Corpus verification cleanup and benchmarking

This commit is contained in:
Jeremy Daer 2016-09-18 21:39:07 -07:00
parent 4843056bd6
commit 2de9ab1091
No known key found for this signature in database
GPG Key ID: AB8F6399D5C60664
3 changed files with 25 additions and 47 deletions

1
.gitignore vendored
View File

@ -10,4 +10,3 @@
/gems
/mail-*.gem
/rdoc
/spec/fixtures/emails/failed_emails/

View File

@ -1,2 +1,4 @@
# frozen_string_literal: true
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
require 'mail'
require 'mail/parsers'

View File

@ -1,3 +1,5 @@
require 'benchmark'
namespace :corpus do
task :load_mail do
@ -11,7 +13,7 @@ namespace :corpus do
task :verify_all => :load_mail do
root_of_corpus = ENV['LOCATION'] || 'corpus/spam'
@save_failures_to = ENV['SAVE_TO'] || 'spec/fixtures/emails/failed_emails'
@save_failures_to = ENV['SAVE_TO'] || 'corpus/failed_emails'
@failed_emails = []
@checked_count = 0
@ -24,40 +26,18 @@ namespace :corpus do
raise "\n\tSupply path to corpus: LOCATION=/path/to/corpus\n\n"
end
if @save_failures_to
if not File.directory?(@save_failures_to)
raise "\n\tPath '#{@save_failures_to}' is not a directory.\n\n"
end
@save_failures_to = File.expand_path(@save_failures_to)
puts "Mail which fails to parse will be saved in '#{@save_failures_to}'"
end
puts "Mail which fails to parse will be saved in '#{@save_failures_to}'"
puts "Checking '#{root_of_corpus}' directory (recursively)"
# we're tracking all the errors separately, don't clutter terminal
$stderr_backup = $stderr.dup
$stderr.reopen("/dev/null", "w")
STDERR = $stderr
dir_node(root_of_corpus)
# put our toys back now that we're done with them
$stderr = $stderr_backup.dup
STDERR = $stderr
elapsed = Benchmark.realtime { dir_node(root_of_corpus) }
puts "\n\n"
if @failed_emails.any?
report_failures_to_stdout
end
puts "Out of Total: #{@checked_count}"
if @save_failures_to
puts "Add SAVE_TO=/some/dir to save failed emails to for review.,"
puts "May result in a lot of saved files. Do a dry run first!\n\n"
else
puts "There are no errors"
end
puts 'Elapsed: %.2f ms' % (elapsed * 1000.0)
end
def dir_node(path)
@ -77,49 +57,46 @@ namespace :corpus do
end
end
end
def file_node(path)
verify(path)
end
def verify(path)
result, message = parse_as_mail(path)
result, exception = parse_as_mail(path)
if result
print '.'
$stdout.flush
else
save_failure(path, message)
save_failure(path, exception)
print 'x'
end
end
def save_failure(path, message)
@failed_emails << [path, message]
def save_failure(path, exception)
@failed_emails << [path, exception]
if @save_failures_to
email_basename = File.basename(path)
failure_as_filename = message.gsub(/\W/, '_')
failure_as_filename = exception.message.gsub(/\W/, '_')
new_email_name = [failure_as_filename, email_basename].join("_")
FileUtils.mkdir_p(@save_failures_to)
File.open(File.join(@save_failures_to, new_email_name), 'w+') do |fh|
fh << File.read(path)
end
end
end
end
def parse_as_mail(path)
@checked_count += 1
begin
parsed_mail = Mail.read(path)
[true, nil]
rescue => e
[false, e.message]
end
Mail.read(path)
[true, nil]
rescue => e
[false, e]
end
def report_failures_to_stdout
@failed_emails.each do |failed|
puts "#{failed[0]} : #{failed[1]}"
@failed_emails.each do |path, exception|
puts "#{path}: #{exception.message}\n\t#{exception.backtrace.join("\n\t")}"
end
puts "Failed: #{@failed_emails.size}"
end
end