Corpus verification cleanup and benchmarking
This commit is contained in:
parent
4843056bd6
commit
2de9ab1091
|
@ -10,4 +10,3 @@
|
|||
/gems
|
||||
/mail-*.gem
|
||||
/rdoc
|
||||
/spec/fixtures/emails/failed_emails/
|
||||
|
|
|
@ -1,2 +1,4 @@
|
|||
# frozen_string_literal: true
|
||||
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
||||
require 'mail'
|
||||
require 'mail/parsers'
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
require 'benchmark'
|
||||
|
||||
namespace :corpus do
|
||||
|
||||
task :load_mail do
|
||||
|
@ -11,7 +13,7 @@ namespace :corpus do
|
|||
task :verify_all => :load_mail do
|
||||
|
||||
root_of_corpus = ENV['LOCATION'] || 'corpus/spam'
|
||||
@save_failures_to = ENV['SAVE_TO'] || 'spec/fixtures/emails/failed_emails'
|
||||
@save_failures_to = ENV['SAVE_TO'] || 'corpus/failed_emails'
|
||||
@failed_emails = []
|
||||
@checked_count = 0
|
||||
|
||||
|
@ -24,40 +26,18 @@ namespace :corpus do
|
|||
raise "\n\tSupply path to corpus: LOCATION=/path/to/corpus\n\n"
|
||||
end
|
||||
|
||||
if @save_failures_to
|
||||
if not File.directory?(@save_failures_to)
|
||||
raise "\n\tPath '#{@save_failures_to}' is not a directory.\n\n"
|
||||
end
|
||||
@save_failures_to = File.expand_path(@save_failures_to)
|
||||
puts "Mail which fails to parse will be saved in '#{@save_failures_to}'"
|
||||
end
|
||||
|
||||
puts "Mail which fails to parse will be saved in '#{@save_failures_to}'"
|
||||
puts "Checking '#{root_of_corpus}' directory (recursively)"
|
||||
|
||||
# we're tracking all the errors separately, don't clutter terminal
|
||||
$stderr_backup = $stderr.dup
|
||||
$stderr.reopen("/dev/null", "w")
|
||||
STDERR = $stderr
|
||||
|
||||
dir_node(root_of_corpus)
|
||||
|
||||
# put our toys back now that we're done with them
|
||||
$stderr = $stderr_backup.dup
|
||||
STDERR = $stderr
|
||||
elapsed = Benchmark.realtime { dir_node(root_of_corpus) }
|
||||
|
||||
puts "\n\n"
|
||||
|
||||
|
||||
if @failed_emails.any?
|
||||
report_failures_to_stdout
|
||||
end
|
||||
puts "Out of Total: #{@checked_count}"
|
||||
|
||||
if @save_failures_to
|
||||
puts "Add SAVE_TO=/some/dir to save failed emails to for review.,"
|
||||
puts "May result in a lot of saved files. Do a dry run first!\n\n"
|
||||
else
|
||||
puts "There are no errors"
|
||||
end
|
||||
puts 'Elapsed: %.2f ms' % (elapsed * 1000.0)
|
||||
end
|
||||
|
||||
def dir_node(path)
|
||||
|
@ -77,49 +57,46 @@ namespace :corpus do
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
def file_node(path)
|
||||
verify(path)
|
||||
end
|
||||
|
||||
|
||||
def verify(path)
|
||||
result, message = parse_as_mail(path)
|
||||
result, exception = parse_as_mail(path)
|
||||
if result
|
||||
print '.'
|
||||
$stdout.flush
|
||||
else
|
||||
save_failure(path, message)
|
||||
save_failure(path, exception)
|
||||
print 'x'
|
||||
end
|
||||
end
|
||||
|
||||
def save_failure(path, message)
|
||||
@failed_emails << [path, message]
|
||||
def save_failure(path, exception)
|
||||
@failed_emails << [path, exception]
|
||||
if @save_failures_to
|
||||
email_basename = File.basename(path)
|
||||
failure_as_filename = message.gsub(/\W/, '_')
|
||||
failure_as_filename = exception.message.gsub(/\W/, '_')
|
||||
new_email_name = [failure_as_filename, email_basename].join("_")
|
||||
FileUtils.mkdir_p(@save_failures_to)
|
||||
File.open(File.join(@save_failures_to, new_email_name), 'w+') do |fh|
|
||||
fh << File.read(path)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def parse_as_mail(path)
|
||||
@checked_count += 1
|
||||
begin
|
||||
parsed_mail = Mail.read(path)
|
||||
[true, nil]
|
||||
rescue => e
|
||||
[false, e.message]
|
||||
end
|
||||
Mail.read(path)
|
||||
[true, nil]
|
||||
rescue => e
|
||||
[false, e]
|
||||
end
|
||||
|
||||
|
||||
def report_failures_to_stdout
|
||||
@failed_emails.each do |failed|
|
||||
puts "#{failed[0]} : #{failed[1]}"
|
||||
@failed_emails.each do |path, exception|
|
||||
puts "#{path}: #{exception.message}\n\t#{exception.backtrace.join("\n\t")}"
|
||||
end
|
||||
puts "Failed: #{@failed_emails.size}"
|
||||
end
|
||||
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue