namespace :corpus do

task :load_mail do
  require File.expand_path('../../../spec/environment')
  require 'mail'
end

# Used to run parsing against an arbitrary corpus of email.
# For example: http://plg.uwaterloo.ca/~gvcormac/treccorpus/
desc "Provide a LOCATION=/some/dir to verify parsing in bulk, otherwise defaults"
task :verify_all => :load_mail do

  root_of_corpus    = ENV['LOCATION'] || 'corpus/spam'
  @save_failures_to = ENV['SAVE_TO']  || 'spec/fixtures/emails/failed_emails'
  @failed_emails    = []
  @checked_count    = 0

  if root_of_corpus
    root_of_corpus = File.expand_path(root_of_corpus)
    if not File.directory?(root_of_corpus)
      raise "\n\tPath '#{root_of_corpus}' is not a directory.\n\n"
    end
  else
    raise "\n\tSupply path to corpus: LOCATION=/path/to/corpus\n\n"
  end

  if @save_failures_to
    if not File.directory?(@save_failures_to)
      raise "\n\tPath '#{@save_failures_to}' is not a directory.\n\n"
    end
    @save_failures_to = File.expand_path(@save_failures_to)
    puts "Mail which fails to parse will be saved in '#{@save_failures_to}'"
  end

  puts "Checking '#{root_of_corpus}' directory (recursively)"

  # we're tracking all the errors separately, don't clutter terminal
  $stderr_backup = $stderr.dup
  $stderr.reopen("/dev/null", "w")
  STDERR = $stderr

  dir_node(root_of_corpus)

  # put our toys back now that we're done with them
  $stderr = $stderr_backup.dup
  STDERR = $stderr

  puts "\n\n"

  if @failed_emails.any?
    report_failures_to_stdout
  end
  puts "Out of Total: #{@checked_count}"

  if @save_failures_to
    puts "Add SAVE_TO=/some/dir to save failed emails to for review.,"
    puts "May result in a lot of saved files. Do a dry run first!\n\n"
  else
    puts "There are no errors"
  end
end

def dir_node(path)
  puts "\n\n"
  puts "Checking emails in '#{path}':"

  entries = Dir.entries(path)

  entries.each do |entry|
    next if ['.', '..'].include?(entry)
    full_path = File.join(path, entry)

    if File.file?(full_path)
      file_node(full_path)
    elsif File.directory?(full_path)
      dir_node(full_path)
    end
  end
end

def file_node(path)
  verify(path)
end

def verify(path)
  result, message = parse_as_mail(path)
  if result
    print '.'
    $stdout.flush
  else
    save_failure(path, message)
    print 'x'
  end
end

def save_failure(path, message)
  @failed_emails << [path, message]
  if @save_failures_to
    email_basename = File.basename(path)
    failure_as_filename = message.gsub(/\W/, '_')
    new_email_name = [failure_as_filename, email_basename].join("_")
    File.open(File.join(@save_failures_to, new_email_name), 'w+') do |fh|
      fh << File.read(path)
    end 
  end
end

def parse_as_mail(path)
  @checked_count += 1
  begin
    parsed_mail = Mail.read(path)
    [true, nil]
  rescue => e
    [false, e.message]
  end
end

def report_failures_to_stdout
  @failed_emails.each do |failed|
    puts "#{failed[0]} : #{failed[1]}"
  end
  puts "Failed: #{@failed_emails.size}"
end

end