gitlab-org--gitlab-foss/lib/gitlab/email/reply_parser.rb
2015-08-20 14:25:56 -07:00

91 lines
2.4 KiB
Ruby

# Inspired in great part by Discourse's Email::Receiver
module Gitlab
module Email
class ReplyParser
attr_accessor :message
def initialize(message)
@message = message
end
def execute
body = select_body(message)
encoding = body.encoding
body = discourse_email_trimmer(body)
body = EmailReplyParser.parse_reply(body)
body.force_encoding(encoding).encode("UTF-8")
end
private
def select_body(message)
html = nil
text = nil
if message.multipart?
html = fix_charset(message.html_part)
text = fix_charset(message.text_part)
elsif message.content_type =~ /text\/html/
html = fix_charset(message)
end
# prefer plain text
return text if text
if html
body = HtmlCleaner.new(html).output_html
else
body = fix_charset(message)
end
# Certain trigger phrases that means we didn't parse correctly
if body =~ /(Content\-Type\:|multipart\/alternative|text\/plain)/
return ""
end
body
end
# Force encoding to UTF-8 on a Mail::Message or Mail::Part
def fix_charset(object)
return nil if object.nil?
if object.charset
object.body.decoded.force_encoding(object.charset.gsub(/utf8/i, "UTF-8")).encode("UTF-8").to_s
else
object.body.to_s
end
rescue
nil
end
REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date)
REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" })
def discourse_email_trimmer(body)
lines = body.scrub.lines.to_a
range_end = 0
lines.each_with_index do |l, idx|
# This one might be controversial but so many reply lines have years, times and end with a colon.
# Let's try it and see how well it works.
break if (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) ||
(l =~ /On \w+ \d+,? \d+,?.*wrote:/)
# Headers on subsequent lines
break if (0..2).all? { |off| lines[idx+off] =~ REPLYING_HEADER_REGEX }
# Headers on the same line
break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3
range_end = idx
end
lines[0..range_end].join.strip
end
end
end
end