Merge branch '41719-mr-title-fix' into 'master'

Render htmlentities correctly for links not supported by Rinku Closes #41719 See merge request gitlab-org/gitlab-ce!17180
2018-03-05 15:39:36 +00:00 · 2018-03-05 15:39:36 +00:00 · 089b2fb9fc
commit 089b2fb9fc
parent fa1debefe7 cb55bc3c07
6 changed files with 176 additions and 92 deletions
--- a/changelogs/unreleased/41719-mr-title-fix.yml
+++ b/changelogs/unreleased/41719-mr-title-fix.yml
@ -0,0 +1,5 @@
+---
+title: Render htmlentities correctly for links not supported by Rinku
+merge_request:
+author:
+type: fixed
--- a/lib/banzai/filter/autolink_filter.rb
+++ b/lib/banzai/filter/autolink_filter.rb
@ -25,8 +25,8 @@ module Banzai
      # period or comma for punctuation without those characters being included
      # in the generated link.
      #
-      # Rubular: http://rubular.com/r/cxjPyZc7Sb
-      LINK_PATTERN = %r{([a-z][a-z0-9\+\.-]+://\S+)(?<!,|\.)}
+      # Rubular: http://rubular.com/r/JzPhi6DCZp
+      LINK_PATTERN = %r{([a-z][a-z0-9\+\.-]+://[^\s>]+)(?<!,|\.)}

      # Text matching LINK_PATTERN inside these elements will not be linked
      IGNORE_PARENTS = %w(a code kbd pre script style).to_set
@ -35,53 +35,19 @@ module Banzai
      TEXT_QUERY = %Q(descendant-or-self::text()[
        not(#{IGNORE_PARENTS.map { |p| "ancestor::#{p}" }.join(' or ')})
        and contains(., '://')
-        and not(starts-with(., 'http'))
-        and not(starts-with(., 'ftp'))
      ]).freeze

+      PUNCTUATION_PAIRS = {
+        "'" => "'",
+        '"' => '"',
+        ')' => '(',
+        ']' => '[',
+        '}' => '{'
+      }.freeze
+
      def call
        return doc if context[:autolink] == false

-        rinku_parse
-        text_parse
-      end
-
-      private
-
-      # Run the text through Rinku as a first pass
-      #
-      # This will quickly autolink http(s) and ftp links.
-      #
-      # `@doc` will be re-parsed with the HTML String from Rinku.
-      def rinku_parse
-        # Convert the options from a Hash to a String that Rinku expects
-        options = tag_options(link_options)
-
-        # NOTE: We don't parse email links because it will erroneously match
-        # external Commit and CommitRange references.
-        #
-        # The final argument tells Rinku to link short URLs that don't include a
-        # period (e.g., http://localhost:3000/)
-        rinku = Rinku.auto_link(html, :urls, options, IGNORE_PARENTS.to_a, 1)
-
-        return if rinku == html
-
-        # Rinku returns a String, so parse it back to a Nokogiri::XML::Document
-        # for further processing.
-        @doc = parse_html(rinku)
-      end
-
-      # Return true if any of the UNSAFE_PROTOCOLS strings are included in the URI scheme
-      def contains_unsafe?(scheme)
-        return false unless scheme
-
-        scheme = scheme.strip.downcase
-        Banzai::Filter::SanitizationFilter::UNSAFE_PROTOCOLS.any? { |protocol| scheme.include?(protocol) }
-      end
-
-      # Autolinks any text matching LINK_PATTERN that Rinku didn't already
-      # replace
-      def text_parse
        doc.xpath(TEXT_QUERY).each do |node|
          content = node.to_html

@ -97,6 +63,16 @@ module Banzai
        doc
      end

+      private
+
+      # Return true if any of the UNSAFE_PROTOCOLS strings are included in the URI scheme
+      def contains_unsafe?(scheme)
+        return false unless scheme
+
+        scheme = scheme.strip.downcase
+        Banzai::Filter::SanitizationFilter::UNSAFE_PROTOCOLS.any? { |protocol| scheme.include?(protocol) }
+      end
+
      def autolink_match(match)
        # start by stripping out dangerous links
        begin
@ -112,12 +88,30 @@ module Banzai
        match.gsub!(/((?:&[\w#]+;)+)\z/, '')
        dropped = ($1 || '').html_safe

+        # To match the behaviour of Rinku, if the matched link ends with a
+        # closing part of a matched pair of punctuation, we remove that trailing
+        # character unless there are an equal number of closing and opening
+        # characters in the link.
+        if match.end_with?(*PUNCTUATION_PAIRS.keys)
+          close_character = match[-1]
+          close_count = match.count(close_character)
+          open_character = PUNCTUATION_PAIRS[close_character]
+          open_count = match.count(open_character)
+
+          if open_count != close_count || open_character == close_character
+            dropped += close_character
+            match = match[0..-2]
+          end
+        end
+
        options = link_options.merge(href: match)
-        content_tag(:a, match, options) + dropped
+        content_tag(:a, match.html_safe, options) + dropped
      end

      def autolink_filter(text)
-        text.gsub(LINK_PATTERN) { |match| autolink_match(match) }
+        Gitlab::StringRegexMarker.new(CGI.unescapeHTML(text), text.html_safe).mark(LINK_PATTERN) do |link, left:, right:|
+          autolink_match(link)
+        end
      end

      def link_options
--- a/lib/gitlab/string_range_marker.rb
+++ b/lib/gitlab/string_range_marker.rb
@ -14,7 +14,7 @@ module Gitlab
    end

    def mark(marker_ranges)
-      return rich_line unless marker_ranges
+      return rich_line unless marker_ranges&.any?

      if html_escaped
        rich_marker_ranges = []
--- a/lib/gitlab/string_regex_marker.rb
+++ b/lib/gitlab/string_regex_marker.rb
@ -1,13 +1,15 @@
 module Gitlab
  class StringRegexMarker < StringRangeMarker
    def mark(regex, group: 0, &block)
-      regex_match = raw_line.match(regex)
-      return rich_line unless regex_match
+      ranges = []

-      begin_index, end_index = regex_match.offset(group)
-      name_range = begin_index..(end_index - 1)
+      raw_line.scan(regex) do
+        begin_index, end_index = Regexp.last_match.offset(group)

-      super([name_range], &block)
+        ranges << (begin_index..(end_index - 1))
+      end
+
+      super(ranges, &block)
    end
  end
 end
--- a/spec/lib/banzai/filter/autolink_filter_spec.rb
+++ b/spec/lib/banzai/filter/autolink_filter_spec.rb
@ -4,6 +4,7 @@ describe Banzai::Filter::AutolinkFilter do
  include FilterSpecHelper

  let(:link) { 'http://about.gitlab.com/' }
+  let(:quotes) { ['"', "'"] }

  it 'does nothing when :autolink is false' do
    exp = act = link
@ -15,17 +16,7 @@ describe Banzai::Filter::AutolinkFilter do
    expect(filter(act).to_html).to eq exp
  end

-  context 'when the input contains no links' do
-    it 'does not parse_html back the rinku returned value' do
-      act = HTML::Pipeline.parse('<p>This text contains no links to autolink</p>')
-
-      expect_any_instance_of(described_class).not_to receive(:parse_html)
-
-      filter(act).to_html
-    end
-  end
-
-  context 'Rinku schemes' do
+  context 'Various schemes' do
    it 'autolinks http' do
      doc = filter("See #{link}")
      expect(doc.at_css('a').text).to eq link
@ -56,33 +47,27 @@ describe Banzai::Filter::AutolinkFilter do
      expect(doc.at_css('a')['href']).to eq link
    end

+    it 'autolinks multiple URLs' do
+      link1 = 'http://localhost:3000/'
+      link2 = 'http://google.com/'
+
+      doc = filter("See #{link1} and #{link2}")
+
+      found_links = doc.css('a')
+
+      expect(found_links.size).to eq(2)
+      expect(found_links[0].text).to eq(link1)
+      expect(found_links[0]['href']).to eq(link1)
+      expect(found_links[1].text).to eq(link2)
+      expect(found_links[1]['href']).to eq(link2)
+    end
+
    it 'accepts link_attr options' do
      doc = filter("See #{link}", link_attr: { class: 'custom' })

      expect(doc.at_css('a')['class']).to eq 'custom'
    end

-    described_class::IGNORE_PARENTS.each do |elem|
-      it "ignores valid links contained inside '#{elem}' element" do
-        exp = act = "<#{elem}>See #{link}</#{elem}>"
-        expect(filter(act).to_html).to eq exp
-      end
-    end
-
-    context 'when the input contains link' do
-      it 'does parse_html back the rinku returned value' do
-        act = HTML::Pipeline.parse("<p>See #{link}</p>")
-
-        expect_any_instance_of(described_class).to receive(:parse_html).at_least(:once).and_call_original
-
-        filter(act).to_html
-      end
-    end
-  end
-
-  context 'other schemes' do
-    let(:link) { 'foo://bar.baz/' }
-
    it 'autolinks smb' do
      link = 'smb:///Volumes/shared/foo.pdf'
      doc = filter("See #{link}")
@ -91,6 +76,21 @@ describe Banzai::Filter::AutolinkFilter do
      expect(doc.at_css('a')['href']).to eq link
    end

+    it 'autolinks multiple occurences of smb' do
+      link1 = 'smb:///Volumes/shared/foo.pdf'
+      link2 = 'smb:///Volumes/shared/bar.pdf'
+
+      doc = filter("See #{link1} and #{link2}")
+
+      found_links = doc.css('a')
+
+      expect(found_links.size).to eq(2)
+      expect(found_links[0].text).to eq(link1)
+      expect(found_links[0]['href']).to eq(link1)
+      expect(found_links[1].text).to eq(link2)
+      expect(found_links[1]['href']).to eq(link2)
+    end
+
    it 'autolinks irc' do
      link = 'irc://irc.freenode.net/git'
      doc = filter("See #{link}")
@ -132,6 +132,45 @@ describe Banzai::Filter::AutolinkFilter do
      expect(doc.at_css('a').text).to eq link
    end

+    it 'includes trailing punctuation when part of a balanced pair' do
+      described_class::PUNCTUATION_PAIRS.each do |close, open|
+        next if open.in?(quotes)
+
+        balanced_link = "#{link}#{open}abc#{close}"
+        balanced_actual = filter("See #{balanced_link}...")
+        unbalanced_link = "#{link}#{close}"
+        unbalanced_actual = filter("See #{unbalanced_link}...")
+
+        expect(balanced_actual.at_css('a').text).to eq(balanced_link)
+        expect(unescape(balanced_actual.to_html)).to eq(Rinku.auto_link("See #{balanced_link}..."))
+        expect(unbalanced_actual.at_css('a').text).to eq(link)
+        expect(unescape(unbalanced_actual.to_html)).to eq(Rinku.auto_link("See #{unbalanced_link}..."))
+      end
+    end
+
+    it 'removes trailing quotes' do
+      quotes.each do |quote|
+        balanced_link = "#{link}#{quote}abc#{quote}"
+        balanced_actual = filter("See #{balanced_link}...")
+        unbalanced_link = "#{link}#{quote}"
+        unbalanced_actual = filter("See #{unbalanced_link}...")
+
+        expect(balanced_actual.at_css('a').text).to eq(balanced_link[0...-1])
+        expect(unescape(balanced_actual.to_html)).to eq(Rinku.auto_link("See #{balanced_link}..."))
+        expect(unbalanced_actual.at_css('a').text).to eq(link)
+        expect(unescape(unbalanced_actual.to_html)).to eq(Rinku.auto_link("See #{unbalanced_link}..."))
+      end
+    end
+
+    it 'removes one closing punctuation mark when the punctuation in the link is unbalanced' do
+      complicated_link = "(#{link}(a'b[c'd]))'"
+      expected_complicated_link = %Q{(<a href="#{link}(a'b[c'd]))">#{link}(a'b[c'd]))</a>'}
+      actual = unescape(filter(complicated_link).to_html)
+
+      expect(actual).to eq(Rinku.auto_link(complicated_link))
+      expect(actual).to eq(expected_complicated_link)
+    end
+
    it 'does not include trailing HTML entities' do
      doc = filter("See &lt;&lt;&lt;#{link}&gt;&gt;&gt;")

@ -151,4 +190,29 @@ describe Banzai::Filter::AutolinkFilter do
      end
    end
  end
+
+  context 'when the link is inside a tag' do
+    %w[http rdar].each do |protocol|
+      it "renders text after the link correctly for #{protocol}" do
+        doc = filter(ERB::Util.html_escape_once("<#{protocol}://link><another>"))
+
+        expect(doc.children.last.text).to include('<another>')
+      end
+    end
+  end
+
+  # Rinku does not escape these characters in HTML attributes, but content_tag
+  # does. We don't care about that difference for these specs, though.
+  def unescape(html)
+    %w([ ] { }).each do |cgi_escape|
+      html.sub!(CGI.escape(cgi_escape), cgi_escape)
+    end
+
+    quotes.each do |html_escape|
+      html.sub!(CGI.escape_html(html_escape), html_escape)
+      html.sub!(CGI.escape(html_escape), CGI.escape_html(html_escape))
+    end
+
+    html
+  end
 end
--- a/spec/lib/gitlab/string_regex_marker_spec.rb
+++ b/spec/lib/gitlab/string_regex_marker_spec.rb
@ -2,17 +2,36 @@ require 'spec_helper'

 describe Gitlab::StringRegexMarker do
  describe '#mark' do
-    let(:raw)  { %{"name": "AFNetworking"} }
-    let(:rich) { %{<span class="key">"name"</span><span class="punctuation">: </span><span class="value">"AFNetworking"</span>}.html_safe }
-    subject do
-      described_class.new(raw, rich).mark(/"[^"]+":\s*"(?<name>[^"]+)"/, group: :name) do |text, left:, right:|
-        %{<a href="#">#{text}</a>}
+    context 'with a single occurrence' do
+      let(:raw)  { %{"name": "AFNetworking"} }
+      let(:rich) { %{<span class="key">"name"</span><span class="punctuation">: </span><span class="value">"AFNetworking"</span>}.html_safe }
+
+      subject do
+        described_class.new(raw, rich).mark(/"[^"]+":\s*"(?<name>[^"]+)"/, group: :name) do |text, left:, right:|
+          %{<a href="#">#{text}</a>}
+        end
+      end
+
+      it 'marks the match' do
+        expect(subject).to eq(%{<span class="key">"name"</span><span class="punctuation">: </span><span class="value">"<a href="#">AFNetworking</a>"</span>})
+        expect(subject).to be_html_safe
      end
    end

-    it 'marks the inline diffs' do
-      expect(subject).to eq(%{<span class="key">"name"</span><span class="punctuation">: </span><span class="value">"<a href="#">AFNetworking</a>"</span>})
-      expect(subject).to be_html_safe
+    context 'with multiple occurrences' do
+      let(:raw)  { %{a <b> <c> d} }
+      let(:rich) { %{a &lt;b&gt; &lt;c&gt; d}.html_safe }
+
+      subject do
+        described_class.new(raw, rich).mark(/<[a-z]>/) do |text, left:, right:|
+          %{<strong>#{text}</strong>}
+        end
+      end
+
+      it 'marks the matches' do
+        expect(subject).to eq(%{a <strong>&lt;b&gt;</strong> <strong>&lt;c&gt;</strong> d})
+        expect(subject).to be_html_safe
+      end
    end
  end
 end