Match Rinku's behaviour for closing punctuation in links

Rinku 2.0.0 (the version we use) will remove the last character of a link if
it's a closing part of a punctuation pair (different types of parentheses and
quotes), unless both of the below are true:

1. The matching pair has different start and end characters.
2. There are equal numbers of both in the matched string (they don't have to be
   balanced).
This commit is contained in:
Sean McGivern 2018-02-22 12:09:27 +00:00
parent 1a09d5cda8
commit cb55bc3c07
2 changed files with 95 additions and 34 deletions
lib/banzai/filter
spec/lib/banzai/filter

View file

@ -25,7 +25,7 @@ module Banzai
# period or comma for punctuation without those characters being included
# in the generated link.
#
# Rubular: http://rubular.com/r/cxjPyZc7Sb
# Rubular: http://rubular.com/r/JzPhi6DCZp
LINK_PATTERN = %r{([a-z][a-z0-9\+\.-]+://[^\s>]+)(?<!,|\.)}
# Text matching LINK_PATTERN inside these elements will not be linked
@ -37,23 +37,17 @@ module Banzai
and contains(., '://')
]).freeze
PUNCTUATION_PAIRS = {
"'" => "'",
'"' => '"',
')' => '(',
']' => '[',
'}' => '{'
}.freeze
def call
return doc if context[:autolink] == false
text_parse
end
private
# Return true if any of the UNSAFE_PROTOCOLS strings are included in the URI scheme
def contains_unsafe?(scheme)
return false unless scheme
scheme = scheme.strip.downcase
Banzai::Filter::SanitizationFilter::UNSAFE_PROTOCOLS.any? { |protocol| scheme.include?(protocol) }
end
def text_parse
doc.xpath(TEXT_QUERY).each do |node|
content = node.to_html
@ -69,6 +63,16 @@ module Banzai
doc
end
private
# Return true if any of the UNSAFE_PROTOCOLS strings are included in the URI scheme
def contains_unsafe?(scheme)
return false unless scheme
scheme = scheme.strip.downcase
Banzai::Filter::SanitizationFilter::UNSAFE_PROTOCOLS.any? { |protocol| scheme.include?(protocol) }
end
def autolink_match(match)
# start by stripping out dangerous links
begin
@ -84,6 +88,22 @@ module Banzai
match.gsub!(/((?:&[\w#]+;)+)\z/, '')
dropped = ($1 || '').html_safe
# To match the behaviour of Rinku, if the matched link ends with a
# closing part of a matched pair of punctuation, we remove that trailing
# character unless there are an equal number of closing and opening
# characters in the link.
if match.end_with?(*PUNCTUATION_PAIRS.keys)
close_character = match[-1]
close_count = match.count(close_character)
open_character = PUNCTUATION_PAIRS[close_character]
open_count = match.count(open_character)
if open_count != close_count || open_character == close_character
dropped += close_character
match = match[0..-2]
end
end
options = link_options.merge(href: match)
content_tag(:a, match.html_safe, options) + dropped
end

View file

@ -4,6 +4,7 @@ describe Banzai::Filter::AutolinkFilter do
include FilterSpecHelper
let(:link) { 'http://about.gitlab.com/' }
let(:quotes) { ['"', "'"] }
it 'does nothing when :autolink is false' do
exp = act = link
@ -15,16 +16,6 @@ describe Banzai::Filter::AutolinkFilter do
expect(filter(act).to_html).to eq exp
end
context 'when the input contains no links' do
it 'does not parse_html back the rinku returned value' do
act = HTML::Pipeline.parse('<p>This text contains no links to autolink</p>')
expect_any_instance_of(described_class).not_to receive(:parse_html)
filter(act).to_html
end
end
context 'Various schemes' do
it 'autolinks http' do
doc = filter("See #{link}")
@ -141,6 +132,45 @@ describe Banzai::Filter::AutolinkFilter do
expect(doc.at_css('a').text).to eq link
end
it 'includes trailing punctuation when part of a balanced pair' do
described_class::PUNCTUATION_PAIRS.each do |close, open|
next if open.in?(quotes)
balanced_link = "#{link}#{open}abc#{close}"
balanced_actual = filter("See #{balanced_link}...")
unbalanced_link = "#{link}#{close}"
unbalanced_actual = filter("See #{unbalanced_link}...")
expect(balanced_actual.at_css('a').text).to eq(balanced_link)
expect(unescape(balanced_actual.to_html)).to eq(Rinku.auto_link("See #{balanced_link}..."))
expect(unbalanced_actual.at_css('a').text).to eq(link)
expect(unescape(unbalanced_actual.to_html)).to eq(Rinku.auto_link("See #{unbalanced_link}..."))
end
end
it 'removes trailing quotes' do
quotes.each do |quote|
balanced_link = "#{link}#{quote}abc#{quote}"
balanced_actual = filter("See #{balanced_link}...")
unbalanced_link = "#{link}#{quote}"
unbalanced_actual = filter("See #{unbalanced_link}...")
expect(balanced_actual.at_css('a').text).to eq(balanced_link[0...-1])
expect(unescape(balanced_actual.to_html)).to eq(Rinku.auto_link("See #{balanced_link}..."))
expect(unbalanced_actual.at_css('a').text).to eq(link)
expect(unescape(unbalanced_actual.to_html)).to eq(Rinku.auto_link("See #{unbalanced_link}..."))
end
end
it 'removes one closing punctuation mark when the punctuation in the link is unbalanced' do
complicated_link = "(#{link}(a'b[c'd]))'"
expected_complicated_link = %Q{(<a href="#{link}(a'b[c'd]))">#{link}(a'b[c'd]))</a>'}
actual = unescape(filter(complicated_link).to_html)
expect(actual).to eq(Rinku.auto_link(complicated_link))
expect(actual).to eq(expected_complicated_link)
end
it 'does not include trailing HTML entities' do
doc = filter("See &lt;&lt;&lt;#{link}&gt;&gt;&gt;")
@ -162,16 +192,27 @@ describe Banzai::Filter::AutolinkFilter do
end
context 'when the link is inside a tag' do
it 'renders text after the link correctly for http' do
doc = filter(ERB::Util.html_escape_once("<http://link><another>"))
%w[http rdar].each do |protocol|
it "renders text after the link correctly for #{protocol}" do
doc = filter(ERB::Util.html_escape_once("<#{protocol}://link><another>"))
expect(doc.children.last.text).to include('<another>')
end
it 'renders text after the link correctly for not other protocol' do
doc = filter(ERB::Util.html_escape_once("<rdar://link><another>"))
expect(doc.children.last.text).to include('<another>')
end
end
# Rinku does not escape these characters in HTML attributes, but content_tag
# does. We don't care about that difference for these specs, though.
def unescape(html)
%w([ ] { }).each do |cgi_escape|
html.sub!(CGI.escape(cgi_escape), cgi_escape)
end
quotes.each do |html_escape|
html.sub!(CGI.escape_html(html_escape), html_escape)
html.sub!(CGI.escape(html_escape), CGI.escape_html(html_escape))
end
html
end
end