2018-07-23 09:32:32 +00:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2015-04-27 22:54:13 +00:00
|
|
|
require 'uri'
|
|
|
|
|
2015-12-15 14:51:16 +00:00
|
|
|
module Banzai
|
|
|
|
module Filter
|
2015-04-27 22:54:13 +00:00
|
|
|
# HTML Filter for auto-linking URLs in HTML.
|
|
|
|
#
|
|
|
|
# Based on HTML::Pipeline::AutolinkFilter
|
|
|
|
#
|
2019-01-14 22:57:54 +00:00
|
|
|
# Note that our CommonMark parser, `commonmarker` (using the autolink extension)
|
|
|
|
# handles standard autolinking, like http/https. We detect additional
|
|
|
|
# schemes (smb, rdar, etc).
|
|
|
|
#
|
2015-04-27 22:54:13 +00:00
|
|
|
# Context options:
|
|
|
|
# :autolink - Boolean, skips all processing done by this filter when false
|
|
|
|
# :link_attr - Hash of attributes for the generated links
|
|
|
|
#
|
|
|
|
class AutolinkFilter < HTML::Pipeline::Filter
|
|
|
|
include ActionView::Helpers::TagHelper
|
Extract SanitizeNodeLink and apply to WikiLinkFilter
The SanitizationFilter was running before the WikiFilter. Since
WikiFilter can modify links, we could see links that _should_ be stopped
by SanatizationFilter being rendered on the page. I (kerrizor) had
previously addressed the bug in: https://gitlab.com/gitlab-org/gitlab-ee/commit/7bc971915bbeadb950bb0e1f13510bf3038229a4
However, an additional exploit was discovered after that was merged.
Working through the issue, we couldn't simply shuffle the order of
filters, due to some implicit assumptions about the order of filters, so
instead we've extracted the logic that sanitizes a Nokogiri-generated
Node object, and applied it to the WikiLinkFilter as well.
On moving filters around:
Once we start moving around filters, we get cascading failures; fix one,
another one crops up. Many of the existing filters in the WikiPipeline
chain seem to assume that other filters have already done their work,
and thus operate on a "transform anything that's left" basis;
WikiFilter, for instance, assumes any link it finds in the markdown
should be prepended with the wiki_base_path.. but if it does that, it
also turns `href="@user"` into `href="/path/to/wiki/@user"`, which the
UserReferenceFilter doesn't see as a user reference it needs to
transform into a user profile link. This is true for all the reference
filters in the WikiPipeline.
2019-07-26 13:41:11 +00:00
|
|
|
include Gitlab::Utils::SanitizeNodeLink
|
2015-04-27 22:54:13 +00:00
|
|
|
|
|
|
|
# Pattern to match text that should be autolinked.
|
|
|
|
#
|
|
|
|
# A URI scheme begins with a letter and may contain letters, numbers,
|
|
|
|
# plus, period and hyphen. Schemes are case-insensitive but we're being
|
|
|
|
# picky here and allowing only lowercase for autolinks.
|
|
|
|
#
|
|
|
|
# See http://en.wikipedia.org/wiki/URI_scheme
|
|
|
|
#
|
2018-03-23 15:51:28 +00:00
|
|
|
# The negative lookbehind ensures that users can paste a URL followed by
|
|
|
|
# punctuation without those characters being included in the generated
|
|
|
|
# link. It matches the behaviour of Rinku 2.0.1:
|
|
|
|
# https://github.com/vmg/rinku/blob/v2.0.1/ext/rinku/autolink.c#L65
|
2015-04-29 18:17:29 +00:00
|
|
|
#
|
2018-03-23 15:51:28 +00:00
|
|
|
# Rubular: http://rubular.com/r/nrL3r9yUiq
|
2019-05-05 10:19:14 +00:00
|
|
|
LINK_PATTERN = %r{([a-z][a-z0-9\+\.-]+://[^\s>]+)(?<!\?|!|\.|,|:)}.freeze
|
2015-04-27 22:54:13 +00:00
|
|
|
|
2015-04-29 22:45:38 +00:00
|
|
|
# Text matching LINK_PATTERN inside these elements will not be linked
|
2015-04-27 22:54:13 +00:00
|
|
|
IGNORE_PARENTS = %w(a code kbd pre script style).to_set
|
|
|
|
|
2016-08-02 15:51:17 +00:00
|
|
|
# The XPath query to use for finding text nodes to parse.
|
2017-02-22 22:39:43 +00:00
|
|
|
TEXT_QUERY = %Q(descendant-or-self::text()[
|
2016-08-02 15:51:17 +00:00
|
|
|
not(#{IGNORE_PARENTS.map { |p| "ancestor::#{p}" }.join(' or ')})
|
|
|
|
and contains(., '://')
|
2021-03-26 03:09:21 +00:00
|
|
|
])
|
2016-08-02 15:51:17 +00:00
|
|
|
|
2018-02-22 12:09:27 +00:00
|
|
|
PUNCTUATION_PAIRS = {
|
|
|
|
"'" => "'",
|
|
|
|
'"' => '"',
|
|
|
|
')' => '(',
|
|
|
|
']' => '[',
|
|
|
|
'}' => '{'
|
|
|
|
}.freeze
|
|
|
|
|
2015-04-27 22:54:13 +00:00
|
|
|
def call
|
|
|
|
return doc if context[:autolink] == false
|
|
|
|
|
2016-08-02 15:51:17 +00:00
|
|
|
doc.xpath(TEXT_QUERY).each do |node|
|
2015-04-27 22:54:13 +00:00
|
|
|
content = node.to_html
|
|
|
|
|
2015-04-29 22:45:38 +00:00
|
|
|
next unless content.match(LINK_PATTERN)
|
2015-04-27 22:54:13 +00:00
|
|
|
|
|
|
|
html = autolink_filter(content)
|
|
|
|
|
|
|
|
next if html == content
|
|
|
|
|
|
|
|
node.replace(html)
|
|
|
|
end
|
|
|
|
|
|
|
|
doc
|
|
|
|
end
|
|
|
|
|
2018-02-22 12:09:27 +00:00
|
|
|
private
|
|
|
|
|
2016-11-07 16:27:35 +00:00
|
|
|
def autolink_match(match)
|
|
|
|
# start by stripping out dangerous links
|
|
|
|
begin
|
|
|
|
uri = Addressable::URI.parse(match)
|
Extract SanitizeNodeLink and apply to WikiLinkFilter
The SanitizationFilter was running before the WikiFilter. Since
WikiFilter can modify links, we could see links that _should_ be stopped
by SanatizationFilter being rendered on the page. I (kerrizor) had
previously addressed the bug in: https://gitlab.com/gitlab-org/gitlab-ee/commit/7bc971915bbeadb950bb0e1f13510bf3038229a4
However, an additional exploit was discovered after that was merged.
Working through the issue, we couldn't simply shuffle the order of
filters, due to some implicit assumptions about the order of filters, so
instead we've extracted the logic that sanitizes a Nokogiri-generated
Node object, and applied it to the WikiLinkFilter as well.
On moving filters around:
Once we start moving around filters, we get cascading failures; fix one,
another one crops up. Many of the existing filters in the WikiPipeline
chain seem to assume that other filters have already done their work,
and thus operate on a "transform anything that's left" basis;
WikiFilter, for instance, assumes any link it finds in the markdown
should be prepended with the wiki_base_path.. but if it does that, it
also turns `href="@user"` into `href="/path/to/wiki/@user"`, which the
UserReferenceFilter doesn't see as a user reference it needs to
transform into a user profile link. This is true for all the reference
filters in the WikiPipeline.
2019-07-26 13:41:11 +00:00
|
|
|
return match unless safe_protocol?(uri.scheme)
|
2016-11-07 16:27:35 +00:00
|
|
|
rescue Addressable::URI::InvalidURIError
|
|
|
|
return match
|
2015-04-27 22:54:13 +00:00
|
|
|
end
|
2016-11-07 16:27:35 +00:00
|
|
|
|
|
|
|
# Remove any trailing HTML entities and store them for appending
|
|
|
|
# outside the link element. The entity must be marked HTML safe in
|
|
|
|
# order to be output literally rather than escaped.
|
|
|
|
match.gsub!(/((?:&[\w#]+;)+)\z/, '')
|
2020-09-03 09:08:20 +00:00
|
|
|
dropped = (Regexp.last_match(1) || '').html_safe
|
2016-11-07 16:27:35 +00:00
|
|
|
|
2018-02-22 12:09:27 +00:00
|
|
|
# To match the behaviour of Rinku, if the matched link ends with a
|
|
|
|
# closing part of a matched pair of punctuation, we remove that trailing
|
|
|
|
# character unless there are an equal number of closing and opening
|
|
|
|
# characters in the link.
|
|
|
|
if match.end_with?(*PUNCTUATION_PAIRS.keys)
|
|
|
|
close_character = match[-1]
|
|
|
|
close_count = match.count(close_character)
|
|
|
|
open_character = PUNCTUATION_PAIRS[close_character]
|
|
|
|
open_count = match.count(open_character)
|
|
|
|
|
|
|
|
if open_count != close_count || open_character == close_character
|
|
|
|
dropped += close_character
|
|
|
|
match = match[0..-2]
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-01-14 22:57:54 +00:00
|
|
|
# Since this came from a Text node, make sure the new href is encoded.
|
|
|
|
# `commonmarker` percent encodes the domains of links it handles, so
|
|
|
|
# do the same (instead of using `normalized_encode`).
|
2019-02-04 16:23:27 +00:00
|
|
|
begin
|
|
|
|
href_safe = Addressable::URI.encode(match).html_safe
|
|
|
|
rescue Addressable::URI::InvalidURIError
|
|
|
|
return uri.to_s
|
|
|
|
end
|
2019-01-14 22:57:54 +00:00
|
|
|
|
2018-03-28 11:56:11 +00:00
|
|
|
html_safe_match = match.html_safe
|
2019-01-14 22:57:54 +00:00
|
|
|
options = link_options.merge(href: href_safe)
|
2018-03-28 11:56:11 +00:00
|
|
|
|
|
|
|
content_tag(:a, html_safe_match, options) + dropped
|
2016-11-07 16:27:35 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
def autolink_filter(text)
|
2021-03-11 15:09:10 +00:00
|
|
|
Gitlab::StringRegexMarker.new(CGI.unescapeHTML(text), text.html_safe).mark(LINK_PATTERN) do |link, left:, right:, mode:|
|
2019-11-26 15:06:50 +00:00
|
|
|
autolink_match(link).html_safe
|
2018-02-16 13:33:50 +00:00
|
|
|
end
|
2015-04-27 22:54:13 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
def link_options
|
|
|
|
@link_options ||= context[:link_attr] || {}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|