gitlab-org--gitlab-foss/lib/banzai/filter/external_link_filter.rb

132 lines
4.0 KiB
Ruby

# frozen_string_literal: true
module Banzai
module Filter
# HTML Filter to modify the attributes of external links
class ExternalLinkFilter < HTML::Pipeline::Filter
SCHEMES = ['http', 'https', nil].freeze
RTLO = "\u202E"
ENCODED_RTLO = '%E2%80%AE'
def call
links.each do |node|
# URI.parse does stricter checking on the url than Addressable,
# such as on `mailto:` links. Since we've been using it, do an
# initial parse for validity and then use Addressable
# for IDN support, etc
uri = uri_strict(node_src(node))
if uri
node.set_attribute(node_src_attribute(node), uri.to_s)
addressable_uri = addressable_uri(node_src(node))
else
addressable_uri = nil
end
unless internal_url?(addressable_uri)
punycode_autolink_node!(addressable_uri, node)
sanitize_link_text!(node)
add_malicious_tooltip!(addressable_uri, node)
add_nofollow!(addressable_uri, node)
end
end
doc
end
private
# if this is a link to a proxied image, then `src` is already the correct
# proxied url, so work with the `data-canonical-src`
def node_src_attribute(node)
node['data-canonical-src'] ? 'data-canonical-src' : 'href'
end
def node_src(node)
node[node_src_attribute(node)]
end
def uri_strict(href)
URI.parse(href)
rescue URI::Error
nil
end
def addressable_uri(href)
Addressable::URI.parse(href)
rescue Addressable::URI::InvalidURIError
nil
end
def links
query = 'descendant-or-self::a[@href and not(@href = "")]'
doc.xpath(query)
end
def internal_url?(uri)
return false if uri.nil?
# Relative URLs miss a hostname AND a scheme
return true if !uri.hostname && !uri.scheme
uri.hostname == internal_url.hostname
end
def internal_url
@internal_url ||= URI.parse(Gitlab.config.gitlab.url)
end
# Only replace an autolink with an IDN with it's punycode
# version if we need emailable links. Otherwise let it
# be shown normally and the tooltips will show the
# punycode version.
def punycode_autolink_node!(uri, node)
return unless uri
return unless context[:emailable_links]
unencoded_uri_str = Addressable::URI.unencode(node_src(node))
if unencoded_uri_str == node.content && idn?(uri)
node.content = uri.normalize
end
end
# escape any right-to-left (RTLO) characters in link text
def sanitize_link_text!(node)
node.inner_html = node.inner_html.gsub(RTLO, ENCODED_RTLO)
end
# If the domain is an international domain name (IDN),
# let's expose with a tooltip in case it's intended
# to be malicious. This is particularly useful for links
# where the link text is not the same as the actual link.
# We will continue to show the unicode version of the domain
# in autolinked link text, which could contain emojis, etc.
#
# Also show the tooltip if the url contains the RTLO character,
# as this is an indicator of a malicious link
def add_malicious_tooltip!(uri, node)
if idn?(uri) || has_encoded_rtlo?(uri)
node.add_class('has-tooltip')
node.set_attribute('title', uri.normalize)
end
end
def add_nofollow!(uri, node)
if SCHEMES.include?(uri&.scheme)
license = true if node.attribute('rel')&.value == 'license'
node.set_attribute('rel', 'nofollow noreferrer noopener')
node.kwattr_append('rel', 'license') if license
node.set_attribute('target', '_blank')
end
end
def idn?(uri)
uri&.normalized_host&.start_with?('xn--')
end
def has_encoded_rtlo?(uri)
uri&.to_s&.include?(ENCODED_RTLO)
end
end
end
end