# frozen_string_literal: true module Banzai module Filter # HTML Filter to modify the attributes of external links class ExternalLinkFilter < HTML::Pipeline::Filter SCHEMES = ['http', 'https', nil].freeze RTLO = "\u202E" ENCODED_RTLO = '%E2%80%AE' def call links.each do |node| # URI.parse does stricter checking on the url than Addressable, # such as on `mailto:` links. Since we've been using it, do an # initial parse for validity and then use Addressable # for IDN support, etc uri = uri_strict(node_src(node)) if uri node.set_attribute(node_src_attribute(node), uri.to_s) addressable_uri = addressable_uri(node_src(node)) else addressable_uri = nil end unless internal_url?(addressable_uri) punycode_autolink_node!(addressable_uri, node) sanitize_link_text!(node) add_malicious_tooltip!(addressable_uri, node) add_nofollow!(addressable_uri, node) end end doc end private # if this is a link to a proxied image, then `src` is already the correct # proxied url, so work with the `data-canonical-src` def node_src_attribute(node) node['data-canonical-src'] ? 'data-canonical-src' : 'href' end def node_src(node) node[node_src_attribute(node)] end def uri_strict(href) URI.parse(href) rescue URI::Error nil end def addressable_uri(href) Addressable::URI.parse(href) rescue Addressable::URI::InvalidURIError nil end def links query = 'descendant-or-self::a[@href and not(@href = "")]' doc.xpath(query) end def internal_url?(uri) return false if uri.nil? # Relative URLs miss a hostname AND a scheme return true if !uri.hostname && !uri.scheme uri.hostname == internal_url.hostname end def internal_url @internal_url ||= URI.parse(Gitlab.config.gitlab.url) end # Only replace an autolink with an IDN with it's punycode # version if we need emailable links. Otherwise let it # be shown normally and the tooltips will show the # punycode version. def punycode_autolink_node!(uri, node) return unless uri return unless context[:emailable_links] unencoded_uri_str = Addressable::URI.unencode(node_src(node)) if unencoded_uri_str == node.content && idn?(uri) node.content = uri.normalize end end # escape any right-to-left (RTLO) characters in link text def sanitize_link_text!(node) node.inner_html = node.inner_html.gsub(RTLO, ENCODED_RTLO) end # If the domain is an international domain name (IDN), # let's expose with a tooltip in case it's intended # to be malicious. This is particularly useful for links # where the link text is not the same as the actual link. # We will continue to show the unicode version of the domain # in autolinked link text, which could contain emojis, etc. # # Also show the tooltip if the url contains the RTLO character, # as this is an indicator of a malicious link def add_malicious_tooltip!(uri, node) if idn?(uri) || has_encoded_rtlo?(uri) node.add_class('has-tooltip') node.set_attribute('title', uri.normalize) end end def add_nofollow!(uri, node) if SCHEMES.include?(uri&.scheme) license = true if node.attribute('rel')&.value == 'license' node.set_attribute('rel', 'nofollow noreferrer noopener') node.kwattr_append('rel', 'license') if license node.set_attribute('target', '_blank') end end def idn?(uri) uri&.normalized_host&.start_with?('xn--') end def has_encoded_rtlo?(uri) uri&.to_s&.include?(ENCODED_RTLO) end end end end