2009-05-29 22:46:34 -04:00
|
|
|
require 'set'
|
2010-11-19 12:29:33 -05:00
|
|
|
require 'active_support/core_ext/class/attribute'
|
2009-05-29 22:46:34 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
module HTML
|
|
|
|
class Sanitizer
|
|
|
|
def sanitize(text, options = {})
|
|
|
|
return text unless sanitizeable?(text)
|
|
|
|
tokenize(text, options).join
|
|
|
|
end
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
def sanitizeable?(text)
|
|
|
|
!(text.nil? || text.empty? || !text.index("<"))
|
|
|
|
end
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
protected
|
|
|
|
def tokenize(text, options)
|
|
|
|
tokenizer = HTML::Tokenizer.new(text)
|
|
|
|
result = []
|
|
|
|
while token = tokenizer.next
|
|
|
|
node = Node.parse(nil, 0, 0, token, false)
|
|
|
|
process_node node, result, options
|
|
|
|
end
|
|
|
|
result
|
|
|
|
end
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
def process_node(node, result, options)
|
|
|
|
result << node.to_s
|
|
|
|
end
|
|
|
|
end
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
class FullSanitizer < Sanitizer
|
|
|
|
def sanitize(text, options = {})
|
|
|
|
result = super
|
|
|
|
# strip any comments, and if they have a newline at the end (ie. line with
|
|
|
|
# only a comment) strip that too
|
|
|
|
result.gsub!(/<!--(.*?)-->[\n]?/m, "") if result
|
|
|
|
# Recurse - handle all dirty nested tags
|
|
|
|
result == text ? result : sanitize(result, options)
|
|
|
|
end
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
def process_node(node, result, options)
|
|
|
|
result << node.to_s if node.class == HTML::Text
|
|
|
|
end
|
|
|
|
end
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
class LinkSanitizer < FullSanitizer
|
|
|
|
cattr_accessor :included_tags, :instance_writer => false
|
|
|
|
self.included_tags = Set.new(%w(a href))
|
|
|
|
|
|
|
|
def sanitizeable?(text)
|
|
|
|
!(text.nil? || text.empty? || !((text.index("<a") || text.index("<href")) && text.index(">")))
|
|
|
|
end
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
protected
|
|
|
|
def process_node(node, result, options)
|
2010-08-14 01:13:00 -04:00
|
|
|
result << node.to_s unless node.is_a?(HTML::Tag) && included_tags.include?(node.name)
|
2007-11-25 22:45:54 -05:00
|
|
|
end
|
|
|
|
end
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
class WhiteListSanitizer < Sanitizer
|
|
|
|
[:protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
|
|
|
|
:allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties].each do |attr|
|
2010-11-19 12:29:33 -05:00
|
|
|
class_attribute attr, :instance_writer => false
|
2007-11-25 22:45:54 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
# A regular expression of the valid characters used to separate protocols like
|
|
|
|
# the ':' in 'http://foo.com'
|
|
|
|
self.protocol_separator = /:|(�*58)|(p)|(%|%)3A/
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
# Specifies a Set of HTML attributes that can have URIs.
|
|
|
|
self.uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
|
|
|
|
|
|
|
|
# Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
|
|
|
|
# to just escaping harmless tags like <font>
|
|
|
|
self.bad_tags = Set.new(%w(script))
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
# Specifies the default Set of tags that the #sanitize helper will allow unscathed.
|
2010-08-14 01:13:00 -04:00
|
|
|
self.allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
|
|
|
|
sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr
|
2007-12-04 15:04:48 -05:00
|
|
|
acronym a img blockquote del ins))
|
2007-11-25 22:45:54 -05:00
|
|
|
|
2010-08-14 01:13:00 -04:00
|
|
|
# Specifies the default Set of html attributes that the #sanitize helper will leave
|
2007-11-25 22:45:54 -05:00
|
|
|
# in the allowed tag.
|
|
|
|
self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
# Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
|
2010-08-14 01:13:00 -04:00
|
|
|
self.allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto
|
2007-11-25 22:45:54 -05:00
|
|
|
feed svn urn aim rsync tag ssh sftp rtsp afs))
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
# Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
|
2010-08-14 01:13:00 -04:00
|
|
|
self.allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse
|
|
|
|
border-color border-left-color border-right-color border-top-color clear color cursor direction display
|
2007-11-25 22:45:54 -05:00
|
|
|
elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
|
|
|
|
overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
|
|
|
|
speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
|
|
|
|
width))
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
# Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
|
|
|
|
self.allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center
|
|
|
|
collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
|
|
|
|
nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
|
|
|
|
|
|
|
|
# Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
|
|
|
|
self.shorthand_css_properties = Set.new(%w(background border margin padding))
|
|
|
|
|
|
|
|
# Sanitizes a block of css code. Used by #sanitize when it comes across a style attribute
|
|
|
|
def sanitize_css(style)
|
|
|
|
# disallow urls
|
|
|
|
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
|
|
|
|
|
|
|
|
# gauntlet
|
|
|
|
if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ ||
|
2007-12-23 16:07:20 -05:00
|
|
|
style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$)\s*)*$/
|
2007-11-25 22:45:54 -05:00
|
|
|
return ''
|
|
|
|
end
|
|
|
|
|
|
|
|
clean = []
|
|
|
|
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
|
|
|
|
if allowed_css_properties.include?(prop.downcase)
|
|
|
|
clean << prop + ': ' + val + ';'
|
2010-08-14 01:13:00 -04:00
|
|
|
elsif shorthand_css_properties.include?(prop.split('-')[0].downcase)
|
2007-11-25 22:45:54 -05:00
|
|
|
unless val.split().any? do |keyword|
|
2010-08-14 01:13:00 -04:00
|
|
|
!allowed_css_keywords.include?(keyword) &&
|
2007-11-25 22:45:54 -05:00
|
|
|
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
|
|
|
end
|
|
|
|
clean << prop + ': ' + val + ';'
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
clean.join(' ')
|
|
|
|
end
|
|
|
|
|
|
|
|
protected
|
|
|
|
def tokenize(text, options)
|
|
|
|
options[:parent] = []
|
|
|
|
options[:attributes] ||= allowed_attributes
|
|
|
|
options[:tags] ||= allowed_tags
|
|
|
|
super
|
|
|
|
end
|
|
|
|
|
|
|
|
def process_node(node, result, options)
|
|
|
|
result << case node
|
|
|
|
when HTML::Tag
|
|
|
|
if node.closing == :close
|
|
|
|
options[:parent].shift
|
|
|
|
else
|
|
|
|
options[:parent].unshift node.name
|
|
|
|
end
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
process_attributes_for node, options
|
|
|
|
|
|
|
|
options[:tags].include?(node.name) ? node : nil
|
|
|
|
else
|
|
|
|
bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "<")
|
|
|
|
end
|
|
|
|
end
|
2010-08-14 01:13:00 -04:00
|
|
|
|
2007-11-25 22:45:54 -05:00
|
|
|
def process_attributes_for(node, options)
|
|
|
|
return unless node.attributes
|
|
|
|
node.attributes.keys.each do |attr_name|
|
|
|
|
value = node.attributes[attr_name].to_s
|
|
|
|
|
|
|
|
if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
|
|
|
|
node.attributes.delete(attr_name)
|
|
|
|
else
|
2008-11-06 07:02:32 -05:00
|
|
|
node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(CGI::unescapeHTML(value))
|
2007-11-25 22:45:54 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def contains_bad_protocols?(attr_name, value)
|
2010-08-14 01:13:00 -04:00
|
|
|
uri_attributes.include?(attr_name) &&
|
2010-11-23 16:17:05 -05:00
|
|
|
(value =~ /(^[^\/:]*):|(�*58)|(p)|(%|%)3A/ && !allowed_protocols.include?(value.split(protocol_separator).first.downcase))
|
2007-11-25 22:45:54 -05:00
|
|
|
end
|
|
|
|
end
|
2007-12-23 16:07:20 -05:00
|
|
|
end
|