1
0
Fork 0
mirror of https://github.com/rails/rails.git synced 2022-11-09 12:12:34 -05:00

Remove html-scanner and its tests.

This commit is contained in:
Timm 2014-05-23 23:34:46 +02:00
parent 017ddc6e24
commit 33019a321c
13 changed files with 0 additions and 2454 deletions

View file

@ -1,22 +0,0 @@
require 'active_support/deprecation'
$LOAD_PATH.unshift "#{File.dirname(__FILE__)}/html-scanner"
module HTML
extend ActiveSupport::Autoload
eager_autoload do
autoload :Scanner, 'html/sanitizer'
autoload :CDATA, 'html/node'
autoload :Document, 'html/document'
autoload :FullSanitizer, 'html/sanitizer'
autoload :LinkSanitizer, 'html/sanitizer'
autoload :Node, 'html/node'
autoload :Sanitizer, 'html/sanitizer'
autoload :Selector, 'html/selector'
autoload :Tag, 'html/node'
autoload :Text, 'html/node'
autoload :Tokenizer, 'html/tokenizer'
autoload :Version, 'html/version'
autoload :WhiteListSanitizer, 'html/sanitizer'
end
end

View file

@ -1,68 +0,0 @@
require 'html/tokenizer'
require 'html/node'
require 'html/selector'
require 'html/sanitizer'
module HTML #:nodoc:
# A top-level HTML document. You give it a body of text, and it will parse that
# text into a tree of nodes.
class Document #:nodoc:
# The root of the parsed document.
attr_reader :root
# Create a new Document from the given text.
def initialize(text, strict=false, xml=false)
tokenizer = Tokenizer.new(text)
@root = Node.new(nil)
node_stack = [ @root ]
while token = tokenizer.next
node = Node.parse(node_stack.last, tokenizer.line, tokenizer.position, token, strict)
node_stack.last.children << node unless node.tag? && node.closing == :close
if node.tag?
if node_stack.length > 1 && node.closing == :close
if node_stack.last.name == node.name
if node_stack.last.children.empty?
node_stack.last.children << Text.new(node_stack.last, node.line, node.position, "")
end
node_stack.pop
else
open_start = node_stack.last.position - 20
open_start = 0 if open_start < 0
close_start = node.position - 20
close_start = 0 if close_start < 0
msg = <<EOF.strip
ignoring attempt to close #{node_stack.last.name} with #{node.name}
opened at byte #{node_stack.last.position}, line #{node_stack.last.line}
closed at byte #{node.position}, line #{node.line}
attributes at open: #{node_stack.last.attributes.inspect}
text around open: #{text[open_start,40].inspect}
text around close: #{text[close_start,40].inspect}
EOF
strict ? raise(msg) : warn(msg)
end
elsif !node.childless?(xml) && node.closing != :close
node_stack.push node
end
end
end
end
# Search the tree for (and return) the first node that matches the given
# conditions. The conditions are interpreted differently for different node
# types, see HTML::Text#find and HTML::Tag#find.
def find(conditions)
@root.find(conditions)
end
# Search the tree for (and return) all nodes that match the given
# conditions. The conditions are interpreted differently for different node
# types, see HTML::Text#find and HTML::Tag#find.
def find_all(conditions)
@root.find_all(conditions)
end
end
end

View file

@ -1,532 +0,0 @@
require 'strscan'
module HTML #:nodoc:
class Conditions < Hash #:nodoc:
def initialize(hash)
super()
hash = { :content => hash } unless Hash === hash
hash = keys_to_symbols(hash)
hash.each do |k,v|
case k
when :tag, :content then
# keys are valid, and require no further processing
when :attributes then
hash[k] = keys_to_strings(v)
when :parent, :child, :ancestor, :descendant, :sibling, :before,
:after
hash[k] = Conditions.new(v)
when :children
hash[k] = v = keys_to_symbols(v)
v.each do |key,value|
case key
when :count, :greater_than, :less_than
# keys are valid, and require no further processing
when :only
v[key] = Conditions.new(value)
else
raise "illegal key #{key.inspect} => #{value.inspect}"
end
end
else
raise "illegal key #{k.inspect} => #{v.inspect}"
end
end
update hash
end
private
def keys_to_strings(hash)
Hash[hash.keys.map {|k| [k.to_s, hash[k]]}]
end
def keys_to_symbols(hash)
Hash[hash.keys.map do |k|
raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym)
[k.to_sym, hash[k]]
end]
end
end
# The base class of all nodes, textual and otherwise, in an HTML document.
class Node #:nodoc:
# The array of children of this node. Not all nodes have children.
attr_reader :children
# The parent node of this node. All nodes have a parent, except for the
# root node.
attr_reader :parent
# The line number of the input where this node was begun
attr_reader :line
# The byte position in the input where this node was begun
attr_reader :position
# Create a new node as a child of the given parent.
def initialize(parent, line=0, pos=0)
@parent = parent
@children = []
@line, @position = line, pos
end
# Returns a textual representation of the node.
def to_s
@children.join()
end
# Returns false (subclasses must override this to provide specific matching
# behavior.) +conditions+ may be of any type.
def match(conditions)
false
end
# Search the children of this node for the first node for which #find
# returns non +nil+. Returns the result of the #find call that succeeded.
def find(conditions)
conditions = validate_conditions(conditions)
@children.each do |child|
node = child.find(conditions)
return node if node
end
nil
end
# Search for all nodes that match the given conditions, and return them
# as an array.
def find_all(conditions)
conditions = validate_conditions(conditions)
matches = []
matches << self if match(conditions)
@children.each do |child|
matches.concat child.find_all(conditions)
end
matches
end
# Returns +false+. Subclasses may override this if they define a kind of
# tag.
def tag?
false
end
def validate_conditions(conditions)
Conditions === conditions ? conditions : Conditions.new(conditions)
end
def ==(node)
return false unless self.class == node.class && children.size == node.children.size
equivalent = true
children.size.times do |i|
equivalent &&= children[i] == node.children[i]
end
equivalent
end
class <<self
def parse(parent, line, pos, content, strict=true)
if content !~ /^<\S/
Text.new(parent, line, pos, content)
else
scanner = StringScanner.new(content)
unless scanner.skip(/</)
if strict
raise "expected <"
else
return Text.new(parent, line, pos, content)
end
end
if scanner.skip(/!\[CDATA\[/)
unless scanner.skip_until(/\]\]>/)
if strict
raise "expected ]]> (got #{scanner.rest.inspect} for #{content})"
else
scanner.skip_until(/\Z/)
end
end
return CDATA.new(parent, line, pos, scanner.pre_match.gsub(/<!\[CDATA\[/, ''))
end
closing = ( scanner.scan(/\//) ? :close : nil )
return Text.new(parent, line, pos, content) unless name = scanner.scan(/[^\s!>\/]+/)
name.downcase!
unless closing
scanner.skip(/\s*/)
attributes = {}
while attr = scanner.scan(/[-\w:]+/)
value = true
if scanner.scan(/\s*=\s*/)
if delim = scanner.scan(/['"]/)
value = ""
while text = scanner.scan(/[^#{delim}\\]+|./)
case text
when "\\" then
value << text
break if scanner.eos?
value << scanner.getch
when delim
break
else value << text
end
end
else
value = scanner.scan(/[^\s>\/]+/)
end
end
attributes[attr.downcase] = value
scanner.skip(/\s*/)
end
closing = ( scanner.scan(/\//) ? :self : nil )
end
unless scanner.scan(/\s*>/)
if strict
raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})"
else
# throw away all text until we find what we're looking for
scanner.skip_until(/>/) or scanner.terminate
end
end
Tag.new(parent, line, pos, name, attributes, closing)
end
end
end
end
# A node that represents text, rather than markup.
class Text < Node #:nodoc:
attr_reader :content
# Creates a new text node as a child of the given parent, with the given
# content.
def initialize(parent, line, pos, content)
super(parent, line, pos)
@content = content
end
# Returns the content of this node.
def to_s
@content
end
# Returns +self+ if this node meets the given conditions. Text nodes support
# conditions of the following kinds:
#
# * if +conditions+ is a string, it must be a substring of the node's
# content
# * if +conditions+ is a regular expression, it must match the node's
# content
# * if +conditions+ is a hash, it must contain a <tt>:content</tt> key that
# is either a string or a regexp, and which is interpreted as described
# above.
def find(conditions)
match(conditions) && self
end
# Returns non-+nil+ if this node meets the given conditions, or +nil+
# otherwise. See the discussion of #find for the valid conditions.
def match(conditions)
case conditions
when String
@content == conditions
when Regexp
@content =~ conditions
when Hash
conditions = validate_conditions(conditions)
# Text nodes only have :content, :parent, :ancestor
unless (conditions.keys - [:content, :parent, :ancestor]).empty?
return false
end
match(conditions[:content])
else
nil
end
end
def ==(node)
return false unless super
content == node.content
end
end
# A CDATA node is simply a text node with a specialized way of displaying
# itself.
class CDATA < Text #:nodoc:
def to_s
"<![CDATA[#{super}]]>"
end
end
# A Tag is any node that represents markup. It may be an opening tag, a
# closing tag, or a self-closing tag. It has a name, and may have a hash of
# attributes.
class Tag < Node #:nodoc:
# Either +nil+, <tt>:close</tt>, or <tt>:self</tt>
attr_reader :closing
# Either +nil+, or a hash of attributes for this node.
attr_reader :attributes
# The name of this tag.
attr_reader :name
# Create a new node as a child of the given parent, using the given content
# to describe the node. It will be parsed and the node name, attributes and
# closing status extracted.
def initialize(parent, line, pos, name, attributes, closing)
super(parent, line, pos)
@name = name
@attributes = attributes
@closing = closing
end
# A convenience for obtaining an attribute of the node. Returns +nil+ if
# the node has no attributes.
def [](attr)
@attributes ? @attributes[attr] : nil
end
# Returns non-+nil+ if this tag can contain child nodes.
def childless?(xml = false)
return false if xml && @closing.nil?
!@closing.nil? ||
@name =~ /^(img|br|hr|link|meta|area|base|basefont|
col|frame|input|isindex|param)$/ox
end
# Returns a textual representation of the node
def to_s
if @closing == :close
"</#{@name}>"
else
s = "<#{@name}"
@attributes.each do |k,v|
s << " #{k}"
s << "=\"#{v}\"" if String === v
end
s << " /" if @closing == :self
s << ">"
@children.each { |child| s << child.to_s }
s << "</#{@name}>" if @closing != :self && !@children.empty?
s
end
end
# If either the node or any of its children meet the given conditions, the
# matching node is returned. Otherwise, +nil+ is returned. (See the
# description of the valid conditions in the +match+ method.)
def find(conditions)
match(conditions) && self || super
end
# Returns +true+, indicating that this node represents an HTML tag.
def tag?
true
end
# Returns +true+ if the node meets any of the given conditions. The
# +conditions+ parameter must be a hash of any of the following keys
# (all are optional):
#
# * <tt>:tag</tt>: the node name must match the corresponding value
# * <tt>:attributes</tt>: a hash. The node's values must match the
# corresponding values in the hash.
# * <tt>:parent</tt>: a hash. The node's parent must match the
# corresponding hash.
# * <tt>:child</tt>: a hash. At least one of the node's immediate children
# must meet the criteria described by the hash.
# * <tt>:ancestor</tt>: a hash. At least one of the node's ancestors must
# meet the criteria described by the hash.
# * <tt>:descendant</tt>: a hash. At least one of the node's descendants
# must meet the criteria described by the hash.
# * <tt>:sibling</tt>: a hash. At least one of the node's siblings must
# meet the criteria described by the hash.
# * <tt>:after</tt>: a hash. The node must be after any sibling meeting
# the criteria described by the hash, and at least one sibling must match.
# * <tt>:before</tt>: a hash. The node must be before any sibling meeting
# the criteria described by the hash, and at least one sibling must match.
# * <tt>:children</tt>: a hash, for counting children of a node. Accepts the
# keys:
# ** <tt>:count</tt>: either a number or a range which must equal (or
# include) the number of children that match.
# ** <tt>:less_than</tt>: the number of matching children must be less than
# this number.
# ** <tt>:greater_than</tt>: the number of matching children must be
# greater than this number.
# ** <tt>:only</tt>: another hash consisting of the keys to use
# to match on the children, and only matching children will be
# counted.
#
# Conditions are matched using the following algorithm:
#
# * if the condition is a string, it must be a substring of the value.
# * if the condition is a regexp, it must match the value.
# * if the condition is a number, the value must match number.to_s.
# * if the condition is +true+, the value must not be +nil+.
# * if the condition is +false+ or +nil+, the value must be +nil+.
#
# Usage:
#
# # test if the node is a "span" tag
# node.match tag: "span"
#
# # test if the node's parent is a "div"
# node.match parent: { tag: "div" }
#
# # test if any of the node's ancestors are "table" tags
# node.match ancestor: { tag: "table" }
#
# # test if any of the node's immediate children are "em" tags
# node.match child: { tag: "em" }
#
# # test if any of the node's descendants are "strong" tags
# node.match descendant: { tag: "strong" }
#
# # test if the node has between 2 and 4 span tags as immediate children
# node.match children: { count: 2..4, only: { tag: "span" } }
#
# # get funky: test to see if the node is a "div", has a "ul" ancestor
# # and an "li" parent (with "class" = "enum"), and whether or not it has
# # a "span" descendant that contains # text matching /hello world/:
# node.match tag: "div",
# ancestor: { tag: "ul" },
# parent: { tag: "li",
# attributes: { class: "enum" } },
# descendant: { tag: "span",
# child: /hello world/ }
def match(conditions)
conditions = validate_conditions(conditions)
# check content of child nodes
if conditions[:content]
if children.empty?
return false unless match_condition("", conditions[:content])
else
return false unless children.find { |child| child.match(conditions[:content]) }
end
end
# test the name
return false unless match_condition(@name, conditions[:tag]) if conditions[:tag]
# test attributes
(conditions[:attributes] || {}).each do |key, value|
return false unless match_condition(self[key], value)
end
# test parent
return false unless parent.match(conditions[:parent]) if conditions[:parent]
# test children
return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child]
# test ancestors
if conditions[:ancestor]
return false unless catch :found do
p = self
throw :found, true if p.match(conditions[:ancestor]) while p = p.parent
end
end
# test descendants
if conditions[:descendant]
return false unless children.find do |child|
# test the child
child.match(conditions[:descendant]) ||
# test the child's descendants
child.match(:descendant => conditions[:descendant])
end
end
# count children
if opts = conditions[:children]
matches = children.select do |c|
(c.kind_of?(HTML::Tag) and (c.closing == :self or ! c.childless?))
end
matches = matches.select { |c| c.match(opts[:only]) } if opts[:only]
opts.each do |key, value|
next if key == :only
case key
when :count
if Integer === value
return false if matches.length != value
else
return false unless value.include?(matches.length)
end
when :less_than
return false unless matches.length < value
when :greater_than
return false unless matches.length > value
else raise "unknown count condition #{key}"
end
end
end
# test siblings
if conditions[:sibling] || conditions[:before] || conditions[:after]
siblings = parent ? parent.children : []
self_index = siblings.index(self)
if conditions[:sibling]
return false unless siblings.detect do |s|
s != self && s.match(conditions[:sibling])
end
end
if conditions[:before]
return false unless siblings[self_index+1..-1].detect do |s|
s != self && s.match(conditions[:before])
end
end
if conditions[:after]
return false unless siblings[0,self_index].detect do |s|
s != self && s.match(conditions[:after])
end
end
end
true
end
def ==(node)
return false unless super
return false unless closing == node.closing && self.name == node.name
attributes == node.attributes
end
private
# Match the given value to the given condition.
def match_condition(value, condition)
case condition
when String
value && value == condition
when Regexp
value && value.match(condition)
when Numeric
value == condition.to_s
when true
!value.nil?
when false, nil
value.nil?
else
false
end
end
end
end

View file

@ -1,202 +0,0 @@
require 'set'
require 'cgi'
require 'active_support/core_ext/module/attribute_accessors'
module HTML
module Scanner
def full_sanitizer
HTML::FullSanitizer
end
def link_sanitizer
HTML::LinkSanitizer
end
def white_list_sanitizer
HTML::WhiteListSanitizer
end
end
class Sanitizer
def sanitize(text, options = {})
validate_options(options)
return text unless sanitizeable?(text)
tokenize(text, options).join
end
def sanitizeable?(text)
!(text.nil? || text.empty? || !text.index("<"))
end
protected
def tokenize(text, options)
tokenizer = HTML::Tokenizer.new(text)
result = []
while token = tokenizer.next
node = Node.parse(nil, 0, 0, token, false)
process_node node, result, options
end
result
end
def process_node(node, result, options)
result << node.to_s
end
def validate_options(options)
if options[:tags] && !options[:tags].is_a?(Enumerable)
raise ArgumentError, "You should pass :tags as an Enumerable"
end
if options[:attributes] && !options[:attributes].is_a?(Enumerable)
raise ArgumentError, "You should pass :attributes as an Enumerable"
end
end
end
class FullSanitizer < Sanitizer
def sanitize(text, options = {})
result = super
# strip any comments, and if they have a newline at the end (ie. line with
# only a comment) strip that too
result = result.gsub(/<!--(.*?)-->[\n]?/m, "") if (result && result =~ /<!--(.*?)-->[\n]?/m)
# Recurse - handle all dirty nested tags
result == text ? result : sanitize(result, options)
end
def process_node(node, result, options)
result << node.to_s if node.class == HTML::Text
end
end
class LinkSanitizer < FullSanitizer
cattr_accessor :included_tags, :instance_writer => false
self.included_tags = Set.new(%w(a href))
def sanitizeable?(text)
!(text.nil? || text.empty? || !((text.index("<a") || text.index("<href")) && text.index(">")))
end
protected
def process_node(node, result, options)
result << node.to_s unless node.is_a?(HTML::Tag) && included_tags.include?(node.name)
end
end
class WhiteListSanitizer < Sanitizer
[:protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
:allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties].each do |attr|
class_attribute attr, :instance_writer => false
end
# A regular expression of the valid characters used to separate protocols like
# the ':' in 'http://foo.com'
self.protocol_separator = /:|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/i
# Specifies a Set of HTML attributes that can have URIs.
self.uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
# Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
# to just escaping harmless tags like &lt;font&gt;
self.bad_tags = Set.new(%w(script))
# Specifies the default Set of tags that the #sanitize helper will allow unscathed.
self.allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr
acronym a img blockquote del ins))
# Specifies the default Set of html attributes that the #sanitize helper will leave
# in the allowed tag.
self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
# Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
self.allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto
feed svn urn aim rsync tag ssh sftp rtsp afs))
# Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
self.allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse
border-color border-left-color border-right-color border-top-color clear color cursor direction display
elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
width))
# Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
self.allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center
collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
# Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
self.shorthand_css_properties = Set.new(%w(background border margin padding))
# Sanitizes a block of css code. Used by #sanitize when it comes across a style attribute
def sanitize_css(style)
# disallow urls
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
# gauntlet
if style !~ /\A([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*\z/ ||
style !~ /\A(\s*[-\w]+\s*:\s*[^:;]*(;|$)\s*)*\z/
return ''
end
clean = []
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
if allowed_css_properties.include?(prop.downcase)
clean << prop + ': ' + val + ';'
elsif shorthand_css_properties.include?(prop.split('-')[0].downcase)
unless val.split().any? do |keyword|
!allowed_css_keywords.include?(keyword) &&
keyword !~ /\A(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
end
clean << prop + ': ' + val + ';'
end
end
end
clean.join(' ')
end
protected
def tokenize(text, options)
options[:parent] = []
options[:attributes] ||= allowed_attributes
options[:tags] ||= allowed_tags
super
end
def process_node(node, result, options)
result << case node
when HTML::Tag
if node.closing == :close
options[:parent].shift
else
options[:parent].unshift node.name
end
process_attributes_for node, options
options[:tags].include?(node.name) ? node : nil
else
bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "&lt;")
end
end
def process_attributes_for(node, options)
return unless node.attributes
node.attributes.keys.each do |attr_name|
value = node.attributes[attr_name].to_s
if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
node.attributes.delete(attr_name)
else
node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(CGI::unescapeHTML(value))
end
end
end
def contains_bad_protocols?(attr_name, value)
uri_attributes.include?(attr_name) &&
(value =~ /(^[^\/:]*):|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/i && !allowed_protocols.include?(value.split(protocol_separator).first.downcase.strip))
end
end
end

View file

@ -1,830 +0,0 @@
#--
# Copyright (c) 2006 Assaf Arkin (http://labnotes.org)
# Under MIT and/or CC By license.
#++
module HTML
# Selects HTML elements using CSS 2 selectors.
#
# The +Selector+ class uses CSS selector expressions to match and select
# HTML elements.
#
# For example:
# selector = HTML::Selector.new "form.login[action=/login]"
# creates a new selector that matches any +form+ element with the class
# +login+ and an attribute +action+ with the value <tt>/login</tt>.
#
# === Matching Elements
#
# Use the #match method to determine if an element matches the selector.
#
# For simple selectors, the method returns an array with that element,
# or +nil+ if the element does not match. For complex selectors (see below)
# the method returns an array with all matched elements, of +nil+ if no
# match found.
#
# For example:
# if selector.match(element)
# puts "Element is a login form"
# end
#
# === Selecting Elements
#
# Use the #select method to select all matching elements starting with
# one element and going through all children in depth-first order.
#
# This method returns an array of all matching elements, an empty array
# if no match is found
#
# For example:
# selector = HTML::Selector.new "input[type=text]"
# matches = selector.select(element)
# matches.each do |match|
# puts "Found text field with name #{match.attributes['name']}"
# end
#
# === Expressions
#
# Selectors can match elements using any of the following criteria:
# * <tt>name</tt> -- Match an element based on its name (tag name).
# For example, <tt>p</tt> to match a paragraph. You can use <tt>*</tt>
# to match any element.
# * <tt>#</tt><tt>id</tt> -- Match an element based on its identifier (the
# <tt>id</tt> attribute). For example, <tt>#</tt><tt>page</tt>.
# * <tt>.class</tt> -- Match an element based on its class name, all
# class names if more than one specified.
# * <tt>[attr]</tt> -- Match an element that has the specified attribute.
# * <tt>[attr=value]</tt> -- Match an element that has the specified
# attribute and value. (More operators are supported see below)
# * <tt>:pseudo-class</tt> -- Match an element based on a pseudo class,
# such as <tt>:nth-child</tt> and <tt>:empty</tt>.
# * <tt>:not(expr)</tt> -- Match an element that does not match the
# negation expression.
#
# When using a combination of the above, the element name comes first
# followed by identifier, class names, attributes, pseudo classes and
# negation in any order. Do not separate these parts with spaces!
# Space separation is used for descendant selectors.
#
# For example:
# selector = HTML::Selector.new "form.login[action=/login]"
# The matched element must be of type +form+ and have the class +login+.
# It may have other classes, but the class +login+ is required to match.
# It must also have an attribute called +action+ with the value
# <tt>/login</tt>.
#
# This selector will match the following element:
# <form class="login form" method="post" action="/login">
# but will not match the element:
# <form method="post" action="/logout">
#
# === Attribute Values
#
# Several operators are supported for matching attributes:
# * <tt>name</tt> -- The element must have an attribute with that name.
# * <tt>name=value</tt> -- The element must have an attribute with that
# name and value.
# * <tt>name^=value</tt> -- The attribute value must start with the
# specified value.
# * <tt>name$=value</tt> -- The attribute value must end with the
# specified value.
# * <tt>name*=value</tt> -- The attribute value must contain the
# specified value.
# * <tt>name~=word</tt> -- The attribute value must contain the specified
# word (space separated).
# * <tt>name|=word</tt> -- The attribute value must start with specified
# word.
#
# For example, the following two selectors match the same element:
# #my_id
# [id=my_id]
# and so do the following two selectors:
# .my_class
# [class~=my_class]
#
# === Alternatives, siblings, children
#
# Complex selectors use a combination of expressions to match elements:
# * <tt>expr1 expr2</tt> -- Match any element against the second expression
# if it has some parent element that matches the first expression.
# * <tt>expr1 > expr2</tt> -- Match any element against the second expression
# if it is the child of an element that matches the first expression.
# * <tt>expr1 + expr2</tt> -- Match any element against the second expression
# if it immediately follows an element that matches the first expression.
# * <tt>expr1 ~ expr2</tt> -- Match any element against the second expression
# that comes after an element that matches the first expression.
# * <tt>expr1, expr2</tt> -- Match any element against the first expression,
# or against the second expression.
#
# Since children and sibling selectors may match more than one element given
# the first element, the #match method may return more than one match.
#
# === Pseudo classes
#
# Pseudo classes were introduced in CSS 3. They are most often used to select
# elements in a given position:
# * <tt>:root</tt> -- Match the element only if it is the root element
# (no parent element).
# * <tt>:empty</tt> -- Match the element only if it has no child elements,
# and no text content.
# * <tt>:content(string)</tt> -- Match the element only if it has <tt>string</tt>
# as its text content (ignoring leading and trailing whitespace).
# * <tt>:only-child</tt> -- Match the element if it is the only child (element)
# of its parent element.
# * <tt>:only-of-type</tt> -- Match the element if it is the only child (element)
# of its parent element and its type.
# * <tt>:first-child</tt> -- Match the element if it is the first child (element)
# of its parent element.
# * <tt>:first-of-type</tt> -- Match the element if it is the first child (element)
# of its parent element of its type.
# * <tt>:last-child</tt> -- Match the element if it is the last child (element)
# of its parent element.
# * <tt>:last-of-type</tt> -- Match the element if it is the last child (element)
# of its parent element of its type.
# * <tt>:nth-child(b)</tt> -- Match the element if it is the b-th child (element)
# of its parent element. The value <tt>b</tt> specifies its index, starting with 1.
# * <tt>:nth-child(an+b)</tt> -- Match the element if it is the b-th child (element)
# in each group of <tt>a</tt> child elements of its parent element.
# * <tt>:nth-child(-an+b)</tt> -- Match the element if it is the first child (element)
# in each group of <tt>a</tt> child elements, up to the first <tt>b</tt> child
# elements of its parent element.
# * <tt>:nth-child(odd)</tt> -- Match element in the odd position (i.e. first, third).
# Same as <tt>:nth-child(2n+1)</tt>.
# * <tt>:nth-child(even)</tt> -- Match element in the even position (i.e. second,
# fourth). Same as <tt>:nth-child(2n+2)</tt>.
# * <tt>:nth-of-type(..)</tt> -- As above, but only counts elements of its type.
# * <tt>:nth-last-child(..)</tt> -- As above, but counts from the last child.
# * <tt>:nth-last-of-type(..)</tt> -- As above, but counts from the last child and
# only elements of its type.
# * <tt>:not(selector)</tt> -- Match the element only if the element does not
# match the simple selector.
#
# As you can see, <tt>:nth-child</tt> pseudo class and its variant can get quite
# tricky and the CSS specification doesn't do a much better job explaining it.
# But after reading the examples and trying a few combinations, it's easy to
# figure out.
#
# For example:
# table tr:nth-child(odd)
# Selects every second row in the table starting with the first one.
#
# div p:nth-child(4)
# Selects the fourth paragraph in the +div+, but not if the +div+ contains
# other elements, since those are also counted.
#
# div p:nth-of-type(4)
# Selects the fourth paragraph in the +div+, counting only paragraphs, and
# ignoring all other elements.
#
# div p:nth-of-type(-n+4)
# Selects the first four paragraphs, ignoring all others.
#
# And you can always select an element that matches one set of rules but
# not another using <tt>:not</tt>. For example:
# p:not(.post)
# Matches all paragraphs that do not have the class <tt>.post</tt>.
#
# === Substitution Values
#
# You can use substitution with identifiers, class names and element values.
# A substitution takes the form of a question mark (<tt>?</tt>) and uses the
# next value in the argument list following the CSS expression.
#
# The substitution value may be a string or a regular expression. All other
# values are converted to strings.
#
# For example:
# selector = HTML::Selector.new "#?", /^\d+$/
# matches any element whose identifier consists of one or more digits.
#
# See http://www.w3.org/TR/css3-selectors/
class Selector
# An invalid selector.
class InvalidSelectorError < StandardError #:nodoc:
end
class << self
# :call-seq:
# Selector.for_class(cls) => selector
#
# Creates a new selector for the given class name.
def for_class(cls)
self.new([".?", cls])
end
# :call-seq:
# Selector.for_id(id) => selector
#
# Creates a new selector for the given id.
def for_id(id)
self.new(["#?", id])
end
end
# :call-seq:
# Selector.new(string, [values ...]) => selector
#
# Creates a new selector from a CSS 2 selector expression.
#
# The first argument is the selector expression. All other arguments
# are used for value substitution.
#
# Throws InvalidSelectorError is the selector expression is invalid.
def initialize(selector, *values)
raise ArgumentError, "CSS expression cannot be empty" if selector.empty?
@source = ""
values = values[0] if values.size == 1 && values[0].is_a?(Array)
# We need a copy to determine if we failed to parse, and also
# preserve the original pass by-ref statement.
statement = selector.strip.dup
# Create a simple selector, along with negation.
simple_selector(statement, values).each { |name, value| instance_variable_set("@#{name}", value) }
@alternates = []
@depends = nil
# Alternative selector.
if statement.sub!(/^\s*,\s*/, "")
second = Selector.new(statement, values)
@alternates << second
# If there are alternate selectors, we group them in the top selector.
if alternates = second.instance_variable_get(:@alternates)
second.instance_variable_set(:@alternates, [])
@alternates.concat alternates
end
@source << " , " << second.to_s
# Sibling selector: create a dependency into second selector that will
# match element immediately following this one.
elsif statement.sub!(/^\s*\+\s*/, "")
second = next_selector(statement, values)
@depends = lambda do |element, first|
if element = next_element(element)
second.match(element, first)
end
end
@source << " + " << second.to_s
# Adjacent selector: create a dependency into second selector that will
# match all elements following this one.
elsif statement.sub!(/^\s*~\s*/, "")
second = next_selector(statement, values)
@depends = lambda do |element, first|
matches = []
while element = next_element(element)
if subset = second.match(element, first)
if first && !subset.empty?
matches << subset.first
break
else
matches.concat subset
end
end
end
matches.empty? ? nil : matches
end
@source << " ~ " << second.to_s
# Child selector: create a dependency into second selector that will
# match a child element of this one.
elsif statement.sub!(/^\s*>\s*/, "")
second = next_selector(statement, values)
@depends = lambda do |element, first|
matches = []
element.children.each do |child|
if child.tag? && subset = second.match(child, first)
if first && !subset.empty?
matches << subset.first
break
else
matches.concat subset
end
end
end
matches.empty? ? nil : matches
end
@source << " > " << second.to_s
# Descendant selector: create a dependency into second selector that
# will match all descendant elements of this one. Note,
elsif statement =~ /^\s+\S+/ && statement != selector
second = next_selector(statement, values)
@depends = lambda do |element, first|
matches = []
stack = element.children.reverse
while node = stack.pop
next unless node.tag?
if subset = second.match(node, first)
if first && !subset.empty?
matches << subset.first
break
else
matches.concat subset
end
elsif children = node.children
stack.concat children.reverse
end
end
matches.empty? ? nil : matches
end
@source << " " << second.to_s
else
# The last selector is where we check that we parsed
# all the parts.
unless statement.empty? || statement.strip.empty?
raise ArgumentError, "Invalid selector: #{statement}"
end
end
end
# :call-seq:
# match(element, first?) => array or nil
#
# Matches an element against the selector.
#
# For a simple selector this method returns an array with the
# element if the element matches, nil otherwise.
#
# For a complex selector (sibling and descendant) this method
# returns an array with all matching elements, nil if no match is
# found.
#
# Use +first_only=true+ if you are only interested in the first element.
#
# For example:
# if selector.match(element)
# puts "Element is a login form"
# end
def match(element, first_only = false)
# Match element if no element name or element name same as element name
if matched = (!@tag_name || @tag_name == element.name)
# No match if one of the attribute matches failed
for attr in @attributes
if element.attributes[attr[0]] !~ attr[1]
matched = false
break
end
end
end
# Pseudo class matches (nth-child, empty, etc).
if matched
for pseudo in @pseudo
unless pseudo.call(element)
matched = false
break
end
end
end
# Negation. Same rules as above, but we fail if a match is made.
if matched && @negation
for negation in @negation
if negation[:tag_name] == element.name
matched = false
else
for attr in negation[:attributes]
if element.attributes[attr[0]] =~ attr[1]
matched = false
break
end
end
end
if matched
for pseudo in negation[:pseudo]
if pseudo.call(element)
matched = false
break
end
end
end
break unless matched
end
end
# If element matched but depends on another element (child,
# sibling, etc), apply the dependent matches instead.
if matched && @depends
matches = @depends.call(element, first_only)
else
matches = matched ? [element] : nil
end
# If this selector is part of the group, try all the alternative
# selectors (unless first_only).
if !first_only || !matches
@alternates.each do |alternate|
break if matches && first_only
if subset = alternate.match(element, first_only)
if matches
matches.concat subset
else
matches = subset
end
end
end
end
matches
end
# :call-seq:
# select(root) => array
#
# Selects and returns an array with all matching elements, beginning
# with one node and traversing through all children depth-first.
# Returns an empty array if no match is found.
#
# The root node may be any element in the document, or the document
# itself.
#
# For example:
# selector = HTML::Selector.new "input[type=text]"
# matches = selector.select(element)
# matches.each do |match|
# puts "Found text field with name #{match.attributes['name']}"
# end
def select(root)
matches = []
stack = [root]
while node = stack.pop
if node.tag? && subset = match(node, false)
subset.each do |match|
matches << match unless matches.any? { |item| item.equal?(match) }
end
elsif children = node.children
stack.concat children.reverse
end
end
matches
end
# Similar to #select but returns the first matching element. Returns +nil+
# if no element matches the selector.
def select_first(root)
stack = [root]
while node = stack.pop
if node.tag? && subset = match(node, true)
return subset.first if !subset.empty?
elsif children = node.children
stack.concat children.reverse
end
end
nil
end
def to_s #:nodoc:
@source
end
# Returns the next element after this one. Skips sibling text nodes.
#
# With the +name+ argument, returns the next element with that name,
# skipping other sibling elements.
def next_element(element, name = nil)
if siblings = element.parent.children
found = false
siblings.each do |node|
if node.equal?(element)
found = true
elsif found && node.tag?
return node if (name.nil? || node.name == name)
end
end
end
nil
end
protected
# Creates a simple selector given the statement and array of
# substitution values.
#
# Returns a hash with the values +tag_name+, +attributes+,
# +pseudo+ (classes) and +negation+.
#
# Called the first time with +can_negate+ true to allow
# negation. Called a second time with false since negation
# cannot be negated.
def simple_selector(statement, values, can_negate = true)
tag_name = nil
attributes = []
pseudo = []
negation = []
# Element name. (Note that in negation, this can come at
# any order, but for simplicity we allow if only first).
statement.sub!(/^(\*|[[:alpha:]][\w\-]*)/) do |match|
match.strip!
tag_name = match.downcase unless match == "*"
@source << match
"" # Remove
end
# Get identifier, class, attribute name, pseudo or negation.
while true
# Element identifier.
next if statement.sub!(/^#(\?|[\w\-]+)/) do
id = $1
if id == "?"
id = values.shift
end
@source << "##{id}"
id = Regexp.new("^#{Regexp.escape(id.to_s)}$") unless id.is_a?(Regexp)
attributes << ["id", id]
"" # Remove
end
# Class name.
next if statement.sub!(/^\.([\w\-]+)/) do
class_name = $1
@source << ".#{class_name}"
class_name = Regexp.new("(^|\s)#{Regexp.escape(class_name)}($|\s)") unless class_name.is_a?(Regexp)
attributes << ["class", class_name]
"" # Remove
end
# Attribute value.
next if statement.sub!(/^\[\s*([[:alpha:]][\w\-:]*)\s*((?:[~|^$*])?=)?\s*('[^']*'|"[^*]"|[^\]]*)\s*\]/) do
name, equality, value = $1, $2, $3
if value == "?"
value = values.shift
else
# Handle single and double quotes.
value.strip!
if (value[0] == ?" || value[0] == ?') && value[0] == value[-1]
value = value[1..-2]
end
end
@source << "[#{name}#{equality}'#{value}']"
attributes << [name.downcase.strip, attribute_match(equality, value)]
"" # Remove
end
# Root element only.
next if statement.sub!(/^:root/) do
pseudo << lambda do |element|
element.parent.nil? || !element.parent.tag?
end
@source << ":root"
"" # Remove
end
# Nth-child including last and of-type.
next if statement.sub!(/^:nth-(last-)?(child|of-type)\((odd|even|(\d+|\?)|(-?\d*|\?)?n([+\-]\d+|\?)?)\)/) do |match|
reverse = $1 == "last-"
of_type = $2 == "of-type"
@source << ":nth-#{$1}#{$2}("
case $3
when "odd"
pseudo << nth_child(2, 1, of_type, reverse)
@source << "odd)"
when "even"
pseudo << nth_child(2, 2, of_type, reverse)
@source << "even)"
when /^(\d+|\?)$/ # b only
b = ($1 == "?" ? values.shift : $1).to_i
pseudo << nth_child(0, b, of_type, reverse)
@source << "#{b})"
when /^(-?\d*|\?)?n([+\-]\d+|\?)?$/
a = ($1 == "?" ? values.shift :
$1 == "" ? 1 : $1 == "-" ? -1 : $1).to_i
b = ($2 == "?" ? values.shift : $2).to_i
pseudo << nth_child(a, b, of_type, reverse)
@source << (b >= 0 ? "#{a}n+#{b})" : "#{a}n#{b})")
else
raise ArgumentError, "Invalid nth-child #{match}"
end
"" # Remove
end
# First/last child (of type).
next if statement.sub!(/^:(first|last)-(child|of-type)/) do
reverse = $1 == "last"
of_type = $2 == "of-type"
pseudo << nth_child(0, 1, of_type, reverse)
@source << ":#{$1}-#{$2}"
"" # Remove
end
# Only child (of type).
next if statement.sub!(/^:only-(child|of-type)/) do
of_type = $1 == "of-type"
pseudo << only_child(of_type)
@source << ":only-#{$1}"
"" # Remove
end
# Empty: no child elements or meaningful content (whitespaces
# are ignored).
next if statement.sub!(/^:empty/) do
pseudo << lambda do |element|
empty = true
for child in element.children
if child.tag? || !child.content.strip.empty?
empty = false
break
end
end
empty
end
@source << ":empty"
"" # Remove
end
# Content: match the text content of the element, stripping
# leading and trailing spaces.
next if statement.sub!(/^:content\(\s*(\?|'[^']*'|"[^"]*"|[^)]*)\s*\)/) do
content = $1
if content == "?"
content = values.shift
elsif (content[0] == ?" || content[0] == ?') && content[0] == content[-1]
content = content[1..-2]
end
@source << ":content('#{content}')"
content = Regexp.new("^#{Regexp.escape(content.to_s)}$") unless content.is_a?(Regexp)
pseudo << lambda do |element|
text = ""
for child in element.children
unless child.tag?
text << child.content
end
end
text.strip =~ content
end
"" # Remove
end
# Negation. Create another simple selector to handle it.
if statement.sub!(/^:not\(\s*/, "")
raise ArgumentError, "Double negatives are not missing feature" unless can_negate
@source << ":not("
negation << simple_selector(statement, values, false)
raise ArgumentError, "Negation not closed" unless statement.sub!(/^\s*\)/, "")
@source << ")"
next
end
# No match: moving on.
break
end
# Return hash. The keys are mapped to instance variables.
{:tag_name=>tag_name, :attributes=>attributes, :pseudo=>pseudo, :negation=>negation}
end
# Create a regular expression to match an attribute value based
# on the equality operator (=, ^=, |=, etc).
def attribute_match(equality, value)
regexp = value.is_a?(Regexp) ? value : Regexp.escape(value.to_s)
case equality
when "=" then
# Match the attribute value in full
Regexp.new("^#{regexp}$")
when "~=" then
# Match a space-separated word within the attribute value
Regexp.new("(^|\s)#{regexp}($|\s)")
when "^="
# Match the beginning of the attribute value
Regexp.new("^#{regexp}")
when "$="
# Match the end of the attribute value
Regexp.new("#{regexp}$")
when "*="
# Match substring of the attribute value
regexp.is_a?(Regexp) ? regexp : Regexp.new(regexp)
when "|=" then
# Match the first space-separated item of the attribute value
Regexp.new("^#{regexp}($|\s)")
else
raise InvalidSelectorError, "Invalid operation/value" unless value.empty?
# Match all attributes values (existence check)
//
end
end
# Returns a lambda that can match an element against the nth-child
# pseudo class, given the following arguments:
# * +a+ -- Value of a part.
# * +b+ -- Value of b part.
# * +of_type+ -- True to test only elements of this type (of-type).
# * +reverse+ -- True to count in reverse order (last-).
def nth_child(a, b, of_type, reverse)
# a = 0 means select at index b, if b = 0 nothing selected
return lambda { |element| false } if a == 0 && b == 0
# a < 0 and b < 0 will never match against an index
return lambda { |element| false } if a < 0 && b < 0
b = a + b + 1 if b < 0 # b < 0 just picks last element from each group
b -= 1 unless b == 0 # b == 0 is same as b == 1, otherwise zero based
lambda do |element|
# Element must be inside parent element.
return false unless element.parent && element.parent.tag?
index = 0
# Get siblings, reverse if counting from last.
siblings = element.parent.children
siblings = siblings.reverse if reverse
# Match element name if of-type, otherwise ignore name.
name = of_type ? element.name : nil
found = false
for child in siblings
# Skip text nodes/comments.
if child.tag? && (name == nil || child.name == name)
if a == 0
# Shortcut when a == 0 no need to go past count
if index == b
found = child.equal?(element)
break
end
elsif a < 0
# Only look for first b elements
break if index > b
if child.equal?(element)
found = (index % a) == 0
break
end
else
# Otherwise, break if child found and count == an+b
if child.equal?(element)
found = (index % a) == b
break
end
end
index += 1
end
end
found
end
end
# Creates a only child lambda. Pass +of-type+ to only look at
# elements of its type.
def only_child(of_type)
lambda do |element|
# Element must be inside parent element.
return false unless element.parent && element.parent.tag?
name = of_type ? element.name : nil
other = false
for child in element.parent.children
# Skip text nodes/comments.
if child.tag? && (name == nil || child.name == name)
unless child.equal?(element)
other = true
break
end
end
end
!other
end
end
# Called to create a dependent selector (sibling, descendant, etc).
# Passes the remainder of the statement that will be reduced to zero
# eventually, and array of substitution values.
#
# This method is called from four places, so it helps to put it here
# for reuse. The only logic deals with the need to detect comma
# separators (alternate) and apply them to the selector group of the
# top selector.
def next_selector(statement, values)
second = Selector.new(statement, values)
# If there are alternate selectors, we group them in the top selector.
if alternates = second.instance_variable_get(:@alternates)
second.instance_variable_set(:@alternates, [])
@alternates.concat alternates
end
second
end
end
# See HTML::Selector.new
def self.selector(statement, *values)
Selector.new(statement, *values)
end
class Tag
def select(selector, *values)
selector = HTML::Selector.new(selector, values)
selector.select(self)
end
end
end

View file

@ -1,107 +0,0 @@
require 'strscan'
module HTML #:nodoc:
# A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
# token is a string. Each string represents either "text", or an HTML element.
#
# This currently assumes valid XHTML, which means no free < or > characters.
#
# Usage:
#
# tokenizer = HTML::Tokenizer.new(text)
# while token = tokenizer.next
# p token
# end
class Tokenizer #:nodoc:
# The current (byte) position in the text
attr_reader :position
# The current line number
attr_reader :line
# Create a new Tokenizer for the given text.
def initialize(text)
text.encode!
@scanner = StringScanner.new(text)
@position = 0
@line = 0
@current_line = 1
end
# Returns the next token in the sequence, or +nil+ if there are no more tokens in
# the stream.
def next
return nil if @scanner.eos?
@position = @scanner.pos
@line = @current_line
if @scanner.check(/<\S/)
update_current_line(scan_tag)
else
update_current_line(scan_text)
end
end
private
# Treat the text at the current position as a tag, and scan it. Supports
# comments, doctype tags, and regular tags, and ignores less-than and
# greater-than characters within quoted strings.
def scan_tag
tag = @scanner.getch
if @scanner.scan(/!--/) # comment
tag << @scanner.matched
tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
elsif @scanner.scan(/!\[CDATA\[/)
tag << @scanner.matched
tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/))
elsif @scanner.scan(/!/) # doctype
tag << @scanner.matched
tag << consume_quoted_regions
else
tag << consume_quoted_regions
end
tag
end
# Scan all text up to the next < character and return it.
def scan_text
"#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
end
# Counts the number of newlines in the text and updates the current line
# accordingly.
def update_current_line(text)
text.scan(/\r?\n/) { @current_line += 1 }
end
# Skips over quoted strings, so that less-than and greater-than characters
# within the strings are ignored.
def consume_quoted_regions
text = ""
loop do
match = @scanner.scan_until(/['"<>]/) or break
delim = @scanner.matched
if delim == "<"
match = match.chop
@scanner.pos -= 1
end
text << match
break if delim == "<" || delim == ">"
# consume the quoted region
while match = @scanner.scan_until(/[\\#{delim}]/)
text << match
break if @scanner.matched == delim
break if @scanner.eos?
text << @scanner.getch # skip the escaped character
end
end
text
end
end
end

View file

@ -1,11 +0,0 @@
module HTML #:nodoc:
module Version #:nodoc:
MAJOR = 0
MINOR = 5
TINY = 3
STRING = [ MAJOR, MINOR, TINY ].join(".")
end
end

View file

@ -1,16 +0,0 @@
require 'abstract_unit'
require 'action_view/vendor/html-scanner/html/node'
class CDATANodeTest < ActiveSupport::TestCase
def setup
@node = HTML::CDATA.new(nil, 0, 0, "<p>howdy</p>")
end
def test_to_s
assert_equal "<![CDATA[<p>howdy</p>]]>", @node.to_s
end
def test_content
assert_equal "<p>howdy</p>", @node.content
end
end

View file

@ -1,149 +0,0 @@
require 'abstract_unit'
require 'action_view/vendor/html-scanner'
class DocumentTest < ActiveSupport::TestCase
def test_handle_doctype
doc = nil
assert_nothing_raised do
doc = HTML::Document.new <<-HTML.strip
<!DOCTYPE "blah" "blah" "blah">
<html>
</html>
HTML
end
assert_equal 3, doc.root.children.length
assert_equal %{<!DOCTYPE "blah" "blah" "blah">}, doc.root.children[0].content
assert_match %r{\s+}m, doc.root.children[1].content
assert_equal "html", doc.root.children[2].name
end
def test_find_img
doc = HTML::Document.new <<-HTML.strip
<html>
<body>
<p><img src="hello.gif"></p>
</body>
</html>
HTML
assert doc.find(:tag=>"img", :attributes=>{"src"=>"hello.gif"})
end
def test_find_all
doc = HTML::Document.new <<-HTML.strip
<html>
<body>
<p class="test"><img src="hello.gif"></p>
<div class="foo">
<p class="test">something</p>
<p>here is <em class="test">more</em></p>
</div>
</body>
</html>
HTML
all = doc.find_all :attributes => { :class => "test" }
assert_equal 3, all.length
assert_equal [ "p", "p", "em" ], all.map { |n| n.name }
end
def test_find_with_text
doc = HTML::Document.new <<-HTML.strip
<html>
<body>
<p>Some text</p>
</body>
</html>
HTML
assert doc.find(:content => "Some text")
assert doc.find(:tag => "p", :child => { :content => "Some text" })
assert doc.find(:tag => "p", :child => "Some text")
assert doc.find(:tag => "p", :content => "Some text")
end
def test_parse_xml
assert_nothing_raised { HTML::Document.new("<tags><tag/></tags>", true, true) }
assert_nothing_raised { HTML::Document.new("<outer><link>something</link></outer>", true, true) }
end
def test_parse_document
doc = HTML::Document.new(<<-HTML)
<div>
<h2>blah</h2>
<table>
</table>
</div>
HTML
assert_not_nil doc.find(:tag => "div", :children => { :count => 1, :only => { :tag => "table" } })
end
def test_tag_nesting_nothing_to_s
doc = HTML::Document.new("<tag></tag>")
assert_equal "<tag></tag>", doc.root.to_s
end
def test_tag_nesting_space_to_s
doc = HTML::Document.new("<tag> </tag>")
assert_equal "<tag> </tag>", doc.root.to_s
end
def test_tag_nesting_text_to_s
doc = HTML::Document.new("<tag>text</tag>")
assert_equal "<tag>text</tag>", doc.root.to_s
end
def test_tag_nesting_tag_to_s
doc = HTML::Document.new("<tag><nested /></tag>")
assert_equal "<tag><nested /></tag>", doc.root.to_s
end
def test_parse_cdata
doc = HTML::Document.new(<<-HTML)
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
<head>
<title><![CDATA[<br>]]></title>
</head>
<body>
<p>this document has &lt;br&gt; for a title</p>
</body>
</html>
HTML
assert_nil doc.find(:tag => "title", :descendant => { :tag => "br" })
assert doc.find(:tag => "title", :child => "<br>")
end
def test_find_empty_tag
doc = HTML::Document.new("<div id='map'></div>")
assert_nil doc.find(:tag => "div", :attributes => { :id => "map" }, :content => /./)
assert doc.find(:tag => "div", :attributes => { :id => "map" }, :content => /\A\Z/)
assert doc.find(:tag => "div", :attributes => { :id => "map" }, :content => /^$/)
assert doc.find(:tag => "div", :attributes => { :id => "map" }, :content => "")
assert doc.find(:tag => "div", :attributes => { :id => "map" }, :content => nil)
end
def test_parse_invalid_document
assert_nothing_raised do
HTML::Document.new("<html>
<table>
<tr>
<td style=\"color: #FFFFFF; height: 17px; onclick=\"window.location.href='http://www.rmeinc.com/about_rme.aspx'\" style=\"cursor:pointer; height: 17px;\"; nowrap onclick=\"window.location.href='http://www.rmeinc.com/about_rme.aspx'\" onmouseout=\"this.bgColor='#0066cc'; this.style.color='#FFFFFF'\" onmouseover=\"this.bgColor='#ffffff'; this.style.color='#0033cc'\">About Us</td>
</tr>
</table>
</html>")
end
end
def test_invalid_document_raises_exception_when_strict
assert_raise RuntimeError do
HTML::Document.new("<html>
<table>
<tr>
<td style=\"color: #FFFFFF; height: 17px; onclick=\"window.location.href='http://www.rmeinc.com/about_rme.aspx'\" style=\"cursor:pointer; height: 17px;\"; nowrap onclick=\"window.location.href='http://www.rmeinc.com/about_rme.aspx'\" onmouseout=\"this.bgColor='#0066cc'; this.style.color='#FFFFFF'\" onmouseover=\"this.bgColor='#ffffff'; this.style.color='#0033cc'\">About Us</td>
</tr>
</table>
</html>", true)
end
end
end

View file

@ -1,90 +0,0 @@
require 'abstract_unit'
require 'action_view/vendor/html-scanner/html/node'
class NodeTest < ActiveSupport::TestCase
class MockNode
def initialize(matched, value)
@matched = matched
@value = value
end
def find(conditions)
@matched && self
end
def to_s
@value.to_s
end
end
def setup
@node = HTML::Node.new("parent")
@node.children.concat [MockNode.new(false,1), MockNode.new(true,"two"), MockNode.new(false,:three)]
end
def test_match
assert !@node.match("foo")
end
def test_tag
assert !@node.tag?
end
def test_to_s
assert_equal "1twothree", @node.to_s
end
def test_find
assert_equal "two", @node.find('blah').to_s
end
def test_parse_strict
s = "<b foo='hello'' bar='baz'>"
assert_raise(RuntimeError) { HTML::Node.parse(nil,0,0,s) }
end
def test_parse_relaxed
s = "<b foo='hello'' bar='baz'>"
node = nil
assert_nothing_raised { node = HTML::Node.parse(nil,0,0,s,false) }
assert node.attributes.has_key?("foo")
assert !node.attributes.has_key?("bar")
end
def test_to_s_with_boolean_attrs
s = "<b foo bar>"
node = HTML::Node.parse(nil,0,0,s)
assert node.attributes.has_key?("foo")
assert node.attributes.has_key?("bar")
assert "<b foo bar>", node.to_s
end
def test_parse_with_unclosed_tag
s = "<span onmouseover='bang'"
node = nil
assert_nothing_raised { node = HTML::Node.parse(nil,0,0,s,false) }
assert node.attributes.has_key?("onmouseover")
end
def test_parse_with_valid_cdata_section
s = "<![CDATA[<span>contents</span>]]>"
node = nil
assert_nothing_raised { node = HTML::Node.parse(nil,0,0,s,false) }
assert_kind_of HTML::CDATA, node
assert_equal '<span>contents</span>', node.content
end
def test_parse_strict_with_unterminated_cdata_section
s = "<![CDATA[neverending..."
assert_raise(RuntimeError) { HTML::Node.parse(nil,0,0,s) }
end
def test_parse_relaxed_with_unterminated_cdata_section
s = "<![CDATA[neverending..."
node = nil
assert_nothing_raised { node = HTML::Node.parse(nil,0,0,s,false) }
assert_kind_of HTML::CDATA, node
assert_equal 'neverending...', node.content
end
end

View file

@ -1,244 +0,0 @@
require 'abstract_unit'
require 'action_view/vendor/html-scanner/html/node'
class TagNodeTest < ActiveSupport::TestCase
def test_open_without_attributes
node = tag("<tag>")
assert_equal "tag", node.name
assert_equal Hash.new, node.attributes
assert_nil node.closing
end
def test_open_with_attributes
node = tag("<TAG1 foo=hey_ho x:bar=\"blah blah\" BAZ='blah blah blah' >")
assert_equal "tag1", node.name
assert_equal "hey_ho", node["foo"]
assert_equal "blah blah", node["x:bar"]
assert_equal "blah blah blah", node["baz"]
end
def test_self_closing_without_attributes
node = tag("<tag/>")
assert_equal "tag", node.name
assert_equal Hash.new, node.attributes
assert_equal :self, node.closing
end
def test_self_closing_with_attributes
node = tag("<tag a=b/>")
assert_equal "tag", node.name
assert_equal( { "a" => "b" }, node.attributes )
assert_equal :self, node.closing
end
def test_closing_without_attributes
node = tag("</tag>")
assert_equal "tag", node.name
assert_nil node.attributes
assert_equal :close, node.closing
end
def test_bracket_op_when_no_attributes
node = tag("</tag>")
assert_nil node["foo"]
end
def test_bracket_op_when_attributes
node = tag("<tag a=b/>")
assert_equal "b", node["a"]
end
def test_attributes_with_escaped_quotes
node = tag("<tag a='b\\'c' b=\"bob \\\"float\\\"\">")
assert_equal "b\\'c", node["a"]
assert_equal "bob \\\"float\\\"", node["b"]
end
def test_to_s
node = tag("<a b=c d='f' g=\"h 'i'\" />")
node = node.to_s
assert node.include?('a')
assert node.include?('b="c"')
assert node.include?('d="f"')
assert node.include?('g="h')
assert node.include?('i')
end
def test_tag
assert tag("<tag>").tag?
end
def test_match_tag_as_string
assert tag("<tag>").match(:tag => "tag")
assert !tag("<tag>").match(:tag => "b")
end
def test_match_tag_as_regexp
assert tag("<tag>").match(:tag => /t.g/)
assert !tag("<tag>").match(:tag => /t[bqs]g/)
end
def test_match_attributes_as_string
t = tag("<tag a=something b=else />")
assert t.match(:attributes => {"a" => "something"})
assert t.match(:attributes => {"b" => "else"})
end
def test_match_attributes_as_regexp
t = tag("<tag a=something b=else />")
assert t.match(:attributes => {"a" => /^something$/})
assert t.match(:attributes => {"b" => /e.*e/})
assert t.match(:attributes => {"a" => /me..i/, "b" => /.ls.$/})
end
def test_match_attributes_as_number
t = tag("<tag a=15 b=3.1415 />")
assert t.match(:attributes => {"a" => 15})
assert t.match(:attributes => {"b" => 3.1415})
assert t.match(:attributes => {"a" => 15, "b" => 3.1415})
end
def test_match_attributes_exist
t = tag("<tag a=15 b=3.1415 />")
assert t.match(:attributes => {"a" => true})
assert t.match(:attributes => {"b" => true})
assert t.match(:attributes => {"a" => true, "b" => true})
end
def test_match_attributes_not_exist
t = tag("<tag a=15 b=3.1415 />")
assert t.match(:attributes => {"c" => false})
assert t.match(:attributes => {"c" => nil})
assert t.match(:attributes => {"a" => true, "c" => false})
end
def test_match_parent_success
t = tag("<tag a=15 b='hello'>", tag("<foo k='value'>"))
assert t.match(:parent => {:tag => "foo", :attributes => {"k" => /v.l/, "j" => false}})
end
def test_match_parent_fail
t = tag("<tag a=15 b='hello'>", tag("<foo k='value'>"))
assert !t.match(:parent => {:tag => /kafka/})
end
def test_match_child_success
t = tag("<tag x:k='something'>")
tag("<child v=john a=kelly>", t)
tag("<sib m=vaughn v=james>", t)
assert t.match(:child => { :tag => "sib", :attributes => {"v" => /j/}})
assert t.match(:child => { :attributes => {"a" => "kelly"}})
end
def test_match_child_fail
t = tag("<tag x:k='something'>")
tag("<child v=john a=kelly>", t)
tag("<sib m=vaughn v=james>", t)
assert !t.match(:child => { :tag => "sib", :attributes => {"v" => /r/}})
assert !t.match(:child => { :attributes => {"v" => false}})
end
def test_match_ancestor_success
t = tag("<tag x:k='something'>", tag("<parent v=john a=kelly>", tag("<grandparent m=vaughn v=james>")))
assert t.match(:ancestor => {:tag => "parent", :attributes => {"a" => /ll/}})
assert t.match(:ancestor => {:attributes => {"m" => "vaughn"}})
end
def test_match_ancestor_fail
t = tag("<tag x:k='something'>", tag("<parent v=john a=kelly>", tag("<grandparent m=vaughn v=james>")))
assert !t.match(:ancestor => {:tag => /^parent/, :attributes => {"v" => /m/}})
assert !t.match(:ancestor => {:attributes => {"v" => false}})
end
def test_match_descendant_success
tag("<grandchild m=vaughn v=james>", tag("<child v=john a=kelly>", t = tag("<tag x:k='something'>")))
assert t.match(:descendant => {:tag => "child", :attributes => {"a" => /ll/}})
assert t.match(:descendant => {:attributes => {"m" => "vaughn"}})
end
def test_match_descendant_fail
tag("<grandchild m=vaughn v=james>", tag("<child v=john a=kelly>", t = tag("<tag x:k='something'>")))
assert !t.match(:descendant => {:tag => /^child/, :attributes => {"v" => /m/}})
assert !t.match(:descendant => {:attributes => {"v" => false}})
end
def test_match_child_count
t = tag("<tag x:k='something'>")
tag("hello", t)
tag("<child v=john a=kelly>", t)
tag("<sib m=vaughn v=james>", t)
assert t.match(:children => { :count => 2 })
assert t.match(:children => { :count => 2..4 })
assert t.match(:children => { :less_than => 4 })
assert t.match(:children => { :greater_than => 1 })
assert !t.match(:children => { :count => 3 })
end
def test_conditions_as_strings
t = tag("<tag x:k='something'>")
assert t.match("tag" => "tag")
assert t.match("attributes" => { "x:k" => "something" })
assert !t.match("tag" => "gat")
assert !t.match("attributes" => { "x:j" => "something" })
end
def test_attributes_as_symbols
t = tag("<child v=john a=kelly>")
assert t.match(:attributes => { :v => /oh/ })
assert t.match(:attributes => { :a => /ll/ })
end
def test_match_sibling
t = tag("<tag x:k='something'>")
tag("hello", t)
tag("<span a=b>", t)
tag("world", t)
m = tag("<span k=r>", t)
tag("<span m=l>", t)
assert m.match(:sibling => {:tag => "span", :attributes => {:a => true}})
assert m.match(:sibling => {:tag => "span", :attributes => {:m => true}})
assert !m.match(:sibling => {:tag => "span", :attributes => {:k => true}})
end
def test_match_sibling_before
t = tag("<tag x:k='something'>")
tag("hello", t)
tag("<span a=b>", t)
tag("world", t)
m = tag("<span k=r>", t)
tag("<span m=l>", t)
assert m.match(:before => {:tag => "span", :attributes => {:m => true}})
assert !m.match(:before => {:tag => "span", :attributes => {:a => true}})
assert !m.match(:before => {:tag => "span", :attributes => {:k => true}})
end
def test_match_sibling_after
t = tag("<tag x:k='something'>")
tag("hello", t)
tag("<span a=b>", t)
tag("world", t)
m = tag("<span k=r>", t)
tag("<span m=l>", t)
assert m.match(:after => {:tag => "span", :attributes => {:a => true}})
assert !m.match(:after => {:tag => "span", :attributes => {:m => true}})
assert !m.match(:after => {:tag => "span", :attributes => {:k => true}})
end
def test_tag_to_s
t = tag("<b x='foo'>")
tag("hello", t)
tag("<hr />", t)
assert_equal %(<b x="foo">hello<hr /></b>), t.to_s
end
private
def tag(content, parent=nil)
node = HTML::Node.parse(parent,0,0,content)
parent.children << node if parent
node
end
end

View file

@ -1,51 +0,0 @@
require 'abstract_unit'
require 'action_view/vendor/html-scanner/html/node'
class TextNodeTest < ActiveSupport::TestCase
def setup
@node = HTML::Text.new(nil, 0, 0, "hello, howdy, aloha, annyeong")
end
def test_to_s
assert_equal "hello, howdy, aloha, annyeong", @node.to_s
end
def test_find_string
assert_equal @node, @node.find("hello, howdy, aloha, annyeong")
assert_equal false, @node.find("bogus")
end
def test_find_regexp
assert_equal @node, @node.find(/an+y/)
assert_nil @node.find(/b/)
end
def test_find_hash
assert_equal @node, @node.find(:content => /howdy/)
assert_nil @node.find(:content => /^howdy$/)
assert_equal false, @node.find(:content => "howdy")
end
def test_find_other
assert_nil @node.find(:hello)
end
def test_match_string
assert @node.match("hello, howdy, aloha, annyeong")
assert_equal false, @node.match("bogus")
end
def test_match_regexp
assert_not_nil @node, @node.match(/an+y/)
assert_nil @node.match(/b/)
end
def test_match_hash
assert_not_nil @node, @node.match(:content => "howdy")
assert_nil @node.match(:content => /^howdy$/)
end
def test_match_other
assert_nil @node.match(:hello)
end
end

View file

@ -1,132 +0,0 @@
require 'abstract_unit'
require 'action_view/vendor/html-scanner/html/tokenizer'
class TokenizerTest < ActiveSupport::TestCase
def test_blank
tokenize ""
assert_end
end
def test_space
tokenize " "
assert_next " "
assert_end
end
def test_tag_simple_open
tokenize "<tag>"
assert_next "<tag>"
assert_end
end
def test_tag_simple_self_closing
tokenize "<tag />"
assert_next "<tag />"
assert_end
end
def test_tag_simple_closing
tokenize "</tag>"
assert_next "</tag>"
end
def test_tag_with_single_quoted_attribute
tokenize %{<tag a='hello'>x}
assert_next %{<tag a='hello'>}
end
def test_tag_with_single_quoted_attribute_with_escape
tokenize %{<tag a='hello\\''>x}
assert_next %{<tag a='hello\\''>}
end
def test_tag_with_double_quoted_attribute
tokenize %{<tag a="hello">x}
assert_next %{<tag a="hello">}
end
def test_tag_with_double_quoted_attribute_with_escape
tokenize %{<tag a="hello\\"">x}
assert_next %{<tag a="hello\\"">}
end
def test_tag_with_unquoted_attribute
tokenize %{<tag a=hello>x}
assert_next %{<tag a=hello>}
end
def test_tag_with_lt_char_in_attribute
tokenize %{<tag a="x < y">x}
assert_next %{<tag a="x < y">}
end
def test_tag_with_gt_char_in_attribute
tokenize %{<tag a="x > y">x}
assert_next %{<tag a="x > y">}
end
def test_doctype_tag
tokenize %{<!DOCTYPE "blah" "blah" "blah">\n <html>}
assert_next %{<!DOCTYPE "blah" "blah" "blah">}
assert_next %{\n }
assert_next %{<html>}
end
def test_cdata_tag
tokenize %{<![CDATA[<br>]]>}
assert_next %{<![CDATA[<br>]]>}
assert_end
end
def test_unterminated_cdata_tag
tokenize %{<content:encoded><![CDATA[ neverending...}
assert_next %{<content:encoded>}
assert_next %{<![CDATA[ neverending...}
assert_end
end
def test_less_than_with_space
tokenize %{original < hello > world}
assert_next %{original }
assert_next %{< hello > world}
end
def test_less_than_without_matching_greater_than
tokenize %{hello <span onmouseover="gotcha"\n<b>foo</b>\nbar</span>}
assert_next %{hello }
assert_next %{<span onmouseover="gotcha"\n}
assert_next %{<b>}
assert_next %{foo}
assert_next %{</b>}
assert_next %{\nbar}
assert_next %{</span>}
assert_end
end
def test_unterminated_comment
tokenize %{hello <!-- neverending...}
assert_next %{hello }
assert_next %{<!-- neverending...}
assert_end
end
private
def tokenize(text)
@tokenizer = HTML::Tokenizer.new(text)
end
def assert_next(expected, message=nil)
token = @tokenizer.next
assert_equal expected, token, message
end
def assert_sequence(*expected)
assert_next expected.shift until expected.empty?
end
def assert_end(message=nil)
assert_nil @tokenizer.next, message
end
end