From f114b85d89cf98cf4a11731615df77e50901d0c1 Mon Sep 17 00:00:00 2001 From: ser Date: Fri, 1 Dec 2006 02:20:08 +0000 Subject: [PATCH] * Cross-patch from Ruby CVS; mostly Nabu edits. * Fixes ticket:68. ***** Note that this is an API change!!! ***** NOTE that this involves an API change! Entity declarations in the doctype now generate events that carry two, not one, arguments. * Implements ticket:15, using gwrite's suggestion. This allows Element to be subclassed. * Fixed namespaces handling in XPath and element. ***** Note that this is an API change!!! ***** Element.namespaces() now returns a hash of namespace mappings which are relevant for that node. * Fixes a bug in multiple decodings * The changeset 1230:1231 was bad. The default behavior is *not* to use the native REXML encodings by default, but rather to use ICONV by default. I'll have to think of a better way of managing translations, but the REXML codecs are (a) less reliable than ICONV, but more importantly (b) slower. The real solution is to use ICONV by default, but allow users to specify that they want to use the pure Ruby codecs. * Fixes ticket:61 (xpath_parser) * Fixes ticket:63 (UTF-16; UNILE decoding was bad) * Improves parsing error messages a little * Adds the ability to override the encoding detection in Source construction * Fixes an edge case in Functions::string, where document nodes weren't correctly converted * Fixes Functions::string() for Element and Document nodes * Fixes some problems in entity handling * Addresses ticket:66 * Fixes ticket:71 * Addresses ticket:78 NOTE: that this also fixes what is technically another bug in REXML. REXML's XPath parser used to allow exponential notation in numbers. The XPath spec is specific about what a number is, and scientific notation is not included. Therefore, this has been fixed. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_1_8@11315 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- lib/rexml/element.rb | 25 +++++++----------- lib/rexml/encoding.rb | 18 +++++++------ lib/rexml/encodings/UNILE.rb | 2 +- lib/rexml/encodings/UTF-16.rb | 3 ++- lib/rexml/functions.rb | 24 ++++++++++++++--- lib/rexml/node.rb | 6 ++--- lib/rexml/parsers/baseparser.rb | 4 --- lib/rexml/parsers/sax2parser.rb | 4 +++ lib/rexml/parsers/treeparser.rb | 3 ++- lib/rexml/rexml.rb | 8 +++--- lib/rexml/sax2listener.rb | 2 +- lib/rexml/source.rb | 25 +++++++++++++----- lib/rexml/text.rb | 47 +++++++++++++++++---------------- lib/rexml/xpath_parser.rb | 46 ++++++++++++++++++++++++++------ 14 files changed, 136 insertions(+), 81 deletions(-) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 435076420a..11e2039609 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -94,7 +94,7 @@ module REXML # new_a = d.root.clone # puts new_a # => "" def clone - Element.new self + self.class.new self end # Evaluates to the root node of the document that this element @@ -200,9 +200,9 @@ module REXML end def namespaces - namespaces = [] + namespaces = {} namespaces = parent.namespaces if parent - namespaces |= attributes.namespaces + namespaces = namespaces.merge( attributes.namespaces ) return namespaces end @@ -494,13 +494,12 @@ module REXML # doc.root.add_element 'c' #-> 'Elliott' # doc.root.text = 'Russell' #-> 'Russell' # doc.root.text = nil #-> '' - def text=( text ) + def text=( text ) if text.kind_of? String text = Text.new( text, whitespace(), nil, raw() ) elsif text and !text.kind_of? Text text = Text.new( text.to_s, whitespace(), nil, raw() ) end - old_text = get_text if text.nil? old_text.remove unless old_text.nil? @@ -557,13 +556,9 @@ module REXML ################################################# def attribute( name, namespace=nil ) - prefix = '' - if namespace - prefix = attributes.prefixes.each { |prefix| - return "#{prefix}:" if namespace( prefix ) == namespace - } || '' - end - attributes.get_attribute( "#{prefix}#{name}" ) + prefix = nil + prefix = namespaces.index(namespace) if namespace + attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) end # Evaluates to +true+ if this element has any attributes set, false @@ -1172,16 +1167,16 @@ module REXML end def namespaces - namespaces = [] + namespaces = {} each_attribute do |attribute| - namespaces << attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' + namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' end if @element.document and @element.document.doctype expn = @element.expanded_name expn = @element.document.doctype.name if expn.size == 0 @element.document.doctype.attributes_of(expn).each { |attribute| - namespaces << attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' + namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' } end namespaces diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index f003d6cc3b..e35c3acf7c 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -24,21 +24,22 @@ module REXML old_verbosity = $VERBOSE begin $VERBOSE = false - return if defined? @encoding and enc == @encoding + enc = enc.nil? ? nil : enc.upcase + return false if defined? @encoding and enc == @encoding if enc and enc != UTF_8 - @encoding = enc.upcase + @encoding = enc + raise ArgumentError, "Bad encoding name #@encoding" unless @encoding =~ /^[\w-]+$/ + @encoding.untaint begin require 'rexml/encodings/ICONV.rb' Encoding.apply(self, "ICONV") - rescue LoadError, Exception => err - raise ArgumentError, "Bad encoding name #@encoding" unless @encoding =~ /^[\w-]+$/ - @encoding.untaint - enc_file = File.join( "rexml", "encodings", "#@encoding.rb" ) + rescue LoadError, Exception begin + enc_file = File.join( "rexml", "encodings", "#@encoding.rb" ) require enc_file Encoding.apply(self, @encoding) - rescue LoadError - puts $!.message + rescue LoadError => err + puts err.message raise ArgumentError, "No decoder found for encoding #@encoding. Please install iconv." end end @@ -50,6 +51,7 @@ module REXML ensure $VERBOSE = old_verbosity end + true end def check_encoding str diff --git a/lib/rexml/encodings/UNILE.rb b/lib/rexml/encodings/UNILE.rb index 0560a08361..d054140c40 100644 --- a/lib/rexml/encodings/UNILE.rb +++ b/lib/rexml/encodings/UNILE.rb @@ -18,7 +18,7 @@ module REXML def decode_unile(str) array_enc=str.unpack('C*') array_utf8 = [] - 2.step(array_enc.size-1, 2){|i| + 0.step(array_enc.size-1, 2){|i| array_utf8 << (array_enc.at(i) + array_enc.at(i+1)*0x100) } array_utf8.pack('U*') diff --git a/lib/rexml/encodings/UTF-16.rb b/lib/rexml/encodings/UTF-16.rb index 972169755e..792adfd44d 100644 --- a/lib/rexml/encodings/UTF-16.rb +++ b/lib/rexml/encodings/UTF-16.rb @@ -16,9 +16,10 @@ module REXML end def decode_utf16(str) + str = str[2..-1] if /^\376\377/ =~ str array_enc=str.unpack('C*') array_utf8 = [] - 2.step(array_enc.size-1, 2){|i| + 0.step(array_enc.size-1, 2){|i| array_utf8 << (array_enc.at(i+1) + array_enc.at(i)*0x100) } array_utf8.pack('U*') diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb index d741dbdab7..cad4f6a8c9 100644 --- a/lib/rexml/functions.rb +++ b/lib/rexml/functions.rb @@ -117,16 +117,30 @@ module REXML elsif defined? object.node_type if object.node_type == :attribute object.value - elsif object.node_type == :element - object.text + elsif object.node_type == :element || object.node_type == :document + string_value(object) else object.to_s end + elsif object.nil? + return "" else object.to_s end end + def Functions::string_value( o ) + rv = "" + o.children.each { |e| + if e.node_type == :text + rv << e.to_s + elsif e.node_type == :element + rv << string_value( e ) + end + } + rv + end + # UNTESTED def Functions::concat( *objects ) objects.join @@ -139,7 +153,7 @@ module REXML # Fixed by Mike Stok def Functions::contains( string, test ) - string(string).include? string(test) + string(string).include?(string(test)) end # Kouhei fixed this @@ -326,7 +340,9 @@ module REXML else str = string( object ) #puts "STRING OF #{object.inspect} = #{str}" - if str =~ /^-?\.?\d/ + # If XPath ever gets scientific notation... + #if str =~ /^\s*-?(\d*\.?\d+|\d+\.)([Ee]\d*)?\s*$/ + if str =~ /^\s*-?(\d*\.?\d+|\d+\.)\s*$/ str.to_f else (0.0 / 0.0) diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index e5dec72a9d..7226e5be6c 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -55,10 +55,8 @@ module REXML return nil end - # Returns the index that +self+ has in its parent's elements array, so that - # the following equation holds true: - # - # node == node.parent.elements[node.index_in_parent] + # Returns the position that +self+ holds in its parent's array, indexed + # from 1. def index_in_parent parent.index(self)+1 end diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index c57ea58dc7..fecd801d6f 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -146,8 +146,6 @@ module REXML # Returns true if there are no more events def empty? - #STDERR.puts "@source.empty? = #{@source.empty?}" - #STDERR.puts "@stack.empty? = #{@stack.empty?}" return (@source.empty? and @stack.empty?) end @@ -365,8 +363,6 @@ module REXML else md = @source.match( TEXT_PATTERN, true ) if md[0].length == 0 - puts "EMPTY = #{empty?}" - puts "BUFFER = \"#{@source.buffer}\"" @source.match( /(\s+)/, true ) end #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 61a216cec1..6c7fbe000a 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -16,6 +16,10 @@ module REXML @tag_stack = [] @entities = {} end + + def source + @parser.source + end def add_listener( listener ) @parser.add_listener( listener ) diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index 500a53f426..a53fa41925 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -23,7 +23,8 @@ module REXML case event[0] when :end_document unless tag_stack.empty? - raise ParseException.new("No close tag for #{tag_stack.inspect}") + #raise ParseException.new("No close tag for #{tag_stack.inspect}") + raise ParseException.new("No close tag for #{@build_context.xpath}") end return when :start_element diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 68759ab3f8..bff1cd9815 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -10,8 +10,8 @@ # # Main page:: http://www.germane-software.com/software/rexml # Author:: Sean Russell -# Version:: 3.1.5 -# Date:: 2006/250 +# Version:: 3.1.6 +# Date:: 2006/335 # # This API documentation can be downloaded from the REXML home page, or can # be accessed online[http://www.germane-software.com/software/rexml_doc] @@ -21,8 +21,8 @@ # online[http://www.germane-software.com/software/rexml/docs/tutorial.html] module REXML COPYRIGHT = "Copyright © 2001-2006 Sean Russell " - DATE = "2006/250" - VERSION = "3.1.5" + DATE = "2006/335" + VERSION = "3.1.6" Copyright = COPYRIGHT Version = VERSION diff --git a/lib/rexml/sax2listener.rb b/lib/rexml/sax2listener.rb index 9a992917e6..8db1389d06 100644 --- a/lib/rexml/sax2listener.rb +++ b/lib/rexml/sax2listener.rb @@ -70,7 +70,7 @@ module REXML # ["open-hatch", "PUBLIC", "\"-//Textuality//TEXT Standard open-hatch boilerplate//EN\"", "\"http://www.textuality.com/boilerplate/OpenHatch.xml\""] # # ["hatch-pic", "SYSTEM", "\"../grafix/OpenHatch.gif\"", "\n\t\t\t\t\t\t\tNDATA gif", "gif"] - def entitydecl content + def entitydecl name, decl end # def notationdecl content diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index c51f504811..2fee99c0e9 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -6,7 +6,7 @@ module REXML # Generates a Source object # @param arg Either a String, or an IO # @return a Source, or nil if a bad argument was given - def SourceFactory::create_from arg#, slurp=true + def SourceFactory::create_from(arg) if arg.kind_of? String Source.new(arg) elsif arg.respond_to? :read and @@ -35,16 +35,23 @@ module REXML # Constructor # @param arg must be a String, and should be a valid XML document - def initialize(arg) + # @param encoding if non-null, sets the encoding of the source to this + # value, overriding all encoding detection + def initialize(arg, encoding=nil) @orig = @buffer = arg - self.encoding = check_encoding( @buffer ) + if encoding + self.encoding = encoding + else + self.encoding = check_encoding( @buffer ) + end @line = 0 end + # Inherited from Encoding # Overridden to support optimized en/decoding def encoding=(enc) - super + return unless super @line_break = encode( '>' ) if enc != UTF_8 @buffer = decode(@buffer) @@ -124,7 +131,7 @@ module REXML #attr_reader :block_size # block_size has been deprecated - def initialize(arg, block_size=500) + def initialize(arg, block_size=500, encoding=nil) @er_source = @source = arg @to_utf = false # Determining the encoding is a deceptively difficult issue to resolve. @@ -134,10 +141,12 @@ module REXML # if there is one. If there isn't one, the file MUST be UTF-8, as per # the XML spec. If there is one, we can determine the encoding from # it. + @buffer = "" str = @source.read( 2 ) - if /\A(?:\xfe\xff|\xff\xfe)/n =~ str + if encoding + self.encoding = encoding + elsif /\A(?:\xfe\xff|\xff\xfe)/n =~ str self.encoding = check_encoding( str ) - @line_break = encode( '>' ) else @line_break = '>' end @@ -159,6 +168,8 @@ module REXML str = @source.readline(@line_break) str = decode(str) if @to_utf and str @buffer << str + rescue Iconv::IllegalSequence + raise rescue @source = nil end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 55bc9f50f8..3de9170623 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -42,6 +42,7 @@ module REXML # Use this field if you have entities defined for some text, and you don't # want REXML to escape that text in output. # Text.new( "<&", false, nil, false ) #-> "<&" + # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;" # Text.new( "<&", false, nil, true ) #-> Parse exception # Text.new( "<&", false, nil, true ) #-> "<&" # # Assume that the entity "s" is defined to be "sean" @@ -172,17 +173,6 @@ module REXML end @unnormalized = Text::unnormalize( @string, doctype ) end - - def wrap(string, width, addnewline=false) - # Recursivly wrap string at width. - return string if string.length <= width - place = string.rindex(' ', width) # Position in string with last ' ' before cutoff - if addnewline then - return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) - else - return string[0,place] + "\n" + wrap(string[place+1..-1], width) - end - end # Sets the contents of this text node. This expects the text to be # unnormalized. It returns self. @@ -198,17 +188,28 @@ module REXML @raw = false end - def indent_text(string, level=1, style="\t", indentfirstline=true) - return string if level < 0 - new_string = '' - string.each { |line| - indent_string = style * level - new_line = (indent_string + line).sub(/[\s]+$/,'') - new_string << new_line - } - new_string.strip! unless indentfirstline - return new_string + def wrap(string, width, addnewline=false) + # Recursivly wrap string at width. + return string if string.length <= width + place = string.rindex(' ', width) # Position in string with last ' ' before cutoff + if addnewline then + return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) + else + return string[0,place] + "\n" + wrap(string[place+1..-1], width) + end end + + def indent_text(string, level=1, style="\t", indentfirstline=true) + return string if level < 0 + new_string = '' + string.each { |line| + indent_string = style * level + new_line = (indent_string + line).sub(/[\s]+$/,'') + new_string << new_line + } + new_string.strip! unless indentfirstline + return new_string + end def write( writer, indent=-1, transitive=false, ie_hack=false ) s = to_s() @@ -286,9 +287,10 @@ module REXML def Text::normalize( input, doctype=nil, entity_filter=nil ) copy = input # Doing it like this rather than in a loop improves the speed + #copy = copy.gsub( EREFERENCE, '&' ) + copy = copy.gsub( "&", "&" ) if doctype # Replace all ampersands that aren't part of an entity - copy = copy.gsub( EREFERENCE, '&' ) doctype.entities.each_value do |entity| copy = copy.gsub( entity.value, "&#{entity.name};" ) if entity.value and @@ -296,7 +298,6 @@ module REXML end else # Replace all ampersands that aren't part of an entity - copy = copy.gsub( EREFERENCE, '&' ) DocType::DEFAULT_ENTITIES.each_value do |entity| copy = copy.gsub(entity.value, "&#{entity.name};" ) end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index a813236e10..3393113d6a 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -162,6 +162,10 @@ module REXML while path_stack.length > 0 #puts "Path stack = #{path_stack.inspect}" #puts "Nodeset is #{nodeset.inspect}" + if nodeset.length == 0 + path_stack.clear + return [] + end case (op = path_stack.shift) when :document nodeset = [ nodeset[0].root_node ] @@ -235,9 +239,11 @@ module REXML name = path_stack.shift for element in nodeset if element.node_type == :element - #puts element.name - attr = element.attribute( name, get_namespace(element, prefix) ) - new_nodeset << attr if attr + #puts "Element name = #{element.name}" + #puts "get_namespace( #{element.inspect}, #{prefix} ) = #{get_namespace(element, prefix)}" + attrib = element.attribute( name, get_namespace(element, prefix) ) + #puts "attrib = #{attrib.inspect}" + new_nodeset << attrib if attrib end end when :any @@ -299,8 +305,10 @@ module REXML #puts "Adding node #{node.inspect}" if result == (index+1) new_nodeset << node if result == (index+1) elsif result.instance_of? Array - #puts "Adding node #{node.inspect}" if result.size > 0 - new_nodeset << node if result.size > 0 + if result.size > 0 and result.inject(false) {|k,s| s or k} + #puts "Adding node #{node.inspect}" if result.size > 0 + new_nodeset << node if result.size > 0 + end else #puts "Adding node #{node.inspect}" if result new_nodeset << node if result @@ -381,9 +389,19 @@ module REXML node_types = ELEMENTS when :namespace - new_set = [] + new_nodeset = [] + prefix = path_stack.shift for node in nodeset - new_nodeset << node.namespace if node.node_type == :element or node.node_type == :attribute + if (node.node_type == :element or node.node_type == :attribute) + if (node.node_type == :element) + namespaces = node.namespaces + else + namespaces = node.element.namesapces + end + if (node.namespace == namespaces[prefix]) + new_nodeset << node + end + end end nodeset = new_nodeset @@ -404,6 +422,18 @@ module REXML #puts "RES => #{res.inspect}" return res + when :and + left = expr( path_stack.shift, nodeset.dup, context ) + #puts "LEFT => #{left.inspect} (#{left.class.name})" + if left == false || left.nil? || !left.inject(false) {|a,b| a | b} + return [] + end + right = expr( path_stack.shift, nodeset.dup, context ) + #puts "RIGHT => #{right.inspect} (#{right.class.name})" + res = equality_relational_compare( left, op, right ) + #puts "RES => #{res.inspect}" + return res + when :div left = Functions::number(expr(path_stack.shift, nodeset, context)).to_f right = Functions::number(expr(path_stack.shift, nodeset, context)).to_f @@ -477,7 +507,7 @@ module REXML # The next two methods are BAD MOJO! # This is my achilles heel. If anybody thinks of a better # way of doing this, be my guest. This really sucks, but - # it took me three days to get it to work at all. + # it is a wonder it works at all. # ######################################################## def descendant_or_self( path_stack, nodeset )