From 7a07ba45a001475b257734cd1f46166c73f5519c Mon Sep 17 00:00:00 2001 From: ser Date: Fri, 2 Apr 2004 03:26:19 +0000 Subject: [PATCH] REXML changes backported from the 1.9 branch: * Minor source documentation changes * Changes to the pretty-printing code, including the addition of the word- wrap submission. * Bug fix for missing quotations in NOTATION DTD items * Bug fixes and improvements to whitespace handling in text nodes * Refactoring and bug fixes in encoding support * Minor speed optimizations in the core parser * Bug fixes in the SAX2 parserthe core parser * Copyright fixes * Version bump to REXML 3.0.0 * A change that caused speed degredation has been reversed * Addition of a value=() method in Text, for replacing the contents of a text node * Fixed the document order of the descendant-or-self axis in XPath git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_1_8@6071 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- lib/rexml/comment.rb | 5 +++- lib/rexml/doctype.rb | 13 +++++++-- lib/rexml/document.rb | 7 +++-- lib/rexml/dtd/entitydecl.rb | 2 +- lib/rexml/dtd/notationdecl.rb | 2 +- lib/rexml/element.rb | 17 +++++------ lib/rexml/encodings/ISO-8859-1.rb | 4 +-- lib/rexml/encodings/SHIFT_JIS.rb | 34 +--------------------- lib/rexml/encodings/US-ASCII.rb | 4 +-- lib/rexml/node.rb | 7 ++++- lib/rexml/parsers/baseparser.rb | 17 ++++++----- lib/rexml/parsers/sax2parser.rb | 9 +++--- lib/rexml/rexml.rb | 6 ++-- lib/rexml/source.rb | 16 ++++------- lib/rexml/text.rb | 47 ++++++++++++++++++++++++++++++- lib/rexml/xpath_parser.rb | 42 +++++++++++++++++++-------- 16 files changed, 141 insertions(+), 91 deletions(-) diff --git a/lib/rexml/comment.rb b/lib/rexml/comment.rb index e439ddf9d8..7c3e79fe2a 100644 --- a/lib/rexml/comment.rb +++ b/lib/rexml/comment.rb @@ -39,7 +39,10 @@ module REXML # indentation will be this number of spaces, and children will be # indented an additional amount. # transitive:: - # Who knows? + # If transitive is true and indent is >= 0, then the output will be + # pretty-printed in such a way that the added whitespace does not affect + # the absolute *value* of the document -- that is, it leaves the value + # and number of Text nodes in the document unchanged. # ie_hack:: # Internet Explorer is the worst piece of crap to have ever been # written, with the possible exception of Windows itself. Since IE is diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb index 084676afa9..b523155f8f 100644 --- a/lib/rexml/doctype.rb +++ b/lib/rexml/doctype.rb @@ -92,7 +92,10 @@ module REXML # indentation will be this number of spaces, and children will be # indented an additional amount. # transitive:: - # Who knows? + # If transitive is true and indent is >= 0, then the output will be + # pretty-printed in such a way that the added whitespace does not affect + # the absolute *value* of the document -- that is, it leaves the value + # and number of Text nodes in the document unchanged. # ie_hack:: # Internet Explorer is the worst piece of crap to have ever been # written, with the possible exception of Windows itself. Since IE is @@ -109,7 +112,7 @@ module REXML output << " #@long_name" if @long_name output << " #@uri" if @uri unless @children.empty? - next_indent = indent + 2 + next_indent = indent + 1 output << ' [' child = nil # speed @children.each { |child| @@ -123,6 +126,10 @@ module REXML output << STOP end + def context + @parent.context + end + def entity( name ) @entities[name].unnormalized if @entities[name] end @@ -185,7 +192,7 @@ module REXML end def to_s - "" + "" end def write( output, indent=-1 ) diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 1eefaea92a..52500f2afd 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -145,7 +145,10 @@ module REXML # indentation will be this number of spaces, and children will be # indented an additional amount. Defaults to -1 # transitive:: - # What the heck does this do? Defaults to false + # If transitive is true and indent is >= 0, then the output will be + # pretty-printed in such a way that the added whitespace does not affect + # the absolute *value* of the document -- that is, it leaves the value + # and number of Text nodes in the document unchanged. # ie_hack:: # Internet Explorer is the worst piece of crap to have ever been # written, with the possible exception of Windows itself. Since IE is @@ -191,7 +194,7 @@ module REXML build_context[-1] << event[1] else build_context.add( - Text.new( event[1], true, nil, true ) + Text.new( event[1], build_context.whitespace, nil, true ) ) unless ( event[1].strip.size==0 and build_context.ignore_whitespace_nodes diff --git a/lib/rexml/dtd/entitydecl.rb b/lib/rexml/dtd/entitydecl.rb index 164825570f..a5f1520f2b 100644 --- a/lib/rexml/dtd/entitydecl.rb +++ b/lib/rexml/dtd/entitydecl.rb @@ -42,7 +42,7 @@ module REXML end def write( output, indent ) - output << (' '*indent) if indent > 0 + indent( output, indent ) output << to_s end diff --git a/lib/rexml/dtd/notationdecl.rb b/lib/rexml/dtd/notationdecl.rb index d577ce0631..a47ff8f24b 100644 --- a/lib/rexml/dtd/notationdecl.rb +++ b/lib/rexml/dtd/notationdecl.rb @@ -25,7 +25,7 @@ module REXML end def write( output, indent ) - output << (' '*indent) if indent > 0 + indent( output, indent ) output << to_s end diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index ffaeddbf54..b61d811141 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -98,8 +98,9 @@ module REXML # is the case if: # 1. Neither :+respect_whitespace+ nor :+compress_whitespace+ has any value # 2. The context has :+respect_whitespace+ set to :+all+ or - # an array containing the name of this element, and :+compress_whitespace+ - # isn't set to :+all+ or an array containing the name of this element. + # an array containing the name of this element, and + # :+compress_whitespace+ isn't set to :+all+ or an array containing the + # name of this element. # The evaluation is tested against +expanded_name+, and so is namespace # sensitive. def whitespace @@ -606,7 +607,9 @@ module REXML # indentation will be this number of spaces, and children will be # indented an additional amount. Defaults to -1 # transitive:: - # What the heck does this do? Defaults to false + # If transitive is true and indent is >= 0, then the output will be + # pretty-printed in such a way that the added whitespace does not affect + # the parse tree of the document # ie_hack:: # Internet Explorer is the worst piece of crap to have ever been # written, with the possible exception of Windows itself. Since IE is @@ -632,7 +635,7 @@ module REXML else if transitive and indent>-1 and !@children[0].kind_of? Text writer << "\n" - indent writer, indent+2 + indent writer, indent+1 end writer << ">" write_children( writer, indent, transitive, ie_hack ) @@ -640,7 +643,7 @@ module REXML end if transitive and indent>-1 writer << "\n" - indent -= 2 if next_sibling.nil? + indent -= 1 if next_sibling.nil? indent(writer, indent) end writer << ">" @@ -661,12 +664,10 @@ module REXML # A private helper method def write_children( writer, indent, transitive, ie_hack ) cr = (indent < 0) ? '' : "\n" - #if size == 1 and @children[0].kind_of?(Text) - # self[0].write( writer, -1 ) if indent == -1 each { |child| child.write( writer, indent, transitive, ie_hack ) } else - next_indent = indent+2 + next_indent = indent+1 last_child=nil each { |child| unless child.kind_of? Text or last_child.kind_of? Text or transitive diff --git a/lib/rexml/encodings/ISO-8859-1.rb b/lib/rexml/encodings/ISO-8859-1.rb index 32ddfbc909..f4e4527c2d 100644 --- a/lib/rexml/encodings/ISO-8859-1.rb +++ b/lib/rexml/encodings/ISO-8859-1.rb @@ -1,6 +1,6 @@ module REXML module Encoding - @@__REXML_encoding_methods =<<-'EOL' + @@__REXML_encoding_methods = %q~ # Convert from UTF-8 def encode content array_utf8 = content.unpack('U*') @@ -20,6 +20,6 @@ module REXML def decode(str) str.unpack('C*').pack('U*') end - EOL + ~ end end diff --git a/lib/rexml/encodings/SHIFT_JIS.rb b/lib/rexml/encodings/SHIFT_JIS.rb index 27e4569403..e355704a7c 100644 --- a/lib/rexml/encodings/SHIFT_JIS.rb +++ b/lib/rexml/encodings/SHIFT_JIS.rb @@ -1,33 +1 @@ -begin - require 'uconv' - - module REXML - module Encoding - def to_shift_jis content - Uconv::u8tosjis(content) - end - - def from_shift_jis(str) - Uconv::sjistou8(str) - end - end - end -rescue LoadError - begin - require 'iconv' - module REXML - module Encoding - def from_shift_jis(str) - return Iconv::iconv("utf-8", "shift_jis", str).join('') - end - - def to_shift_jis content - return Iconv::iconv("shift_jis", "utf-8", content).join('') - end - end - end - rescue LoadError - raise "uconv or iconv is required for Japanese encoding support." - end - -end +require 'rexml/encodings/SHIFT-JIS' diff --git a/lib/rexml/encodings/US-ASCII.rb b/lib/rexml/encodings/US-ASCII.rb index 32ddfbc909..f4e4527c2d 100644 --- a/lib/rexml/encodings/US-ASCII.rb +++ b/lib/rexml/encodings/US-ASCII.rb @@ -1,6 +1,6 @@ module REXML module Encoding - @@__REXML_encoding_methods =<<-'EOL' + @@__REXML_encoding_methods = %q~ # Convert from UTF-8 def encode content array_utf8 = content.unpack('U*') @@ -20,6 +20,6 @@ module REXML def decode(str) str.unpack('C*').pack('U*') end - EOL + ~ end end diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index 41d9eee43b..5f414c03ef 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -25,7 +25,12 @@ module REXML end def indent to, ind - to << " "*ind unless ind<1 + if @parent and @parent.context and not @parent.context[:indentstyle].nil? then + indentstyle = @parent.context[:indentstyle] + else + indentstyle = ' ' + end + to << indentstyle*ind unless ind<1 end def parent? diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 057617d6e8..025d43db54 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -122,14 +122,14 @@ module REXML # Returns true if there are no more events def empty? - !has_next? + #puts "@source.empty? = #{@source.empty?}" + #puts "@stack.empty? = #{@stack.empty?}" + return (@source.empty? and @stack.empty?) end # Returns true if there are more events. Synonymous with !empty? def has_next? - return true if @closed - @source.read if @source.buffer.size==0 and !@source.empty? - (!@source.empty? and @source.buffer.strip.size>0) or @stack.size>0 or @closed + return !(@source.empty? and @stack.empty?) end # Push an event back on the head of the stream. This method @@ -329,9 +329,12 @@ module REXML end else md = @source.match( TEXT_PATTERN, true ) - #md = @source.match_to_consume( '<', TEXT_PATTERN ) - #@source.read - raise REXML::ParseException("no text to add") if md[0].length == 0 + if md[0].length == 0 + #puts "EMPTY = #{empty?}" + #puts "BUFFER = \"#{@source.buffer}\"" + @source.match( /(\s+)/, true ) + end + #return [ :text, "" ] if md[0].length == 0 # unnormalized = Text::unnormalize( md[1], self ) # return PullEvent.new( :text, md[1], unnormalized ) return [ :text, md[1] ] diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index aab87caae4..8c82cf8fc1 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -45,7 +45,7 @@ module REXML if args.size == 2 args[1].each { |match| @procs << [args[0], match, blok] } else - add( [args[0], /.*/, blok] ) + add( [args[0], nil, blok] ) end elsif args[0].kind_of? Array if args.size == 2 @@ -54,7 +54,7 @@ module REXML args[0].each { |match| add( [ :start_element, match, blok ] ) } end else - add([nil, /.*/, args[0]]) + add([nil, nil, args[0]]) end end @@ -164,9 +164,10 @@ module REXML def get_procs( symbol, name ) return nil if @procs.size == 0 @procs.find_all do |sym, match, block| + #puts sym.inspect+"=="+symbol.inspect+ "\t"+match.inspect+"=="+name.inspect+ "\t"+( (sym.nil? or symbol == sym) and ((name.nil? and match.nil?) or match.nil? or ( (name == match) or (match.kind_of? Regexp and name =~ match)))).to_s ( (sym.nil? or symbol == sym) and - (name.nil? or ( + ((name.nil? and match.nil?) or match.nil? or ( (name == match) or (match.kind_of? Regexp and name =~ match) ) @@ -179,7 +180,7 @@ module REXML @listeners.find_all do |sym, match, block| ( (sym.nil? or symbol == sym) and - (name.nil? or ( + ((name.nil? and match.nil?) or match.nil? or ( (name == match) or (match.kind_of? Regexp and name =~ match) ) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index b632188571..67b36a87f6 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -20,7 +20,7 @@ # be accessed online at http://www.germane-software.com/software/rexml_doc # A tutorial is available in docs/tutorial.html module REXML - Copyright = "Copyright #{Time.now.year} Sean Russell " - Date = "+2003/346" - Version = "2.7.3" + Copyright = "Copyright © 2001, 2002, 2003, 2004 Sean Russell " + Date = "+2004/088" + Version = "3.0.0" end diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 2110e6db66..a524e483ef 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -31,7 +31,6 @@ module REXML def initialize(arg) @orig = @buffer = arg self.encoding = check_encoding( @buffer ) - #@buffer = decode(@buffer) unless @encoding == UTF_8 @line = 0 end @@ -96,7 +95,7 @@ module REXML # @return true if the Source is exhausted def empty? - @buffer.nil? + @buffer == "" end # @return the current line in the source @@ -113,17 +112,14 @@ module REXML class IOSource < Source #attr_reader :block_size + # block_size has been deprecated def initialize(arg, block_size=500) @er_source = @source = arg @to_utf = false - # READLINE OPT - # The following was commented out when IOSource started using readline - # to pull the data from the stream. - #@block_size = block_size - #super @source.read(@block_size) - @line_break = '>' - #super @source.readline( "\n" ) - super @source.readline( @line_break )+@source.read + # FIXME + # This is broken. If the user puts in enough carriage returns, this can fail + # to calculate the correct encoding. + super @source.read( 100 ) @line_break = encode( '>' ) end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 2494ad9e8a..2e54f9fa11 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -164,9 +164,54 @@ module REXML end @unnormalized = Text::unnormalize( @string, doctype ) end + + def wrap(string, width, addnewline=false) + # Recursivly wrap string at width. + return string if string.length <= width + place = string.rindex(' ', width) # Position in string with last ' ' before cutoff + if addnewline then + return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) + else + return string[0,place] + "\n" + wrap(string[place+1..-1], width) + end + end + # Sets the contents of this text node. This expects the text to be + # unnormalized. It returns self. + # + # e = Element.new( "a" ) + # e.add_text( "foo" ) # foo + # e[0].value = "bar" # bar + # e[0].value = "" # <a> + def value=( val ) + @string = val.gsub( /\r\n?/, "\n" ) + @unnormalized = nil + @normalized = nil + @raw = false + end + + def indent(string, level=1, style="\t", indentfirstline=true) + return string if level < 0 + new_string = '' + string.each { |line| + indent_string = style * level + new_line = (indent_string + line).sub(/[\s]+$/,'') + new_string << new_line + } + new_string.strip! unless indentfirstline + return new_string + end + def write( writer, indent=-1, transitive=false, ie_hack=false ) - writer << to_s() + s = to_s() + if not (@parent and @parent.whitespace) then + s = wrap(s, 60, false) if @parent and @parent.context[:wordwrap] == :all + if @parent and not @parent.context[:indentstyle].nil? and indent > 0 and s.count("\n") > 0 + s = indent(s, indent, @parent.context[:indentstyle], false) + end + s.squeeze!(" \n\t") if @parent and !@parent.whitespace + end + writer << s end # Writes out text, substituting special characters beforehand. diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 9cd1e5d64c..377a51e885 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -29,7 +29,8 @@ module REXML def parse path, nodeset path_stack = @parser.parse( path ) - #puts "PARSE: #{path} => #{path_stack.inspect}" + #puts "PARSE: #{path} => #{path_stack.inspect}" + #puts "PARSE: nodeset = #{nodeset.collect{|x|x.to_s}.inspect}" match( path_stack, nodeset ) end @@ -46,7 +47,7 @@ module REXML def match( path_stack, nodeset ) while ( path_stack.size > 0 and nodeset.size > 0 ) - #puts "PARSE: #{path_stack.inspect} '#{nodeset.collect{|n|n.type}.inspect}'" + #puts "PARSE: #{path_stack.inspect} '#{nodeset.collect{|n|n.class}.inspect}'" nodeset = internal_parse( path_stack, nodeset ) #puts "NODESET: #{nodeset.size}" #puts "PATH_STACK: #{path_stack.inspect}" @@ -55,8 +56,9 @@ module REXML end def internal_parse path_stack, nodeset + #puts "INTERNAL_PARSE RETURNING WITH NO RESULTS" if nodeset.size == 0 or path_stack.size == 0 return nodeset if nodeset.size == 0 or path_stack.size == 0 - #puts "INTERNAL_PARSE: #{path_stack.inspect}, #{nodeset.collect{|n| n.type}.inspect}" + #puts "INTERNAL_PARSE: #{path_stack.inspect}, #{nodeset.collect{|n| n.class}.inspect}" case path_stack.shift when :document return [ nodeset[0].root.parent ] @@ -205,7 +207,7 @@ module REXML Functions::index = index+1 #puts "Node #{node} and index=#{index+1}" result = Predicate( predicate, node ) - #puts "Predicate returned #{result} (#{result.type}) for #{node.type}" + #puts "Predicate returned #{result} (#{result.class}) for #{node.class}" if result.kind_of? Numeric #puts "#{result} == #{index} => #{result == index}" new_nodeset << node if result == (index+1) @@ -285,6 +287,7 @@ module REXML end ########################################################## + # FIXME # The next two methods are BAD MOJO! # This is my achilles heel. If anybody thinks of a better # way of doing this, be my guest. This really sucks, but @@ -294,24 +297,39 @@ module REXML def descendant_or_self( path_stack, nodeset ) rs = [] d_o_s( path_stack, nodeset, rs ) - #puts "RS = #{rs.collect{|n|n.to_s}.inspect}" - rs.flatten.compact + #puts "RS = #{rs.collect{|n|n.to_s}.inspect}" + document_order(rs.flatten.compact) end def d_o_s( p, ns, r ) - #puts r.collect{|n|n.to_s}.inspect - #puts ns.collect{|n|n.to_s}.inspect nt = nil ns.each_index do |i| n = ns[i] x = match( p.clone, [ n ] ) - #puts "Got a match on #{p.inspect} for #{ns.collect{|n|n.to_s+"("+n.type.to_s+")"}.inspect}" nt = n.node_type - d_o_s( p, n.children, x ) if nt == :element or nt == :document - r[i,0] = [x] if x.size > 0 + d_o_s( p, n.children, x ) if nt == :element or nt == :document and n.children.size > 0 + r.concat(x) if x.size > 0 end end + + # Reorders an array of nodes so that they are in document order + # It tries to do this efficiently. + def document_order( array_of_nodes ) + new_arry = [] + array_of_nodes.each { |node| + node_idx = [] + np = node.node_type == :attribute ? node.element : node + while np.parent and np.parent.node_type == :element + node_idx << np.parent.children.index( np ) + np = np.parent + end + new_arry << [ node_idx.reverse.join, node ] + } + new_arry.sort{ |s1, s2| s1[0] <=> s2[0] }.collect{ |s| s[1] } + end + + def recurse( nodeset, &block ) for node in nodeset yield node @@ -324,7 +342,7 @@ module REXML def Predicate( predicate, node ) predicate = predicate.clone #puts "#"*20 - #puts "Predicate( #{predicate.inspect}, #{node.type} )" + #puts "Predicate( #{predicate.inspect}, #{node.class} )" results = [] case (predicate[0]) when :and, :or, :eq, :neq, :lt, :lteq, :gt, :gteq