diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index a5a58055b8..a169148f32 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -101,20 +101,20 @@ module REXML end @unnormalized = nil - @value = @normalized = Text::normalize( @value, doctype ) + @normalized = Text::normalize( @value, doctype ) end # Returns the UNNORMALIZED value of this attribute. That is, entities # have been expanded to their values def value - @unnormalized if @unnormalized + return @unnormalized if @unnormalized doctype = nil if @element doc = @element.document doctype = doc.doctype if doc end @normalized = nil - @value = @unnormalized = Text::unnormalize( @value, doctype ) + @unnormalized = Text::unnormalize( @value, doctype ) end # Returns a copy of this attribute diff --git a/lib/rexml/cdata.rb b/lib/rexml/cdata.rb index ffedac1b53..046012ba61 100644 --- a/lib/rexml/cdata.rb +++ b/lib/rexml/cdata.rb @@ -35,6 +35,10 @@ module REXML @string end + def value + @string + end + # Generates XML output of this object # # output:: diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb index 652a04fce2..4a1ffb4336 100644 --- a/lib/rexml/doctype.rb +++ b/lib/rexml/doctype.rb @@ -6,55 +6,55 @@ require 'rexml/attlistdecl' require 'rexml/xmltokens' module REXML - # Represents an XML DOCTYPE declaration; that is, the contents of . DOCTYPES can be used to declare the DTD of a document, as well as - # being used to declare entities used in the document. - class DocType < Parent - include XMLTokens - START = "" - SYSTEM = "SYSTEM" - PUBLIC = "PUBLIC" - DEFAULT_ENTITIES = { - 'gt'=>EntityConst::GT, - 'lt'=>EntityConst::LT, - 'quot'=>EntityConst::QUOT, - "apos"=>EntityConst::APOS - } + # Represents an XML DOCTYPE declaration; that is, the contents of . DOCTYPES can be used to declare the DTD of a document, as well as + # being used to declare entities used in the document. + class DocType < Parent + include XMLTokens + START = "" + SYSTEM = "SYSTEM" + PUBLIC = "PUBLIC" + DEFAULT_ENTITIES = { + 'gt'=>EntityConst::GT, + 'lt'=>EntityConst::LT, + 'quot'=>EntityConst::QUOT, + "apos"=>EntityConst::APOS + } - # name is the name of the doctype - # external_id is the referenced DTD, if given - attr_reader :name, :external_id, :entities, :namespaces + # name is the name of the doctype + # external_id is the referenced DTD, if given + attr_reader :name, :external_id, :entities, :namespaces - # Constructor - # - # dt = DocType.new( 'foo', '-//I/Hate/External/IDs' ) - # # - # dt = DocType.new( doctype_to_clone ) - # # Incomplete. Shallow clone of doctype + # Constructor + # + # dt = DocType.new( 'foo', '-//I/Hate/External/IDs' ) + # # + # dt = DocType.new( doctype_to_clone ) + # # Incomplete. Shallow clone of doctype # # +Note+ that the constructor: # # Doctype.new( Source.new( "" ) ) # # is _deprecated_. Do not use it. It will probably disappear. - def initialize( first, parent=nil ) - @entities = DEFAULT_ENTITIES - @long_name = @uri = nil - if first.kind_of? String - super() - @name = first - @external_id = parent - elsif first.kind_of? DocType - super( parent ) - @name = first.name - @external_id = first.external_id - elsif first.kind_of? Array - super( parent ) - @name = first[0] - @external_id = first[1] - @long_name = first[2] - @uri = first[3] + def initialize( first, parent=nil ) + @entities = DEFAULT_ENTITIES + @long_name = @uri = nil + if first.kind_of? String + super() + @name = first + @external_id = parent + elsif first.kind_of? DocType + super( parent ) + @name = first.name + @external_id = first.external_id + elsif first.kind_of? Array + super( parent ) + @name = first[0] + @external_id = first[1] + @long_name = first[2] + @uri = first[3] elsif first.kind_of? Source super( parent ) parser = Parsers::BaseParser.new( first ) @@ -64,150 +64,215 @@ module REXML end else super() - end - end + end + end - def node_type - :doctype - end + def node_type + :doctype + end - def attributes_of element - rv = [] - each do |child| - child.each do |key,val| - rv << Attribute.new(key,val) - end if child.kind_of? AttlistDecl and child.element_name == element - end - rv - end + def attributes_of element + rv = [] + each do |child| + child.each do |key,val| + rv << Attribute.new(key,val) + end if child.kind_of? AttlistDecl and child.element_name == element + end + rv + end - def attribute_of element, attribute - att_decl = find do |child| - child.kind_of? AttlistDecl and - child.element_name == element and - child.include? attribute - end - return nil unless att_decl - att_decl[attribute] - end + def attribute_of element, attribute + att_decl = find do |child| + child.kind_of? AttlistDecl and + child.element_name == element and + child.include? attribute + end + return nil unless att_decl + att_decl[attribute] + end - def clone - DocType.new self - end + def clone + DocType.new self + end - # output:: - # Where to write the string - # indent:: - # An integer. If -1, no indenting will be used; otherwise, the - # indentation will be this number of spaces, and children will be - # indented an additional amount. - # transitive:: - # If transitive is true and indent is >= 0, then the output will be - # pretty-printed in such a way that the added whitespace does not affect - # the absolute *value* of the document -- that is, it leaves the value - # and number of Text nodes in the document unchanged. - # ie_hack:: - # Internet Explorer is the worst piece of crap to have ever been - # written, with the possible exception of Windows itself. Since IE is - # unable to parse proper XML, we have to provide a hack to generate XML - # that IE's limited abilities can handle. This hack inserts a space - # before the /> on empty tags. - # - def write( output, indent=0, transitive=false, ie_hack=false ) - indent( output, indent ) - output << START - output << ' ' - output << @name - output << " #@external_id" if @external_id - output << " #@long_name" if @long_name - output << " #@uri" if @uri - unless @children.empty? - next_indent = indent + 1 - output << ' [' - child = nil # speed - @children.each { |child| - output << "\n" - child.write( output, next_indent ) - } - output << "\n" - #output << ' '*next_indent - output << "]" - end - output << STOP - end + # output:: + # Where to write the string + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. + # transitive:: + # If transitive is true and indent is >= 0, then the output will be + # pretty-printed in such a way that the added whitespace does not affect + # the absolute *value* of the document -- that is, it leaves the value + # and number of Text nodes in the document unchanged. + # ie_hack:: + # Internet Explorer is the worst piece of crap to have ever been + # written, with the possible exception of Windows itself. Since IE is + # unable to parse proper XML, we have to provide a hack to generate XML + # that IE's limited abilities can handle. This hack inserts a space + # before the /> on empty tags. + # + def write( output, indent=0, transitive=false, ie_hack=false ) + indent( output, indent ) + output << START + output << ' ' + output << @name + output << " #@external_id" if @external_id + output << " #@long_name" if @long_name + output << " #@uri" if @uri + unless @children.empty? + next_indent = indent + 1 + output << ' [' + child = nil # speed + @children.each { |child| + output << "\n" + child.write( output, next_indent ) + } + #output << ' '*next_indent + output << "\n]" + end + output << STOP + end def context @parent.context end - def entity( name ) - @entities[name].unnormalized if @entities[name] - end + def entity( name ) + @entities[name].unnormalized if @entities[name] + end - def add child - super(child) - @entities = DEFAULT_ENTITIES.clone if @entities == DEFAULT_ENTITIES - @entities[ child.name ] = child if child.kind_of? Entity - end - end + def add child + super(child) + @entities = DEFAULT_ENTITIES.clone if @entities == DEFAULT_ENTITIES + @entities[ child.name ] = child if child.kind_of? Entity + end + + # This method retrieves the public identifier identifying the document's + # DTD. + # + # Method contributed by Henrik Martensson + def public + case @external_id + when "SYSTEM" + nil + when "PUBLIC" + strip_quotes(@long_name) + end + end + + # This method retrieves the system identifier identifying the document's DTD + # + # Method contributed by Henrik Martensson + def system + case @external_id + when "SYSTEM" + strip_quotes(@long_name) + when "PUBLIC" + @uri.kind_of?(String) ? strip_quotes(@uri) : nil + end + end + + # This method returns a list of notations that have been declared in the + # _internal_ DTD subset. Notations in the external DTD subset are not + # listed. + # + # Method contributed by Henrik Martensson + def notations + children().select {|node| node.kind_of?(REXML::NotationDecl)} + end + + # Retrieves a named notation. Only notations declared in the internal + # DTD subset can be retrieved. + # + # Method contributed by Henrik Martensson + def notation(name) + notations.find { |notation_decl| + notation_decl.name == name + } + end + + private + + # Method contributed by Henrik Martensson + def strip_quotes(quoted_string) + quoted_string =~ /^[\'\"].*[\´\"]$/ ? + quoted_string[1, quoted_string.length-2] : + quoted_string + end + end - # We don't really handle any of these since we're not a validating - # parser, so we can be pretty dumb about them. All we need to be able - # to do is spew them back out on a write() + # We don't really handle any of these since we're not a validating + # parser, so we can be pretty dumb about them. All we need to be able + # to do is spew them back out on a write() - # This is an abstract class. You never use this directly; it serves as a - # parent class for the specific declarations. - class Declaration < Child - def initialize src - super() - @string = src - end + # This is an abstract class. You never use this directly; it serves as a + # parent class for the specific declarations. + class Declaration < Child + def initialize src + super() + @string = src + end - def to_s - @string+'>' - end + def to_s + @string+'>' + end - def write( output, indent ) - output << (' '*indent) if indent > 0 - output << to_s - end - end - - public - class ElementDecl < Declaration - def initialize( src ) - super - end - end + def write( output, indent ) + output << (' '*indent) if indent > 0 + output << to_s + end + end + + public + class ElementDecl < Declaration + def initialize( src ) + super + end + end - class ExternalEntity < Child - def initialize( src ) - super() - @entity = src - end - def to_s - @entity - end - def write( output, indent ) - output << @entity - output << "\n" - end - end + class ExternalEntity < Child + def initialize( src ) + super() + @entity = src + end + def to_s + @entity + end + def write( output, indent ) + output << @entity + end + end - class NotationDecl < Child - def initialize name, middle, rest - @name = name - @middle = middle - @rest = rest - end + class NotationDecl < Child + attr_accessor :public, :system + def initialize name, middle, pub, sys + super(nil) + @name = name + @middle = middle + @public = pub + @system = sys + end - def to_s - "" - end + def to_s + "" + end - def write( output, indent=-1 ) - output << (' '*indent) if indent > 0 - output << to_s - end - end + def write( output, indent=-1 ) + output << (' '*indent) if indent > 0 + output << to_s + end + + # This method retrieves the name of the notation. + # + # Method contributed by Henrik Martensson + def name + @name + end + end end diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 8755e04de1..619a844257 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -16,166 +16,178 @@ module REXML # Document has a single child that can be accessed by root(). # Note that if you want to have an XML declaration written for a document # you create, you must add one; REXML documents do not write a default - # declaration for you. See |DECLARATION| and |write|. - class Document < Element - # A convenient default XML declaration. If you want an XML declaration, - # the easiest way to add one is mydoc << Document::DECLARATION + # declaration for you. See |DECLARATION| and |write|. + class Document < Element + # A convenient default XML declaration. If you want an XML declaration, + # the easiest way to add one is mydoc << Document::DECLARATION # +DEPRECATED+ # Use: mydoc << XMLDecl.default - DECLARATION = XMLDecl.default + DECLARATION = XMLDecl.default - # Constructor - # @param source if supplied, must be a Document, String, or IO. - # Documents have their context and Element attributes cloned. - # Strings are expected to be valid XML documents. IOs are expected - # to be sources of valid XML documents. - # @param context if supplied, contains the context of the document; - # this should be a Hash. - # NOTE that I'm not sure what the context is for; I cloned it out of - # the Electric XML API (in which it also seems to do nothing), and it - # is now legacy. It may do something, someday... it may disappear. - def initialize( source = nil, context = {} ) - super() - @context = context - return if source.nil? - if source.kind_of? Document - @context = source.context - super source - else - build( source ) - end - end + # Constructor + # @param source if supplied, must be a Document, String, or IO. + # Documents have their context and Element attributes cloned. + # Strings are expected to be valid XML documents. IOs are expected + # to be sources of valid XML documents. + # @param context if supplied, contains the context of the document; + # this should be a Hash. + # NOTE that I'm not sure what the context is for; I cloned it out of + # the Electric XML API (in which it also seems to do nothing), and it + # is now legacy. It may do something, someday... it may disappear. + def initialize( source = nil, context = {} ) + super() + @context = context + return if source.nil? + if source.kind_of? Document + @context = source.context + super source + else + build( source ) + end + end def node_type :document end - # Should be obvious - def clone - Document.new self - end + # Should be obvious + def clone + Document.new self + end - # According to the XML spec, a root node has no expanded name - def expanded_name - '' - #d = doc_type - #d ? d.name : "UNDEFINED" - end + # According to the XML spec, a root node has no expanded name + def expanded_name + '' + #d = doc_type + #d ? d.name : "UNDEFINED" + end - alias :name :expanded_name + alias :name :expanded_name - # We override this, because XMLDecls and DocTypes must go at the start - # of the document - def add( child ) - if child.kind_of? XMLDecl - @children.unshift child - elsif child.kind_of? DocType - if @children[0].kind_of? XMLDecl - @children[1,0] = child - else - @children.unshift child + # We override this, because XMLDecls and DocTypes must go at the start + # of the document + def add( child ) + if child.kind_of? XMLDecl + @children.unshift child + elsif child.kind_of? DocType + # Find first Element or DocType node and insert the decl right + # before it. If there is no such node, just insert the child at the + # end. If there is a child and it is an DocType, then replace it. + insert_before_index = 0 + @children.find { |x| + insert_before_index += 1 + x.kind_of?(Element) || x.kind_of?(DocType) + } + if @children[ insert_before_index ] # Not null = not end of list + if @children[ insert_before_index ].kind_of DocType + @children[ insert_before_index ] = child + else + @children[ index_before_index-1, 0 ] = child + end + else # Insert at end of list + @children[insert_before_index] = child end - child.parent = self - else - rv = super - raise "attempted adding second root element to document" if @elements.size > 1 - rv - end - end - alias :<< :add + child.parent = self + else + rv = super + raise "attempted adding second root element to document" if @elements.size > 1 + rv + end + end + alias :<< :add - def add_element(arg=nil, arg2=nil) - rv = super - raise "attempted adding second root element to document" if @elements.size > 1 - rv - end + def add_element(arg=nil, arg2=nil) + rv = super + raise "attempted adding second root element to document" if @elements.size > 1 + rv + end - # @return the root Element of the document, or nil if this document - # has no children. - def root + # @return the root Element of the document, or nil if this document + # has no children. + def root elements[1] #self #@children.find { |item| item.kind_of? Element } - end + end - # @return the DocType child of the document, if one exists, - # and nil otherwise. - def doctype - @children.find { |item| item.kind_of? DocType } - end + # @return the DocType child of the document, if one exists, + # and nil otherwise. + def doctype + @children.find { |item| item.kind_of? DocType } + end - # @return the XMLDecl of this document; if no XMLDecl has been - # set, the default declaration is returned. - def xml_decl - rv = @children[0] + # @return the XMLDecl of this document; if no XMLDecl has been + # set, the default declaration is returned. + def xml_decl + rv = @children[0] return rv if rv.kind_of? XMLDecl rv = @children.unshift(XMLDecl.default)[0] - end + end - # @return the XMLDecl version of this document as a String. - # If no XMLDecl has been set, returns the default version. - def version - xml_decl().version - end + # @return the XMLDecl version of this document as a String. + # If no XMLDecl has been set, returns the default version. + def version + xml_decl().version + end - # @return the XMLDecl encoding of this document as a String. - # If no XMLDecl has been set, returns the default encoding. - def encoding - xml_decl().encoding - end + # @return the XMLDecl encoding of this document as a String. + # If no XMLDecl has been set, returns the default encoding. + def encoding + xml_decl().encoding + end - # @return the XMLDecl standalone value of this document as a String. - # If no XMLDecl has been set, returns the default setting. - def stand_alone? - xml_decl().stand_alone? - end + # @return the XMLDecl standalone value of this document as a String. + # If no XMLDecl has been set, returns the default setting. + def stand_alone? + xml_decl().stand_alone? + end - # Write the XML tree out, optionally with indent. This writes out the - # entire XML document, including XML declarations, doctype declarations, - # and processing instructions (if any are given). - # A controversial point is whether Document should always write the XML - # declaration () whether or not one is given by the - # user (or source document). REXML does not write one if one was not - # specified, because it adds unneccessary bandwidth to applications such - # as XML-RPC. - # - # - # output:: - # output an object which supports '<< string'; this is where the - # document will be written. - # indent:: - # An integer. If -1, no indenting will be used; otherwise, the - # indentation will be this number of spaces, and children will be - # indented an additional amount. Defaults to -1 - # transitive:: - # If transitive is true and indent is >= 0, then the output will be - # pretty-printed in such a way that the added whitespace does not affect - # the absolute *value* of the document -- that is, it leaves the value - # and number of Text nodes in the document unchanged. - # ie_hack:: - # Internet Explorer is the worst piece of crap to have ever been - # written, with the possible exception of Windows itself. Since IE is - # unable to parse proper XML, we have to provide a hack to generate XML - # that IE's limited abilities can handle. This hack inserts a space - # before the /> on empty tags. Defaults to false - def write( output=$stdout, indent_level=-1, transitive=false, ie_hack=false ) - output = Output.new( output, xml_decl.encoding ) if xml_decl.encoding != "UTF-8" && !output.kind_of?(Output) - @children.each { |node| - indent( output, indent_level ) if node.node_type == :element - if node.write( output, indent_level, transitive, ie_hack ) - output << "\n" unless indent_level<0 or node == @children[-1] + # Write the XML tree out, optionally with indent. This writes out the + # entire XML document, including XML declarations, doctype declarations, + # and processing instructions (if any are given). + # A controversial point is whether Document should always write the XML + # declaration () whether or not one is given by the + # user (or source document). REXML does not write one if one was not + # specified, because it adds unneccessary bandwidth to applications such + # as XML-RPC. + # + # + # output:: + # output an object which supports '<< string'; this is where the + # document will be written. + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. Defaults to -1 + # transitive:: + # If transitive is true and indent is >= 0, then the output will be + # pretty-printed in such a way that the added whitespace does not affect + # the absolute *value* of the document -- that is, it leaves the value + # and number of Text nodes in the document unchanged. + # ie_hack:: + # Internet Explorer is the worst piece of crap to have ever been + # written, with the possible exception of Windows itself. Since IE is + # unable to parse proper XML, we have to provide a hack to generate XML + # that IE's limited abilities can handle. This hack inserts a space + # before the /> on empty tags. Defaults to false + def write( output=$stdout, indent=-1, transitive=false, ie_hack=false ) + output = Output.new( output, xml_decl.encoding ) if xml_decl.encoding != "UTF-8" && !output.kind_of?(Output) + @children.each { |node| + indent( output, indent ) if node.node_type == :element + if node.write( output, indent, transitive, ie_hack ) + output << "\n" unless indent<0 or node == @children[-1] end - } - end + } + end - - def Document::parse_stream( source, listener ) - Parsers::StreamParser.new( source, listener ).parse - end + + def Document::parse_stream( source, listener ) + Parsers::StreamParser.new( source, listener ).parse + end - private - def build( source ) + private + def build( source ) Parsers::TreeParser.new( source, self ).parse - end - end + end + end end diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 7f578ecb3d..80463d95b7 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -36,8 +36,6 @@ module REXML # If an Element, the object will be shallowly cloned; name, # attributes, and namespaces will be copied. Children will +not+ be # copied. - # If a Source, the source will be scanned and parsed for an Element, - # and all child elements will be recursively parsed as well. # parent:: # if supplied, must be a Parent, and will be used as # the parent of this object. @@ -223,7 +221,7 @@ module REXML # b.namespace("y") # -> '2' def namespace(prefix=nil) if prefix.nil? - prefix = self.prefix() + prefix = prefix() end if prefix == '' prefix = "xmlns" @@ -715,7 +713,7 @@ module REXML private def __to_xpath_helper node - rv = node.expanded_name + rv = node.expanded_name.clone if node.parent results = node.parent.find_all {|n| n.kind_of?(REXML::Element) and n.expanded_name == node.expanded_name @@ -1226,5 +1224,20 @@ module REXML rv.each{ |attr| attr.remove } return rv end + + # The +get_attribute_ns+ method retrieves a method by its namespace + # and name. Thus it is possible to reliably identify an attribute + # even if an XML processor has changed the prefix. + # + # Method contributed by Henrik Martensson + def get_attribute_ns(namespace, name) + each_attribute() { |attribute| + if name == attribute.name && + namespace == attribute.namespace() + return attribute + end + } + nil + end end end diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index edd3e80dfe..644957439e 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -1,58 +1,64 @@ # -*- mode: ruby; ruby-indent-level: 2; indent-tabs-mode: t; tab-width: 2 -*- vim: sw=2 ts=2 module REXML - module Encoding - @encoding_methods = {} - def self.register(enc, &block) - @encoding_methods[enc] = block - end - def self.apply(obj, enc) - @encoding_methods[enc][obj] - end - def self.encoding_method(enc) - @encoding_methods[enc] - end + module Encoding + @encoding_methods = {} + def self.register(enc, &block) + @encoding_methods[enc] = block + end + def self.apply(obj, enc) + @encoding_methods[enc][obj] + end + def self.encoding_method(enc) + @encoding_methods[enc] + end - # Native, default format is UTF-8, so it is declared here rather than in - # an encodings/ definition. - UTF_8 = 'UTF-8' - UTF_16 = 'UTF-16' - UNILE = 'UNILE' + # Native, default format is UTF-8, so it is declared here rather than in + # an encodings/ definition. + UTF_8 = 'UTF-8' + UTF_16 = 'UTF-16' + UNILE = 'UNILE' - # ID ---> Encoding name - attr_reader :encoding - def encoding=( enc ) - old_verbosity = $VERBOSE - begin - $VERBOSE = false - return if defined? @encoding and enc == @encoding - if enc - raise ArgumentError, "Bad encoding name #{enc}" unless /\A[\w-]+\z/n =~ enc - @encoding = enc.upcase.untaint - else - @encoding = UTF_8 - end - err = nil - [@encoding, "ICONV"].each do |enc| - begin - require File.join("rexml", "encodings", "#{enc}.rb") - return Encoding.apply(self, enc) - rescue LoadError, Exception => err - end - end - puts err.message - raise ArgumentError, "No decoder found for encoding #@encoding. Please install iconv." - ensure - $VERBOSE = old_verbosity - end - end + # ID ---> Encoding name + attr_reader :encoding + def encoding=( enc ) + old_verbosity = $VERBOSE + begin + $VERBOSE = false + return if defined? @encoding and enc == @encoding + if enc and enc != UTF_8 + @encoding = enc.upcase + begin + require 'rexml/encodings/ICONV.rb' + Encoding.apply(self, "ICONV") + rescue LoadError, Exception => err + raise ArgumentError, "Bad encoding name #@encoding" unless @encoding =~ /^[\w-]+$/ + @encoding.untaint + enc_file = File.join( "rexml", "encodings", "#@encoding.rb" ) + begin + require enc_file + Encoding.apply(self, @encoding) + rescue LoadError + puts $!.message + raise ArgumentError, "No decoder found for encoding #@encoding. Please install iconv." + end + end + else + @encoding = UTF_8 + require 'rexml/encodings/UTF-8.rb' + Encoding.apply(self, @encoding) + end + ensure + $VERBOSE = old_verbosity + end + end - def check_encoding str - # We have to recognize UTF-16, LSB UTF-16, and UTF-8 - return UTF_16 if str[0] == 254 && str[1] == 255 - return UNILE if str[0] == 255 && str[1] == 254 - str =~ /^\s*This API is experimental, and subject to change. - # parser = PullParser.new( "texttxet" ) - # while parser.has_next? - # res = parser.next - # puts res[1]['att'] if res.start_tag? and res[0] == 'b' - # end - # See the PullEvent class for information on the content of the results. - # The data is identical to the arguments passed for the various events to - # the StreamListener API. - # - # Notice that: - # parser = PullParser.new( "BAD DOCUMENT" ) - # while parser.has_next? - # res = parser.next - # raise res[1] if res.error? - # end - # - # Nat Price gave me some good ideas for the API. - class BaseParser - NCNAME_STR= '[\w:][\-\w\d.]*' - NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" + module Parsers + # = Using the Pull Parser + # This API is experimental, and subject to change. + # parser = PullParser.new( "texttxet" ) + # while parser.has_next? + # res = parser.next + # puts res[1]['att'] if res.start_tag? and res[0] == 'b' + # end + # See the PullEvent class for information on the content of the results. + # The data is identical to the arguments passed for the various events to + # the StreamListener API. + # + # Notice that: + # parser = PullParser.new( "BAD DOCUMENT" ) + # while parser.has_next? + # res = parser.next + # raise res[1] if res.error? + # end + # + # Nat Price gave me some good ideas for the API. + class BaseParser + NCNAME_STR= '[\w:][\-\w\d.]*' + NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" - NAMECHAR = '[\-\w\d\.:]' - NAME = "([\\w:]#{NAMECHAR}*)" - NMTOKEN = "(?:#{NAMECHAR})+" - NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" - REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" - REFERENCE_RE = /#{REFERENCE}/ + NAMECHAR = '[\-\w\d\.:]' + NAME = "([\\w:]#{NAMECHAR}*)" + NMTOKEN = "(?:#{NAMECHAR})+" + NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" + REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" + REFERENCE_RE = /#{REFERENCE}/ - DOCTYPE_START = /\A\s*)/um - ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um - COMMENT_START = /\A/um - CDATA_START = /\A/um - CDATA_PATTERN = //um - XMLDECL_START = /\A<\?xml\s/u; - XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>*/um - INSTRUCTION_START = /\A<\?/u - INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um - TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{NAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/um - CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um + DOCTYPE_START = /\A\s*)/um + ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um + COMMENT_START = /\A/um + CDATA_START = /\A/um + CDATA_PATTERN = //um + XMLDECL_START = /\A<\?xml\s/u; + XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um + INSTRUCTION_START = /\A<\?/u + INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um + TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{NAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/um + CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um - VERSION = /\bversion\s*=\s*["'](.*?)['"]/um - ENCODING = /\bencoding=["'](.*?)['"]/um - STANDALONE = /\bstandalone=["'](.*?)['"]/um + VERSION = /\bversion\s*=\s*["'](.*?)['"]/um + ENCODING = /\bencoding=["'](.*?)['"]/um + STANDALONE = /\bstandalone=["'](.*?)['"]/um - ENTITY_START = /^\s*/um - SYSTEMENTITY = /^\s*(%.*?;)\s*$/um - ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" - NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" - ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" - ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" - ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" - DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" - ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" - ATTDEF_RE = /#{ATTDEF}/ - ATTLISTDECL_START = /^\s*/um - NOTATIONDECL_START = /^\s*/um - SYSTEM = /^\s*/um + ENTITY_START = /^\s*/um + SYSTEMENTITY = /^\s*(%.*?;)\s*$/um + ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" + NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" + ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" + ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" + ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" + DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" + ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" + ATTDEF_RE = /#{ATTDEF}/ + ATTLISTDECL_START = /^\s*/um + NOTATIONDECL_START = /^\s*/um + SYSTEM = /^\s*/um - TEXT_PATTERN = /\A([^<]*)/um + TEXT_PATTERN = /\A([^<]*)/um - # Entity constants - PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" - SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} - PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} - EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" - NDATADECL = "\\s+NDATA\\s+#{NAME}" - PEREFERENCE = "%#{NAME};" - ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} - PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" - ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" - PEDECL = "" - GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + # Entity constants + PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" + SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} + PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} + EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" + NDATADECL = "\\s+NDATA\\s+#{NAME}" + PEREFERENCE = "%#{NAME};" + ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} + PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" + ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" + PEDECL = "" + GEDECL = "" + ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um - EREFERENCE = /&(?!#{NAME};)/ + EREFERENCE = /&(?!#{NAME};)/ - DEFAULT_ENTITIES = { - 'gt' => [/>/, '>', '>', />/], - 'lt' => [/</, '<', '<', / [/"/, '"', '"', /"/], - "apos" => [/'/, "'", "'", /'/] - } + DEFAULT_ENTITIES = { + 'gt' => [/>/, '>', '>', />/], + 'lt' => [/</, '<', '<', / [/"/, '"', '"', /"/], + "apos" => [/'/, "'", "'", /'/] + } - def initialize( source ) - self.stream = source - end + def initialize( source ) + self.stream = source + end def add_listener( listener ) if !defined?(@listeners) or !@listeners @@ -119,315 +119,320 @@ module REXML attr_reader :source - def stream=( source ) - if source.kind_of? String - @source = Source.new(source) - elsif source.kind_of? IO - @source = IOSource.new(source) - elsif source.kind_of? Source - @source = source - elsif defined? StringIO and source.kind_of? StringIO - @source = IOSource.new(source) - else - raise "#{source.class} is not a valid input stream. It must be \n"+ - "either a String, IO, StringIO or Source." - end - @closed = nil - @document_status = nil - @tags = [] - @stack = [] - @entities = [] - end + def stream=( source ) + @source = SourceFactory.create_from( source ) + @closed = nil + @document_status = nil + @tags = [] + @stack = [] + @entities = [] + end - # Returns true if there are no more events - def empty? - #puts "@source.empty? = #{@source.empty?}" - #puts "@stack.empty? = #{@stack.empty?}" + def position + if @source.respond_to? :position + @source.position + else + # FIXME + 0 + end + end + + # Returns true if there are no more events + def empty? + #STDERR.puts "@source.empty? = #{@source.empty?}" + #STDERR.puts "@stack.empty? = #{@stack.empty?}" return (@source.empty? and @stack.empty?) - end + end - # Returns true if there are more events. Synonymous with !empty? - def has_next? + # Returns true if there are more events. Synonymous with !empty? + def has_next? return !(@source.empty? and @stack.empty?) - end + end - # Push an event back on the head of the stream. This method - # has (theoretically) infinite depth. - def unshift token - @stack.unshift(token) - end + # Push an event back on the head of the stream. This method + # has (theoretically) infinite depth. + def unshift token + @stack.unshift(token) + end - # Peek at the +depth+ event in the stack. The first element on the stack - # is at depth 0. If +depth+ is -1, will parse to the end of the input - # stream and return the last event, which is always :end_document. - # Be aware that this causes the stream to be parsed up to the +depth+ - # event, so you can effectively pre-parse the entire document (pull the - # entire thing into memory) using this method. - def peek depth=0 - raise %Q[Illegal argument "#{depth}"] if depth < -1 - temp = [] - if depth == -1 - temp.push(pull()) until empty? - else - while @stack.size+temp.size < depth+1 - temp.push(pull()) - end - end - @stack += temp if temp.size > 0 - @stack[depth] - end + # Peek at the +depth+ event in the stack. The first element on the stack + # is at depth 0. If +depth+ is -1, will parse to the end of the input + # stream and return the last event, which is always :end_document. + # Be aware that this causes the stream to be parsed up to the +depth+ + # event, so you can effectively pre-parse the entire document (pull the + # entire thing into memory) using this method. + def peek depth=0 + raise %Q[Illegal argument "#{depth}"] if depth < -1 + temp = [] + if depth == -1 + temp.push(pull()) until empty? + else + while @stack.size+temp.size < depth+1 + temp.push(pull()) + end + end + @stack += temp if temp.size > 0 + @stack[depth] + end - # Returns the next event. This is a +PullEvent+ object. - def pull - if @closed - x, @closed = @closed, nil - return [ :end_element, x ] - end - return [ :end_document ] if empty? - return @stack.shift if @stack.size > 0 - @source.read if @source.buffer.size<2 - if @document_status == nil - @source.consume( /^\s*/um ) - word = @source.match( /(<[^>]*)>/um ) - word = word[1] unless word.nil? - case word - when COMMENT_START - return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] - when XMLDECL_START - results = @source.match( XMLDECL_PATTERN, true )[1] - version = VERSION.match( results ) - version = version[1] unless version.nil? - encoding = ENCODING.match(results) - encoding = encoding[1] unless encoding.nil? - @source.encoding = encoding - standalone = STANDALONE.match(results) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone] - when INSTRUCTION_START - return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] - when DOCTYPE_START - md = @source.match( DOCTYPE_PATTERN, true ) - identity = md[1] - close = md[2] - identity =~ IDENTITY - name = $1 - raise REXML::ParseException("DOCTYPE is missing a name") if name.nil? - pub_sys = $2.nil? ? nil : $2.strip - long_name = $3.nil? ? nil : $3.strip - uri = $4.nil? ? nil : $4.strip - args = [ :start_doctype, name, pub_sys, long_name, uri ] - if close == ">" - @document_status = :after_doctype - @source.read if @source.buffer.size<2 - md = @source.match(/^\s*/um, true) - @stack << [ :end_doctype ] - else - @document_status = :in_doctype - end - return args - else - @document_status = :after_doctype - @source.read if @source.buffer.size<2 - md = @source.match(/\s*/um, true) - end - end - if @document_status == :in_doctype - md = @source.match(/\s*(.*?>)/um) - case md[1] - when SYSTEMENTITY - match = @source.match( SYSTEMENTITY, true )[1] - return [ :externalentity, match ] + # Returns the next event. This is a +PullEvent+ object. + def pull + if @closed + x, @closed = @closed, nil + return [ :end_element, x ] + end + return [ :end_document ] if empty? + return @stack.shift if @stack.size > 0 + @source.read if @source.buffer.size<2 + #STDERR.puts "BUFFER = #{@source.buffer.inspect}" + if @document_status == nil + #@source.consume( /^\s*/um ) + word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um ) + word = word[1] unless word.nil? + #STDERR.puts "WORD = #{word.inspect}" + case word + when COMMENT_START + return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] + when XMLDECL_START + #STDERR.puts "XMLDECL" + results = @source.match( XMLDECL_PATTERN, true )[1] + version = VERSION.match( results ) + version = version[1] unless version.nil? + encoding = ENCODING.match(results) + encoding = encoding[1] unless encoding.nil? + @source.encoding = encoding + standalone = STANDALONE.match(results) + standalone = standalone[1] unless standalone.nil? + return [ :xmldecl, version, encoding, standalone ] + when INSTRUCTION_START + return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] + when DOCTYPE_START + md = @source.match( DOCTYPE_PATTERN, true ) + identity = md[1] + close = md[2] + identity =~ IDENTITY + name = $1 + raise REXML::ParseException("DOCTYPE is missing a name") if name.nil? + pub_sys = $2.nil? ? nil : $2.strip + long_name = $3.nil? ? nil : $3.strip + uri = $4.nil? ? nil : $4.strip + args = [ :start_doctype, name, pub_sys, long_name, uri ] + if close == ">" + @document_status = :after_doctype + @source.read if @source.buffer.size<2 + md = @source.match(/^\s*/um, true) + @stack << [ :end_doctype ] + else + @document_status = :in_doctype + end + return args + when /^\s+/ + else + @document_status = :after_doctype + @source.read if @source.buffer.size<2 + md = @source.match(/\s*/um, true) + end + end + if @document_status == :in_doctype + md = @source.match(/\s*(.*?>)/um) + case md[1] + when SYSTEMENTITY + match = @source.match( SYSTEMENTITY, true )[1] + return [ :externalentity, match ] - when ELEMENTDECL_START - return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] + when ELEMENTDECL_START + return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] - when ENTITY_START - match = @source.match( ENTITYDECL, true ).to_a.compact - match[0] = :entitydecl - ref = false - if match[1] == '%' - ref = true - match.delete_at 1 - end - # Now we have to sort out what kind of entity reference this is - if match[2] == 'SYSTEM' - # External reference - match[3] = match[3][1..-2] # PUBID - match.delete_at(4) if match.size > 4 # Chop out NDATA decl - # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] - elsif match[2] == 'PUBLIC' - # External reference - match[3] = match[3][1..-2] # PUBID - match[4] = match[4][1..-2] # HREF - # match is [ :entity, name, PUBLIC, pubid, href ] - else - match[2] = match[2][1..-2] - match.pop if match.size == 4 - # match is [ :entity, name, value ] - end - match << '%' if ref - return match - when ATTLISTDECL_START - md = @source.match( ATTLISTDECL_PATTERN, true ) - raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? - element = md[1] - contents = md[0] + when ENTITY_START + match = @source.match( ENTITYDECL, true ).to_a.compact + match[0] = :entitydecl + ref = false + if match[1] == '%' + ref = true + match.delete_at 1 + end + # Now we have to sort out what kind of entity reference this is + if match[2] == 'SYSTEM' + # External reference + match[3] = match[3][1..-2] # PUBID + match.delete_at(4) if match.size > 4 # Chop out NDATA decl + # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] + elsif match[2] == 'PUBLIC' + # External reference + match[3] = match[3][1..-2] # PUBID + match[4] = match[4][1..-2] # HREF + # match is [ :entity, name, PUBLIC, pubid, href ] + else + match[2] = match[2][1..-2] + match.pop if match.size == 4 + # match is [ :entity, name, value ] + end + match << '%' if ref + return match + when ATTLISTDECL_START + md = @source.match( ATTLISTDECL_PATTERN, true ) + raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? + element = md[1] + contents = md[0] - pairs = {} - values = md[0].scan( ATTDEF_RE ) - values.each do |attdef| - unless attdef[3] == "#IMPLIED" - attdef.compact! - val = attdef[3] - val = attdef[4] if val == "#FIXED " - pairs[attdef[0]] = val - end - end - return [ :attlistdecl, element, pairs, contents ] - when NOTATIONDECL_START - md = nil - if @source.match( PUBLIC ) - md = @source.match( PUBLIC, true ) - elsif @source.match( SYSTEM ) - md = @source.match( SYSTEM, true ) - else - raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source ) - end - return [ :notationdecl, md[1], md[2], md[3] ] - when CDATA_END - @document_status = :after_doctype - @source.match( CDATA_END, true ) - return [ :end_doctype ] - end - end - begin - if @source.buffer[0] == ?< - if @source.buffer[1] == ?/ - last_tag = @tags.pop - #md = @source.match_to_consume( '>', CLOSE_MATCH) - md = @source.match( CLOSE_MATCH, true ) - raise REXML::ParseException.new( "Missing end tag for "+ + pairs = {} + values = md[0].scan( ATTDEF_RE ) + values.each do |attdef| + unless attdef[3] == "#IMPLIED" + attdef.compact! + val = attdef[3] + val = attdef[4] if val == "#FIXED " + pairs[attdef[0]] = val + end + end + return [ :attlistdecl, element, pairs, contents ] + when NOTATIONDECL_START + md = nil + if @source.match( PUBLIC ) + md = @source.match( PUBLIC, true ) + vals = [md[1],md[2],md[4],md[6]] + elsif @source.match( SYSTEM ) + md = @source.match( SYSTEM, true ) + vals = [md[1],md[2],nil,md[4]] + else + raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source ) + end + return [ :notationdecl, *vals ] + when CDATA_END + @document_status = :after_doctype + @source.match( CDATA_END, true ) + return [ :end_doctype ] + end + end + begin + if @source.buffer[0] == ?< + if @source.buffer[1] == ?/ + last_tag = @tags.pop + #md = @source.match_to_consume( '>', CLOSE_MATCH) + md = @source.match( CLOSE_MATCH, true ) + raise REXML::ParseException.new( "Missing end tag for "+ "'#{last_tag}' (got \"#{md[1]}\")", @source) unless last_tag == md[1] - return [ :end_element, last_tag ] - elsif @source.buffer[1] == ?! - md = @source.match(/\A(\s*[^>]*>)/um) - #puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" - raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][2] == ?- - md = @source.match( COMMENT_PATTERN, true ) - return [ :comment, md[1] ] if md - else - md = @source.match( CDATA_PATTERN, true ) - return [ :cdata, md[1] ] if md - end - raise REXML::ParseException.new( "Declarations can only occur "+ - "in the doctype declaration.", @source) - elsif @source.buffer[1] == ?? - md = @source.match( INSTRUCTION_PATTERN, true ) - return [ :processing_instruction, md[1], md[2] ] if md - raise REXML::ParseException.new( "Bad instruction declaration", - @source) - else - # Get the next tag - md = @source.match(TAG_MATCH, true) - raise REXML::ParseException.new("malformed XML: missing tag start", @source) unless md - attrs = [] - if md[2].size > 0 - attrs = md[2].scan( ATTRIBUTE_PATTERN ) - raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0 - end - - if md[4] - @closed = md[1] - else - @tags.push( md[1] ) - end - attributes = {} - attrs.each { |a,b,c| attributes[a] = c } - return [ :start_element, md[1], attributes ] - end - else - md = @source.match( TEXT_PATTERN, true ) + return [ :end_element, last_tag ] + elsif @source.buffer[1] == ?! + md = @source.match(/\A(\s*[^>]*>)/um) + #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" + raise REXML::ParseException.new("Malformed node", @source) unless md + if md[0][2] == ?- + md = @source.match( COMMENT_PATTERN, true ) + return [ :comment, md[1] ] if md + else + md = @source.match( CDATA_PATTERN, true ) + return [ :cdata, md[1] ] if md + end + raise REXML::ParseException.new( "Declarations can only occur "+ + "in the doctype declaration.", @source) + elsif @source.buffer[1] == ?? + md = @source.match( INSTRUCTION_PATTERN, true ) + return [ :processing_instruction, md[1], md[2] ] if md + raise REXML::ParseException.new( "Bad instruction declaration", + @source) + else + # Get the next tag + md = @source.match(TAG_MATCH, true) + raise REXML::ParseException.new("malformed XML: missing tag start", @source) unless md + attrs = [] + if md[2].size > 0 + attrs = md[2].scan( ATTRIBUTE_PATTERN ) + raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0 + end + + if md[4] + @closed = md[1] + else + @tags.push( md[1] ) + end + attributes = {} + attrs.each { |a,b,c| attributes[a] = c } + return [ :start_element, md[1], attributes ] + end + else + md = @source.match( TEXT_PATTERN, true ) if md[0].length == 0 - #puts "EMPTY = #{empty?}" - #puts "BUFFER = \"#{@source.buffer}\"" + puts "EMPTY = #{empty?}" + puts "BUFFER = \"#{@source.buffer}\"" @source.match( /(\s+)/, true ) end + #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 #return [ :text, "" ] if md[0].length == 0 - # unnormalized = Text::unnormalize( md[1], self ) - # return PullEvent.new( :text, md[1], unnormalized ) - return [ :text, md[1] ] - end - rescue REXML::ParseException - raise - rescue Exception, NameError => error - raise REXML::ParseException.new( "Exception parsing", - @source, self, (error ? error : $!) ) - end - return [ :dummy ] - end + # unnormalized = Text::unnormalize( md[1], self ) + # return PullEvent.new( :text, md[1], unnormalized ) + return [ :text, md[1] ] + end + rescue REXML::ParseException + raise + rescue Exception, NameError => error + raise REXML::ParseException.new( "Exception parsing", + @source, self, (error ? error : $!) ) + end + return [ :dummy ] + end - def entity( reference, entities ) - value = nil - value = entities[ reference ] if entities - if not value - value = DEFAULT_ENTITIES[ reference ] - value = value[2] if value - end - unnormalize( value, entities ) if value - end + def entity( reference, entities ) + value = nil + value = entities[ reference ] if entities + if not value + value = DEFAULT_ENTITIES[ reference ] + value = value[2] if value + end + unnormalize( value, entities ) if value + end - # Escapes all possible entities - def normalize( input, entities=nil, entity_filter=nil ) - copy = input.clone - # Doing it like this rather than in a loop improves the speed - copy.gsub!( EREFERENCE, '&' ) - entities.each do |key, value| - copy.gsub!( value, "&#{key};" ) unless entity_filter and - entity_filter.include?(entity) - end if entities - copy.gsub!( EREFERENCE, '&' ) - DEFAULT_ENTITIES.each do |key, value| - copy.gsub!( value[3], value[1] ) - end - copy - end + # Escapes all possible entities + def normalize( input, entities=nil, entity_filter=nil ) + copy = input.clone + # Doing it like this rather than in a loop improves the speed + copy.gsub!( EREFERENCE, '&' ) + entities.each do |key, value| + copy.gsub!( value, "&#{key};" ) unless entity_filter and + entity_filter.include?(entity) + end if entities + copy.gsub!( EREFERENCE, '&' ) + DEFAULT_ENTITIES.each do |key, value| + copy.gsub!( value[3], value[1] ) + end + copy + end - # Unescapes all possible entities - def unnormalize( string, entities=nil, filter=nil ) - rv = string.clone - rv.gsub!( /\r\n?/, "\n" ) - matches = rv.scan( REFERENCE_RE ) - return rv if matches.size == 0 - rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m| - m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') - } - matches.collect!{|x|x[0]}.compact! - if matches.size > 0 - matches.each do |entity_reference| - unless filter and filter.include?(entity_reference) - entity_value = entity( entity_reference, entities ) - if entity_value - re = /&#{entity_reference};/ - rv.gsub!( re, entity_value ) - end - end - end - matches.each do |entity_reference| - unless filter and filter.include?(entity_reference) - er = DEFAULT_ENTITIES[entity_reference] - rv.gsub!( er[0], er[2] ) if er - end - end - rv.gsub!( /&/, '&' ) - end - rv - end - end - end + # Unescapes all possible entities + def unnormalize( string, entities=nil, filter=nil ) + rv = string.clone + rv.gsub!( /\r\n?/, "\n" ) + matches = rv.scan( REFERENCE_RE ) + return rv if matches.size == 0 + rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m| + m=$1 + m = "0#{m}" if m[0] == ?x + [Integer(m)].pack('U*') + } + matches.collect!{|x|x[0]}.compact! + if matches.size > 0 + matches.each do |entity_reference| + unless filter and filter.include?(entity_reference) + entity_value = entity( entity_reference, entities ) + if entity_value + re = /&#{entity_reference};/ + rv.gsub!( re, entity_value ) + end + end + end + matches.each do |entity_reference| + unless filter and filter.include?(entity_reference) + er = DEFAULT_ENTITIES[entity_reference] + rv.gsub!( er[0], er[2] ) if er + end + end + rv.gsub!( /&/, '&' ) + end + rv + end + end + end end =begin diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb index 0a328ea8fc..36dc7160c3 100644 --- a/lib/rexml/parsers/pullparser.rb +++ b/lib/rexml/parsers/pullparser.rb @@ -1,96 +1,100 @@ +require 'forwardable' + require 'rexml/parseexception' require 'rexml/parsers/baseparser' require 'rexml/xmltokens' module REXML - module Parsers - # = Using the Pull Parser - # This API is experimental, and subject to change. - # parser = PullParser.new( "texttxet" ) - # while parser.has_next? - # res = parser.next - # puts res[1]['att'] if res.start_tag? and res[0] == 'b' - # end - # See the PullEvent class for information on the content of the results. - # The data is identical to the arguments passed for the various events to - # the StreamListener API. - # - # Notice that: - # parser = PullParser.new( "BAD DOCUMENT" ) - # while parser.has_next? - # res = parser.next - # raise res[1] if res.error? - # end - # - # Nat Price gave me some good ideas for the API. - class PullParser - include XMLTokens + module Parsers + # = Using the Pull Parser + # This API is experimental, and subject to change. + # parser = PullParser.new( "texttxet" ) + # while parser.has_next? + # res = parser.next + # puts res[1]['att'] if res.start_tag? and res[0] == 'b' + # end + # See the PullEvent class for information on the content of the results. + # The data is identical to the arguments passed for the various events to + # the StreamListener API. + # + # Notice that: + # parser = PullParser.new( "BAD DOCUMENT" ) + # while parser.has_next? + # res = parser.next + # raise res[1] if res.error? + # end + # + # Nat Price gave me some good ideas for the API. + class PullParser + include XMLTokens + extend Forwardable - def initialize stream - @entities = {} + def_delegators( :@parser, :has_next? ) + def_delegators( :@parser, :entity ) + def_delegators( :@parser, :empty? ) + def_delegators( :@parser, :source ) + + def initialize stream + @entities = {} @listeners = nil @parser = BaseParser.new( stream ) - end + @my_stack = [] + end def add_listener( listener ) @listeners = [] unless @listeners @listeners << listener end - def each - while has_next? - yield self.pull - end - end - - def peek depth=0 - PullEvent.new(@parser.peek(depth)) - end - - def has_next? - @parser.has_next? + def each + while has_next? + yield self.pull + end end - def pull - event = @parser.pull - case event[0] - when :entitydecl - @entities[ event[1] ] = - event[2] unless event[2] =~ /PUBLIC|SYSTEM/ - when :text - unnormalized = @parser.unnormalize( event[1], @entities ) - event << unnormalized - end - PullEvent.new( event ) - end + def peek depth=0 + if @my_stack.length <= depth + (depth - @my_stack.length + 1).times { + e = PullEvent.new(@parser.pull) + @my_stack.push(e) + } + end + @my_stack[depth] + end + + def pull + return @my_stack.shift if @my_stack.length > 0 + + event = @parser.pull + case event[0] + when :entitydecl + @entities[ event[1] ] = + event[2] unless event[2] =~ /PUBLIC|SYSTEM/ + when :text + unnormalized = @parser.unnormalize( event[1], @entities ) + event << unnormalized + end + PullEvent.new( event ) + end def unshift token - @parser.unshift token + @my_stack.unshift token end + end - def entity reference - @parser.entity( reference ) + # A parsing event. The contents of the event are accessed as an +Array?, + # and the type is given either by the ...? methods, or by accessing the + # +type+ accessor. The contents of this object vary from event to event, + # but are identical to the arguments passed to +StreamListener+s for each + # event. + class PullEvent + # The type of this event. Will be one of :tag_start, :tag_end, :text, + # :processing_instruction, :comment, :doctype, :attlistdecl, :entitydecl, + # :notationdecl, :entity, :cdata, :xmldecl, or :error. + def initialize(arg) + @contents = arg end - def empty? - @parser.empty? - end - - end - - # A parsing event. The contents of the event are accessed as an +Array?, - # and the type is given either by the ...? methods, or by accessing the - # +type+ accessor. The contents of this object vary from event to event, - # but are identical to the arguments passed to +StreamListener+s for each - # event. - class PullEvent - # The type of this event. Will be one of :tag_start, :tag_end, :text, - # :processing_instruction, :comment, :doctype, :attlistdecl, :entitydecl, - # :notationdecl, :entity, :cdata, :xmldecl, or :error. - def initialize(arg) - @contents = arg - end - def []( start, endd=nil) if start.kind_of? Range @contents.slice( start.begin+1 .. start.end ) @@ -103,90 +107,90 @@ module REXML else raise "Illegal argument #{start.inspect} (#{start.class})" end - end + end - def event_type - @contents[0] - end + def event_type + @contents[0] + end - # Content: [ String tag_name, Hash attributes ] - def start_element? - @contents[0] == :start_element - end + # Content: [ String tag_name, Hash attributes ] + def start_element? + @contents[0] == :start_element + end - # Content: [ String tag_name ] - def end_element? - @contents[0] == :end_element - end + # Content: [ String tag_name ] + def end_element? + @contents[0] == :end_element + end - # Content: [ String raw_text, String unnormalized_text ] - def text? - @contents[0] == :text - end + # Content: [ String raw_text, String unnormalized_text ] + def text? + @contents[0] == :text + end - # Content: [ String text ] - def instruction? - @contents[0] == :processing_instruction - end + # Content: [ String text ] + def instruction? + @contents[0] == :processing_instruction + end - # Content: [ String text ] - def comment? - @contents[0] == :comment - end + # Content: [ String text ] + def comment? + @contents[0] == :comment + end - # Content: [ String name, String pub_sys, String long_name, String uri ] - def doctype? - @contents[0] == :start_doctype - end + # Content: [ String name, String pub_sys, String long_name, String uri ] + def doctype? + @contents[0] == :start_doctype + end - # Content: [ String text ] - def attlistdecl? - @contents[0] == :attlistdecl - end + # Content: [ String text ] + def attlistdecl? + @contents[0] == :attlistdecl + end - # Content: [ String text ] - def elementdecl? - @contents[0] == :elementdecl - end + # Content: [ String text ] + def elementdecl? + @contents[0] == :elementdecl + end - # Due to the wonders of DTDs, an entity declaration can be just about - # anything. There's no way to normalize it; you'll have to interpret the - # content yourself. However, the following is true: - # - # * If the entity declaration is an internal entity: - # [ String name, String value ] - # Content: [ String text ] - def entitydecl? - @contents[0] == :entitydecl - end + # Due to the wonders of DTDs, an entity declaration can be just about + # anything. There's no way to normalize it; you'll have to interpret the + # content yourself. However, the following is true: + # + # * If the entity declaration is an internal entity: + # [ String name, String value ] + # Content: [ String text ] + def entitydecl? + @contents[0] == :entitydecl + end - # Content: [ String text ] - def notationdecl? - @contents[0] == :notationdecl - end + # Content: [ String text ] + def notationdecl? + @contents[0] == :notationdecl + end - # Content: [ String text ] - def entity? - @contents[0] == :entity - end + # Content: [ String text ] + def entity? + @contents[0] == :entity + end - # Content: [ String text ] - def cdata? - @contents[0] == :cdata - end + # Content: [ String text ] + def cdata? + @contents[0] == :cdata + end - # Content: [ String version, String encoding, String standalone ] - def xmldecl? - @contents[0] == :xmldecl - end + # Content: [ String version, String encoding, String standalone ] + def xmldecl? + @contents[0] == :xmldecl + end - def error? - @contents[0] == :error - end + def error? + @contents[0] == :error + end - def inspect + def inspect @contents[0].to_s + ": " + @contents[1..-1].inspect - end - end - end + end + end + end end diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index d5ee1bcfcd..61a216cec1 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -1,9 +1,11 @@ require 'rexml/parsers/baseparser' require 'rexml/parseexception' require 'rexml/namespace' +require 'rexml/text' module REXML module Parsers + # SAX2Parser class SAX2Parser def initialize source @parser = BaseParser.new(source) @@ -36,6 +38,10 @@ module REXML # :start_prefix_mapping, :end_prefix_mapping, :characters, # :processing_instruction, :doctype, :attlistdecl, :elementdecl, # :entitydecl, :notationdecl, :cdata, :xmldecl, :comment + # + # There is an additional symbol that can be listened for: :progress. + # This will be called for every event generated, passing in the current + # stream position. # # Array contains regular expressions or strings which will be matched # against fully qualified element names. @@ -161,6 +167,7 @@ module REXML :elementdecl, :cdata, :notationdecl, :xmldecl handle( *event ) end + handle( :progress, @parser.position ) end end diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 996d613e15..256d0f611c 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -1,42 +1,46 @@ module REXML - module Parsers - class StreamParser - def initialize source, listener - @listener = listener - @parser = BaseParser.new( source ) - end - + module Parsers + class StreamParser + def initialize source, listener + @listener = listener + @parser = BaseParser.new( source ) + end + def add_listener( listener ) @parser.add_listener( listener ) end - - def parse - # entity string - while true - event = @parser.pull - case event[0] - when :end_document - return - when :start_element - attrs = event[2].each do |n, v| - event[2][n] = @parser.unnormalize( v ) - end - @listener.tag_start( event[1], attrs ) - when :end_element - @listener.tag_end( event[1] ) - when :text - normalized = @parser.unnormalize( event[1] ) - @listener.text( normalized ) - when :processing_instruction - @listener.instruction( *event[1,2] ) + + def parse + # entity string + while true + event = @parser.pull + case event[0] + when :end_document + return + when :start_element + attrs = event[2].each do |n, v| + event[2][n] = @parser.unnormalize( v ) + end + @listener.tag_start( event[1], attrs ) + when :end_element + @listener.tag_end( event[1] ) + when :text + normalized = @parser.unnormalize( event[1] ) + @listener.text( normalized ) + when :processing_instruction + @listener.instruction( *event[1,2] ) when :start_doctype @listener.doctype( *event[1..-1] ) - when :comment, :attlistdecl, :notationdecl, :elementdecl, - :entitydecl, :cdata, :xmldecl, :attlistdecl - @listener.send( event[0].to_s, *event[1..-1] ) - end - end - end - end - end + when :end_doctype + # FIXME: remove this condition for milestone:3.2 + @listener.doctype_end if @listener.respond_to? :doctype_end + when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl + @listener.send( event[0].to_s, *event[1..-1] ) + when :entitydecl, :notationdecl + @listener.send( event[0].to_s, event[1..-1] ) + end + end + end + end + end end diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index 57d11f7e23..500a53f426 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -19,8 +19,12 @@ module REXML begin while true event = @parser.pull + #STDERR.puts "TREEPARSER GOT #{event.inspect}" case event[0] when :end_document + unless tag_stack.empty? + raise ParseException.new("No close tag for #{tag_stack.inspect}") + end return when :start_element tag_stack.push(event[1]) @@ -35,10 +39,10 @@ module REXML @build_context[-1] << event[1] else @build_context.add( - Text.new( event[1], @build_context.whitespace, nil, true ) + Text.new(event[1], @build_context.whitespace, nil, true) ) unless ( - event[1].strip.size==0 and - @build_context.ignore_whitespace_nodes + @build_context.ignore_whitespace_nodes and + event[1].strip.size==0 ) end end diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 00fd50ad02..ca154443b5 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -10,8 +10,8 @@ # # Main page:: http://www.germane-software.com/software/rexml # Author:: Sean Russell -# Version:: 3.1.3 -# Date:: +2005/139 +# Version:: 3.1.4 +# Date:: 2006/104 # # This API documentation can be downloaded from the REXML home page, or can # be accessed online[http://www.germane-software.com/software/rexml_doc] @@ -20,7 +20,10 @@ # or can be accessed # online[http://www.germane-software.com/software/rexml/docs/tutorial.html] module REXML - Copyright = "Copyright © 2001-2005 Sean Russell " - Date = "+2005/139" - Version = "3.1.3" + COPYRIGHT = "Copyright © 2001-2006 Sean Russell " + DATE = "2006/104" + VERSION = "3.1.4" + + Copyright = COPYRIGHT + Version = VERSION end diff --git a/lib/rexml/sax2listener.rb b/lib/rexml/sax2listener.rb index 40a77ed464..9a992917e6 100644 --- a/lib/rexml/sax2listener.rb +++ b/lib/rexml/sax2listener.rb @@ -84,11 +84,14 @@ module REXML # @p version the version attribute value. EG, "1.0" # @p encoding the encoding attribute value, or nil. EG, "utf" # @p standalone the standalone attribute value, or nil. EG, nil + # @p spaced the declaration is followed by a line break def xmldecl version, encoding, standalone end # Called when a comment is encountered. # @p comment The content of the comment def comment comment end + def progress position + end end end diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 7251666160..cacab221db 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -7,12 +7,19 @@ module REXML # @param arg Either a String, or an IO # @return a Source, or nil if a bad argument was given def SourceFactory::create_from arg#, slurp=true - if arg.kind_of? String - source = Source.new(arg) - elsif arg.kind_of? IO - source = IOSource.new(arg) - end - source + if arg.kind_of? String + Source.new(arg) + elsif arg.respond_to? :read and + arg.respond_to? :readline and + arg.respond_to? :nil? and + arg.respond_to? :eof? + IOSource.new(arg) + elsif arg.kind_of? Source + arg + else + raise "#{source.class} is not a valid input stream. It must walk \n"+ + "like either a String, IO, or Source." + end end end @@ -98,6 +105,10 @@ module REXML @buffer == "" end + def position + @orig.index( @buffer ) + end + # @return the current line in the source def current_line lines = @orig.split @@ -194,6 +205,10 @@ module REXML super and ( @source.nil? || @source.eof? ) end + def position + @er_source.stat.pipe? ? 0 : @er_source.pos + end + # @return the current line in the source def current_line begin diff --git a/lib/rexml/streamlistener.rb b/lib/rexml/streamlistener.rb index 3c3c5e3684..6f401125b5 100644 --- a/lib/rexml/streamlistener.rb +++ b/lib/rexml/streamlistener.rb @@ -39,6 +39,9 @@ module REXML # @p uri the uri of the doctype, or nil. EG, "bar" def doctype name, pub_sys, long_name, uri end + # Called when the doctype is done + def doctype_end + end # If a doctype includes an ATTLIST declaration, it will cause this # method to be called. The content is the declaration itself, unparsed. # EG, will come to this method as "el diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 9a83121af8..55bc9f50f8 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -39,8 +39,10 @@ module REXML # text. If this value is nil (the default), then the raw value of the # parent will be used as the raw value for this node. If there is no raw # value for the parent, and no value is supplied, the default is false. + # Use this field if you have entities defined for some text, and you don't + # want REXML to escape that text in output. # Text.new( "<&", false, nil, false ) #-> "<&" - # Text.new( "<&", false, nil, true ) #-> IllegalArgumentException + # Text.new( "<&", false, nil, true ) #-> Parse exception # Text.new( "<&", false, nil, true ) #-> "<&" # # Assume that the entity "s" is defined to be "sean" # # and that the entity "r" is defined to be "russell" @@ -156,11 +158,11 @@ module REXML # # Assume that the entity "s" is defined to be "sean", and that the # # entity "r" is defined to be "russell" # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) - # t.string #-> "< & sean russell" + # t.value #-> "< & sean russell" # t = Text.new( "< & &s; russell", false, nil, false ) - # t.string #-> "< & sean russell" + # t.value #-> "< & sean russell" # u = Text.new( "sean russell", false, nil, true ) - # u.string #-> "sean russell" + # u.value #-> "sean russell" def value @unnormalized if @unnormalized doctype = nil @@ -282,9 +284,10 @@ module REXML EREFERENCE = /&(?!#{Entity::NAME};)/ # Escapes all possible entities def Text::normalize( input, doctype=nil, entity_filter=nil ) - copy = input.clone + copy = input # Doing it like this rather than in a loop improves the speed if doctype + # Replace all ampersands that aren't part of an entity copy = copy.gsub( EREFERENCE, '&' ) doctype.entities.each_value do |entity| copy = copy.gsub( entity.value, @@ -292,6 +295,7 @@ module REXML not( entity_filter and entity_filter.include?(entity) ) end else + # Replace all ampersands that aren't part of an entity copy = copy.gsub( EREFERENCE, '&' ) DocType::DEFAULT_ENTITIES.each_value do |entity| copy = copy.gsub(entity.value, "&#{entity.name};" ) diff --git a/lib/rexml/validation/validation.rb b/lib/rexml/validation/validation.rb index fbee315f0b..160ea96b31 100644 --- a/lib/rexml/validation/validation.rb +++ b/lib/rexml/validation/validation.rb @@ -82,10 +82,13 @@ module REXML @event_arg = event_arg end - attr_reader :done? attr_reader :event_type attr_accessor :event_arg + def done? + @done + end + def single? return (@event_type != :start_element and @event_type != :start_attribute) end diff --git a/lib/rexml/xmldecl.rb b/lib/rexml/xmldecl.rb index 47131ac816..b65604b762 100644 --- a/lib/rexml/xmldecl.rb +++ b/lib/rexml/xmldecl.rb @@ -2,71 +2,71 @@ require 'rexml/encoding' require 'rexml/source' module REXML - # NEEDS DOCUMENTATION - class XMLDecl < Child - include Encoding + # NEEDS DOCUMENTATION + class XMLDecl < Child + include Encoding - DEFAULT_VERSION = "1.0"; - DEFAULT_ENCODING = "UTF-8"; - DEFAULT_STANDALONE = "no"; - START = '<\?xml'; - STOP = '\?>'; + DEFAULT_VERSION = "1.0"; + DEFAULT_ENCODING = "UTF-8"; + DEFAULT_STANDALONE = "no"; + START = '<\?xml'; + STOP = '\?>'; - attr_accessor :version, :standalone + attr_accessor :version, :standalone attr_reader :writeencoding - def initialize(version=DEFAULT_VERSION, encoding=nil, standalone=nil) + def initialize(version=DEFAULT_VERSION, encoding=nil, standalone=nil) @writethis = true @writeencoding = !encoding.nil? - if version.kind_of? XMLDecl - super() - @version = version.version - self.encoding = version.encoding + if version.kind_of? XMLDecl + super() + @version = version.version + self.encoding = version.encoding @writeencoding = version.writeencoding - @standalone = version.standalone - else - super() - @version = version - self.encoding = encoding - @standalone = standalone - end - @version = DEFAULT_VERSION if @version.nil? - end + @standalone = version.standalone + else + super() + @version = version + self.encoding = encoding + @standalone = standalone + end + @version = DEFAULT_VERSION if @version.nil? + end - def clone - XMLDecl.new(self) - end + def clone + XMLDecl.new(self) + end - def write writer, indent_level=-1, transitive=false, ie_hack=false + def write writer, indent=-1, transitive=false, ie_hack=false return nil unless @writethis or writer.kind_of? Output - indent( writer, indent_level ) - writer << START.sub(/\\/u, '') + indent( writer, indent ) + writer << START.sub(/\\/u, '') if writer.kind_of? Output writer << " #{content writer.encoding}" else writer << " #{content encoding}" end - writer << STOP.sub(/\\/u, '') - end + writer << STOP.sub(/\\/u, '') + end - def ==( other ) - other.kind_of?(XMLDecl) and - other.version == @version and - other.encoding == self.encoding and - other.standalone == @standalone - end + def ==( other ) + other.kind_of?(XMLDecl) and + other.version == @version and + other.encoding == self.encoding and + other.standalone == @standalone + end - def xmldecl version, encoding, standalone - @version = version - self.encoding = encoding - @standalone = standalone - end + def xmldecl version, encoding, standalone + @version = version + self.encoding = encoding + @standalone = standalone + end - def node_type - :xmldecl - end + def node_type + :xmldecl + end - alias :stand_alone? :standalone + alias :stand_alone? :standalone alias :old_enc= :encoding= def encoding=( enc ) @@ -80,6 +80,11 @@ module REXML self.dowrite end + # Only use this if you do not want the XML declaration to be written; + # this object is ignored by the XML writer. Otherwise, instantiate your + # own XMLDecl and add it to the document. + # + # Note that XML 1.1 documents *must* include an XML declaration def XMLDecl.default rv = XMLDecl.new( "1.0" ) rv.nowrite @@ -98,12 +103,12 @@ module REXML START.sub(/\\/u, '') + " ... " + STOP.sub(/\\/u, '') end - private - def content(enc) - rv = "version='#@version'" - rv << " encoding='#{enc}'" if @writeencoding || enc !~ /utf-8/i - rv << " standalone='#@standalone'" if @standalone - rv - end - end + private + def content(enc) + rv = "version='#@version'" + rv << " encoding='#{enc}'" if @writeencoding || enc !~ /utf-8/i + rv << " standalone='#@standalone'" if @standalone + rv + end + end end diff --git a/lib/rexml/xpath.rb b/lib/rexml/xpath.rb index 6875f038e0..1ed440868b 100644 --- a/lib/rexml/xpath.rb +++ b/lib/rexml/xpath.rb @@ -2,76 +2,65 @@ require 'rexml/functions' require 'rexml/xpath_parser' module REXML - # Wrapper class. Use this class to access the XPath functions. - class XPath - include Functions - EMPTY_HASH = {} + # Wrapper class. Use this class to access the XPath functions. + class XPath + include Functions + EMPTY_HASH = {} - # Finds and returns the first node that matches the supplied xpath. - # element:: - # The context element - # path:: - # The xpath to search for. If not supplied or nil, returns the first - # node matching '*'. - # namespaces:: - # If supplied, a Hash which defines a namespace mapping. - # - # XPath.first( node ) - # XPath.first( doc, "//b"} ) - # XPath.first( node, "a/x:b", { "x"=>"http://doofus" } ) + # Finds and returns the first node that matches the supplied xpath. + # element:: + # The context element + # path:: + # The xpath to search for. If not supplied or nil, returns the first + # node matching '*'. + # namespaces:: + # If supplied, a Hash which defines a namespace mapping. + # + # XPath.first( node ) + # XPath.first( doc, "//b"} ) + # XPath.first( node, "a/x:b", { "x"=>"http://doofus" } ) def XPath::first element, path=nil, namespaces={}, variables={} -=begin raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.kind_of? Hash raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of? Hash - parser = XPathParser.new - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - parser.first( path, element ); -=end -#=begin - raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.kind_of? Hash - raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of? Hash - parser = XPathParser.new - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - element = [element] unless element.kind_of? Array - parser.parse(path, element).flatten[0] -#=end - end + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path, element).flatten[0] + end - # Itterates over nodes that match the given path, calling the supplied - # block with the match. - # element:: - # The context element - # path:: - # The xpath to search for. If not supplied or nil, defaults to '*' - # namespaces:: - # If supplied, a Hash which defines a namespace mapping - # - # XPath.each( node ) { |el| ... } - # XPath.each( node, '/*[@attr='v']' ) { |el| ... } - # XPath.each( node, 'ancestor::x' ) { |el| ... } - def XPath::each element, path=nil, namespaces={}, variables={}, &block + # Itterates over nodes that match the given path, calling the supplied + # block with the match. + # element:: + # The context element + # path:: + # The xpath to search for. If not supplied or nil, defaults to '*' + # namespaces:: + # If supplied, a Hash which defines a namespace mapping + # + # XPath.each( node ) { |el| ... } + # XPath.each( node, '/*[@attr='v']' ) { |el| ... } + # XPath.each( node, 'ancestor::x' ) { |el| ... } + def XPath::each element, path=nil, namespaces={}, variables={}, &block raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.kind_of? Hash raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of? Hash - parser = XPathParser.new - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - element = [element] unless element.kind_of? Array - parser.parse(path, element).each( &block ) - end + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path, element).each( &block ) + end - # Returns an array of nodes matching a given XPath. - def XPath::match element, path=nil, namespaces={}, variables={} - parser = XPathParser.new - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - element = [element] unless element.kind_of? Array - parser.parse(path,element) - end - end + # Returns an array of nodes matching a given XPath. + def XPath::match element, path=nil, namespaces={}, variables={} + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path,element) + end + end end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 91b8ad48c8..98ed70cc10 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -76,6 +76,8 @@ module REXML # Performs a depth-first (document order) XPath search, and returns the # first match. This is the fastest, lightest way to return a single result. + # + # FIXME: This method is incomplete! def first( path_stack, node ) #puts "#{depth}) Entering match( #{path.inspect}, #{tree.inspect} )" return nil if path.size == 0 @@ -123,14 +125,6 @@ module REXML r = expr( path_stack, nodeset ) #puts "MAIN EXPR => #{r.inspect}" r - - #while ( path_stack.size > 0 and nodeset.size > 0 ) - # #puts "MATCH: #{path_stack.inspect} '#{nodeset.collect{|n|n.class}.inspect}'" - # nodeset = expr( path_stack, nodeset ) - # #puts "NODESET: #{nodeset.inspect}" - # #puts "PATH_STACK: #{path_stack.inspect}" - #end - #nodeset end private @@ -158,9 +152,10 @@ module REXML #puts "IN QNAME" prefix = path_stack.shift name = path_stack.shift - ns = @namespaces[prefix] - ns = ns ? ns : '' + default_ns = @namespaces[prefix] + default_ns = default_ns ? default_ns : '' nodeset.delete_if do |node| + ns = default_ns # FIXME: This DOUBLES the time XPath searches take ns = node.namespace( prefix ) if node.node_type == :element and ns == '' #puts "NS = #{ns.inspect}" @@ -353,7 +348,7 @@ module REXML preceding_siblings = all_siblings[ 0 .. current_index-1 ].reverse #results += expr( path_stack.dclone, preceding_siblings ) end - nodeset = preceding_siblings + nodeset = preceding_siblings || [] node_types = ELEMENTS when :preceding @@ -385,10 +380,13 @@ module REXML return @variables[ var_name ] # :and, :or, :eq, :neq, :lt, :lteq, :gt, :gteq + # TODO: Special case for :or and :and -- not evaluate the right + # operand if the left alone determines result (i.e. is true for + # :or and false for :and). when :eq, :neq, :lt, :lteq, :gt, :gteq, :and, :or - left = expr( path_stack.shift, nodeset, context ) + left = expr( path_stack.shift, nodeset.dup, context ) #puts "LEFT => #{left.inspect} (#{left.class.name})" - right = expr( path_stack.shift, nodeset, context ) + right = expr( path_stack.shift, nodeset.dup, context ) #puts "RIGHT => #{right.inspect} (#{right.class.name})" res = equality_relational_compare( left, op, right ) #puts "RES => #{res.inspect}" @@ -472,8 +470,11 @@ module REXML def descendant_or_self( path_stack, nodeset ) rs = [] + #puts "#"*80 + #puts "PATH_STACK = #{path_stack.inspect}" + #puts "NODESET = #{nodeset.collect{|n|n.inspect}.inspect}" d_o_s( path_stack, nodeset, rs ) - #puts "RS = #{rs.collect{|n|n.to_s}.inspect}" + #puts "RS = #{rs.collect{|n|n.inspect}.inspect}" document_order(rs.flatten.compact) #rs.flatten.compact end