From 78d9dd71a619ae6b822063d13944c86c5d834eb8 Mon Sep 17 00:00:00 2001 From: ser Date: Sat, 15 Apr 2006 04:11:04 +0000 Subject: [PATCH] Short summary: This is a version bump to REXML 3.1.4 for Ruby HEAD. This change log is identical to the log for the 1.8 branch. It includes numerous bug fixes and is a pretty big patch, but is nonetheless a minor revision bump, since the API hasn't changed. For more information, see: http:/www.germane-software.com/projects/rexml/milestone/3.1.4 For all tickets, see: http://www.germane-software.com/projects/rexml/ticket/# Where '#' is replaced with the ticket number. Changelog: * Fixed the documentation WRT the raw mode of text nodes (ticket #4) * Fixes roundup ticket #43: substring-after bug. * Fixed ticket #44, Element#xpath * Patch submitted by an anonymous doner to allow parsing of Tempfiles. I was hoping that, by now, that whole Source thing would have been changed to use duck typing and avoid this sort of ticket... but in the meantime, the patch has been applied. * Fixes ticket:30, XPath default namespace bug. The fix was provided by Lucas Nussbaum. * Aliases #size to #length, as per zdennis's request. * Fixes typo from previous commit * Fixes ticket #32, preceding-sibling fails attempting delete_if on nil nodeset * Merges a user-contributed patch for ticket #40 * Adds a forgotten-to-commit unit test for ticket #32 * Changes Date, Version, and Copyright to upper case, to avoid conflicts with the Date class. All of the other changes in the altered files are because Subversion doesn't allow block-level commits, like it should. English cased Version and Copyright are aliased to the upper case versions, for partial backward compatability. * Resolves ticket #34, SAX parser change makes it impossible to parse IO feeds. * Moves parser.source.position() to parser.position() * Fixes ticket:48, repeated writes munging text content * Fixes ticket:46, adding methods for accessing notation DTD information. * Encodes some characters and removes a brokes link in the documentation * Deals with carriage returns after XML declarations * Improved doctype handling * Whitespace handling changes * Applies a patch by David Tardon, which (incidentally) fixes ticket:50 * Closes #26, allowing anything that walks like an IO to be a source. * Ticket #31 - One unescape too many This wasn't really a bug, per se... "value" always returns a normalized string, and "value" is the method used to get the text() of an element. However, entities have no meaning in CDATA sections, so there's no justification for value to be normalizing the content of CData objects. This behavior has therefore been changed. * Ticket #45 -- Now parses notation declarations in DTDs properly. * Resolves ticket #49, Document.parse_stream returns ArgumentError * Adds documentation to clarify how XMLDecl works, to avoid invalid bug reports. * Addresses ticket #10, fixing the StreamParser API for DTDs. * Fixes ticket #42, XPath node-set function 'name' fails with relative node set parameter * Good patch by Aaron to fix ticket #53: REXML ignoring unbalanced tags at the end of a document. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@10092 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- lib/rexml/attribute.rb | 6 +- lib/rexml/cdata.rb | 4 + lib/rexml/doctype.rb | 401 ++++++++------- lib/rexml/document.rb | 278 ++++++----- lib/rexml/element.rb | 21 +- lib/rexml/encoding.rb | 110 +++-- lib/rexml/functions.rb | 18 +- lib/rexml/instruction.rb | 4 +- lib/rexml/parent.rb | 319 ++++++------ lib/rexml/parsers/baseparser.rb | 761 +++++++++++++++-------------- lib/rexml/parsers/pullparser.rb | 284 +++++------ lib/rexml/parsers/sax2parser.rb | 7 + lib/rexml/parsers/streamparser.rb | 74 +-- lib/rexml/parsers/treeparser.rb | 10 +- lib/rexml/rexml.rb | 13 +- lib/rexml/sax2listener.rb | 3 + lib/rexml/source.rb | 27 +- lib/rexml/streamlistener.rb | 3 + lib/rexml/text.rb | 14 +- lib/rexml/validation/validation.rb | 5 +- lib/rexml/xmldecl.rb | 113 +++-- lib/rexml/xpath.rb | 117 ++--- lib/rexml/xpath_parser.rb | 29 +- 23 files changed, 1385 insertions(+), 1236 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index a5a58055b8..a169148f32 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -101,20 +101,20 @@ module REXML end @unnormalized = nil - @value = @normalized = Text::normalize( @value, doctype ) + @normalized = Text::normalize( @value, doctype ) end # Returns the UNNORMALIZED value of this attribute. That is, entities # have been expanded to their values def value - @unnormalized if @unnormalized + return @unnormalized if @unnormalized doctype = nil if @element doc = @element.document doctype = doc.doctype if doc end @normalized = nil - @value = @unnormalized = Text::unnormalize( @value, doctype ) + @unnormalized = Text::unnormalize( @value, doctype ) end # Returns a copy of this attribute diff --git a/lib/rexml/cdata.rb b/lib/rexml/cdata.rb index ffedac1b53..046012ba61 100644 --- a/lib/rexml/cdata.rb +++ b/lib/rexml/cdata.rb @@ -35,6 +35,10 @@ module REXML @string end + def value + @string + end + # Generates XML output of this object # # output:: diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb index 652a04fce2..4a1ffb4336 100644 --- a/lib/rexml/doctype.rb +++ b/lib/rexml/doctype.rb @@ -6,55 +6,55 @@ require 'rexml/attlistdecl' require 'rexml/xmltokens' module REXML - # Represents an XML DOCTYPE declaration; that is, the contents of . DOCTYPES can be used to declare the DTD of a document, as well as - # being used to declare entities used in the document. - class DocType < Parent - include XMLTokens - START = "" - SYSTEM = "SYSTEM" - PUBLIC = "PUBLIC" - DEFAULT_ENTITIES = { - 'gt'=>EntityConst::GT, - 'lt'=>EntityConst::LT, - 'quot'=>EntityConst::QUOT, - "apos"=>EntityConst::APOS - } + # Represents an XML DOCTYPE declaration; that is, the contents of . DOCTYPES can be used to declare the DTD of a document, as well as + # being used to declare entities used in the document. + class DocType < Parent + include XMLTokens + START = "" + SYSTEM = "SYSTEM" + PUBLIC = "PUBLIC" + DEFAULT_ENTITIES = { + 'gt'=>EntityConst::GT, + 'lt'=>EntityConst::LT, + 'quot'=>EntityConst::QUOT, + "apos"=>EntityConst::APOS + } - # name is the name of the doctype - # external_id is the referenced DTD, if given - attr_reader :name, :external_id, :entities, :namespaces + # name is the name of the doctype + # external_id is the referenced DTD, if given + attr_reader :name, :external_id, :entities, :namespaces - # Constructor - # - # dt = DocType.new( 'foo', '-//I/Hate/External/IDs' ) - # # - # dt = DocType.new( doctype_to_clone ) - # # Incomplete. Shallow clone of doctype + # Constructor + # + # dt = DocType.new( 'foo', '-//I/Hate/External/IDs' ) + # # + # dt = DocType.new( doctype_to_clone ) + # # Incomplete. Shallow clone of doctype # # +Note+ that the constructor: # # Doctype.new( Source.new( "" ) ) # # is _deprecated_. Do not use it. It will probably disappear. - def initialize( first, parent=nil ) - @entities = DEFAULT_ENTITIES - @long_name = @uri = nil - if first.kind_of? String - super() - @name = first - @external_id = parent - elsif first.kind_of? DocType - super( parent ) - @name = first.name - @external_id = first.external_id - elsif first.kind_of? Array - super( parent ) - @name = first[0] - @external_id = first[1] - @long_name = first[2] - @uri = first[3] + def initialize( first, parent=nil ) + @entities = DEFAULT_ENTITIES + @long_name = @uri = nil + if first.kind_of? String + super() + @name = first + @external_id = parent + elsif first.kind_of? DocType + super( parent ) + @name = first.name + @external_id = first.external_id + elsif first.kind_of? Array + super( parent ) + @name = first[0] + @external_id = first[1] + @long_name = first[2] + @uri = first[3] elsif first.kind_of? Source super( parent ) parser = Parsers::BaseParser.new( first ) @@ -64,150 +64,215 @@ module REXML end else super() - end - end + end + end - def node_type - :doctype - end + def node_type + :doctype + end - def attributes_of element - rv = [] - each do |child| - child.each do |key,val| - rv << Attribute.new(key,val) - end if child.kind_of? AttlistDecl and child.element_name == element - end - rv - end + def attributes_of element + rv = [] + each do |child| + child.each do |key,val| + rv << Attribute.new(key,val) + end if child.kind_of? AttlistDecl and child.element_name == element + end + rv + end - def attribute_of element, attribute - att_decl = find do |child| - child.kind_of? AttlistDecl and - child.element_name == element and - child.include? attribute - end - return nil unless att_decl - att_decl[attribute] - end + def attribute_of element, attribute + att_decl = find do |child| + child.kind_of? AttlistDecl and + child.element_name == element and + child.include? attribute + end + return nil unless att_decl + att_decl[attribute] + end - def clone - DocType.new self - end + def clone + DocType.new self + end - # output:: - # Where to write the string - # indent:: - # An integer. If -1, no indenting will be used; otherwise, the - # indentation will be this number of spaces, and children will be - # indented an additional amount. - # transitive:: - # If transitive is true and indent is >= 0, then the output will be - # pretty-printed in such a way that the added whitespace does not affect - # the absolute *value* of the document -- that is, it leaves the value - # and number of Text nodes in the document unchanged. - # ie_hack:: - # Internet Explorer is the worst piece of crap to have ever been - # written, with the possible exception of Windows itself. Since IE is - # unable to parse proper XML, we have to provide a hack to generate XML - # that IE's limited abilities can handle. This hack inserts a space - # before the /> on empty tags. - # - def write( output, indent=0, transitive=false, ie_hack=false ) - indent( output, indent ) - output << START - output << ' ' - output << @name - output << " #@external_id" if @external_id - output << " #@long_name" if @long_name - output << " #@uri" if @uri - unless @children.empty? - next_indent = indent + 1 - output << ' [' - child = nil # speed - @children.each { |child| - output << "\n" - child.write( output, next_indent ) - } - output << "\n" - #output << ' '*next_indent - output << "]" - end - output << STOP - end + # output:: + # Where to write the string + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. + # transitive:: + # If transitive is true and indent is >= 0, then the output will be + # pretty-printed in such a way that the added whitespace does not affect + # the absolute *value* of the document -- that is, it leaves the value + # and number of Text nodes in the document unchanged. + # ie_hack:: + # Internet Explorer is the worst piece of crap to have ever been + # written, with the possible exception of Windows itself. Since IE is + # unable to parse proper XML, we have to provide a hack to generate XML + # that IE's limited abilities can handle. This hack inserts a space + # before the /> on empty tags. + # + def write( output, indent=0, transitive=false, ie_hack=false ) + indent( output, indent ) + output << START + output << ' ' + output << @name + output << " #@external_id" if @external_id + output << " #@long_name" if @long_name + output << " #@uri" if @uri + unless @children.empty? + next_indent = indent + 1 + output << ' [' + child = nil # speed + @children.each { |child| + output << "\n" + child.write( output, next_indent ) + } + #output << ' '*next_indent + output << "\n]" + end + output << STOP + end def context @parent.context end - def entity( name ) - @entities[name].unnormalized if @entities[name] - end + def entity( name ) + @entities[name].unnormalized if @entities[name] + end - def add child - super(child) - @entities = DEFAULT_ENTITIES.clone if @entities == DEFAULT_ENTITIES - @entities[ child.name ] = child if child.kind_of? Entity - end - end + def add child + super(child) + @entities = DEFAULT_ENTITIES.clone if @entities == DEFAULT_ENTITIES + @entities[ child.name ] = child if child.kind_of? Entity + end + + # This method retrieves the public identifier identifying the document's + # DTD. + # + # Method contributed by Henrik Martensson + def public + case @external_id + when "SYSTEM" + nil + when "PUBLIC" + strip_quotes(@long_name) + end + end + + # This method retrieves the system identifier identifying the document's DTD + # + # Method contributed by Henrik Martensson + def system + case @external_id + when "SYSTEM" + strip_quotes(@long_name) + when "PUBLIC" + @uri.kind_of?(String) ? strip_quotes(@uri) : nil + end + end + + # This method returns a list of notations that have been declared in the + # _internal_ DTD subset. Notations in the external DTD subset are not + # listed. + # + # Method contributed by Henrik Martensson + def notations + children().select {|node| node.kind_of?(REXML::NotationDecl)} + end + + # Retrieves a named notation. Only notations declared in the internal + # DTD subset can be retrieved. + # + # Method contributed by Henrik Martensson + def notation(name) + notations.find { |notation_decl| + notation_decl.name == name + } + end + + private + + # Method contributed by Henrik Martensson + def strip_quotes(quoted_string) + quoted_string =~ /^[\'\"].*[\´\"]$/ ? + quoted_string[1, quoted_string.length-2] : + quoted_string + end + end - # We don't really handle any of these since we're not a validating - # parser, so we can be pretty dumb about them. All we need to be able - # to do is spew them back out on a write() + # We don't really handle any of these since we're not a validating + # parser, so we can be pretty dumb about them. All we need to be able + # to do is spew them back out on a write() - # This is an abstract class. You never use this directly; it serves as a - # parent class for the specific declarations. - class Declaration < Child - def initialize src - super() - @string = src - end + # This is an abstract class. You never use this directly; it serves as a + # parent class for the specific declarations. + class Declaration < Child + def initialize src + super() + @string = src + end - def to_s - @string+'>' - end + def to_s + @string+'>' + end - def write( output, indent ) - output << (' '*indent) if indent > 0 - output << to_s - end - end - - public - class ElementDecl < Declaration - def initialize( src ) - super - end - end + def write( output, indent ) + output << (' '*indent) if indent > 0 + output << to_s + end + end + + public + class ElementDecl < Declaration + def initialize( src ) + super + end + end - class ExternalEntity < Child - def initialize( src ) - super() - @entity = src - end - def to_s - @entity - end - def write( output, indent ) - output << @entity - output << "\n" - end - end + class ExternalEntity < Child + def initialize( src ) + super() + @entity = src + end + def to_s + @entity + end + def write( output, indent ) + output << @entity + end + end - class NotationDecl < Child - def initialize name, middle, rest - @name = name - @middle = middle - @rest = rest - end + class NotationDecl < Child + attr_accessor :public, :system + def initialize name, middle, pub, sys + super(nil) + @name = name + @middle = middle + @public = pub + @system = sys + end - def to_s - "" - end + def to_s + "" + end - def write( output, indent=-1 ) - output << (' '*indent) if indent > 0 - output << to_s - end - end + def write( output, indent=-1 ) + output << (' '*indent) if indent > 0 + output << to_s + end + + # This method retrieves the name of the notation. + # + # Method contributed by Henrik Martensson + def name + @name + end + end end diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 8755e04de1..619a844257 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -16,166 +16,178 @@ module REXML # Document has a single child that can be accessed by root(). # Note that if you want to have an XML declaration written for a document # you create, you must add one; REXML documents do not write a default - # declaration for you. See |DECLARATION| and |write|. - class Document < Element - # A convenient default XML declaration. If you want an XML declaration, - # the easiest way to add one is mydoc << Document::DECLARATION + # declaration for you. See |DECLARATION| and |write|. + class Document < Element + # A convenient default XML declaration. If you want an XML declaration, + # the easiest way to add one is mydoc << Document::DECLARATION # +DEPRECATED+ # Use: mydoc << XMLDecl.default - DECLARATION = XMLDecl.default + DECLARATION = XMLDecl.default - # Constructor - # @param source if supplied, must be a Document, String, or IO. - # Documents have their context and Element attributes cloned. - # Strings are expected to be valid XML documents. IOs are expected - # to be sources of valid XML documents. - # @param context if supplied, contains the context of the document; - # this should be a Hash. - # NOTE that I'm not sure what the context is for; I cloned it out of - # the Electric XML API (in which it also seems to do nothing), and it - # is now legacy. It may do something, someday... it may disappear. - def initialize( source = nil, context = {} ) - super() - @context = context - return if source.nil? - if source.kind_of? Document - @context = source.context - super source - else - build( source ) - end - end + # Constructor + # @param source if supplied, must be a Document, String, or IO. + # Documents have their context and Element attributes cloned. + # Strings are expected to be valid XML documents. IOs are expected + # to be sources of valid XML documents. + # @param context if supplied, contains the context of the document; + # this should be a Hash. + # NOTE that I'm not sure what the context is for; I cloned it out of + # the Electric XML API (in which it also seems to do nothing), and it + # is now legacy. It may do something, someday... it may disappear. + def initialize( source = nil, context = {} ) + super() + @context = context + return if source.nil? + if source.kind_of? Document + @context = source.context + super source + else + build( source ) + end + end def node_type :document end - # Should be obvious - def clone - Document.new self - end + # Should be obvious + def clone + Document.new self + end - # According to the XML spec, a root node has no expanded name - def expanded_name - '' - #d = doc_type - #d ? d.name : "UNDEFINED" - end + # According to the XML spec, a root node has no expanded name + def expanded_name + '' + #d = doc_type + #d ? d.name : "UNDEFINED" + end - alias :name :expanded_name + alias :name :expanded_name - # We override this, because XMLDecls and DocTypes must go at the start - # of the document - def add( child ) - if child.kind_of? XMLDecl - @children.unshift child - elsif child.kind_of? DocType - if @children[0].kind_of? XMLDecl - @children[1,0] = child - else - @children.unshift child + # We override this, because XMLDecls and DocTypes must go at the start + # of the document + def add( child ) + if child.kind_of? XMLDecl + @children.unshift child + elsif child.kind_of? DocType + # Find first Element or DocType node and insert the decl right + # before it. If there is no such node, just insert the child at the + # end. If there is a child and it is an DocType, then replace it. + insert_before_index = 0 + @children.find { |x| + insert_before_index += 1 + x.kind_of?(Element) || x.kind_of?(DocType) + } + if @children[ insert_before_index ] # Not null = not end of list + if @children[ insert_before_index ].kind_of DocType + @children[ insert_before_index ] = child + else + @children[ index_before_index-1, 0 ] = child + end + else # Insert at end of list + @children[insert_before_index] = child end - child.parent = self - else - rv = super - raise "attempted adding second root element to document" if @elements.size > 1 - rv - end - end - alias :<< :add + child.parent = self + else + rv = super + raise "attempted adding second root element to document" if @elements.size > 1 + rv + end + end + alias :<< :add - def add_element(arg=nil, arg2=nil) - rv = super - raise "attempted adding second root element to document" if @elements.size > 1 - rv - end + def add_element(arg=nil, arg2=nil) + rv = super + raise "attempted adding second root element to document" if @elements.size > 1 + rv + end - # @return the root Element of the document, or nil if this document - # has no children. - def root + # @return the root Element of the document, or nil if this document + # has no children. + def root elements[1] #self #@children.find { |item| item.kind_of? Element } - end + end - # @return the DocType child of the document, if one exists, - # and nil otherwise. - def doctype - @children.find { |item| item.kind_of? DocType } - end + # @return the DocType child of the document, if one exists, + # and nil otherwise. + def doctype + @children.find { |item| item.kind_of? DocType } + end - # @return the XMLDecl of this document; if no XMLDecl has been - # set, the default declaration is returned. - def xml_decl - rv = @children[0] + # @return the XMLDecl of this document; if no XMLDecl has been + # set, the default declaration is returned. + def xml_decl + rv = @children[0] return rv if rv.kind_of? XMLDecl rv = @children.unshift(XMLDecl.default)[0] - end + end - # @return the XMLDecl version of this document as a String. - # If no XMLDecl has been set, returns the default version. - def version - xml_decl().version - end + # @return the XMLDecl version of this document as a String. + # If no XMLDecl has been set, returns the default version. + def version + xml_decl().version + end - # @return the XMLDecl encoding of this document as a String. - # If no XMLDecl has been set, returns the default encoding. - def encoding - xml_decl().encoding - end + # @return the XMLDecl encoding of this document as a String. + # If no XMLDecl has been set, returns the default encoding. + def encoding + xml_decl().encoding + end - # @return the XMLDecl standalone value of this document as a String. - # If no XMLDecl has been set, returns the default setting. - def stand_alone? - xml_decl().stand_alone? - end + # @return the XMLDecl standalone value of this document as a String. + # If no XMLDecl has been set, returns the default setting. + def stand_alone? + xml_decl().stand_alone? + end - # Write the XML tree out, optionally with indent. This writes out the - # entire XML document, including XML declarations, doctype declarations, - # and processing instructions (if any are given). - # A controversial point is whether Document should always write the XML - # declaration () whether or not one is given by the - # user (or source document). REXML does not write one if one was not - # specified, because it adds unneccessary bandwidth to applications such - # as XML-RPC. - # - # - # output:: - # output an object which supports '<< string'; this is where the - # document will be written. - # indent:: - # An integer. If -1, no indenting will be used; otherwise, the - # indentation will be this number of spaces, and children will be - # indented an additional amount. Defaults to -1 - # transitive:: - # If transitive is true and indent is >= 0, then the output will be - # pretty-printed in such a way that the added whitespace does not affect - # the absolute *value* of the document -- that is, it leaves the value - # and number of Text nodes in the document unchanged. - # ie_hack:: - # Internet Explorer is the worst piece of crap to have ever been - # written, with the possible exception of Windows itself. Since IE is - # unable to parse proper XML, we have to provide a hack to generate XML - # that IE's limited abilities can handle. This hack inserts a space - # before the /> on empty tags. Defaults to false - def write( output=$stdout, indent_level=-1, transitive=false, ie_hack=false ) - output = Output.new( output, xml_decl.encoding ) if xml_decl.encoding != "UTF-8" && !output.kind_of?(Output) - @children.each { |node| - indent( output, indent_level ) if node.node_type == :element - if node.write( output, indent_level, transitive, ie_hack ) - output << "\n" unless indent_level<0 or node == @children[-1] + # Write the XML tree out, optionally with indent. This writes out the + # entire XML document, including XML declarations, doctype declarations, + # and processing instructions (if any are given). + # A controversial point is whether Document should always write the XML + # declaration () whether or not one is given by the + # user (or source document). REXML does not write one if one was not + # specified, because it adds unneccessary bandwidth to applications such + # as XML-RPC. + # + # + # output:: + # output an object which supports '<< string'; this is where the + # document will be written. + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. Defaults to -1 + # transitive:: + # If transitive is true and indent is >= 0, then the output will be + # pretty-printed in such a way that the added whitespace does not affect + # the absolute *value* of the document -- that is, it leaves the value + # and number of Text nodes in the document unchanged. + # ie_hack:: + # Internet Explorer is the worst piece of crap to have ever been + # written, with the possible exception of Windows itself. Since IE is + # unable to parse proper XML, we have to provide a hack to generate XML + # that IE's limited abilities can handle. This hack inserts a space + # before the /> on empty tags. Defaults to false + def write( output=$stdout, indent=-1, transitive=false, ie_hack=false ) + output = Output.new( output, xml_decl.encoding ) if xml_decl.encoding != "UTF-8" && !output.kind_of?(Output) + @children.each { |node| + indent( output, indent ) if node.node_type == :element + if node.write( output, indent, transitive, ie_hack ) + output << "\n" unless indent<0 or node == @children[-1] end - } - end + } + end - - def Document::parse_stream( source, listener ) - Parsers::StreamParser.new( source, listener ).parse - end + + def Document::parse_stream( source, listener ) + Parsers::StreamParser.new( source, listener ).parse + end - private - def build( source ) + private + def build( source ) Parsers::TreeParser.new( source, self ).parse - end - end + end + end end diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 7f578ecb3d..80463d95b7 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -36,8 +36,6 @@ module REXML # If an Element, the object will be shallowly cloned; name, # attributes, and namespaces will be copied. Children will +not+ be # copied. - # If a Source, the source will be scanned and parsed for an Element, - # and all child elements will be recursively parsed as well. # parent:: # if supplied, must be a Parent, and will be used as # the parent of this object. @@ -223,7 +221,7 @@ module REXML # b.namespace("y") # -> '2' def namespace(prefix=nil) if prefix.nil? - prefix = self.prefix() + prefix = prefix() end if prefix == '' prefix = "xmlns" @@ -715,7 +713,7 @@ module REXML private def __to_xpath_helper node - rv = node.expanded_name + rv = node.expanded_name.clone if node.parent results = node.parent.find_all {|n| n.kind_of?(REXML::Element) and n.expanded_name == node.expanded_name @@ -1226,5 +1224,20 @@ module REXML rv.each{ |attr| attr.remove } return rv end + + # The +get_attribute_ns+ method retrieves a method by its namespace + # and name. Thus it is possible to reliably identify an attribute + # even if an XML processor has changed the prefix. + # + # Method contributed by Henrik Martensson + def get_attribute_ns(namespace, name) + each_attribute() { |attribute| + if name == attribute.name && + namespace == attribute.namespace() + return attribute + end + } + nil + end end end diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index edd3e80dfe..644957439e 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -1,58 +1,64 @@ # -*- mode: ruby; ruby-indent-level: 2; indent-tabs-mode: t; tab-width: 2 -*- vim: sw=2 ts=2 module REXML - module Encoding - @encoding_methods = {} - def self.register(enc, &block) - @encoding_methods[enc] = block - end - def self.apply(obj, enc) - @encoding_methods[enc][obj] - end - def self.encoding_method(enc) - @encoding_methods[enc] - end + module Encoding + @encoding_methods = {} + def self.register(enc, &block) + @encoding_methods[enc] = block + end + def self.apply(obj, enc) + @encoding_methods[enc][obj] + end + def self.encoding_method(enc) + @encoding_methods[enc] + end - # Native, default format is UTF-8, so it is declared here rather than in - # an encodings/ definition. - UTF_8 = 'UTF-8' - UTF_16 = 'UTF-16' - UNILE = 'UNILE' + # Native, default format is UTF-8, so it is declared here rather than in + # an encodings/ definition. + UTF_8 = 'UTF-8' + UTF_16 = 'UTF-16' + UNILE = 'UNILE' - # ID ---> Encoding name - attr_reader :encoding - def encoding=( enc ) - old_verbosity = $VERBOSE - begin - $VERBOSE = false - return if defined? @encoding and enc == @encoding - if enc - raise ArgumentError, "Bad encoding name #{enc}" unless /\A[\w-]+\z/n =~ enc - @encoding = enc.upcase.untaint - else - @encoding = UTF_8 - end - err = nil - [@encoding, "ICONV"].each do |enc| - begin - require File.join("rexml", "encodings", "#{enc}.rb") - return Encoding.apply(self, enc) - rescue LoadError, Exception => err - end - end - puts err.message - raise ArgumentError, "No decoder found for encoding #@encoding. Please install iconv." - ensure - $VERBOSE = old_verbosity - end - end + # ID ---> Encoding name + attr_reader :encoding + def encoding=( enc ) + old_verbosity = $VERBOSE + begin + $VERBOSE = false + return if defined? @encoding and enc == @encoding + if enc and enc != UTF_8 + @encoding = enc.upcase + begin + require 'rexml/encodings/ICONV.rb' + Encoding.apply(self, "ICONV") + rescue LoadError, Exception => err + raise ArgumentError, "Bad encoding name #@encoding" unless @encoding =~ /^[\w-]+$/ + @encoding.untaint + enc_file = File.join( "rexml", "encodings", "#@encoding.rb" ) + begin + require enc_file + Encoding.apply(self, @encoding) + rescue LoadError + puts $!.message + raise ArgumentError, "No decoder found for encoding #@encoding. Please install iconv." + end + end + else + @encoding = UTF_8 + require 'rexml/encodings/UTF-8.rb' + Encoding.apply(self, @encoding) + end + ensure + $VERBOSE = old_verbosity + end + end - def check_encoding str - # We have to recognize UTF-16, LSB UTF-16, and UTF-8 - return UTF_16 if str[0] == 254 && str[1] == 255 - return UNILE if str[0] == 255 && str[1] == 254 - str =~ /^\s*This API is experimental, and subject to change. - # parser = PullParser.new( "texttxet" ) - # while parser.has_next? - # res = parser.next - # puts res[1]['att'] if res.start_tag? and res[0] == 'b' - # end - # See the PullEvent class for information on the content of the results. - # The data is identical to the arguments passed for the various events to - # the StreamListener API. - # - # Notice that: - # parser = PullParser.new( "BAD DOCUMENT" ) - # while parser.has_next? - # res = parser.next - # raise res[1] if res.error? - # end - # - # Nat Price gave me some good ideas for the API. - class BaseParser - NCNAME_STR= '[\w:][\-\w\d.]*' - NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" + module Parsers + # = Using the Pull Parser + # This API is experimental, and subject to change. + # parser = PullParser.new( "texttxet" ) + # while parser.has_next? + # res = parser.next + # puts res[1]['att'] if res.start_tag? and res[0] == 'b' + # end + # See the PullEvent class for information on the content of the results. + # The data is identical to the arguments passed for the various events to + # the StreamListener API. + # + # Notice that: + # parser = PullParser.new( "BAD DOCUMENT" ) + # while parser.has_next? + # res = parser.next + # raise res[1] if res.error? + # end + # + # Nat Price gave me some good ideas for the API. + class BaseParser + NCNAME_STR= '[\w:][\-\w\d.]*' + NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" - NAMECHAR = '[\-\w\d\.:]' - NAME = "([\\w:]#{NAMECHAR}*)" - NMTOKEN = "(?:#{NAMECHAR})+" - NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" - REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" - REFERENCE_RE = /#{REFERENCE}/ + NAMECHAR = '[\-\w\d\.:]' + NAME = "([\\w:]#{NAMECHAR}*)" + NMTOKEN = "(?:#{NAMECHAR})+" + NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" + REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" + REFERENCE_RE = /#{REFERENCE}/ - DOCTYPE_START = /\A\s*)/um - ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um - COMMENT_START = /\A/um - CDATA_START = /\A/um - CDATA_PATTERN = //um - XMLDECL_START = /\A<\?xml\s/u; - XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>*/um - INSTRUCTION_START = /\A<\?/u - INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um - TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{NAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/um - CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um + DOCTYPE_START = /\A\s*)/um + ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um + COMMENT_START = /\A/um + CDATA_START = /\A/um + CDATA_PATTERN = //um + XMLDECL_START = /\A<\?xml\s/u; + XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um + INSTRUCTION_START = /\A<\?/u + INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um + TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{NAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/um + CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um - VERSION = /\bversion\s*=\s*["'](.*?)['"]/um - ENCODING = /\bencoding=["'](.*?)['"]/um - STANDALONE = /\bstandalone=["'](.*?)['"]/um + VERSION = /\bversion\s*=\s*["'](.*?)['"]/um + ENCODING = /\bencoding=["'](.*?)['"]/um + STANDALONE = /\bstandalone=["'](.*?)['"]/um - ENTITY_START = /^\s*/um - SYSTEMENTITY = /^\s*(%.*?;)\s*$/um - ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" - NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" - ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" - ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" - ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" - DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" - ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" - ATTDEF_RE = /#{ATTDEF}/ - ATTLISTDECL_START = /^\s*/um - NOTATIONDECL_START = /^\s*/um - SYSTEM = /^\s*/um + ENTITY_START = /^\s*/um + SYSTEMENTITY = /^\s*(%.*?;)\s*$/um + ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" + NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" + ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" + ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" + ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" + DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" + ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" + ATTDEF_RE = /#{ATTDEF}/ + ATTLISTDECL_START = /^\s*/um + NOTATIONDECL_START = /^\s*/um + SYSTEM = /^\s*/um - TEXT_PATTERN = /\A([^<]*)/um + TEXT_PATTERN = /\A([^<]*)/um - # Entity constants - PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" - SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} - PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} - EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" - NDATADECL = "\\s+NDATA\\s+#{NAME}" - PEREFERENCE = "%#{NAME};" - ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} - PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" - ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" - PEDECL = "" - GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + # Entity constants + PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" + SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} + PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} + EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" + NDATADECL = "\\s+NDATA\\s+#{NAME}" + PEREFERENCE = "%#{NAME};" + ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} + PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" + ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" + PEDECL = "" + GEDECL = "" + ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um - EREFERENCE = /&(?!#{NAME};)/ + EREFERENCE = /&(?!#{NAME};)/ - DEFAULT_ENTITIES = { - 'gt' => [/>/, '>', '>', />/], - 'lt' => [/</, '<', '<', / [/"/, '"', '"', /"/], - "apos" => [/'/, "'", "'", /'/] - } + DEFAULT_ENTITIES = { + 'gt' => [/>/, '>', '>', />/], + 'lt' => [/</, '<', '<', / [/"/, '"', '"', /"/], + "apos" => [/'/, "'", "'", /'/] + } - def initialize( source ) - self.stream = source - end + def initialize( source ) + self.stream = source + end def add_listener( listener ) if !defined?(@listeners) or !@listeners @@ -119,315 +119,320 @@ module REXML attr_reader :source - def stream=( source ) - if source.kind_of? String - @source = Source.new(source) - elsif source.kind_of? IO - @source = IOSource.new(source) - elsif source.kind_of? Source - @source = source - elsif defined? StringIO and source.kind_of? StringIO - @source = IOSource.new(source) - else - raise "#{source.class} is not a valid input stream. It must be \n"+ - "either a String, IO, StringIO or Source." - end - @closed = nil - @document_status = nil - @tags = [] - @stack = [] - @entities = [] - end + def stream=( source ) + @source = SourceFactory.create_from( source ) + @closed = nil + @document_status = nil + @tags = [] + @stack = [] + @entities = [] + end - # Returns true if there are no more events - def empty? - #puts "@source.empty? = #{@source.empty?}" - #puts "@stack.empty? = #{@stack.empty?}" + def position + if @source.respond_to? :position + @source.position + else + # FIXME + 0 + end + end + + # Returns true if there are no more events + def empty? + #STDERR.puts "@source.empty? = #{@source.empty?}" + #STDERR.puts "@stack.empty? = #{@stack.empty?}" return (@source.empty? and @stack.empty?) - end + end - # Returns true if there are more events. Synonymous with !empty? - def has_next? + # Returns true if there are more events. Synonymous with !empty? + def has_next? return !(@source.empty? and @stack.empty?) - end + end - # Push an event back on the head of the stream. This method - # has (theoretically) infinite depth. - def unshift token - @stack.unshift(token) - end + # Push an event back on the head of the stream. This method + # has (theoretically) infinite depth. + def unshift token + @stack.unshift(token) + end - # Peek at the +depth+ event in the stack. The first element on the stack - # is at depth 0. If +depth+ is -1, will parse to the end of the input - # stream and return the last event, which is always :end_document. - # Be aware that this causes the stream to be parsed up to the +depth+ - # event, so you can effectively pre-parse the entire document (pull the - # entire thing into memory) using this method. - def peek depth=0 - raise %Q[Illegal argument "#{depth}"] if depth < -1 - temp = [] - if depth == -1 - temp.push(pull()) until empty? - else - while @stack.size+temp.size < depth+1 - temp.push(pull()) - end - end - @stack += temp if temp.size > 0 - @stack[depth] - end + # Peek at the +depth+ event in the stack. The first element on the stack + # is at depth 0. If +depth+ is -1, will parse to the end of the input + # stream and return the last event, which is always :end_document. + # Be aware that this causes the stream to be parsed up to the +depth+ + # event, so you can effectively pre-parse the entire document (pull the + # entire thing into memory) using this method. + def peek depth=0 + raise %Q[Illegal argument "#{depth}"] if depth < -1 + temp = [] + if depth == -1 + temp.push(pull()) until empty? + else + while @stack.size+temp.size < depth+1 + temp.push(pull()) + end + end + @stack += temp if temp.size > 0 + @stack[depth] + end - # Returns the next event. This is a +PullEvent+ object. - def pull - if @closed - x, @closed = @closed, nil - return [ :end_element, x ] - end - return [ :end_document ] if empty? - return @stack.shift if @stack.size > 0 - @source.read if @source.buffer.size<2 - if @document_status == nil - @source.consume( /^\s*/um ) - word = @source.match( /(<[^>]*)>/um ) - word = word[1] unless word.nil? - case word - when COMMENT_START - return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] - when XMLDECL_START - results = @source.match( XMLDECL_PATTERN, true )[1] - version = VERSION.match( results ) - version = version[1] unless version.nil? - encoding = ENCODING.match(results) - encoding = encoding[1] unless encoding.nil? - @source.encoding = encoding - standalone = STANDALONE.match(results) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone] - when INSTRUCTION_START - return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] - when DOCTYPE_START - md = @source.match( DOCTYPE_PATTERN, true ) - identity = md[1] - close = md[2] - identity =~ IDENTITY - name = $1 - raise REXML::ParseException("DOCTYPE is missing a name") if name.nil? - pub_sys = $2.nil? ? nil : $2.strip - long_name = $3.nil? ? nil : $3.strip - uri = $4.nil? ? nil : $4.strip - args = [ :start_doctype, name, pub_sys, long_name, uri ] - if close == ">" - @document_status = :after_doctype - @source.read if @source.buffer.size<2 - md = @source.match(/^\s*/um, true) - @stack << [ :end_doctype ] - else - @document_status = :in_doctype - end - return args - else - @document_status = :after_doctype - @source.read if @source.buffer.size<2 - md = @source.match(/\s*/um, true) - end - end - if @document_status == :in_doctype - md = @source.match(/\s*(.*?>)/um) - case md[1] - when SYSTEMENTITY - match = @source.match( SYSTEMENTITY, true )[1] - return [ :externalentity, match ] + # Returns the next event. This is a +PullEvent+ object. + def pull + if @closed + x, @closed = @closed, nil + return [ :end_element, x ] + end + return [ :end_document ] if empty? + return @stack.shift if @stack.size > 0 + @source.read if @source.buffer.size<2 + #STDERR.puts "BUFFER = #{@source.buffer.inspect}" + if @document_status == nil + #@source.consume( /^\s*/um ) + word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um ) + word = word[1] unless word.nil? + #STDERR.puts "WORD = #{word.inspect}" + case word + when COMMENT_START + return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] + when XMLDECL_START + #STDERR.puts "XMLDECL" + results = @source.match( XMLDECL_PATTERN, true )[1] + version = VERSION.match( results ) + version = version[1] unless version.nil? + encoding = ENCODING.match(results) + encoding = encoding[1] unless encoding.nil? + @source.encoding = encoding + standalone = STANDALONE.match(results) + standalone = standalone[1] unless standalone.nil? + return [ :xmldecl, version, encoding, standalone ] + when INSTRUCTION_START + return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] + when DOCTYPE_START + md = @source.match( DOCTYPE_PATTERN, true ) + identity = md[1] + close = md[2] + identity =~ IDENTITY + name = $1 + raise REXML::ParseException("DOCTYPE is missing a name") if name.nil? + pub_sys = $2.nil? ? nil : $2.strip + long_name = $3.nil? ? nil : $3.strip + uri = $4.nil? ? nil : $4.strip + args = [ :start_doctype, name, pub_sys, long_name, uri ] + if close == ">" + @document_status = :after_doctype + @source.read if @source.buffer.size<2 + md = @source.match(/^\s*/um, true) + @stack << [ :end_doctype ] + else + @document_status = :in_doctype + end + return args + when /^\s+/ + else + @document_status = :after_doctype + @source.read if @source.buffer.size<2 + md = @source.match(/\s*/um, true) + end + end + if @document_status == :in_doctype + md = @source.match(/\s*(.*?>)/um) + case md[1] + when SYSTEMENTITY + match = @source.match( SYSTEMENTITY, true )[1] + return [ :externalentity, match ] - when ELEMENTDECL_START - return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] + when ELEMENTDECL_START + return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] - when ENTITY_START - match = @source.match( ENTITYDECL, true ).to_a.compact - match[0] = :entitydecl - ref = false - if match[1] == '%' - ref = true - match.delete_at 1 - end - # Now we have to sort out what kind of entity reference this is - if match[2] == 'SYSTEM' - # External reference - match[3] = match[3][1..-2] # PUBID - match.delete_at(4) if match.size > 4 # Chop out NDATA decl - # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] - elsif match[2] == 'PUBLIC' - # External reference - match[3] = match[3][1..-2] # PUBID - match[4] = match[4][1..-2] # HREF - # match is [ :entity, name, PUBLIC, pubid, href ] - else - match[2] = match[2][1..-2] - match.pop if match.size == 4 - # match is [ :entity, name, value ] - end - match << '%' if ref - return match - when ATTLISTDECL_START - md = @source.match( ATTLISTDECL_PATTERN, true ) - raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? - element = md[1] - contents = md[0] + when ENTITY_START + match = @source.match( ENTITYDECL, true ).to_a.compact + match[0] = :entitydecl + ref = false + if match[1] == '%' + ref = true + match.delete_at 1 + end + # Now we have to sort out what kind of entity reference this is + if match[2] == 'SYSTEM' + # External reference + match[3] = match[3][1..-2] # PUBID + match.delete_at(4) if match.size > 4 # Chop out NDATA decl + # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] + elsif match[2] == 'PUBLIC' + # External reference + match[3] = match[3][1..-2] # PUBID + match[4] = match[4][1..-2] # HREF + # match is [ :entity, name, PUBLIC, pubid, href ] + else + match[2] = match[2][1..-2] + match.pop if match.size == 4 + # match is [ :entity, name, value ] + end + match << '%' if ref + return match + when ATTLISTDECL_START + md = @source.match( ATTLISTDECL_PATTERN, true ) + raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? + element = md[1] + contents = md[0] - pairs = {} - values = md[0].scan( ATTDEF_RE ) - values.each do |attdef| - unless attdef[3] == "#IMPLIED" - attdef.compact! - val = attdef[3] - val = attdef[4] if val == "#FIXED " - pairs[attdef[0]] = val - end - end - return [ :attlistdecl, element, pairs, contents ] - when NOTATIONDECL_START - md = nil - if @source.match( PUBLIC ) - md = @source.match( PUBLIC, true ) - elsif @source.match( SYSTEM ) - md = @source.match( SYSTEM, true ) - else - raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source ) - end - return [ :notationdecl, md[1], md[2], md[3] ] - when CDATA_END - @document_status = :after_doctype - @source.match( CDATA_END, true ) - return [ :end_doctype ] - end - end - begin - if @source.buffer[0] == ?< - if @source.buffer[1] == ?/ - last_tag = @tags.pop - #md = @source.match_to_consume( '>', CLOSE_MATCH) - md = @source.match( CLOSE_MATCH, true ) - raise REXML::ParseException.new( "Missing end tag for "+ + pairs = {} + values = md[0].scan( ATTDEF_RE ) + values.each do |attdef| + unless attdef[3] == "#IMPLIED" + attdef.compact! + val = attdef[3] + val = attdef[4] if val == "#FIXED " + pairs[attdef[0]] = val + end + end + return [ :attlistdecl, element, pairs, contents ] + when NOTATIONDECL_START + md = nil + if @source.match( PUBLIC ) + md = @source.match( PUBLIC, true ) + vals = [md[1],md[2],md[4],md[6]] + elsif @source.match( SYSTEM ) + md = @source.match( SYSTEM, true ) + vals = [md[1],md[2],nil,md[4]] + else + raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source ) + end + return [ :notationdecl, *vals ] + when CDATA_END + @document_status = :after_doctype + @source.match( CDATA_END, true ) + return [ :end_doctype ] + end + end + begin + if @source.buffer[0] == ?< + if @source.buffer[1] == ?/ + last_tag = @tags.pop + #md = @source.match_to_consume( '>', CLOSE_MATCH) + md = @source.match( CLOSE_MATCH, true ) + raise REXML::ParseException.new( "Missing end tag for "+ "'#{last_tag}' (got \"#{md[1]}\")", @source) unless last_tag == md[1] - return [ :end_element, last_tag ] - elsif @source.buffer[1] == ?! - md = @source.match(/\A(\s*[^>]*>)/um) - #puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" - raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][2] == ?- - md = @source.match( COMMENT_PATTERN, true ) - return [ :comment, md[1] ] if md - else - md = @source.match( CDATA_PATTERN, true ) - return [ :cdata, md[1] ] if md - end - raise REXML::ParseException.new( "Declarations can only occur "+ - "in the doctype declaration.", @source) - elsif @source.buffer[1] == ?? - md = @source.match( INSTRUCTION_PATTERN, true ) - return [ :processing_instruction, md[1], md[2] ] if md - raise REXML::ParseException.new( "Bad instruction declaration", - @source) - else - # Get the next tag - md = @source.match(TAG_MATCH, true) - raise REXML::ParseException.new("malformed XML: missing tag start", @source) unless md - attrs = [] - if md[2].size > 0 - attrs = md[2].scan( ATTRIBUTE_PATTERN ) - raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0 - end - - if md[4] - @closed = md[1] - else - @tags.push( md[1] ) - end - attributes = {} - attrs.each { |a,b,c| attributes[a] = c } - return [ :start_element, md[1], attributes ] - end - else - md = @source.match( TEXT_PATTERN, true ) + return [ :end_element, last_tag ] + elsif @source.buffer[1] == ?! + md = @source.match(/\A(\s*[^>]*>)/um) + #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" + raise REXML::ParseException.new("Malformed node", @source) unless md + if md[0][2] == ?- + md = @source.match( COMMENT_PATTERN, true ) + return [ :comment, md[1] ] if md + else + md = @source.match( CDATA_PATTERN, true ) + return [ :cdata, md[1] ] if md + end + raise REXML::ParseException.new( "Declarations can only occur "+ + "in the doctype declaration.", @source) + elsif @source.buffer[1] == ?? + md = @source.match( INSTRUCTION_PATTERN, true ) + return [ :processing_instruction, md[1], md[2] ] if md + raise REXML::ParseException.new( "Bad instruction declaration", + @source) + else + # Get the next tag + md = @source.match(TAG_MATCH, true) + raise REXML::ParseException.new("malformed XML: missing tag start", @source) unless md + attrs = [] + if md[2].size > 0 + attrs = md[2].scan( ATTRIBUTE_PATTERN ) + raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0 + end + + if md[4] + @closed = md[1] + else + @tags.push( md[1] ) + end + attributes = {} + attrs.each { |a,b,c| attributes[a] = c } + return [ :start_element, md[1], attributes ] + end + else + md = @source.match( TEXT_PATTERN, true ) if md[0].length == 0 - #puts "EMPTY = #{empty?}" - #puts "BUFFER = \"#{@source.buffer}\"" + puts "EMPTY = #{empty?}" + puts "BUFFER = \"#{@source.buffer}\"" @source.match( /(\s+)/, true ) end + #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 #return [ :text, "" ] if md[0].length == 0 - # unnormalized = Text::unnormalize( md[1], self ) - # return PullEvent.new( :text, md[1], unnormalized ) - return [ :text, md[1] ] - end - rescue REXML::ParseException - raise - rescue Exception, NameError => error - raise REXML::ParseException.new( "Exception parsing", - @source, self, (error ? error : $!) ) - end - return [ :dummy ] - end + # unnormalized = Text::unnormalize( md[1], self ) + # return PullEvent.new( :text, md[1], unnormalized ) + return [ :text, md[1] ] + end + rescue REXML::ParseException + raise + rescue Exception, NameError => error + raise REXML::ParseException.new( "Exception parsing", + @source, self, (error ? error : $!) ) + end + return [ :dummy ] + end - def entity( reference, entities ) - value = nil - value = entities[ reference ] if entities - if not value - value = DEFAULT_ENTITIES[ reference ] - value = value[2] if value - end - unnormalize( value, entities ) if value - end + def entity( reference, entities ) + value = nil + value = entities[ reference ] if entities + if not value + value = DEFAULT_ENTITIES[ reference ] + value = value[2] if value + end + unnormalize( value, entities ) if value + end - # Escapes all possible entities - def normalize( input, entities=nil, entity_filter=nil ) - copy = input.clone - # Doing it like this rather than in a loop improves the speed - copy.gsub!( EREFERENCE, '&' ) - entities.each do |key, value| - copy.gsub!( value, "&#{key};" ) unless entity_filter and - entity_filter.include?(entity) - end if entities - copy.gsub!( EREFERENCE, '&' ) - DEFAULT_ENTITIES.each do |key, value| - copy.gsub!( value[3], value[1] ) - end - copy - end + # Escapes all possible entities + def normalize( input, entities=nil, entity_filter=nil ) + copy = input.clone + # Doing it like this rather than in a loop improves the speed + copy.gsub!( EREFERENCE, '&' ) + entities.each do |key, value| + copy.gsub!( value, "&#{key};" ) unless entity_filter and + entity_filter.include?(entity) + end if entities + copy.gsub!( EREFERENCE, '&' ) + DEFAULT_ENTITIES.each do |key, value| + copy.gsub!( value[3], value[1] ) + end + copy + end - # Unescapes all possible entities - def unnormalize( string, entities=nil, filter=nil ) - rv = string.clone - rv.gsub!( /\r\n?/, "\n" ) - matches = rv.scan( REFERENCE_RE ) - return rv if matches.size == 0 - rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m| - m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') - } - matches.collect!{|x|x[0]}.compact! - if matches.size > 0 - matches.each do |entity_reference| - unless filter and filter.include?(entity_reference) - entity_value = entity( entity_reference, entities ) - if entity_value - re = /&#{entity_reference};/ - rv.gsub!( re, entity_value ) - end - end - end - matches.each do |entity_reference| - unless filter and filter.include?(entity_reference) - er = DEFAULT_ENTITIES[entity_reference] - rv.gsub!( er[0], er[2] ) if er - end - end - rv.gsub!( /&/, '&' ) - end - rv - end - end - end + # Unescapes all possible entities + def unnormalize( string, entities=nil, filter=nil ) + rv = string.clone + rv.gsub!( /\r\n?/, "\n" ) + matches = rv.scan( REFERENCE_RE ) + return rv if matches.size == 0 + rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m| + m=$1 + m = "0#{m}" if m[0] == ?x + [Integer(m)].pack('U*') + } + matches.collect!{|x|x[0]}.compact! + if matches.size > 0 + matches.each do |entity_reference| + unless filter and filter.include?(entity_reference) + entity_value = entity( entity_reference, entities ) + if entity_value + re = /&#{entity_reference};/ + rv.gsub!( re, entity_value ) + end + end + end + matches.each do |entity_reference| + unless filter and filter.include?(entity_reference) + er = DEFAULT_ENTITIES[entity_reference] + rv.gsub!( er[0], er[2] ) if er + end + end + rv.gsub!( /&/, '&' ) + end + rv + end + end + end end =begin diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb index 0a328ea8fc..36dc7160c3 100644 --- a/lib/rexml/parsers/pullparser.rb +++ b/lib/rexml/parsers/pullparser.rb @@ -1,96 +1,100 @@ +require 'forwardable' + require 'rexml/parseexception' require 'rexml/parsers/baseparser' require 'rexml/xmltokens' module REXML - module Parsers - # = Using the Pull Parser - # This API is experimental, and subject to change. - # parser = PullParser.new( "texttxet" ) - # while parser.has_next? - # res = parser.next - # puts res[1]['att'] if res.start_tag? and res[0] == 'b' - # end - # See the PullEvent class for information on the content of the results. - # The data is identical to the arguments passed for the various events to - # the StreamListener API. - # - # Notice that: - # parser = PullParser.new( "BAD DOCUMENT" ) - # while parser.has_next? - # res = parser.next - # raise res[1] if res.error? - # end - # - # Nat Price gave me some good ideas for the API. - class PullParser - include XMLTokens + module Parsers + # = Using the Pull Parser + # This API is experimental, and subject to change. + # parser = PullParser.new( "texttxet" ) + # while parser.has_next? + # res = parser.next + # puts res[1]['att'] if res.start_tag? and res[0] == 'b' + # end + # See the PullEvent class for information on the content of the results. + # The data is identical to the arguments passed for the various events to + # the StreamListener API. + # + # Notice that: + # parser = PullParser.new( "BAD DOCUMENT" ) + # while parser.has_next? + # res = parser.next + # raise res[1] if res.error? + # end + # + # Nat Price gave me some good ideas for the API. + class PullParser + include XMLTokens + extend Forwardable - def initialize stream - @entities = {} + def_delegators( :@parser, :has_next? ) + def_delegators( :@parser, :entity ) + def_delegators( :@parser, :empty? ) + def_delegators( :@parser, :source ) + + def initialize stream + @entities = {} @listeners = nil @parser = BaseParser.new( stream ) - end + @my_stack = [] + end def add_listener( listener ) @listeners = [] unless @listeners @listeners << listener end - def each - while has_next? - yield self.pull - end - end - - def peek depth=0 - PullEvent.new(@parser.peek(depth)) - end - - def has_next? - @parser.has_next? + def each + while has_next? + yield self.pull + end end - def pull - event = @parser.pull - case event[0] - when :entitydecl - @entities[ event[1] ] = - event[2] unless event[2] =~ /PUBLIC|SYSTEM/ - when :text - unnormalized = @parser.unnormalize( event[1], @entities ) - event << unnormalized - end - PullEvent.new( event ) - end + def peek depth=0 + if @my_stack.length <= depth + (depth - @my_stack.length + 1).times { + e = PullEvent.new(@parser.pull) + @my_stack.push(e) + } + end + @my_stack[depth] + end + + def pull + return @my_stack.shift if @my_stack.length > 0 + + event = @parser.pull + case event[0] + when :entitydecl + @entities[ event[1] ] = + event[2] unless event[2] =~ /PUBLIC|SYSTEM/ + when :text + unnormalized = @parser.unnormalize( event[1], @entities ) + event << unnormalized + end + PullEvent.new( event ) + end def unshift token - @parser.unshift token + @my_stack.unshift token end + end - def entity reference - @parser.entity( reference ) + # A parsing event. The contents of the event are accessed as an +Array?, + # and the type is given either by the ...? methods, or by accessing the + # +type+ accessor. The contents of this object vary from event to event, + # but are identical to the arguments passed to +StreamListener+s for each + # event. + class PullEvent + # The type of this event. Will be one of :tag_start, :tag_end, :text, + # :processing_instruction, :comment, :doctype, :attlistdecl, :entitydecl, + # :notationdecl, :entity, :cdata, :xmldecl, or :error. + def initialize(arg) + @contents = arg end - def empty? - @parser.empty? - end - - end - - # A parsing event. The contents of the event are accessed as an +Array?, - # and the type is given either by the ...? methods, or by accessing the - # +type+ accessor. The contents of this object vary from event to event, - # but are identical to the arguments passed to +StreamListener+s for each - # event. - class PullEvent - # The type of this event. Will be one of :tag_start, :tag_end, :text, - # :processing_instruction, :comment, :doctype, :attlistdecl, :entitydecl, - # :notationdecl, :entity, :cdata, :xmldecl, or :error. - def initialize(arg) - @contents = arg - end - def []( start, endd=nil) if start.kind_of? Range @contents.slice( start.begin+1 .. start.end ) @@ -103,90 +107,90 @@ module REXML else raise "Illegal argument #{start.inspect} (#{start.class})" end - end + end - def event_type - @contents[0] - end + def event_type + @contents[0] + end - # Content: [ String tag_name, Hash attributes ] - def start_element? - @contents[0] == :start_element - end + # Content: [ String tag_name, Hash attributes ] + def start_element? + @contents[0] == :start_element + end - # Content: [ String tag_name ] - def end_element? - @contents[0] == :end_element - end + # Content: [ String tag_name ] + def end_element? + @contents[0] == :end_element + end - # Content: [ String raw_text, String unnormalized_text ] - def text? - @contents[0] == :text - end + # Content: [ String raw_text, String unnormalized_text ] + def text? + @contents[0] == :text + end - # Content: [ String text ] - def instruction? - @contents[0] == :processing_instruction - end + # Content: [ String text ] + def instruction? + @contents[0] == :processing_instruction + end - # Content: [ String text ] - def comment? - @contents[0] == :comment - end + # Content: [ String text ] + def comment? + @contents[0] == :comment + end - # Content: [ String name, String pub_sys, String long_name, String uri ] - def doctype? - @contents[0] == :start_doctype - end + # Content: [ String name, String pub_sys, String long_name, String uri ] + def doctype? + @contents[0] == :start_doctype + end - # Content: [ String text ] - def attlistdecl? - @contents[0] == :attlistdecl - end + # Content: [ String text ] + def attlistdecl? + @contents[0] == :attlistdecl + end - # Content: [ String text ] - def elementdecl? - @contents[0] == :elementdecl - end + # Content: [ String text ] + def elementdecl? + @contents[0] == :elementdecl + end - # Due to the wonders of DTDs, an entity declaration can be just about - # anything. There's no way to normalize it; you'll have to interpret the - # content yourself. However, the following is true: - # - # * If the entity declaration is an internal entity: - # [ String name, String value ] - # Content: [ String text ] - def entitydecl? - @contents[0] == :entitydecl - end + # Due to the wonders of DTDs, an entity declaration can be just about + # anything. There's no way to normalize it; you'll have to interpret the + # content yourself. However, the following is true: + # + # * If the entity declaration is an internal entity: + # [ String name, String value ] + # Content: [ String text ] + def entitydecl? + @contents[0] == :entitydecl + end - # Content: [ String text ] - def notationdecl? - @contents[0] == :notationdecl - end + # Content: [ String text ] + def notationdecl? + @contents[0] == :notationdecl + end - # Content: [ String text ] - def entity? - @contents[0] == :entity - end + # Content: [ String text ] + def entity? + @contents[0] == :entity + end - # Content: [ String text ] - def cdata? - @contents[0] == :cdata - end + # Content: [ String text ] + def cdata? + @contents[0] == :cdata + end - # Content: [ String version, String encoding, String standalone ] - def xmldecl? - @contents[0] == :xmldecl - end + # Content: [ String version, String encoding, String standalone ] + def xmldecl? + @contents[0] == :xmldecl + end - def error? - @contents[0] == :error - end + def error? + @contents[0] == :error + end - def inspect + def inspect @contents[0].to_s + ": " + @contents[1..-1].inspect - end - end - end + end + end + end end diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index d5ee1bcfcd..61a216cec1 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -1,9 +1,11 @@ require 'rexml/parsers/baseparser' require 'rexml/parseexception' require 'rexml/namespace' +require 'rexml/text' module REXML module Parsers + # SAX2Parser class SAX2Parser def initialize source @parser = BaseParser.new(source) @@ -36,6 +38,10 @@ module REXML # :start_prefix_mapping, :end_prefix_mapping, :characters, # :processing_instruction, :doctype, :attlistdecl, :elementdecl, # :entitydecl, :notationdecl, :cdata, :xmldecl, :comment + # + # There is an additional symbol that can be listened for: :progress. + # This will be called for every event generated, passing in the current + # stream position. # # Array contains regular expressions or strings which will be matched # against fully qualified element names. @@ -161,6 +167,7 @@ module REXML :elementdecl, :cdata, :notationdecl, :xmldecl handle( *event ) end + handle( :progress, @parser.position ) end end diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 996d613e15..256d0f611c 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -1,42 +1,46 @@ module REXML - module Parsers - class StreamParser - def initialize source, listener - @listener = listener - @parser = BaseParser.new( source ) - end - + module Parsers + class StreamParser + def initialize source, listener + @listener = listener + @parser = BaseParser.new( source ) + end + def add_listener( listener ) @parser.add_listener( listener ) end - - def parse - # entity string - while true - event = @parser.pull - case event[0] - when :end_document - return - when :start_element - attrs = event[2].each do |n, v| - event[2][n] = @parser.unnormalize( v ) - end - @listener.tag_start( event[1], attrs ) - when :end_element - @listener.tag_end( event[1] ) - when :text - normalized = @parser.unnormalize( event[1] ) - @listener.text( normalized ) - when :processing_instruction - @listener.instruction( *event[1,2] ) + + def parse + # entity string + while true + event = @parser.pull + case event[0] + when :end_document + return + when :start_element + attrs = event[2].each do |n, v| + event[2][n] = @parser.unnormalize( v ) + end + @listener.tag_start( event[1], attrs ) + when :end_element + @listener.tag_end( event[1] ) + when :text + normalized = @parser.unnormalize( event[1] ) + @listener.text( normalized ) + when :processing_instruction + @listener.instruction( *event[1,2] ) when :start_doctype @listener.doctype( *event[1..-1] ) - when :comment, :attlistdecl, :notationdecl, :elementdecl, - :entitydecl, :cdata, :xmldecl, :attlistdecl - @listener.send( event[0].to_s, *event[1..-1] ) - end - end - end - end - end + when :end_doctype + # FIXME: remove this condition for milestone:3.2 + @listener.doctype_end if @listener.respond_to? :doctype_end + when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl + @listener.send( event[0].to_s, *event[1..-1] ) + when :entitydecl, :notationdecl + @listener.send( event[0].to_s, event[1..-1] ) + end + end + end + end + end end diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index 57d11f7e23..500a53f426 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -19,8 +19,12 @@ module REXML begin while true event = @parser.pull + #STDERR.puts "TREEPARSER GOT #{event.inspect}" case event[0] when :end_document + unless tag_stack.empty? + raise ParseException.new("No close tag for #{tag_stack.inspect}") + end return when :start_element tag_stack.push(event[1]) @@ -35,10 +39,10 @@ module REXML @build_context[-1] << event[1] else @build_context.add( - Text.new( event[1], @build_context.whitespace, nil, true ) + Text.new(event[1], @build_context.whitespace, nil, true) ) unless ( - event[1].strip.size==0 and - @build_context.ignore_whitespace_nodes + @build_context.ignore_whitespace_nodes and + event[1].strip.size==0 ) end end diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 00fd50ad02..ca154443b5 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -10,8 +10,8 @@ # # Main page:: http://www.germane-software.com/software/rexml # Author:: Sean Russell -# Version:: 3.1.3 -# Date:: +2005/139 +# Version:: 3.1.4 +# Date:: 2006/104 # # This API documentation can be downloaded from the REXML home page, or can # be accessed online[http://www.germane-software.com/software/rexml_doc] @@ -20,7 +20,10 @@ # or can be accessed # online[http://www.germane-software.com/software/rexml/docs/tutorial.html] module REXML - Copyright = "Copyright © 2001-2005 Sean Russell " - Date = "+2005/139" - Version = "3.1.3" + COPYRIGHT = "Copyright © 2001-2006 Sean Russell " + DATE = "2006/104" + VERSION = "3.1.4" + + Copyright = COPYRIGHT + Version = VERSION end diff --git a/lib/rexml/sax2listener.rb b/lib/rexml/sax2listener.rb index 40a77ed464..9a992917e6 100644 --- a/lib/rexml/sax2listener.rb +++ b/lib/rexml/sax2listener.rb @@ -84,11 +84,14 @@ module REXML # @p version the version attribute value. EG, "1.0" # @p encoding the encoding attribute value, or nil. EG, "utf" # @p standalone the standalone attribute value, or nil. EG, nil + # @p spaced the declaration is followed by a line break def xmldecl version, encoding, standalone end # Called when a comment is encountered. # @p comment The content of the comment def comment comment end + def progress position + end end end diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 7251666160..cacab221db 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -7,12 +7,19 @@ module REXML # @param arg Either a String, or an IO # @return a Source, or nil if a bad argument was given def SourceFactory::create_from arg#, slurp=true - if arg.kind_of? String - source = Source.new(arg) - elsif arg.kind_of? IO - source = IOSource.new(arg) - end - source + if arg.kind_of? String + Source.new(arg) + elsif arg.respond_to? :read and + arg.respond_to? :readline and + arg.respond_to? :nil? and + arg.respond_to? :eof? + IOSource.new(arg) + elsif arg.kind_of? Source + arg + else + raise "#{source.class} is not a valid input stream. It must walk \n"+ + "like either a String, IO, or Source." + end end end @@ -98,6 +105,10 @@ module REXML @buffer == "" end + def position + @orig.index( @buffer ) + end + # @return the current line in the source def current_line lines = @orig.split @@ -194,6 +205,10 @@ module REXML super and ( @source.nil? || @source.eof? ) end + def position + @er_source.stat.pipe? ? 0 : @er_source.pos + end + # @return the current line in the source def current_line begin diff --git a/lib/rexml/streamlistener.rb b/lib/rexml/streamlistener.rb index 3c3c5e3684..6f401125b5 100644 --- a/lib/rexml/streamlistener.rb +++ b/lib/rexml/streamlistener.rb @@ -39,6 +39,9 @@ module REXML # @p uri the uri of the doctype, or nil. EG, "bar" def doctype name, pub_sys, long_name, uri end + # Called when the doctype is done + def doctype_end + end # If a doctype includes an ATTLIST declaration, it will cause this # method to be called. The content is the declaration itself, unparsed. # EG, will come to this method as "el diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 9a83121af8..55bc9f50f8 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -39,8 +39,10 @@ module REXML # text. If this value is nil (the default), then the raw value of the # parent will be used as the raw value for this node. If there is no raw # value for the parent, and no value is supplied, the default is false. + # Use this field if you have entities defined for some text, and you don't + # want REXML to escape that text in output. # Text.new( "<&", false, nil, false ) #-> "<&" - # Text.new( "<&", false, nil, true ) #-> IllegalArgumentException + # Text.new( "<&", false, nil, true ) #-> Parse exception # Text.new( "<&", false, nil, true ) #-> "<&" # # Assume that the entity "s" is defined to be "sean" # # and that the entity "r" is defined to be "russell" @@ -156,11 +158,11 @@ module REXML # # Assume that the entity "s" is defined to be "sean", and that the # # entity "r" is defined to be "russell" # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) - # t.string #-> "< & sean russell" + # t.value #-> "< & sean russell" # t = Text.new( "< & &s; russell", false, nil, false ) - # t.string #-> "< & sean russell" + # t.value #-> "< & sean russell" # u = Text.new( "sean russell", false, nil, true ) - # u.string #-> "sean russell" + # u.value #-> "sean russell" def value @unnormalized if @unnormalized doctype = nil @@ -282,9 +284,10 @@ module REXML EREFERENCE = /&(?!#{Entity::NAME};)/ # Escapes all possible entities def Text::normalize( input, doctype=nil, entity_filter=nil ) - copy = input.clone + copy = input # Doing it like this rather than in a loop improves the speed if doctype + # Replace all ampersands that aren't part of an entity copy = copy.gsub( EREFERENCE, '&' ) doctype.entities.each_value do |entity| copy = copy.gsub( entity.value, @@ -292,6 +295,7 @@ module REXML not( entity_filter and entity_filter.include?(entity) ) end else + # Replace all ampersands that aren't part of an entity copy = copy.gsub( EREFERENCE, '&' ) DocType::DEFAULT_ENTITIES.each_value do |entity| copy = copy.gsub(entity.value, "&#{entity.name};" ) diff --git a/lib/rexml/validation/validation.rb b/lib/rexml/validation/validation.rb index fbee315f0b..160ea96b31 100644 --- a/lib/rexml/validation/validation.rb +++ b/lib/rexml/validation/validation.rb @@ -82,10 +82,13 @@ module REXML @event_arg = event_arg end - attr_reader :done? attr_reader :event_type attr_accessor :event_arg + def done? + @done + end + def single? return (@event_type != :start_element and @event_type != :start_attribute) end diff --git a/lib/rexml/xmldecl.rb b/lib/rexml/xmldecl.rb index 47131ac816..b65604b762 100644 --- a/lib/rexml/xmldecl.rb +++ b/lib/rexml/xmldecl.rb @@ -2,71 +2,71 @@ require 'rexml/encoding' require 'rexml/source' module REXML - # NEEDS DOCUMENTATION - class XMLDecl < Child - include Encoding + # NEEDS DOCUMENTATION + class XMLDecl < Child + include Encoding - DEFAULT_VERSION = "1.0"; - DEFAULT_ENCODING = "UTF-8"; - DEFAULT_STANDALONE = "no"; - START = '<\?xml'; - STOP = '\?>'; + DEFAULT_VERSION = "1.0"; + DEFAULT_ENCODING = "UTF-8"; + DEFAULT_STANDALONE = "no"; + START = '<\?xml'; + STOP = '\?>'; - attr_accessor :version, :standalone + attr_accessor :version, :standalone attr_reader :writeencoding - def initialize(version=DEFAULT_VERSION, encoding=nil, standalone=nil) + def initialize(version=DEFAULT_VERSION, encoding=nil, standalone=nil) @writethis = true @writeencoding = !encoding.nil? - if version.kind_of? XMLDecl - super() - @version = version.version - self.encoding = version.encoding + if version.kind_of? XMLDecl + super() + @version = version.version + self.encoding = version.encoding @writeencoding = version.writeencoding - @standalone = version.standalone - else - super() - @version = version - self.encoding = encoding - @standalone = standalone - end - @version = DEFAULT_VERSION if @version.nil? - end + @standalone = version.standalone + else + super() + @version = version + self.encoding = encoding + @standalone = standalone + end + @version = DEFAULT_VERSION if @version.nil? + end - def clone - XMLDecl.new(self) - end + def clone + XMLDecl.new(self) + end - def write writer, indent_level=-1, transitive=false, ie_hack=false + def write writer, indent=-1, transitive=false, ie_hack=false return nil unless @writethis or writer.kind_of? Output - indent( writer, indent_level ) - writer << START.sub(/\\/u, '') + indent( writer, indent ) + writer << START.sub(/\\/u, '') if writer.kind_of? Output writer << " #{content writer.encoding}" else writer << " #{content encoding}" end - writer << STOP.sub(/\\/u, '') - end + writer << STOP.sub(/\\/u, '') + end - def ==( other ) - other.kind_of?(XMLDecl) and - other.version == @version and - other.encoding == self.encoding and - other.standalone == @standalone - end + def ==( other ) + other.kind_of?(XMLDecl) and + other.version == @version and + other.encoding == self.encoding and + other.standalone == @standalone + end - def xmldecl version, encoding, standalone - @version = version - self.encoding = encoding - @standalone = standalone - end + def xmldecl version, encoding, standalone + @version = version + self.encoding = encoding + @standalone = standalone + end - def node_type - :xmldecl - end + def node_type + :xmldecl + end - alias :stand_alone? :standalone + alias :stand_alone? :standalone alias :old_enc= :encoding= def encoding=( enc ) @@ -80,6 +80,11 @@ module REXML self.dowrite end + # Only use this if you do not want the XML declaration to be written; + # this object is ignored by the XML writer. Otherwise, instantiate your + # own XMLDecl and add it to the document. + # + # Note that XML 1.1 documents *must* include an XML declaration def XMLDecl.default rv = XMLDecl.new( "1.0" ) rv.nowrite @@ -98,12 +103,12 @@ module REXML START.sub(/\\/u, '') + " ... " + STOP.sub(/\\/u, '') end - private - def content(enc) - rv = "version='#@version'" - rv << " encoding='#{enc}'" if @writeencoding || enc !~ /utf-8/i - rv << " standalone='#@standalone'" if @standalone - rv - end - end + private + def content(enc) + rv = "version='#@version'" + rv << " encoding='#{enc}'" if @writeencoding || enc !~ /utf-8/i + rv << " standalone='#@standalone'" if @standalone + rv + end + end end diff --git a/lib/rexml/xpath.rb b/lib/rexml/xpath.rb index 6875f038e0..1ed440868b 100644 --- a/lib/rexml/xpath.rb +++ b/lib/rexml/xpath.rb @@ -2,76 +2,65 @@ require 'rexml/functions' require 'rexml/xpath_parser' module REXML - # Wrapper class. Use this class to access the XPath functions. - class XPath - include Functions - EMPTY_HASH = {} + # Wrapper class. Use this class to access the XPath functions. + class XPath + include Functions + EMPTY_HASH = {} - # Finds and returns the first node that matches the supplied xpath. - # element:: - # The context element - # path:: - # The xpath to search for. If not supplied or nil, returns the first - # node matching '*'. - # namespaces:: - # If supplied, a Hash which defines a namespace mapping. - # - # XPath.first( node ) - # XPath.first( doc, "//b"} ) - # XPath.first( node, "a/x:b", { "x"=>"http://doofus" } ) + # Finds and returns the first node that matches the supplied xpath. + # element:: + # The context element + # path:: + # The xpath to search for. If not supplied or nil, returns the first + # node matching '*'. + # namespaces:: + # If supplied, a Hash which defines a namespace mapping. + # + # XPath.first( node ) + # XPath.first( doc, "//b"} ) + # XPath.first( node, "a/x:b", { "x"=>"http://doofus" } ) def XPath::first element, path=nil, namespaces={}, variables={} -=begin raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.kind_of? Hash raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of? Hash - parser = XPathParser.new - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - parser.first( path, element ); -=end -#=begin - raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.kind_of? Hash - raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of? Hash - parser = XPathParser.new - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - element = [element] unless element.kind_of? Array - parser.parse(path, element).flatten[0] -#=end - end + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path, element).flatten[0] + end - # Itterates over nodes that match the given path, calling the supplied - # block with the match. - # element:: - # The context element - # path:: - # The xpath to search for. If not supplied or nil, defaults to '*' - # namespaces:: - # If supplied, a Hash which defines a namespace mapping - # - # XPath.each( node ) { |el| ... } - # XPath.each( node, '/*[@attr='v']' ) { |el| ... } - # XPath.each( node, 'ancestor::x' ) { |el| ... } - def XPath::each element, path=nil, namespaces={}, variables={}, &block + # Itterates over nodes that match the given path, calling the supplied + # block with the match. + # element:: + # The context element + # path:: + # The xpath to search for. If not supplied or nil, defaults to '*' + # namespaces:: + # If supplied, a Hash which defines a namespace mapping + # + # XPath.each( node ) { |el| ... } + # XPath.each( node, '/*[@attr='v']' ) { |el| ... } + # XPath.each( node, 'ancestor::x' ) { |el| ... } + def XPath::each element, path=nil, namespaces={}, variables={}, &block raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.kind_of? Hash raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of? Hash - parser = XPathParser.new - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - element = [element] unless element.kind_of? Array - parser.parse(path, element).each( &block ) - end + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path, element).each( &block ) + end - # Returns an array of nodes matching a given XPath. - def XPath::match element, path=nil, namespaces={}, variables={} - parser = XPathParser.new - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - element = [element] unless element.kind_of? Array - parser.parse(path,element) - end - end + # Returns an array of nodes matching a given XPath. + def XPath::match element, path=nil, namespaces={}, variables={} + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path,element) + end + end end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 91b8ad48c8..98ed70cc10 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -76,6 +76,8 @@ module REXML # Performs a depth-first (document order) XPath search, and returns the # first match. This is the fastest, lightest way to return a single result. + # + # FIXME: This method is incomplete! def first( path_stack, node ) #puts "#{depth}) Entering match( #{path.inspect}, #{tree.inspect} )" return nil if path.size == 0 @@ -123,14 +125,6 @@ module REXML r = expr( path_stack, nodeset ) #puts "MAIN EXPR => #{r.inspect}" r - - #while ( path_stack.size > 0 and nodeset.size > 0 ) - # #puts "MATCH: #{path_stack.inspect} '#{nodeset.collect{|n|n.class}.inspect}'" - # nodeset = expr( path_stack, nodeset ) - # #puts "NODESET: #{nodeset.inspect}" - # #puts "PATH_STACK: #{path_stack.inspect}" - #end - #nodeset end private @@ -158,9 +152,10 @@ module REXML #puts "IN QNAME" prefix = path_stack.shift name = path_stack.shift - ns = @namespaces[prefix] - ns = ns ? ns : '' + default_ns = @namespaces[prefix] + default_ns = default_ns ? default_ns : '' nodeset.delete_if do |node| + ns = default_ns # FIXME: This DOUBLES the time XPath searches take ns = node.namespace( prefix ) if node.node_type == :element and ns == '' #puts "NS = #{ns.inspect}" @@ -353,7 +348,7 @@ module REXML preceding_siblings = all_siblings[ 0 .. current_index-1 ].reverse #results += expr( path_stack.dclone, preceding_siblings ) end - nodeset = preceding_siblings + nodeset = preceding_siblings || [] node_types = ELEMENTS when :preceding @@ -385,10 +380,13 @@ module REXML return @variables[ var_name ] # :and, :or, :eq, :neq, :lt, :lteq, :gt, :gteq + # TODO: Special case for :or and :and -- not evaluate the right + # operand if the left alone determines result (i.e. is true for + # :or and false for :and). when :eq, :neq, :lt, :lteq, :gt, :gteq, :and, :or - left = expr( path_stack.shift, nodeset, context ) + left = expr( path_stack.shift, nodeset.dup, context ) #puts "LEFT => #{left.inspect} (#{left.class.name})" - right = expr( path_stack.shift, nodeset, context ) + right = expr( path_stack.shift, nodeset.dup, context ) #puts "RIGHT => #{right.inspect} (#{right.class.name})" res = equality_relational_compare( left, op, right ) #puts "RES => #{res.inspect}" @@ -472,8 +470,11 @@ module REXML def descendant_or_self( path_stack, nodeset ) rs = [] + #puts "#"*80 + #puts "PATH_STACK = #{path_stack.inspect}" + #puts "NODESET = #{nodeset.collect{|n|n.inspect}.inspect}" d_o_s( path_stack, nodeset, rs ) - #puts "RS = #{rs.collect{|n|n.to_s}.inspect}" + #puts "RS = #{rs.collect{|n|n.inspect}.inspect}" document_order(rs.flatten.compact) #rs.flatten.compact end