ruby--ruby/lib/rexml/document.rb

require "rexml/element"
require "rexml/xmldecl"
require "rexml/source"
require "rexml/comment"
require "rexml/doctype"
require "rexml/instruction"
require "rexml/rexml"
require "rexml/parseexception"
require "rexml/output"
require "rexml/parsers/baseparser"
require "rexml/parsers/streamparser"
require "rexml/parsers/treeparser"

module REXML
  # Represents a full XML document, including PIs, a doctype, etc.  A
  # Document has a single child that can be accessed by root().
  # Note that if you want to have an XML declaration written for a document
  # you create, you must add one; REXML documents do not write a default
  # declaration for you.  See |DECLARATION| and |write|.
  class Document < Element
    # A convenient default XML declaration.  If you want an XML declaration,
    # the easiest way to add one is mydoc << Document::DECLARATION
    # +DEPRECATED+
    # Use: mydoc << XMLDecl.default
    DECLARATION = XMLDecl.default

    # Constructor
    # @param source if supplied, must be a Document, String, or IO. 
    # Documents have their context and Element attributes cloned.
    # Strings are expected to be valid XML documents.  IOs are expected
    # to be sources of valid XML documents.
    # @param context if supplied, contains the context of the document;
    # this should be a Hash.
    def initialize( source = nil, context = {} )
      @entity_expansion_count = 0
      super()
      @context = context
      return if source.nil?
      if source.kind_of? Document
        @context = source.context
        super source
      else
        build(  source )
      end
    end

    def node_type
      :document
    end

    # Should be obvious
    def clone
      Document.new self
    end

    # According to the XML spec, a root node has no expanded name
    def expanded_name
      ''
      #d = doc_type
      #d ? d.name : "UNDEFINED"
    end

    alias :name :expanded_name

    # We override this, because XMLDecls and DocTypes must go at the start
    # of the document
    def add( child )
      if child.kind_of? XMLDecl
        @children.unshift child
        child.parent = self
      elsif child.kind_of? DocType
        # Find first Element or DocType node and insert the decl right 
        # before it.  If there is no such node, just insert the child at the
        # end.  If there is a child and it is an DocType, then replace it.
        insert_before_index = 0
        @children.find { |x| 
          insert_before_index += 1
          x.kind_of?(Element) || x.kind_of?(DocType)
        }
        if @children[ insert_before_index ] # Not null = not end of list
          if @children[ insert_before_index ].kind_of DocType
            @children[ insert_before_index ] = child
          else
            @children[ index_before_index-1, 0 ] = child
          end
        else  # Insert at end of list
          @children[insert_before_index] = child
        end
        child.parent = self
      else
        rv = super
        raise "attempted adding second root element to document" if @elements.size > 1
        rv
      end
    end
    alias :<< :add

    def add_element(arg=nil, arg2=nil)
      rv = super
      raise "attempted adding second root element to document" if @elements.size > 1
      rv
    end

    # @return the root Element of the document, or nil if this document
    # has no children.
    def root
      elements[1]
      #self
      #@children.find { |item| item.kind_of? Element }
    end

    # @return the DocType child of the document, if one exists,
    # and nil otherwise.
    def doctype
      @children.find { |item| item.kind_of? DocType }
    end

    # @return the XMLDecl of this document; if no XMLDecl has been
    # set, the default declaration is returned.
    def xml_decl
      rv = @children[0]
      return rv if rv.kind_of? XMLDecl
      rv = @children.unshift(XMLDecl.default)[0]
    end

    # @return the XMLDecl version of this document as a String.
    # If no XMLDecl has been set, returns the default version.
    def version
      xml_decl().version
    end

    # @return the XMLDecl encoding of this document as a String.
    # If no XMLDecl has been set, returns the default encoding.
    def encoding
      xml_decl().encoding
    end

    # @return the XMLDecl standalone value of this document as a String.
    # If no XMLDecl has been set, returns the default setting.
    def stand_alone?
      xml_decl().stand_alone?
    end

    # Write the XML tree out, optionally with indent.  This writes out the
    # entire XML document, including XML declarations, doctype declarations,
    # and processing instructions (if any are given).
    #
    # A controversial point is whether Document should always write the XML
    # declaration (<?xml version='1.0'?>) whether or not one is given by the
    # user (or source document).  REXML does not write one if one was not
    # specified, because it adds unnecessary bandwidth to applications such
    # as XML-RPC.
    #
    # See also the classes in the rexml/formatters package for the proper way
    # to change the default formatting of XML output
    #
    # _Examples_
    #   Document.new("<a><b/></a>").serialize
    #
    #   output_string = ""
    #   tr = Transitive.new( output_string )
    #   Document.new("<a><b/></a>").serialize( tr )
    #
    # output::
    #	  output an object which supports '<< string'; this is where the
    #   document will be written.
    # indent::
    #   An integer.  If -1, no indenting will be used; otherwise, the
    #   indentation will be twice this number of spaces, and children will be
    #   indented an additional amount.  For a value of 3, every item will be 
    #   indented 3 more levels, or 6 more spaces (2 * 3). Defaults to -1
    # transitive::
    #   If transitive is true and indent is >= 0, then the output will be
    #   pretty-printed in such a way that the added whitespace does not affect
    #   the absolute *value* of the document -- that is, it leaves the value
    #   and number of Text nodes in the document unchanged.
    # ie_hack::
    #   Internet Explorer is the worst piece of crap to have ever been
    #   written, with the possible exception of Windows itself.  Since IE is
    #   unable to parse proper XML, we have to provide a hack to generate XML
    #   that IE's limited abilities can handle.  This hack inserts a space 
    #   before the /> on empty tags.  Defaults to false
    def write( output=$stdout, indent=-1, transitive=false, ie_hack=false )
      if xml_decl.encoding != "UTF-8" && !output.kind_of?(Output)
        output = Output.new( output, xml_decl.encoding )
      end
      formatter = if indent > -1
          if transitive
            REXML::Formatters::Transitive.new( indent, ie_hack )
          else
            REXML::Formatters::Pretty.new( indent, ie_hack )
          end
        else
          REXML::Formatters::Default.new( ie_hack )
        end
      formatter.write( self, output )
    end

    
    def Document::parse_stream( source, listener )
      Parsers::StreamParser.new( source, listener ).parse
    end

    @@entity_expansion_limit = 10_000

    # Set the entity expansion limit. By default the limit is set to 10000.
    def Document::entity_expansion_limit=( val )
      @@entity_expansion_limit = val
    end

    # Get the entity expansion limit. By default the limit is set to 10000.
    def Document::entity_expansion_limit
      return @@entity_expansion_limit
    end

    attr_reader :entity_expansion_count
    
    def record_entity_expansion
      @entity_expansion_count += 1
      if @entity_expansion_count > @@entity_expansion_limit
        raise "number of entity expansions exceeded, processing aborted."
      end
    end

    private
    def build( source )
      Parsers::TreeParser.new( source, self ).parse
    end
  end
end