mirror of
				https://github.com/ruby/ruby.git
				synced 2022-11-09 12:17:21 -05:00 
			
		
		
		
	trans -> transitive. [ruby-dev:32040], r13686 * lib/rexml/text.rb (Text.check): fix check for illigal characher. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@16840 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
		
			
				
	
	
		
			404 lines
		
	
	
	
		
			13 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			404 lines
		
	
	
	
		
			13 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
require 'rexml/entity'
 | 
						|
require 'rexml/doctype'
 | 
						|
require 'rexml/child'
 | 
						|
require 'rexml/doctype'
 | 
						|
require 'rexml/parseexception'
 | 
						|
 | 
						|
module REXML
 | 
						|
  # Represents text nodes in an XML document
 | 
						|
  class Text < Child
 | 
						|
    include Comparable
 | 
						|
    # The order in which the substitutions occur
 | 
						|
    SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
 | 
						|
    SUBSTITUTES = ['&', '<', '>', '"', ''', '
']
 | 
						|
    # Characters which are substituted in written strings
 | 
						|
    SLAICEPS = [ '<', '>', '"', "'", '&' ]
 | 
						|
    SETUTITSBUS = [ /</u, />/u, /"/u, /'/u, /&/u ]
 | 
						|
 | 
						|
    # If +raw+ is true, then REXML leaves the value alone
 | 
						|
    attr_accessor :raw
 | 
						|
 | 
						|
    NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
 | 
						|
    NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ 
 | 
						|
    VALID_CHAR = [
 | 
						|
      0x9, 0xA, 0xD,
 | 
						|
      (0x20..0xD7FF),
 | 
						|
      (0xE000..0xFFFD),
 | 
						|
      (0x10000..0x10FFFF)
 | 
						|
    ]
 | 
						|
 | 
						|
    if String.method_defined? :encode
 | 
						|
      VALID_XML_CHARS = Regexp.new('^['+
 | 
						|
        VALID_CHAR.map { |item|
 | 
						|
          case item
 | 
						|
          when Fixnum
 | 
						|
            [item].pack('U').force_encoding('utf-8')
 | 
						|
          when Range
 | 
						|
            [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
 | 
						|
          end
 | 
						|
        }.join +
 | 
						|
      ']*$')
 | 
						|
    else
 | 
						|
      VALID_XML_CHARS = /^(
 | 
						|
           [\x09\x0A\x0D\x20-\x7E]            # ASCII
 | 
						|
         | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
 | 
						|
         |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
 | 
						|
         | [\xE1-\xEC\xEE][\x80-\xBF]{2}      # straight 3-byte
 | 
						|
         |  \xEF[\x80-\xBE]{2}                #
 | 
						|
         |  \xEF\xBF[\x80-\xBD]               # excluding U+fffe and U+ffff
 | 
						|
         |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
 | 
						|
         |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
 | 
						|
         | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
 | 
						|
         |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
 | 
						|
       )*$/nx; 
 | 
						|
    end
 | 
						|
 | 
						|
    # Constructor
 | 
						|
    # +arg+ if a String, the content is set to the String.  If a Text,
 | 
						|
    # the object is shallowly cloned.  
 | 
						|
    #
 | 
						|
    # +respect_whitespace+ (boolean, false) if true, whitespace is
 | 
						|
    # respected
 | 
						|
    #
 | 
						|
    # +parent+ (nil) if this is a Parent object, the parent
 | 
						|
    # will be set to this.  
 | 
						|
    #
 | 
						|
    # +raw+ (nil) This argument can be given three values.
 | 
						|
    # If true, then the value of used to construct this object is expected to 
 | 
						|
    # contain no unescaped XML markup, and REXML will not change the text. If 
 | 
						|
    # this value is false, the string may contain any characters, and REXML will
 | 
						|
    # escape any and all defined entities whose values are contained in the
 | 
						|
    # text.  If this value is nil (the default), then the raw value of the 
 | 
						|
    # parent will be used as the raw value for this node.  If there is no raw
 | 
						|
    # value for the parent, and no value is supplied, the default is false.
 | 
						|
    # Use this field if you have entities defined for some text, and you don't
 | 
						|
    # want REXML to escape that text in output.
 | 
						|
    #   Text.new( "<&", false, nil, false ) #-> "<&"
 | 
						|
    #   Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
 | 
						|
    #   Text.new( "<&", false, nil, true )  #-> Parse exception
 | 
						|
    #   Text.new( "<&", false, nil, true )  #-> "<&"
 | 
						|
    #   # Assume that the entity "s" is defined to be "sean"
 | 
						|
    #   # and that the entity    "r" is defined to be "russell"
 | 
						|
    #   Text.new( "sean russell" )          #-> "&s; &r;"
 | 
						|
    #   Text.new( "sean russell", false, nil, true ) #-> "sean russell"
 | 
						|
    #
 | 
						|
    # +entity_filter+ (nil) This can be an array of entities to match in the
 | 
						|
    # supplied text.  This argument is only useful if +raw+ is set to false.
 | 
						|
    #   Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
 | 
						|
    #   Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
 | 
						|
    # In the last example, the +entity_filter+ argument is ignored.
 | 
						|
    #
 | 
						|
    # +pattern+ INTERNAL USE ONLY
 | 
						|
    def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, 
 | 
						|
      entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK )
 | 
						|
 | 
						|
      @raw = false
 | 
						|
 | 
						|
      if parent
 | 
						|
        super( parent )
 | 
						|
        @raw = parent.raw 
 | 
						|
      else
 | 
						|
        @parent = nil
 | 
						|
      end
 | 
						|
 | 
						|
      @raw = raw unless raw.nil?
 | 
						|
      @entity_filter = entity_filter
 | 
						|
      @normalized = @unnormalized = nil
 | 
						|
 | 
						|
      if arg.kind_of? String
 | 
						|
        @string = arg.clone
 | 
						|
        @string.squeeze!(" \n\t") unless respect_whitespace
 | 
						|
      elsif arg.kind_of? Text
 | 
						|
        @string = arg.to_s
 | 
						|
        @raw = arg.raw
 | 
						|
      elsif
 | 
						|
        raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
 | 
						|
      end
 | 
						|
 | 
						|
      @string.gsub!( /\r\n?/, "\n" )
 | 
						|
 | 
						|
      Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
 | 
						|
    end
 | 
						|
 | 
						|
    def parent= parent
 | 
						|
      super(parent)
 | 
						|
      Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
 | 
						|
    end
 | 
						|
 | 
						|
    # check for illegal characters
 | 
						|
    def Text.check string, pattern, doctype
 | 
						|
 | 
						|
      # illegal anywhere
 | 
						|
      if string !~ VALID_XML_CHARS
 | 
						|
        if String.method_defined? :encode
 | 
						|
          string.chars.each do |c|
 | 
						|
            case c.ord
 | 
						|
            when *VALID_CHAR
 | 
						|
            else
 | 
						|
              raise "Illegal character #{c.inspect} in raw string \"#{string}\""
 | 
						|
            end
 | 
						|
          end
 | 
						|
        else
 | 
						|
          string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c|
 | 
						|
            case c.unpack('U')
 | 
						|
            when *VALID_CHAR
 | 
						|
            else
 | 
						|
              raise "Illegal character #{c.inspect} in raw string \"#{string}\""
 | 
						|
            end
 | 
						|
          end
 | 
						|
        end
 | 
						|
      end
 | 
						|
 | 
						|
      # context sensitive
 | 
						|
      string.scan(pattern) do
 | 
						|
        if $1[-1] != ?;
 | 
						|
          raise "Illegal character '#{$1}' in raw string \"#{string}\""
 | 
						|
        elsif $1[0] == ?&
 | 
						|
          if $5 and $5[0] == ?#
 | 
						|
            case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
 | 
						|
            when *VALID_CHAR
 | 
						|
            else
 | 
						|
              raise "Illegal character '#{$1}' in raw string \"#{string}\""
 | 
						|
            end
 | 
						|
          elsif $3 and !SUBSTITUTES.include?($1)
 | 
						|
            if !doctype or !doctype.entities.has_key?($3)
 | 
						|
              raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
 | 
						|
            end
 | 
						|
          end
 | 
						|
        end
 | 
						|
      end
 | 
						|
    end
 | 
						|
 | 
						|
    def node_type
 | 
						|
      :text
 | 
						|
    end
 | 
						|
 | 
						|
    def empty?
 | 
						|
      @string.size==0
 | 
						|
    end
 | 
						|
 | 
						|
 | 
						|
    def clone
 | 
						|
      return Text.new(self)
 | 
						|
    end
 | 
						|
 | 
						|
 | 
						|
    # Appends text to this text node.  The text is appended in the +raw+ mode
 | 
						|
    # of this text node.
 | 
						|
    def <<( to_append )
 | 
						|
      @string << to_append.gsub( /\r\n?/, "\n" )
 | 
						|
    end
 | 
						|
 | 
						|
 | 
						|
    # +other+ a String or a Text
 | 
						|
    # +returns+ the result of (to_s <=> arg.to_s)
 | 
						|
    def <=>( other )
 | 
						|
      to_s() <=> other.to_s
 | 
						|
    end
 | 
						|
 | 
						|
    def doctype
 | 
						|
      if @parent
 | 
						|
        doc = @parent.document
 | 
						|
        doc.doctype if doc
 | 
						|
      end
 | 
						|
    end
 | 
						|
 | 
						|
    REFERENCE = /#{Entity::REFERENCE}/
 | 
						|
    # Returns the string value of this text node.  This string is always
 | 
						|
    # escaped, meaning that it is a valid XML text node string, and all
 | 
						|
    # entities that can be escaped, have been inserted.  This method respects
 | 
						|
    # the entity filter set in the constructor.
 | 
						|
    #   
 | 
						|
    #   # Assume that the entity "s" is defined to be "sean", and that the 
 | 
						|
    #   # entity "r" is defined to be "russell"
 | 
						|
    #   t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 
 | 
						|
    #   t.to_s   #-> "< & &s; russell"
 | 
						|
    #   t = Text.new( "< & &s; russell", false, nil, false ) 
 | 
						|
    #   t.to_s   #-> "< & &s; russell"
 | 
						|
    #   u = Text.new( "sean russell", false, nil, true )
 | 
						|
    #   u.to_s   #-> "sean russell"
 | 
						|
    def to_s
 | 
						|
      return @string if @raw
 | 
						|
      return @normalized if @normalized
 | 
						|
 | 
						|
      @normalized = Text::normalize( @string, doctype, @entity_filter )
 | 
						|
    end
 | 
						|
 | 
						|
    def inspect
 | 
						|
      @string.inspect
 | 
						|
    end
 | 
						|
 | 
						|
    # Returns the string value of this text.  This is the text without
 | 
						|
    # entities, as it might be used programmatically, or printed to the
 | 
						|
    # console.  This ignores the 'raw' attribute setting, and any
 | 
						|
    # entity_filter.
 | 
						|
    #
 | 
						|
    #   # Assume that the entity "s" is defined to be "sean", and that the 
 | 
						|
    #   # entity "r" is defined to be "russell"
 | 
						|
    #   t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 
 | 
						|
    #   t.value   #-> "< & sean russell"
 | 
						|
    #   t = Text.new( "< & &s; russell", false, nil, false )
 | 
						|
    #   t.value   #-> "< & sean russell"
 | 
						|
    #   u = Text.new( "sean russell", false, nil, true )
 | 
						|
    #   u.value   #-> "sean russell"
 | 
						|
    def value
 | 
						|
      return @unnormalized if @unnormalized
 | 
						|
      @unnormalized = Text::unnormalize( @string, doctype )
 | 
						|
    end
 | 
						|
 | 
						|
    # Sets the contents of this text node.  This expects the text to be 
 | 
						|
    # unnormalized.  It returns self.
 | 
						|
    #
 | 
						|
    #   e = Element.new( "a" )
 | 
						|
    #   e.add_text( "foo" )   # <a>foo</a>
 | 
						|
    #   e[0].value = "bar"    # <a>bar</a>
 | 
						|
    #   e[0].value = "<a>"    # <a><a></a>
 | 
						|
    def value=( val )
 | 
						|
      @string = val.gsub( /\r\n?/, "\n" )
 | 
						|
      @unnormalized = nil
 | 
						|
      @normalized = nil
 | 
						|
      @raw = false
 | 
						|
    end
 | 
						|
 
 | 
						|
     def wrap(string, width, addnewline=false)
 | 
						|
       # Recursively wrap string at width.
 | 
						|
       return string if string.length <= width
 | 
						|
       place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
 | 
						|
       if addnewline then
 | 
						|
         return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
 | 
						|
       else
 | 
						|
         return string[0,place] + "\n" + wrap(string[place+1..-1], width)
 | 
						|
       end
 | 
						|
     end
 | 
						|
 | 
						|
    def indent_text(string, level=1, style="\t", indentfirstline=true)
 | 
						|
      return string if level < 0
 | 
						|
      new_string = ''
 | 
						|
      string.each { |line|
 | 
						|
        indent_string = style * level
 | 
						|
        new_line = (indent_string + line).sub(/[\s]+$/,'')
 | 
						|
        new_string << new_line
 | 
						|
      }
 | 
						|
      new_string.strip! unless indentfirstline
 | 
						|
      return new_string
 | 
						|
    end
 | 
						|
 
 | 
						|
    # == DEPRECATED
 | 
						|
    # See REXML::Formatters
 | 
						|
    #
 | 
						|
    def write( writer, indent=-1, transitive=false, ie_hack=false ) 
 | 
						|
      Kernel.warn("#{self.class.name}.write is deprecated.  See REXML::Formatters")
 | 
						|
      formatter = if indent > -1
 | 
						|
          REXML::Formatters::Pretty.new( indent )
 | 
						|
        else
 | 
						|
          REXML::Formatters::Default.new
 | 
						|
        end
 | 
						|
      formatter.write( self, writer )
 | 
						|
    end
 | 
						|
 | 
						|
    # FIXME
 | 
						|
    # This probably won't work properly
 | 
						|
    def xpath
 | 
						|
      path = @parent.xpath
 | 
						|
      path += "/text()"
 | 
						|
      return path
 | 
						|
    end
 | 
						|
 | 
						|
    # Writes out text, substituting special characters beforehand.
 | 
						|
    # +out+ A String, IO, or any other object supporting <<( String )
 | 
						|
    # +input+ the text to substitute and the write out
 | 
						|
    #
 | 
						|
    #   z=utf8.unpack("U*")
 | 
						|
    #   ascOut=""
 | 
						|
    #   z.each{|r|
 | 
						|
    #     if r <  0x100
 | 
						|
    #       ascOut.concat(r.chr)
 | 
						|
    #     else
 | 
						|
    #       ascOut.concat(sprintf("&#x%x;", r))
 | 
						|
    #     end
 | 
						|
    #   }
 | 
						|
    #   puts ascOut
 | 
						|
    def write_with_substitution out, input
 | 
						|
      copy = input.clone
 | 
						|
      # Doing it like this rather than in a loop improves the speed
 | 
						|
      copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
 | 
						|
      copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
 | 
						|
      copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
 | 
						|
      copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
 | 
						|
      copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
 | 
						|
      copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
 | 
						|
      out << copy
 | 
						|
    end
 | 
						|
 | 
						|
    # Reads text, substituting entities
 | 
						|
    def Text::read_with_substitution( input, illegal=nil )
 | 
						|
      copy = input.clone
 | 
						|
 | 
						|
      if copy =~ illegal
 | 
						|
        raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
 | 
						|
      end if illegal
 | 
						|
      
 | 
						|
      copy.gsub!( /\r\n?/, "\n" )
 | 
						|
      if copy.include? ?&
 | 
						|
        copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
 | 
						|
        copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
 | 
						|
        copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
 | 
						|
        copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
 | 
						|
        copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
 | 
						|
        copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) {
 | 
						|
          m=$1
 | 
						|
          #m='0' if m==''
 | 
						|
          m = "0#{m}" if m[0] == ?x
 | 
						|
          [Integer(m)].pack('U*')
 | 
						|
        }
 | 
						|
      end
 | 
						|
      copy
 | 
						|
    end
 | 
						|
 | 
						|
    EREFERENCE = /&(?!#{Entity::NAME};)/
 | 
						|
    # Escapes all possible entities
 | 
						|
    def Text::normalize( input, doctype=nil, entity_filter=nil )
 | 
						|
      copy = input.to_s
 | 
						|
      # Doing it like this rather than in a loop improves the speed
 | 
						|
      #copy = copy.gsub( EREFERENCE, '&' )
 | 
						|
      copy = copy.gsub( "&", "&" )
 | 
						|
      if doctype
 | 
						|
        # Replace all ampersands that aren't part of an entity
 | 
						|
        doctype.entities.each_value do |entity|
 | 
						|
          copy = copy.gsub( entity.value, 
 | 
						|
            "&#{entity.name};" ) if entity.value and 
 | 
						|
              not( entity_filter and entity_filter.include?(entity) )
 | 
						|
        end
 | 
						|
      else
 | 
						|
        # Replace all ampersands that aren't part of an entity
 | 
						|
        DocType::DEFAULT_ENTITIES.each_value do |entity|
 | 
						|
          copy = copy.gsub(entity.value, "&#{entity.name};" )
 | 
						|
        end
 | 
						|
      end
 | 
						|
      copy
 | 
						|
    end
 | 
						|
 | 
						|
    # Unescapes all possible entities
 | 
						|
    def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
 | 
						|
      string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
 | 
						|
        ref = $&
 | 
						|
        if ref[1] == ?#
 | 
						|
          if ref[2] == ?x
 | 
						|
            [ref[3...-1].to_i(16)].pack('U*')
 | 
						|
          else
 | 
						|
            [ref[2...-1].to_i].pack('U*')
 | 
						|
          end
 | 
						|
        elsif ref == '&'
 | 
						|
          '&'
 | 
						|
        elsif filter and filter.include?( ref[1...-1] )
 | 
						|
          ref
 | 
						|
        elsif doctype
 | 
						|
          doctype.entity( ref[1...-1] ) or ref
 | 
						|
        else
 | 
						|
          entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ]
 | 
						|
          entity_value ? entity_value.value : ref
 | 
						|
        end
 | 
						|
      }
 | 
						|
    end
 | 
						|
  end
 | 
						|
end
 |