mirror of
				https://github.com/ruby/ruby.git
				synced 2022-11-09 12:17:21 -05:00 
			
		
		
		
	git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@4790 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
		
			
				
	
	
		
			279 lines
		
	
	
	
		
			8.6 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			279 lines
		
	
	
	
		
			8.6 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
require 'rexml/entity'
 | 
						|
 | 
						|
module REXML
 | 
						|
	# Represents text nodes in an XML document
 | 
						|
	class Text < Child
 | 
						|
		include Comparable
 | 
						|
		# The order in which the substitutions occur
 | 
						|
		SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
 | 
						|
		SUBSTITUTES = ['&', '<', '>', '"', ''', '
']
 | 
						|
		# Characters which are substituted in written strings
 | 
						|
		SLAICEPS = [ '<', '>', '"', "'", '&' ]
 | 
						|
		SETUTITSBUS = [ /</u, />/u, /"/u, /'/u, /&/u ]
 | 
						|
 | 
						|
		# If +raw+ is true, then REXML leaves the value alone
 | 
						|
		attr_accessor :raw
 | 
						|
 | 
						|
		ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um
 | 
						|
		NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ 
 | 
						|
 | 
						|
		# Constructor
 | 
						|
		# +arg+ if a String, the content is set to the String.  If a Text,
 | 
						|
		# the object is shallowly cloned.  
 | 
						|
		#
 | 
						|
		# +respect_whitespace+ (boolean, false) if true, whitespace is
 | 
						|
		# respected
 | 
						|
		#
 | 
						|
		# +parent+ (nil) if this is a Parent object, the parent
 | 
						|
		# will be set to this.  
 | 
						|
		#
 | 
						|
		# +raw+ (nil) This argument can be given three values.
 | 
						|
		# If true, then the value of used to construct this object is expected to 
 | 
						|
		# contain no unescaped XML markup, and REXML will not change the text. If 
 | 
						|
		# this value is false, the string may contain any characters, and REXML will
 | 
						|
		# escape any and all defined entities whose values are contained in the
 | 
						|
		# text.  If this value is nil (the default), then the raw value of the 
 | 
						|
		# parent will be used as the raw value for this node.  If there is no raw
 | 
						|
		# value for the parent, and no value is supplied, the default is false.
 | 
						|
		#   Text.new( "<&", false, nil, false ) #-> "<&"
 | 
						|
		#   Text.new( "<&", false, nil, true )  #-> IllegalArgumentException
 | 
						|
		#   Text.new( "<&", false, nil, true )  #-> "<&"
 | 
						|
		#   # Assume that the entity "s" is defined to be "sean"
 | 
						|
		#   # and that the entity    "r" is defined to be "russell"
 | 
						|
		#   Text.new( "sean russell" )          #-> "&s; &r;"
 | 
						|
		#   Text.new( "sean russell", false, nil, true ) #-> "sean russell"
 | 
						|
		#
 | 
						|
		# +entity_filter+ (nil) This can be an array of entities to match in the
 | 
						|
		# supplied text.  This argument is only useful if +raw+ is set to false.
 | 
						|
		#   Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
 | 
						|
		#   Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
 | 
						|
		# In the last example, the +entity_filter+ argument is ignored.
 | 
						|
		#
 | 
						|
		# +pattern+ INTERNAL USE ONLY
 | 
						|
		def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, 
 | 
						|
			entity_filter=nil, illegal=ILLEGAL )
 | 
						|
 | 
						|
			@raw = false
 | 
						|
 | 
						|
			if parent
 | 
						|
				super( parent )
 | 
						|
				@raw = parent.raw 
 | 
						|
			else
 | 
						|
				@parent = nil
 | 
						|
			end
 | 
						|
 | 
						|
			@raw = raw unless raw.nil?
 | 
						|
			@entity_filter = entity_filter
 | 
						|
			@normalized = @unnormalized = nil
 | 
						|
 | 
						|
			if arg.kind_of? String
 | 
						|
				@string = arg.clone
 | 
						|
				@string.squeeze!(" \n\t") unless respect_whitespace
 | 
						|
			elsif arg.kind_of? Text
 | 
						|
				@string = arg.to_s
 | 
						|
				@raw = arg.raw
 | 
						|
			elsif
 | 
						|
				raise Exception.new( "Illegal argument of type #{arg.type} for Text constructor (#{arg})" )
 | 
						|
			end
 | 
						|
 | 
						|
			@string.gsub!( /\r\n?/, "\n" )
 | 
						|
 | 
						|
			# check for illegal characters
 | 
						|
			if @raw
 | 
						|
				if @string =~ illegal
 | 
						|
					raise Exception.new(
 | 
						|
						"Illegal character '#{$1}' in raw string \"#{@string}\""
 | 
						|
					)
 | 
						|
				end
 | 
						|
			end
 | 
						|
		end
 | 
						|
 | 
						|
		def node_type
 | 
						|
			:text
 | 
						|
		end
 | 
						|
 | 
						|
		def empty?
 | 
						|
			@string.size==0
 | 
						|
		end
 | 
						|
 | 
						|
 | 
						|
		def clone
 | 
						|
			return Text.new(self)
 | 
						|
		end
 | 
						|
 | 
						|
 | 
						|
		# Appends text to this text node.  The text is appended in the +raw+ mode
 | 
						|
		# of this text node.
 | 
						|
		def <<( to_append )
 | 
						|
			@string << to_append.gsub( /\r\n?/, "\n" )
 | 
						|
		end
 | 
						|
 | 
						|
 | 
						|
		# +other+ a String or a Text
 | 
						|
		# +returns+ the result of (to_s <=> arg.to_s)
 | 
						|
		def <=>( other )
 | 
						|
			to_s() <=> other.to_s
 | 
						|
		end
 | 
						|
 | 
						|
		REFERENCE = /#{Entity::REFERENCE}/
 | 
						|
		# Returns the string value of this text node.  This string is always
 | 
						|
		# escaped, meaning that it is a valid XML text node string, and all
 | 
						|
		# entities that can be escaped, have been inserted.  This method respects
 | 
						|
		# the entity filter set in the constructor.
 | 
						|
		#   
 | 
						|
		#   # Assume that the entity "s" is defined to be "sean", and that the 
 | 
						|
		#   # entity "r" is defined to be "russell"
 | 
						|
		#   t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 
 | 
						|
		#   t.to_s   #-> "< & &s; russell"
 | 
						|
		#   t = Text.new( "< & &s; russell", false, nil, false ) 
 | 
						|
		#   t.to_s   #-> "< & &s; russell"
 | 
						|
		#   u = Text.new( "sean russell", false, nil, true )
 | 
						|
		#   u.to_s   #-> "sean russell"
 | 
						|
		def to_s
 | 
						|
			return @string if @raw
 | 
						|
			return @normalized if @normalized
 | 
						|
 | 
						|
			doctype = nil
 | 
						|
			if @parent
 | 
						|
				doc = @parent.document
 | 
						|
				doctype = doc.doctype if doc
 | 
						|
			end
 | 
						|
 | 
						|
			@normalized = Text::normalize( @string, doctype, @entity_filter )
 | 
						|
		end
 | 
						|
 | 
						|
		# Returns the string value of this text.  This is the text without
 | 
						|
		# entities, as it might be used programmatically, or printed to the
 | 
						|
		# console.  This ignores the 'raw' attribute setting, and any
 | 
						|
		# entity_filter.
 | 
						|
		#
 | 
						|
		#   # Assume that the entity "s" is defined to be "sean", and that the 
 | 
						|
		#   # entity "r" is defined to be "russell"
 | 
						|
		#   t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 
 | 
						|
		#   t.string   #-> "< & sean russell"
 | 
						|
		#   t = Text.new( "< & &s; russell", false, nil, false )
 | 
						|
		#   t.string   #-> "< & sean russell"
 | 
						|
		#   u = Text.new( "sean russell", false, nil, true )
 | 
						|
		#   u.string   #-> "sean russell"
 | 
						|
		def value
 | 
						|
			@unnormalized if @unnormalized
 | 
						|
			doctype = nil
 | 
						|
			if @parent
 | 
						|
				doc = @parent.document
 | 
						|
				doctype = doc.doctype if doc
 | 
						|
			end
 | 
						|
			@unnormalized = Text::unnormalize( @string, doctype )
 | 
						|
		end
 | 
						|
 | 
						|
		def write( writer, indent=-1, transitive=false, ie_hack=false ) 
 | 
						|
			writer << to_s()
 | 
						|
		end
 | 
						|
 | 
						|
		# Writes out text, substituting special characters beforehand.
 | 
						|
		# +out+ A String, IO, or any other object supporting <<( String )
 | 
						|
		# +input+ the text to substitute and the write out
 | 
						|
		#
 | 
						|
		#   z=utf8.unpack("U*")
 | 
						|
		#   ascOut=""
 | 
						|
		#   z.each{|r|
 | 
						|
		#     if r <  0x100
 | 
						|
		#       ascOut.concat(r.chr)
 | 
						|
		#     else
 | 
						|
		#       ascOut.concat(sprintf("&#x%x;", r))
 | 
						|
		#     end
 | 
						|
		#   }
 | 
						|
		#   puts ascOut
 | 
						|
		def write_with_substitution out, input
 | 
						|
			copy = input.clone
 | 
						|
			# Doing it like this rather than in a loop improves the speed
 | 
						|
			copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
 | 
						|
			copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
 | 
						|
			copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
 | 
						|
			copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
 | 
						|
			copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
 | 
						|
			copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
 | 
						|
			out << copy
 | 
						|
		end
 | 
						|
 | 
						|
		# Reads text, substituting entities
 | 
						|
		def Text::read_with_substitution( input, illegal=nil )
 | 
						|
			copy = input.clone
 | 
						|
 | 
						|
			if copy =~ illegal
 | 
						|
				raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
 | 
						|
			end if illegal
 | 
						|
			
 | 
						|
			copy.gsub!( /\r\n?/, "\n" )
 | 
						|
			if copy.include? ?&
 | 
						|
				copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
 | 
						|
				copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
 | 
						|
				copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
 | 
						|
				copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
 | 
						|
				copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
 | 
						|
				copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) {|m|
 | 
						|
					m=$1
 | 
						|
					#m='0' if m==''
 | 
						|
					m = "0#{m}" if m[0] == ?x
 | 
						|
					[Integer(m)].pack('U*')
 | 
						|
				}
 | 
						|
			end
 | 
						|
			copy
 | 
						|
		end
 | 
						|
 | 
						|
		EREFERENCE = /&(?!#{Entity::NAME};)/
 | 
						|
		# Escapes all possible entities
 | 
						|
		def Text::normalize( input, doctype=nil, entity_filter=nil )
 | 
						|
			copy = input.clone
 | 
						|
			# Doing it like this rather than in a loop improves the speed
 | 
						|
			if doctype
 | 
						|
				copy.gsub!( EREFERENCE, '&' )
 | 
						|
				doctype.entities.each_value do |entity|
 | 
						|
					copy.gsub!( entity.value, 
 | 
						|
						"&#{entity.name};" ) if entity.value and 
 | 
						|
							not( entity_filter and entity_filter.include?(entity) )
 | 
						|
				end
 | 
						|
			else
 | 
						|
				copy.gsub!( EREFERENCE, '&' )
 | 
						|
				DocType::DEFAULT_ENTITIES.each_value do |entity|
 | 
						|
					copy.gsub!(entity.value, "&#{entity.name};" )
 | 
						|
				end
 | 
						|
			end
 | 
						|
			copy
 | 
						|
		end
 | 
						|
 | 
						|
		# Unescapes all possible entities
 | 
						|
		def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
 | 
						|
			rv = string.clone
 | 
						|
			rv.gsub!( /\r\n?/, "\n" )
 | 
						|
			matches = rv.scan(REFERENCE)
 | 
						|
			return rv if matches.size == 0
 | 
						|
			rv.gsub!( NUMERICENTITY ) {|m|
 | 
						|
				m=$1
 | 
						|
				m = "0#{m}" if m[0] == ?x
 | 
						|
				[Integer(m)].pack('U*')
 | 
						|
			}
 | 
						|
			matches.collect!{|x|x[0]}.compact!
 | 
						|
			if matches.size > 0
 | 
						|
				if doctype
 | 
						|
					matches.each do |entity_reference|
 | 
						|
						unless filter and filter.include?(entity_reference)
 | 
						|
							entity_value = doctype.entity( entity_reference )
 | 
						|
							re = /&#{entity_reference};/
 | 
						|
							rv.gsub!( re, entity_value ) if entity_value
 | 
						|
						end
 | 
						|
					end
 | 
						|
				else
 | 
						|
					matches.each do |entity_reference|
 | 
						|
						unless filter and filter.include?(entity_reference)
 | 
						|
							entity_value = DocType::DEFAULT_ENTITIES[ entity_reference ]
 | 
						|
							re = /&#{entity_reference};/
 | 
						|
							rv.gsub!( re, entity_value.value ) if entity_value
 | 
						|
						end
 | 
						|
					end
 | 
						|
				end
 | 
						|
				rv.gsub!( /&/, '&' )
 | 
						|
			end
 | 
						|
			rv
 | 
						|
		end
 | 
						|
	end
 | 
						|
end
 |