2003-06-09 21:31:01 -04:00
|
|
|
require 'rexml/entity'
|
|
|
|
|
|
|
|
module REXML
|
|
|
|
# Represents text nodes in an XML document
|
|
|
|
class Text < Child
|
|
|
|
include Comparable
|
|
|
|
# The order in which the substitutions occur
|
|
|
|
SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
|
|
|
|
SUBSTITUTES = ['&', '<', '>', '"', ''', ' ']
|
|
|
|
# Characters which are substituted in written strings
|
|
|
|
SLAICEPS = [ '<', '>', '"', "'", '&' ]
|
|
|
|
SETUTITSBUS = [ /</u, />/u, /"/u, /'/u, /&/u ]
|
|
|
|
|
|
|
|
# If +raw+ is true, then REXML leaves the value alone
|
|
|
|
attr_accessor :raw
|
|
|
|
|
|
|
|
ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um
|
|
|
|
NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/
|
|
|
|
|
|
|
|
# Constructor
|
|
|
|
# +arg+ if a String, the content is set to the String. If a Text,
|
|
|
|
# the object is shallowly cloned.
|
|
|
|
#
|
|
|
|
# +respect_whitespace+ (boolean, false) if true, whitespace is
|
|
|
|
# respected
|
|
|
|
#
|
|
|
|
# +parent+ (nil) if this is a Parent object, the parent
|
|
|
|
# will be set to this.
|
|
|
|
#
|
|
|
|
# +raw+ (nil) This argument can be given three values.
|
|
|
|
# If true, then the value of used to construct this object is expected to
|
|
|
|
# contain no unescaped XML markup, and REXML will not change the text. If
|
|
|
|
# this value is false, the string may contain any characters, and REXML will
|
|
|
|
# escape any and all defined entities whose values are contained in the
|
|
|
|
# text. If this value is nil (the default), then the raw value of the
|
|
|
|
# parent will be used as the raw value for this node. If there is no raw
|
|
|
|
# value for the parent, and no value is supplied, the default is false.
|
|
|
|
# Text.new( "<&", false, nil, false ) #-> "<&"
|
|
|
|
# Text.new( "<&", false, nil, true ) #-> IllegalArgumentException
|
|
|
|
# Text.new( "<&", false, nil, true ) #-> "<&"
|
|
|
|
# # Assume that the entity "s" is defined to be "sean"
|
|
|
|
# # and that the entity "r" is defined to be "russell"
|
|
|
|
# Text.new( "sean russell" ) #-> "&s; &r;"
|
|
|
|
# Text.new( "sean russell", false, nil, true ) #-> "sean russell"
|
|
|
|
#
|
|
|
|
# +entity_filter+ (nil) This can be an array of entities to match in the
|
|
|
|
# supplied text. This argument is only useful if +raw+ is set to false.
|
|
|
|
# Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
|
|
|
|
# Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
|
|
|
|
# In the last example, the +entity_filter+ argument is ignored.
|
|
|
|
#
|
|
|
|
# +pattern+ INTERNAL USE ONLY
|
|
|
|
def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
|
|
|
|
entity_filter=nil, illegal=ILLEGAL )
|
|
|
|
|
|
|
|
@raw = false
|
|
|
|
|
|
|
|
if parent
|
|
|
|
super( parent )
|
|
|
|
@raw = parent.raw
|
|
|
|
else
|
|
|
|
@parent = nil
|
|
|
|
end
|
|
|
|
|
|
|
|
@raw = raw unless raw.nil?
|
|
|
|
@entity_filter = entity_filter
|
|
|
|
@normalized = @unnormalized = nil
|
|
|
|
|
|
|
|
if arg.kind_of? String
|
|
|
|
@string = arg.clone
|
|
|
|
@string.squeeze!(" \n\t") unless respect_whitespace
|
|
|
|
elsif arg.kind_of? Text
|
|
|
|
@string = arg.to_s
|
|
|
|
@raw = arg.raw
|
|
|
|
elsif
|
|
|
|
raise Exception.new( "Illegal argument of type #{arg.type} for Text constructor (#{arg})" )
|
|
|
|
end
|
|
|
|
|
|
|
|
@string.gsub!( /\r\n?/, "\n" )
|
|
|
|
|
|
|
|
# check for illegal characters
|
|
|
|
if @raw
|
|
|
|
if @string =~ illegal
|
|
|
|
raise Exception.new(
|
|
|
|
"Illegal character '#{$1}' in raw string \"#{@string}\""
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def node_type
|
|
|
|
:text
|
|
|
|
end
|
|
|
|
|
|
|
|
def empty?
|
|
|
|
@string.size==0
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
def clone
|
|
|
|
return Text.new(self)
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
# Appends text to this text node. The text is appended in the +raw+ mode
|
|
|
|
# of this text node.
|
|
|
|
def <<( to_append )
|
|
|
|
@string << to_append.gsub( /\r\n?/, "\n" )
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
# +other+ a String or a Text
|
|
|
|
# +returns+ the result of (to_s <=> arg.to_s)
|
|
|
|
def <=>( other )
|
|
|
|
to_s() <=> other.to_s
|
|
|
|
end
|
|
|
|
|
|
|
|
REFERENCE = /#{Entity::REFERENCE}/
|
|
|
|
# Returns the string value of this text node. This string is always
|
|
|
|
# escaped, meaning that it is a valid XML text node string, and all
|
|
|
|
# entities that can be escaped, have been inserted. This method respects
|
|
|
|
# the entity filter set in the constructor.
|
|
|
|
#
|
|
|
|
# # Assume that the entity "s" is defined to be "sean", and that the
|
|
|
|
# # entity "r" is defined to be "russell"
|
|
|
|
# t = Text.new( "< & sean russell", false, nil, false, ['s'] )
|
|
|
|
# t.to_s #-> "< & &s; russell"
|
|
|
|
# t = Text.new( "< & &s; russell", false, nil, false )
|
|
|
|
# t.to_s #-> "< & &s; russell"
|
|
|
|
# u = Text.new( "sean russell", false, nil, true )
|
|
|
|
# u.to_s #-> "sean russell"
|
|
|
|
def to_s
|
|
|
|
return @string if @raw
|
|
|
|
return @normalized if @normalized
|
|
|
|
|
|
|
|
doctype = nil
|
|
|
|
if @parent
|
|
|
|
doc = @parent.document
|
|
|
|
doctype = doc.doctype if doc
|
|
|
|
end
|
|
|
|
|
|
|
|
@normalized = Text::normalize( @string, doctype, @entity_filter )
|
|
|
|
end
|
|
|
|
|
|
|
|
# Returns the string value of this text. This is the text without
|
|
|
|
# entities, as it might be used programmatically, or printed to the
|
|
|
|
# console. This ignores the 'raw' attribute setting, and any
|
|
|
|
# entity_filter.
|
|
|
|
#
|
|
|
|
# # Assume that the entity "s" is defined to be "sean", and that the
|
|
|
|
# # entity "r" is defined to be "russell"
|
|
|
|
# t = Text.new( "< & sean russell", false, nil, false, ['s'] )
|
|
|
|
# t.string #-> "< & sean russell"
|
|
|
|
# t = Text.new( "< & &s; russell", false, nil, false )
|
|
|
|
# t.string #-> "< & sean russell"
|
|
|
|
# u = Text.new( "sean russell", false, nil, true )
|
|
|
|
# u.string #-> "sean russell"
|
|
|
|
def value
|
|
|
|
@unnormalized if @unnormalized
|
|
|
|
doctype = nil
|
|
|
|
if @parent
|
|
|
|
doc = @parent.document
|
|
|
|
doctype = doc.doctype if doc
|
|
|
|
end
|
|
|
|
@unnormalized = Text::unnormalize( @string, doctype )
|
|
|
|
end
|
2004-02-13 17:40:14 -05:00
|
|
|
|
|
|
|
def wrap(string, width, addnewline=false)
|
|
|
|
# Recursivly wrap string at width.
|
|
|
|
return string if string.length <= width
|
|
|
|
place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
|
|
|
|
if addnewline then
|
|
|
|
return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
|
|
|
|
else
|
|
|
|
return string[0,place] + "\n" + wrap(string[place+1..-1], width)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def indent(string, level=1, style="\t", indentfirstline=true)
|
|
|
|
return string if level < 0
|
|
|
|
new_string = ''
|
|
|
|
string.each { |line|
|
|
|
|
indent_string = style * level
|
|
|
|
new_line = (indent_string + line).sub(/[\s]+$/,'')
|
|
|
|
new_string << new_line
|
|
|
|
}
|
|
|
|
new_string.strip! unless indentfirstline
|
|
|
|
return new_string
|
|
|
|
end
|
|
|
|
|
2003-06-09 21:31:01 -04:00
|
|
|
def write( writer, indent=-1, transitive=false, ie_hack=false )
|
2004-02-13 17:40:14 -05:00
|
|
|
s = to_s()
|
|
|
|
if not (@parent and @parent.whitespace) then
|
|
|
|
s = wrap(s, 60, false) if @parent and @parent.context[:wordwrap] == :all
|
|
|
|
if @parent and not @parent.context[:indentstyle].nil? then
|
|
|
|
indentstyle = @parent.context[:indentstyle]
|
|
|
|
else
|
|
|
|
indentstyle = ' '
|
|
|
|
end
|
|
|
|
if s.count("\n") > 0 and indent > 0 then
|
|
|
|
s = indent(s, indent, indentstyle, false)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
writer << s
|
2003-06-09 21:31:01 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
# Writes out text, substituting special characters beforehand.
|
|
|
|
# +out+ A String, IO, or any other object supporting <<( String )
|
|
|
|
# +input+ the text to substitute and the write out
|
|
|
|
#
|
|
|
|
# z=utf8.unpack("U*")
|
|
|
|
# ascOut=""
|
|
|
|
# z.each{|r|
|
|
|
|
# if r < 0x100
|
|
|
|
# ascOut.concat(r.chr)
|
|
|
|
# else
|
|
|
|
# ascOut.concat(sprintf("&#x%x;", r))
|
|
|
|
# end
|
|
|
|
# }
|
|
|
|
# puts ascOut
|
|
|
|
def write_with_substitution out, input
|
|
|
|
copy = input.clone
|
|
|
|
# Doing it like this rather than in a loop improves the speed
|
|
|
|
copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
|
|
|
|
copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
|
|
|
|
copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
|
|
|
|
copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
|
|
|
|
copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
|
|
|
|
copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
|
|
|
|
out << copy
|
|
|
|
end
|
|
|
|
|
|
|
|
# Reads text, substituting entities
|
|
|
|
def Text::read_with_substitution( input, illegal=nil )
|
|
|
|
copy = input.clone
|
|
|
|
|
|
|
|
if copy =~ illegal
|
|
|
|
raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
|
|
|
|
end if illegal
|
|
|
|
|
|
|
|
copy.gsub!( /\r\n?/, "\n" )
|
|
|
|
if copy.include? ?&
|
|
|
|
copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
|
|
|
|
copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
|
|
|
|
copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
|
|
|
|
copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
|
|
|
|
copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
|
|
|
|
copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) {|m|
|
|
|
|
m=$1
|
|
|
|
#m='0' if m==''
|
|
|
|
m = "0#{m}" if m[0] == ?x
|
|
|
|
[Integer(m)].pack('U*')
|
|
|
|
}
|
|
|
|
end
|
|
|
|
copy
|
|
|
|
end
|
|
|
|
|
|
|
|
EREFERENCE = /&(?!#{Entity::NAME};)/
|
|
|
|
# Escapes all possible entities
|
|
|
|
def Text::normalize( input, doctype=nil, entity_filter=nil )
|
|
|
|
copy = input.clone
|
|
|
|
# Doing it like this rather than in a loop improves the speed
|
|
|
|
if doctype
|
|
|
|
copy.gsub!( EREFERENCE, '&' )
|
|
|
|
doctype.entities.each_value do |entity|
|
|
|
|
copy.gsub!( entity.value,
|
|
|
|
"&#{entity.name};" ) if entity.value and
|
|
|
|
not( entity_filter and entity_filter.include?(entity) )
|
|
|
|
end
|
|
|
|
else
|
|
|
|
copy.gsub!( EREFERENCE, '&' )
|
|
|
|
DocType::DEFAULT_ENTITIES.each_value do |entity|
|
|
|
|
copy.gsub!(entity.value, "&#{entity.name};" )
|
|
|
|
end
|
|
|
|
end
|
|
|
|
copy
|
|
|
|
end
|
|
|
|
|
|
|
|
# Unescapes all possible entities
|
|
|
|
def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
|
|
|
|
rv = string.clone
|
|
|
|
rv.gsub!( /\r\n?/, "\n" )
|
2003-12-08 21:41:33 -05:00
|
|
|
matches = rv.scan( REFERENCE )
|
2003-06-09 21:31:01 -04:00
|
|
|
return rv if matches.size == 0
|
|
|
|
rv.gsub!( NUMERICENTITY ) {|m|
|
|
|
|
m=$1
|
|
|
|
m = "0#{m}" if m[0] == ?x
|
|
|
|
[Integer(m)].pack('U*')
|
|
|
|
}
|
|
|
|
matches.collect!{|x|x[0]}.compact!
|
|
|
|
if matches.size > 0
|
|
|
|
if doctype
|
|
|
|
matches.each do |entity_reference|
|
|
|
|
unless filter and filter.include?(entity_reference)
|
|
|
|
entity_value = doctype.entity( entity_reference )
|
|
|
|
re = /&#{entity_reference};/
|
|
|
|
rv.gsub!( re, entity_value ) if entity_value
|
|
|
|
end
|
|
|
|
end
|
|
|
|
else
|
|
|
|
matches.each do |entity_reference|
|
|
|
|
unless filter and filter.include?(entity_reference)
|
|
|
|
entity_value = DocType::DEFAULT_ENTITIES[ entity_reference ]
|
|
|
|
re = /&#{entity_reference};/
|
|
|
|
rv.gsub!( re, entity_value.value ) if entity_value
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
rv.gsub!( /&/, '&' )
|
|
|
|
end
|
|
|
|
rv
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|