1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

* Changes to the encoding mechanism. If iconv is found, it is used first

for encoding changes.  This should be the case on all 1.8 installations.
  When it isn't found (<1.6), the native REXML encoding mechanism is used.
  This cleaned out some files, and tightened up the code a bit; and iconv
  should be faster than the pure Ruby code.
* Changed deprecated assert_not_nil to assert throughout the tests.
* Parse exceptions are a little more verbose, and extend RuntimeError.
* Bug fixes to XPathParser
* The Light API is still shifting, like the sands of the desert.
* Fixed a new Ruby 1.8.0 warning, added some speed optimizations, and
  tightened error reporting in the base parser


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@4737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
ser 2003-10-10 12:54:46 +00:00
parent 662532be00
commit 7d21c237cc
23 changed files with 185 additions and 224 deletions

View file

@ -2,61 +2,49 @@ module REXML
module Encoding
@@uconv_available = false
ENCODING_CLAIMS = { }
def Encoding.claim( encoding_str, match=nil )
if match
ENCODING_CLAIMS[ match ] = encoding_str
else
ENCODING_CLAIMS[ /^\s*<?xml\s*version=(['"]).*?\1\s*encoding=(["'])#{encoding_str}\2/i ] = encoding_str
end
end
# Native, default format is UTF-8, so it is declared here rather than in
# an encodings/ definition.
UTF_8 = 'UTF-8'
claim( UTF_8 )
UTF_16 = 'UTF-16'
UNILE = 'UNILE'
# ID ---> Encoding name
attr_reader :encoding
def encoding=( enc )
enc = UTF_8 unless enc
old_verbosity = $VERBOSE
begin
$VERBOSE = false
return if defined? @encoding and enc == @encoding
if enc and enc != UTF_8
@encoding = enc.upcase
require "rexml/encodings/#@encoding" unless @encoding == UTF_8
begin
load 'rexml/encodings/ICONV.rb'
Iconv::iconv( UTF_8, @encoding, "" )
rescue LoadError, Exception => err
enc_file = File.join( "rexml", "encodings", "#@encoding.rb" )
begin
load enc_file
rescue LoadError
raise Exception.new( "No decoder found for encoding #@encoding. Please install iconv." )
end
end
else
enc = UTF_8
@encoding = enc.upcase
load 'rexml/encodings/UTF-8.rb'
end
ensure
$VERBOSE = old_verbosity
end
end
def check_encoding str
rv = ENCODING_CLAIMS.find{|k,v| str =~ k }
# Raise an exception if there is a declared encoding and we don't
# recognize it
unless rv
if str =~ /^\s*<?xml\s*version=(['"]).*?\1\s*encoding=(["'])(.*?)\2/
raise "A matching encoding handler was not found for encoding '#{$3}', or the encoding handler failed to load due to a missing support library (such as uconv)."
else
# We have to recognize UTF-16, LSB UTF-16, and UTF-8
return UTF_16 if str[0] == 254 && str[1] == 255
return UNILE if str[0] == 255 && str[1] == 254
str =~ /^\s*<?xml\s*version=(['"]).*?\2\s*encoding=(["'])(.*?)\2/um
return $1.upcase if $1
return UTF_8
end
end
return rv[1]
end
def to_utf_8(str)
return str
end
def from_utf_8 content
return content
end
end
module Encodingses
encodings = []
$:.each do |incl_dir|
if Dir[ File.join(incl_dir, 'rexml', 'encodings') ].size > 0
encodings |= Dir[ File.join(incl_dir, 'rexml', 'encodings', '*_decl.rb') ]
end
encodings.collect!{ |f| File.basename(f) }
encodings.uniq!
end
encodings.each { |enc| require "rexml/encodings/#{enc}" }
end
end

View file

@ -3,11 +3,11 @@ begin
module REXML
module Encoding
def from_euc_jp(str)
def decode(str)
return Uconv::euctou8(str)
end
def to_euc_jp content
def encode content
return Uconv::u8toeuc(content)
end
end
@ -17,12 +17,12 @@ rescue LoadError
require 'iconv'
module REXML
module Encoding
def from_euc_jp(str)
return Iconv::iconv("utf-8", "euc-jp", str).join('')
def decode(str)
return Iconv::iconv("utf-8", "euc-jp", str)[0]
end
def to_euc_jp content
return Iconv::iconv("euc-jp", "utf-8", content).join('')
def encode content
return Iconv::iconv("euc-jp", "utf-8", content)[0]
end
end
end

View file

@ -1,6 +0,0 @@
module REXML
module Encoding
EUC_JP = 'EUC-JP'
claim( EUC_JP )
end
end

View file

@ -0,0 +1,14 @@
require "iconv"
raise LoadError unless defined? Iconv
module REXML
module Encoding
def decode( str )
return Iconv::iconv(UTF_8, @encoding, str)[0]
end
def encode( content )
return Iconv::iconv(@encoding, UTF_8, content)[0]
end
end
end

View file

@ -1,7 +1,7 @@
module REXML
module Encoding
# Convert from UTF-8
def to_iso_8859_1 content
def encode content
array_utf8 = content.unpack('U*')
array_enc = []
array_utf8.each do |num|
@ -16,7 +16,7 @@ module REXML
end
# Convert to UTF-8
def from_iso_8859_1(str)
def decode(str)
str.unpack('C*').pack('U*')
end
end

View file

@ -1,6 +0,0 @@
module REXML
module Encoding
ISO_8859_1 = 'ISO-8859-1'
claim( ISO_8859_1 )
end
end

View file

@ -1,6 +0,0 @@
module REXML
module Encoding
claim( 'Shift-JIS' )
claim( 'Shift_JIS' )
end
end

View file

@ -1,6 +1,6 @@
module REXML
module Encoding
def to_unile content
def encode content
array_utf8 = content.unpack("U*")
array_enc = []
array_utf8.each do |num|
@ -15,7 +15,7 @@ module REXML
array_enc.pack('C*')
end
def from_unile(str)
def decode(str)
array_enc=str.unpack('C*')
array_utf8 = []
2.step(array_enc.size-1, 2){|i|

View file

@ -1,6 +0,0 @@
module REXML
module Encoding
UNILE = 'UNILE'
claim( UNILE, /^\377\376/ )
end
end

View file

@ -1,7 +1,7 @@
module REXML
module Encoding
# Convert from UTF-8
def to_us_ascii content
def encode content
array_utf8 = content.unpack('U*')
array_enc = []
array_utf8.each do |num|
@ -16,7 +16,7 @@ module REXML
end
# Convert to UTF-8
def from_us_ascii(str)
def decode(str)
str.unpack('C*').pack('U*')
end
end

View file

@ -1,6 +0,0 @@
module REXML
module Encoding
US_ASCII = 'US-ASCII'
claim( US_ASCII )
end
end

View file

@ -1,6 +1,6 @@
module REXML
module Encoding
def to_utf_16 content
def encode content
array_utf8 = content.unpack("U*")
array_enc = []
array_utf8.each do |num|
@ -15,7 +15,7 @@ module REXML
array_enc.pack('C*')
end
def from_utf_16(str)
def decode(str)
array_enc=str.unpack('C*')
array_utf8 = []
2.step(arrayEnc.size-1, 2){|i|

View file

@ -1,6 +0,0 @@
module REXML
module Encoding
UTF_16 = 'UTF-16'
claim( UTF_16, /^\376\377/ )
end
end

View file

@ -0,0 +1,11 @@
module REXML
module Encoding
def encode content
content
end
def decode(str)
str
end
end
end

View file

@ -1,76 +1,58 @@
require 'rexml/xmltokens'
require 'rexml/light/node'
# Development model
# document = Node.new
# Add an element "foo" to the document
# foo = document << "foo"
# # Set attribute "attr" on foo
# foo["attr"] = "la"
# # Set another attribute in a different namespace
# foo["attr", "namespace"] = "too"
# # Swap foo into another namespace
# foo.namespace = "blah"
# # Add a couple of element nodes to foo
# foo << "a"
# foo << "b"
# # Access the children of foo in various ways
# a = foo[0]
# foo.each { |child|
# #...
# }
# # Add text to foo
# # Add instruction
# # Add comment
# # Get the root of the document
# document == a.root
# # Write the document out
# puts document.to_s
# [ :element, parent, name, attributes, children* ]
# a = Node.new
# a << "B" # => <a>B</a>
# a.b # => <a>B<b/></a>
# a.b[1] # => <a>B<b/><b/><a>
# a.b[1]["x"] = "y" # => <a>B<b/><b x="y"/></a>
# a.b[0].c # => <a>B<b><c/></b><b x="y"/></a>
# a.b.c << "D" # => <a>B<b><c>D</c></b><b x="y"/></a>
module REXML
module Light
# Represents a tagged XML element. Elements are characterized by
# having children, attributes, and names, and can themselves be
# children.
class Node < Array
alias :_old_get :[]
alias :_old_put :[]=
class Node
NAMESPLIT = /^(?:(#{XMLTokens::NCNAME_STR}):)?(#{XMLTokens::NCNAME_STR})/u
PARENTS = [ :element, :document, :doctype ]
# Create a new element.
def initialize node=nil
@node = node
if node.kind_of? String
node = [ :text, node ]
elsif node.nil?
node = [ :document, nil, nil ]
elsif node[0] == :start_element
node[0] = :element
elsif node[0] == :start_doctype
node[0] = :doctype
elsif node[0] == :start_document
node[0] = :document
end
replace( node )
_old_put( 1, 0, 1 )
_old_put( 1, nil )
end
def size
el!()
super-4
if PARENTS.include? @node[0]
@node[-1].size
else
0
end
end
def each( &block )
el!()
size.times { |x| yield( at(x+4) ) }
end
def name
el!()
at(2)
end
def name=( name_str, ns=nil )
el!()
pfx = ''
pfx = "#{prefix(ns)}:" if ns
_old_put(1, "#{pfx}#{name_str}")
_old_put(2, "#{pfx}#{name_str}")
end
def parent=( node )
@ -78,28 +60,23 @@ module REXML
end
def local_name
el!()
namesplit
@name
end
def local_name=( name_str )
el!()
_old_put( 1, "#@prefix:#{name_str}" )
end
def prefix( namespace=nil )
el!()
prefix_of( self, namespace )
end
def namespace( prefix=prefix() )
el!()
namespace_of( self, prefix )
end
def namespace=( namespace )
el!()
@prefix = prefix( namespace )
pfx = ''
pfx = "#@prefix:" if @prefix.size > 0
@ -107,7 +84,6 @@ module REXML
end
def []( reference, ns=nil )
el!()
if reference.kind_of? String
pfx = ''
pfx = "#{prefix(ns)}:" if ns
@ -125,7 +101,6 @@ module REXML
# Doesn't handle namespaces yet
def []=( reference, ns, value=nil )
el!()
if reference.kind_of? String
value = ns unless value
at( 3 )[reference] = value
@ -170,12 +145,10 @@ module REXML
end
def has_name?( name, namespace = '' )
el!()
at(3) == name and namespace() == namespace
end
def children
el!()
self
end
@ -187,14 +160,6 @@ module REXML
end
def el!
if node_type() != :element and node_type() != :document
_old_put( 0, :element )
push({})
end
self
end
private
def namesplit

View file

@ -8,10 +8,6 @@ module REXML
@output = real_IO
self.encoding = encd
eval <<-EOL
alias :encode :to_#{encoding.tr('-', '_').downcase}
alias :decode :from_#{encoding.tr('-', '_').downcase}
EOL
@to_utf = encd == UTF_8 ? false : true
end

View file

@ -1,5 +1,5 @@
module REXML
class ParseException < Exception
class ParseException < RuntimeError
attr_accessor :source, :parser, :continued_exception
def initialize( message, source=nil, parser=nil, exception=nil )
@ -12,9 +12,9 @@ module REXML
def to_s
# Quote the original exception, if there was one
if @continued_exception
err = @continued_exception.message
err = @continued_exception.inspect
err << "\n"
err << @continued_exception.backtrace[0..3].join("\n")
err << @continued_exception.backtrace.join("\n")
err << "\n...\n"
else
err = ""
@ -24,17 +24,24 @@ module REXML
err << super
# Add contextual information
err << "\n#{@source.current_line}\nLast 80 unconsumed characters:\n#{@source.buffer[0..80].gsub(/\n/, ' ')}\n" if @source
err << "\nContext:\n#{@parser.context}" if @parser
if @source
err << "\nLine: #{line}\n"
err << "Position: #{position}\n"
err << "Last 80 unconsumed characters:\n"
err << @source.buffer[0..80].gsub(/\n/, ' ')
err << "\n"
err << @source.buffer[0..80].unpack("U*").inspect
end
err
end
def position
@source.current_line[0] if @source
@source.current_line[0] if @source and @source.current_line
end
def line
@source.current_line[2] if @source
@source.current_line[2] if @source and @source.current_line
end
def context

View file

@ -89,10 +89,10 @@ module REXML
EREFERENCE = /&(?!#{NAME};)/
DEFAULT_ENTITIES = {
'gt' => [/&gt;/, '&gt;', '>'],
'lt' => [/&lt;/, '&lt;', '<'],
'quot' => [/&quot;/, '&quot;', '"'],
"apos" => [/&apos;/, "&apos;", "'"]
'gt' => [/&gt;/, '&gt;', '>', />/],
'lt' => [/&lt;/, '&lt;', '<', /</],
'quot' => [/&quot;/, '&quot;', '"', /"/],
"apos" => [/&apos;/, "&apos;", "'", /'/]
}
def initialize( source )
@ -126,6 +126,7 @@ module REXML
# Returns true if there are more events. Synonymous with !empty?
def has_next?
return true if @closed
@source.read if @source.buffer.size==0 and !@source.empty?
(!@source.empty? and @source.buffer.strip.size>0) or @stack.size>0 or @closed
end
@ -143,7 +144,7 @@ module REXML
# event, so you can effectively pre-parse the entire document (pull the
# entire thing into memory) using this method.
def peek depth=0
raise 'Illegal argument "#{depth}"' if depth < -1
raise %Q[Illegal argument "#{depth}"] if depth < -1
temp = []
if depth == -1
temp.push(pull()) until empty?
@ -166,8 +167,9 @@ module REXML
return @stack.shift if @stack.size > 0
@source.read if @source.buffer.size==0
if @document_status == nil
@source.match( /^\s*/um, true )
word = @source.match( /^\s*(<.*?)>/um )
@source.consume( /^\s*/um )
word = @source.match( /(<.*?)>/um )
#word = @source.match_to( '>', /(<.*?)>/um )
word = word[1] unless word.nil?
case word
when COMMENT_START
@ -190,7 +192,7 @@ module REXML
close = md[2]
identity =~ IDENTITY
name = $1
raise "DOCTYPE is missing a name" if name.nil?
raise REXML::ParseException("DOCTYPE is missing a name") if name.nil?
pub_sys = $2.nil? ? nil : $2.strip
long_name = $3.nil? ? nil : $3.strip
uri = $4.nil? ? nil : $4.strip
@ -278,6 +280,7 @@ module REXML
if @source.buffer[0] == ?<
if @source.buffer[1] == ?/
last_tag = @tags.pop
#md = @source.match_to_consume( '>', CLOSE_MATCH)
md = @source.match( CLOSE_MATCH, true )
raise REXML::ParseException.new( "Missing end tag for '#{last_tag}' "+
"(got \"#{md[1]}\")", @source) unless last_tag == md[1]
@ -286,18 +289,20 @@ module REXML
md = @source.match(/\A(\s*[^>]*>)/um)
#puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
raise REXML::ParseException.new("Malformed node", @source) unless md
case md[1]
when CDATA_START
return [ :cdata, @source.match( CDATA_PATTERN, true )[1] ]
when COMMENT_START
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
if md[0][2] == ?-
md = @source.match( COMMENT_PATTERN, true )
return [ :comment, md[1] ] if md
else
md = @source.match( CDATA_PATTERN, true )
return [ :cdata, md[1] ] if md
end
raise REXML::ParseException.new( "Declarations can only occur "+
"in the doctype declaration.", @source)
end
elsif @source.buffer[1] == ??
md = @source.match( INSTRUCTION_PATTERN, true )
return [ :processing_instruction, md[1], md[2] ]
return [ :processing_instruction, md[1], md[2] ] if md
raise REXML::ParseException.new( "Bad instruction declaration",
@source)
else
# Get the next tag
md = @source.match(TAG_MATCH, true)
@ -318,17 +323,19 @@ module REXML
return [ :start_element, md[1], attributes ]
end
else
md = @source.match(TEXT_PATTERN, true)
raise "no text to add" if md[0].length == 0
md = @source.match( TEXT_PATTERN, true )
#md = @source.match_to_consume( '<', TEXT_PATTERN )
#@source.read
raise REXML::ParseException("no text to add") if md[0].length == 0
# unnormalized = Text::unnormalize( md[1], self )
# return PullEvent.new( :text, md[1], unnormalized )
return [ :text, md[1] ]
end
rescue REXML::ParseException
raise $!
raise
rescue Exception, NameError => error
raise REXML::ParseException.new( "Exception parsing",
@source, self, error )
@source, self, (error ? error : $!) )
end
return [ :dummy ]
end
@ -354,7 +361,7 @@ module REXML
end if entities
copy.gsub!( EREFERENCE, '&amp;' )
DEFAULT_ENTITIES.each do |key, value|
copy.gsub!( value[2], value[1] )
copy.gsub!( value[3], value[1] )
end
copy
end

View file

@ -16,25 +16,25 @@ module REXML
end
def parse
root = context = REXML::Light::Node.new([ :document ])
root = context = [ :document ]
while true
event = @parser.pull
case event[0]
when :end_document
break
when :end_doctype
context = context.parent
context = context[1]
when :start_element, :start_doctype
new_node = REXML::Light::Node.new(event)
new_node = event
context << new_node
new_node.parent = context
new_node[1,0] = [context]
context = new_node
when :end_element, :end_doctype
context = context.parent
context = context[1]
else
new_node = REXML::Light::Node.new(event)
new_node = event
context << new_node
new_node.parent = context
new_node[1,0] = [context]
end
end
root

View file

@ -31,7 +31,7 @@ module REXML
results = filter([element], path)
when /^\*/u
results = filter(element.to_a, path)
when /^[\[!\w:]/u
when /^[[!\w:]/u
# match on child
matches = []
children = element.to_a

View file

@ -21,6 +21,6 @@
# A tutorial is available in docs/tutorial.html
module REXML
Copyright = "Copyright #{Time.now.year} Sean Russell <ser@germane-software.com>"
Date = "+2003/110"
Version = "2.7.1"
Date = "+2003/283"
Version = "2.7.2"
end

View file

@ -39,10 +39,6 @@ module REXML
# Overridden to support optimized en/decoding
def encoding=(enc)
super
eval <<-EOL
alias :encode :to_#{encoding.tr('-', '_').downcase}
alias :decode :from_#{encoding.tr('-', '_').downcase}
EOL
@line_break = encode( '>' )
if enc != UTF_8
@buffer = decode(@buffer)
@ -78,8 +74,22 @@ module REXML
def read
end
def consume( pattern )
@buffer = $' if pattern.match( @buffer )
end
def match_to( char, pattern )
return pattern.match(@buffer)
end
def match_to_consume( char, pattern )
md = pattern.match(@buffer)
@buffer = $'
return md
end
def match pattern, consume=false
md = pattern.match @buffer
md = pattern.match(@buffer)
@buffer = $' if consume and md
return md
end
@ -112,7 +122,9 @@ module REXML
#@block_size = block_size
#super @source.read(@block_size)
@line_break = '>'
super @source.readline( @line_break )
#super @source.readline( "\n" )
super @source.readline( @line_break )+@source.read
@line_break = encode( '>' )
end
def scan pattern, consume=false
@ -145,11 +157,15 @@ module REXML
str = @source.readline('>')
str = decode(str) if @to_utf and str
@buffer << str
rescue
rescue Exception, NameError
@source = nil
end
end
def consume( pattern )
match( pattern, true )
end
def match pattern, consume=false
rv = pattern.match(@buffer)
@buffer = $' if consume and rv

View file

@ -2,16 +2,6 @@ require 'rexml/namespace'
require 'rexml/xmltokens'
require 'rexml/parsers/xpathparser'
# Ignore this class. It adds a __ne__ method, because Ruby doesn't seem to
# understand object.send( "!=", foo ), whereas it *does* understand "<", "==",
# and all of the other comparison methods. Stupid, and annoying, and not at
# all POLS.
class Object
def __ne__(b)
self != b
end
end
module REXML
# You don't want to use this class. Really. Use XPath, which is a wrapper
# for this class. Believe me. You don't want to poke around in here.
@ -132,11 +122,10 @@ module REXML
when :child
#puts "CHILD"
new_nodeset = []
ps_clone = nil
nt = nil
for node in nodeset
#ps_clone = path_stack.clone
#new_nodeset += internal_parse( ps_clone, node.children ) if node.parent?
new_nodeset += node.children if node.parent?
nt = node.node_type
new_nodeset += node.children if nt == :element or nt == :document
end
#path_stack[0,(path_stack.size-ps_clone.size)] = []
return new_nodeset
@ -238,9 +227,11 @@ module REXML
when :descendant
#puts ":DESCENDANT"
results = []
nt = nil
for node in nodeset
nt = node.node_type
results += internal_parse( path_stack.clone.unshift( :descendant_or_self ),
node.children ) if node.parent?
node.children ) if nt == :element or nt == :document
end
return results
@ -310,11 +301,13 @@ module REXML
def d_o_s( p, ns, r )
#puts r.collect{|n|n.to_s}.inspect
#puts ns.collect{|n|n.to_s}.inspect
nt = nil
ns.each_index do |i|
n = ns[i]
x = match( p.clone, [ n ] )
#puts "Got a match on #{p.inspect} for #{ns.collect{|n|n.to_s+"("+n.type.to_s+")"}.inspect}"
d_o_s( p, n.children, x ) if n.parent?
nt = n.node_type
d_o_s( p, n.children, x ) if nt == :element or nt == :document
r[i,0] = [x] if x.size > 0
end
end