mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
58ef0f06c6
UTF-8 instead of US-ASCII. [ruby-core:46021] [Feature #6679] * parse.y (parser_initialize): set default parser encoding as UTF-8 instead of US-ASCII. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@37485 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
296 lines
7.6 KiB
Ruby
296 lines
7.6 KiB
Ruby
# coding: US-ASCII
|
|
require 'rexml/encoding'
|
|
|
|
module REXML
|
|
# Generates Source-s. USE THIS CLASS.
|
|
class SourceFactory
|
|
# Generates a Source object
|
|
# @param arg Either a String, or an IO
|
|
# @return a Source, or nil if a bad argument was given
|
|
def SourceFactory::create_from(arg)
|
|
if arg.respond_to? :read and
|
|
arg.respond_to? :readline and
|
|
arg.respond_to? :nil? and
|
|
arg.respond_to? :eof?
|
|
IOSource.new(arg)
|
|
elsif arg.respond_to? :to_str
|
|
require 'stringio'
|
|
IOSource.new(StringIO.new(arg))
|
|
elsif arg.kind_of? Source
|
|
arg
|
|
else
|
|
raise "#{arg.class} is not a valid input stream. It must walk \n"+
|
|
"like either a String, an IO, or a Source."
|
|
end
|
|
end
|
|
end
|
|
|
|
# A Source can be searched for patterns, and wraps buffers and other
|
|
# objects and provides consumption of text
|
|
class Source
|
|
include Encoding
|
|
# The current buffer (what we're going to read next)
|
|
attr_reader :buffer
|
|
# The line number of the last consumed text
|
|
attr_reader :line
|
|
attr_reader :encoding
|
|
|
|
# Constructor
|
|
# @param arg must be a String, and should be a valid XML document
|
|
# @param encoding if non-null, sets the encoding of the source to this
|
|
# value, overriding all encoding detection
|
|
def initialize(arg, encoding=nil)
|
|
@orig = @buffer = arg
|
|
if encoding
|
|
self.encoding = encoding
|
|
else
|
|
detect_encoding
|
|
end
|
|
@line = 0
|
|
end
|
|
|
|
|
|
# Inherited from Encoding
|
|
# Overridden to support optimized en/decoding
|
|
def encoding=(enc)
|
|
return unless super
|
|
encoding_updated
|
|
end
|
|
|
|
# Scans the source for a given pattern. Note, that this is not your
|
|
# usual scan() method. For one thing, the pattern argument has some
|
|
# requirements; for another, the source can be consumed. You can easily
|
|
# confuse this method. Originally, the patterns were easier
|
|
# to construct and this method more robust, because this method
|
|
# generated search regexes on the fly; however, this was
|
|
# computationally expensive and slowed down the entire REXML package
|
|
# considerably, since this is by far the most commonly called method.
|
|
# @param pattern must be a Regexp, and must be in the form of
|
|
# /^\s*(#{your pattern, with no groups})(.*)/. The first group
|
|
# will be returned; the second group is used if the consume flag is
|
|
# set.
|
|
# @param consume if true, the pattern returned will be consumed, leaving
|
|
# everything after it in the Source.
|
|
# @return the pattern, if found, or nil if the Source is empty or the
|
|
# pattern is not found.
|
|
def scan(pattern, cons=false)
|
|
return nil if @buffer.nil?
|
|
rv = @buffer.scan(pattern)
|
|
@buffer = $' if cons and rv.size>0
|
|
rv
|
|
end
|
|
|
|
def read
|
|
end
|
|
|
|
def consume( pattern )
|
|
@buffer = $' if pattern.match( @buffer )
|
|
end
|
|
|
|
def match_to( char, pattern )
|
|
return pattern.match(@buffer)
|
|
end
|
|
|
|
def match_to_consume( char, pattern )
|
|
md = pattern.match(@buffer)
|
|
@buffer = $'
|
|
return md
|
|
end
|
|
|
|
def match(pattern, cons=false)
|
|
md = pattern.match(@buffer)
|
|
@buffer = $' if cons and md
|
|
return md
|
|
end
|
|
|
|
# @return true if the Source is exhausted
|
|
def empty?
|
|
@buffer == ""
|
|
end
|
|
|
|
def position
|
|
@orig.index( @buffer )
|
|
end
|
|
|
|
# @return the current line in the source
|
|
def current_line
|
|
lines = @orig.split
|
|
res = lines.grep @buffer[0..30]
|
|
res = res[-1] if res.kind_of? Array
|
|
lines.index( res ) if res
|
|
end
|
|
|
|
private
|
|
def detect_encoding
|
|
buffer_encoding = @buffer.encoding
|
|
detected_encoding = "UTF-8"
|
|
begin
|
|
@buffer.force_encoding("ASCII-8BIT")
|
|
if @buffer[0, 2] == "\xfe\xff"
|
|
@buffer[0, 2] = ""
|
|
detected_encoding = "UTF-16BE"
|
|
elsif @buffer[0, 2] == "\xff\xfe"
|
|
@buffer[0, 2] = ""
|
|
detected_encoding = "UTF-16LE"
|
|
elsif @buffer[0, 3] == "\xef\xbb\xbf"
|
|
@buffer[0, 3] = ""
|
|
detected_encoding = "UTF-8"
|
|
end
|
|
ensure
|
|
@buffer.force_encoding(buffer_encoding)
|
|
end
|
|
self.encoding = detected_encoding
|
|
end
|
|
|
|
def encoding_updated
|
|
if @encoding != 'UTF-8'
|
|
@buffer = decode(@buffer)
|
|
@to_utf = true
|
|
else
|
|
@to_utf = false
|
|
@buffer.force_encoding ::Encoding::UTF_8
|
|
end
|
|
end
|
|
end
|
|
|
|
# A Source that wraps an IO. See the Source class for method
|
|
# documentation
|
|
class IOSource < Source
|
|
#attr_reader :block_size
|
|
|
|
# block_size has been deprecated
|
|
def initialize(arg, block_size=500, encoding=nil)
|
|
@er_source = @source = arg
|
|
@to_utf = false
|
|
@pending_buffer = nil
|
|
|
|
if encoding
|
|
super("", encoding)
|
|
else
|
|
super(@source.read(3) || "")
|
|
end
|
|
|
|
if !@to_utf and
|
|
@buffer.respond_to?(:force_encoding) and
|
|
@source.respond_to?(:external_encoding) and
|
|
@source.external_encoding != ::Encoding::UTF_8
|
|
@force_utf8 = true
|
|
else
|
|
@force_utf8 = false
|
|
end
|
|
end
|
|
|
|
def scan(pattern, cons=false)
|
|
rv = super
|
|
# You'll notice that this next section is very similar to the same
|
|
# section in match(), but just a liiittle different. This is
|
|
# because it is a touch faster to do it this way with scan()
|
|
# than the way match() does it; enough faster to warrent duplicating
|
|
# some code
|
|
if rv.size == 0
|
|
until @buffer =~ pattern or @source.nil?
|
|
begin
|
|
@buffer << readline
|
|
rescue Iconv::IllegalSequence
|
|
raise
|
|
rescue
|
|
@source = nil
|
|
end
|
|
end
|
|
rv = super
|
|
end
|
|
rv.taint
|
|
rv
|
|
end
|
|
|
|
def read
|
|
begin
|
|
@buffer << readline
|
|
rescue Exception, NameError
|
|
@source = nil
|
|
end
|
|
end
|
|
|
|
def consume( pattern )
|
|
match( pattern, true )
|
|
end
|
|
|
|
def match( pattern, cons=false )
|
|
rv = pattern.match(@buffer)
|
|
@buffer = $' if cons and rv
|
|
while !rv and @source
|
|
begin
|
|
@buffer << readline
|
|
rv = pattern.match(@buffer)
|
|
@buffer = $' if cons and rv
|
|
rescue
|
|
@source = nil
|
|
end
|
|
end
|
|
rv.taint
|
|
rv
|
|
end
|
|
|
|
def empty?
|
|
super and ( @source.nil? || @source.eof? )
|
|
end
|
|
|
|
def position
|
|
@er_source.pos rescue 0
|
|
end
|
|
|
|
# @return the current line in the source
|
|
def current_line
|
|
begin
|
|
pos = @er_source.pos # The byte position in the source
|
|
lineno = @er_source.lineno # The XML < position in the source
|
|
@er_source.rewind
|
|
line = 0 # The \r\n position in the source
|
|
begin
|
|
while @er_source.pos < pos
|
|
@er_source.readline
|
|
line += 1
|
|
end
|
|
rescue
|
|
end
|
|
rescue IOError
|
|
pos = -1
|
|
line = -1
|
|
end
|
|
[pos, lineno, line]
|
|
end
|
|
|
|
private
|
|
def readline
|
|
str = @source.readline(@line_break)
|
|
if @pending_buffer
|
|
if str.nil?
|
|
str = @pending_buffer
|
|
else
|
|
str = @pending_buffer + str
|
|
end
|
|
@pending_buffer = nil
|
|
end
|
|
return nil if str.nil?
|
|
|
|
if @to_utf
|
|
decode(str)
|
|
else
|
|
str.force_encoding(::Encoding::UTF_8) if @force_utf8
|
|
str
|
|
end
|
|
end
|
|
|
|
def encoding_updated
|
|
case @encoding
|
|
when "UTF-16BE", "UTF-16LE"
|
|
@source.binmode
|
|
@source.set_encoding(@encoding)
|
|
end
|
|
@line_break = encode(">")
|
|
@pending_buffer, @buffer = @buffer, ""
|
|
@pending_buffer.force_encoding(@encoding)
|
|
super
|
|
end
|
|
end
|
|
end
|