require 'rexml/encoding' module REXML # Generates Source-s. USE THIS CLASS. class SourceFactory # Generates a Source object # @param arg Either a String, or an IO # @return a Source, or nil if a bad argument was given def SourceFactory::create_from(arg) if arg.respond_to? :read and arg.respond_to? :readline and arg.respond_to? :nil? and arg.respond_to? :eof? IOSource.new(arg) elsif arg.respond_to? :to_str require 'stringio' IOSource.new(StringIO.new(arg)) elsif arg.kind_of? Source arg else raise "#{arg.class} is not a valid input stream. It must walk \n"+ "like either a String, an IO, or a Source." end end end # A Source can be searched for patterns, and wraps buffers and other # objects and provides consumption of text class Source include Encoding # The current buffer (what we're going to read next) attr_reader :buffer # The line number of the last consumed text attr_reader :line attr_reader :encoding # Constructor # @param arg must be a String, and should be a valid XML document # @param encoding if non-null, sets the encoding of the source to this # value, overriding all encoding detection def initialize(arg, encoding=nil) @orig = @buffer = arg if encoding self.encoding = encoding else detect_encoding end @line = 0 end # Inherited from Encoding # Overridden to support optimized en/decoding def encoding=(enc) return unless super encoding_updated end # Scans the source for a given pattern. Note, that this is not your # usual scan() method. For one thing, the pattern argument has some # requirements; for another, the source can be consumed. You can easily # confuse this method. Originally, the patterns were easier # to construct and this method more robust, because this method # generated search regexes on the fly; however, this was # computationally expensive and slowed down the entire REXML package # considerably, since this is by far the most commonly called method. # @param pattern must be a Regexp, and must be in the form of # /^\s*(#{your pattern, with no groups})(.*)/. The first group # will be returned; the second group is used if the consume flag is # set. # @param consume if true, the pattern returned will be consumed, leaving # everything after it in the Source. # @return the pattern, if found, or nil if the Source is empty or the # pattern is not found. def scan(pattern, cons=false) return nil if @buffer.nil? rv = @buffer.scan(pattern) @buffer = $' if cons and rv.size>0 rv end def read end def consume( pattern ) @buffer = $' if pattern.match( @buffer ) end def match_to( char, pattern ) return pattern.match(@buffer) end def match_to_consume( char, pattern ) md = pattern.match(@buffer) @buffer = $' return md end def match(pattern, cons=false) md = pattern.match(@buffer) @buffer = $' if cons and md return md end # @return true if the Source is exhausted def empty? @buffer == "" end def position @orig.index( @buffer ) end # @return the current line in the source def current_line lines = @orig.split res = lines.grep @buffer[0..30] res = res[-1] if res.kind_of? Array lines.index( res ) if res end private def detect_encoding buffer_encoding = @buffer.encoding detected_encoding = "UTF-8" begin @buffer.force_encoding("ASCII-8BIT") if @buffer[0, 2] == "\xfe\xff" @buffer[0, 2] = "" detected_encoding = "UTF-16BE" elsif @buffer[0, 2] == "\xff\xfe" @buffer[0, 2] = "" detected_encoding = "UTF-16LE" elsif @buffer[0, 3] == "\xef\xbb\xbf" @buffer[0, 3] = "" detected_encoding = "UTF-8" end ensure @buffer.force_encoding(buffer_encoding) end self.encoding = detected_encoding end def encoding_updated if @encoding != 'UTF-8' @buffer = decode(@buffer) @to_utf = true else @to_utf = false @buffer.force_encoding ::Encoding::UTF_8 end end end # A Source that wraps an IO. See the Source class for method # documentation class IOSource < Source #attr_reader :block_size # block_size has been deprecated def initialize(arg, block_size=500, encoding=nil) @er_source = @source = arg @to_utf = false @pending_buffer = nil if encoding super("", encoding) else super(@source.read(3) || "") end if !@to_utf and @buffer.respond_to?(:force_encoding) and @source.respond_to?(:external_encoding) and @source.external_encoding != ::Encoding::UTF_8 @force_utf8 = true else @force_utf8 = false end end def scan(pattern, cons=false) rv = super # You'll notice that this next section is very similar to the same # section in match(), but just a liiittle different. This is # because it is a touch faster to do it this way with scan() # than the way match() does it; enough faster to warrent duplicating # some code if rv.size == 0 until @buffer =~ pattern or @source.nil? begin @buffer << readline rescue Iconv::IllegalSequence raise rescue @source = nil end end rv = super end rv.taint rv end def read begin @buffer << readline rescue Exception, NameError @source = nil end end def consume( pattern ) match( pattern, true ) end def match( pattern, cons=false ) rv = pattern.match(@buffer) @buffer = $' if cons and rv while !rv and @source begin @buffer << readline rv = pattern.match(@buffer) @buffer = $' if cons and rv rescue @source = nil end end rv.taint rv end def empty? super and ( @source.nil? || @source.eof? ) end def position @er_source.pos rescue 0 end # @return the current line in the source def current_line begin pos = @er_source.pos # The byte position in the source lineno = @er_source.lineno # The XML < position in the source @er_source.rewind line = 0 # The \r\n position in the source begin while @er_source.pos < pos @er_source.readline line += 1 end rescue end rescue IOError pos = -1 line = -1 end [pos, lineno, line] end private def readline str = @source.readline(@line_break) if @pending_buffer if str.nil? str = @pending_buffer else str = @pending_buffer + str end @pending_buffer = nil end return nil if str.nil? if @to_utf decode(str) else str.force_encoding(::Encoding::UTF_8) if @force_utf8 str end end def encoding_updated case @encoding when "UTF-16BE", "UTF-16LE" @source.binmode @source.set_encoding(@encoding) end @line_break = encode(">") @pending_buffer, @buffer = @buffer, "" @pending_buffer.force_encoding(@encoding) super end end end