From 2a42c1bd3a8fcaa59f778070bff0eac60757ced3 Mon Sep 17 00:00:00 2001 From: kou Date: Sun, 28 Oct 2012 12:31:20 +0000 Subject: [PATCH] * lib/rexml/source.rb (REXML::IOSource#initialize): Reduce @line_break initialize code. It should be done only in #encoding=. * lib/rexml/parsers/baseparser.rb: Don't set UTF-16 encoding to source by encoding="UTF-16" in XML declaration because UTF-16XX source encoding should be set in Source#initialize or IOSource#intialize. They should handle BOM. Parser should not consider about it. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@37361 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 10 ++++++++ lib/rexml/parsers/baseparser.rb | 11 ++++++++- lib/rexml/source.rb | 43 +++++++++++++++++++++++---------- 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/ChangeLog b/ChangeLog index ffc27525ba..329f6d6a88 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +Sun Oct 28 21:25:11 2012 Kouhei Sutou + + * lib/rexml/source.rb (REXML::IOSource#initialize): Reduce + @line_break initialize code. It should be done only in #encoding=. + * lib/rexml/parsers/baseparser.rb: Don't set UTF-16 encoding to + source by encoding="UTF-16" in XML declaration because UTF-16XX + source encoding should be set in Source#initialize or + IOSource#intialize. They should handle BOM. Parser should not + consider about it. + Sun Oct 28 21:18:37 2012 Kouhei Sutou * test/rexml/test_document.rb: Add tests for parsing XML encoded diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index ebffdaa8c7..dc4a1c8bee 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -212,7 +212,9 @@ module REXML version = version[1] unless version.nil? encoding = ENCODING.match(results) encoding = encoding[1] unless encoding.nil? - @source.encoding = encoding + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end standalone = STANDALONE.match(results) standalone = standalone[1] unless standalone.nil? return [ :xmldecl, version, encoding, standalone ] @@ -493,6 +495,13 @@ module REXML end rv end + + private + def need_source_encoding_update?(xml_declaration_encoding) + return false if xml_declaration_encoding.nil? + return false if /\AUTF-16\z/i =~ xml_declaration_encoding + true + end end end end diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 112393cfd4..c15f63dcc8 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -144,22 +144,39 @@ module REXML # if there is one. If there isn't one, the file MUST be UTF-8, as per # the XML spec. If there is one, we can determine the encoding from # it. - @buffer = "" - str = @source.read( 2 ) || '' if encoding - self.encoding = encoding - elsif str[0,2] == "\xfe\xff" - @line_break = "\000>" - elsif str[0,2] == "\xff\xfe" - @line_break = ">\000" - elsif str[0,2] == "\xef\xbb" - str += @source.read(1) - str = '' if (str[2,1] == "\xBF") - @line_break = ">" + super("", encoding) else - @line_break = ">" + need_super_with_line = false + str = @source.read( 2 ) || '' + str.force_encoding("ASCII-8BIT") + if str[0, 2] == "\xfe\xff" + @source.binmode + @source.set_encoding("UTF-16BE") + super("", "UTF-16BE") + elsif str[0, 2] == "\xff\xfe" + @source.binmode + @source.set_encoding("UTF-16LE") + super("", "UTF-16LE") + elsif str[0, 2] == "\xef\xbb" + str += @source.read(1) + if str[2, 1] == "\xBF" + @source.set_encoding("UTF-8") + super("", "UTF-8") + else + need_super_with_line = true + end + else + need_super_with_line = true + end + if need_super_with_line + if @source.eof? + super(str) + else + super(str + @source.readline(">")) + end + end end - super( @source.eof? ? str : str+@source.readline( @line_break ) ) if !@to_utf and @buffer.respond_to?(:force_encoding) and