1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

merge revision(s) e86c1f6fc53433ef5c82ed2b7a4cc9a12c153e4c,f6539202c52a051a4e6946a318a1d9cd29002990: [Backport #1205]

Work around issue transcoding issue with non-ASCII compatible
	 encodings and xml escaping

	When using a non-ASCII compatible source and destination encoding
	and xml escaping (the :xml option to String#encode), the resulting
	string was broken, as it used the correct non-ASCII compatible
	encoding, but contained data that was ASCII-compatible instead of
	compatible with the string's encoding.

	Work around this issue by detecting the case where both the
	source and destination encoding are non-ASCII compatible, and
	transcoding the source string from the non-ASCII compatible
	encoding to UTF-8. The xml escaping code will correctly handle
	the UTF-8 source string and the return the correctly encoded
	and escaped value.

	Fixes [Bug #12052]

	Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>
	---
	 test/ruby/test_transcode.rb | 19 +++++++++++++++++++
	 transcode.c                 |  6 ++++++
	 2 files changed, 25 insertions(+)

	=?UTF-8?q?-=20add=20regression=20tests=20for=20U+6E7F=20(?=
	 =?UTF-8?q?=E6=B9=BF)=20in=20ISO-2022-JP?=
	MIME-Version: 1.0
	Content-Type: text/plain; charset=UTF-8
	Content-Transfer-Encoding: 8bit

	  In ISO-2022-JP, the bytes use to code are the same as those for "<>".
	  This adds regression tests to make sure that these bytes, when representing
	  湿, are NOT escaped with encode("ISO-2022-JP, xml: :text) or similar.
	  These are additional regression tests for #12052.
	---
	 test/ruby/test_transcode.rb | 3 +++
	 1 file changed, 3 insertions(+)
This commit is contained in:
nagachika 2021-07-03 13:49:46 +09:00
parent 2aad080396
commit e62cccaeb0
3 changed files with 29 additions and 1 deletions

View file

@ -126,6 +126,28 @@ class TestTranscode < Test::Unit::TestCase
assert_equal("D\xFCrst".force_encoding('iso-8859-2'), "D\xFCrst".encode('iso-8859-2', 'iso-8859-1'))
end
def test_encode_xml_multibyte
encodings = %w'UTF-8 UTF-16LE UTF-16BE UTF-32LE UTF-32BE'
encodings.each do |src_enc|
encodings.each do |dst_enc|
escaped = "<>".encode(src_enc).encode(dst_enc, :xml=>:text)
assert_equal("&lt;&gt;", escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :text")
escaped = '<">'.encode(src_enc).encode(dst_enc, :xml=>:attr)
assert_equal('"&lt;&quot;&gt;"', escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :attr")
escaped = "<>".encode(src_enc).force_encoding("UTF-8").encode(dst_enc, src_enc, :xml=>:text)
assert_equal("&lt;&gt;", escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :text")
escaped = '<">'.encode(src_enc).force_encoding("UTF-8").encode(dst_enc, src_enc, :xml=>:attr)
assert_equal('"&lt;&quot;&gt;"', escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :attr")
end
end
# regression test; U+6E7F (湿) uses the same bytes in ISO-2022-JP as "<>"
assert_equal( "&lt;&gt;\u6E7F", "<>\u6E7F".encode("ISO-2022-JP").encode("ISO-2022-JP", :xml=>:text).encode("UTF-8"))
assert_equal("\"&lt;&gt;\u6E7F\"", "<>\u6E7F".encode("ISO-2022-JP").encode("ISO-2022-JP", :xml=>:attr).encode("UTF-8"))
end
def test_ascii_range
encodings = [
'US-ASCII', 'ASCII-8BIT',

View file

@ -2719,6 +2719,12 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
}
}
else {
if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
rb_encoding *utf8 = rb_utf8_encoding();
str = rb_str_conv_enc(str, senc, utf8);
senc = utf8;
sname = "UTF-8";
}
if (encoding_equal(sname, dname)) {
sname = "";
dname = "";

View file

@ -12,7 +12,7 @@
# define RUBY_VERSION_MINOR RUBY_API_VERSION_MINOR
#define RUBY_VERSION_TEENY 2
#define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR
#define RUBY_PATCHLEVEL 105
#define RUBY_PATCHLEVEL 106
#define RUBY_RELEASE_YEAR 2021
#define RUBY_RELEASE_MONTH 7