mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
merge revision(s) 391abc543cea118a9cd7d6310acadbfa352668ef,e86c1f6fc53433ef5c82ed2b7a4cc9a12c153e4c,f6539202c52a051a4e6946a318a1d9cd29002990: [Backport #12052]
Scan the coderange in the given encoding --- ext/-test-/string/enc_str_buf_cat.c | 14 ++++++++++++++ string.c | 32 ++++++++++++++++++++++--------- test/-ext-/string/test_enc_str_buf_cat.rb | 9 +++++++++ 3 files changed, 46 insertions(+), 9 deletions(-) Work around issue transcoding issue with non-ASCII compatible encodings and xml escaping When using a non-ASCII compatible source and destination encoding and xml escaping (the :xml option to String#encode), the resulting string was broken, as it used the correct non-ASCII compatible encoding, but contained data that was ASCII-compatible instead of compatible with the string's encoding. Work around this issue by detecting the case where both the source and destination encoding are non-ASCII compatible, and transcoding the source string from the non-ASCII compatible encoding to UTF-8. The xml escaping code will correctly handle the UTF-8 source string and the return the correctly encoded and escaped value. Fixes [Bug #12052] Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org> --- test/ruby/test_transcode.rb | 19 +++++++++++++++++++ transcode.c | 6 ++++++ 2 files changed, 25 insertions(+) =?UTF-8?q?-=20add=20regression=20tests=20for=20U+6E7F=20(?= =?UTF-8?q?=E6=B9=BF)=20in=20ISO-2022-JP?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In ISO-2022-JP, the bytes use to code are the same as those for "<>". This adds regression tests to make sure that these bytes, when representing 湿, are NOT escaped with encode("ISO-2022-JP, xml: :text) or similar. These are additional regression tests for #12052. --- test/ruby/test_transcode.rb | 3 +++ 1 file changed, 3 insertions(+)
This commit is contained in:
parent
ca6ebde821
commit
b93a2d9d2c
6 changed files with 75 additions and 10 deletions
|
|
@ -7,8 +7,22 @@ enc_str_buf_cat(VALUE str, VALUE str2)
|
|||
return rb_enc_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2), rb_enc_get(str2));
|
||||
}
|
||||
|
||||
static VALUE
|
||||
str_conv_enc_opts(VALUE str, VALUE from, VALUE to, VALUE ecflags, VALUE ecopts)
|
||||
{
|
||||
rb_encoding *from_enc = NIL_P(from) ? NULL : rb_to_encoding(from);
|
||||
rb_encoding *to_enc = NIL_P(to) ? NULL : rb_to_encoding(to);
|
||||
int flags = NUM2INT(ecflags);
|
||||
if (!NIL_P(ecopts)) {
|
||||
Check_Type(ecopts, T_HASH);
|
||||
OBJ_FREEZE(ecopts);
|
||||
}
|
||||
return rb_str_conv_enc_opts(str, from_enc, to_enc, flags, ecopts);
|
||||
}
|
||||
|
||||
void
|
||||
Init_string_enc_str_buf_cat(VALUE klass)
|
||||
{
|
||||
rb_define_method(klass, "enc_str_buf_cat", enc_str_buf_cat, 1);
|
||||
rb_define_method(klass, "str_conv_enc_opts", str_conv_enc_opts, 4);
|
||||
}
|
||||
|
|
|
|||
32
string.c
32
string.c
|
|
@ -698,6 +698,18 @@ rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
|
|||
ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
|
||||
}
|
||||
|
||||
static int
|
||||
enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx)
|
||||
{
|
||||
if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
|
||||
rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
|
||||
return ENC_CODERANGE_BROKEN;
|
||||
}
|
||||
else {
|
||||
return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
rb_enc_str_coderange(VALUE str)
|
||||
{
|
||||
|
|
@ -706,14 +718,7 @@ rb_enc_str_coderange(VALUE str)
|
|||
if (cr == ENC_CODERANGE_UNKNOWN) {
|
||||
int encidx = ENCODING_GET(str);
|
||||
rb_encoding *enc = rb_enc_from_index(encidx);
|
||||
if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
|
||||
rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
|
||||
cr = ENC_CODERANGE_BROKEN;
|
||||
}
|
||||
else {
|
||||
cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str),
|
||||
enc);
|
||||
}
|
||||
cr = enc_coderange_scan(str, enc, encidx);
|
||||
ENC_CODERANGE_SET(str, cr);
|
||||
}
|
||||
return cr;
|
||||
|
|
@ -955,6 +960,15 @@ static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long
|
|||
rb_encoding *from, rb_encoding *to,
|
||||
int ecflags, VALUE ecopts);
|
||||
|
||||
static inline bool
|
||||
is_enc_ascii_string(VALUE str, rb_encoding *enc)
|
||||
{
|
||||
int encidx = rb_enc_to_index(enc);
|
||||
if (rb_enc_get_index(str) == encidx)
|
||||
return is_ascii_string(str);
|
||||
return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT;
|
||||
}
|
||||
|
||||
VALUE
|
||||
rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
|
||||
{
|
||||
|
|
@ -965,7 +979,7 @@ rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags,
|
|||
if (!to) return str;
|
||||
if (!from) from = rb_enc_get(str);
|
||||
if (from == to) return str;
|
||||
if ((rb_enc_asciicompat(to) && is_ascii_string(str)) ||
|
||||
if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
|
||||
to == rb_ascii8bit_encoding()) {
|
||||
if (STR_ENC_GET(str) != to) {
|
||||
str = rb_str_dup(str);
|
||||
|
|
|
|||
|
|
@ -13,4 +13,13 @@ class Test_StringEncStrBufCat < Test::Unit::TestCase
|
|||
assert_equal(:unknown, Bug::String.new(cr_unknown_str).coderange, "an assertion for following tests")
|
||||
assert_equal(:valid, Bug::String.new(a8_str).enc_str_buf_cat(cr_unknown_str).coderange, Bug6509)
|
||||
end
|
||||
|
||||
def test_str_conv_enc
|
||||
str = Bug::String.new("aaa".encode("US-ASCII"))
|
||||
assert_same(str, str.str_conv_enc_opts("UTF-8", "US-ASCII", 0, nil))
|
||||
|
||||
str = Bug::String.new("aaa".encode("UTF-16LE").force_encoding("UTF-8"))
|
||||
assert_predicate(str, :ascii_only?) # cache coderange
|
||||
assert_equal("aaa", str.str_conv_enc_opts("UTF-16LE", "UTF-8", 0, nil))
|
||||
end
|
||||
end
|
||||
|
|
|
|||
|
|
@ -126,6 +126,28 @@ class TestTranscode < Test::Unit::TestCase
|
|||
assert_equal("D\xFCrst".force_encoding('iso-8859-2'), "D\xFCrst".encode('iso-8859-2', 'iso-8859-1'))
|
||||
end
|
||||
|
||||
def test_encode_xml_multibyte
|
||||
encodings = %w'UTF-8 UTF-16LE UTF-16BE UTF-32LE UTF-32BE'
|
||||
encodings.each do |src_enc|
|
||||
encodings.each do |dst_enc|
|
||||
escaped = "<>".encode(src_enc).encode(dst_enc, :xml=>:text)
|
||||
assert_equal("<>", escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :text")
|
||||
|
||||
escaped = '<">'.encode(src_enc).encode(dst_enc, :xml=>:attr)
|
||||
assert_equal('"<">"', escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :attr")
|
||||
|
||||
escaped = "<>".encode(src_enc).force_encoding("UTF-8").encode(dst_enc, src_enc, :xml=>:text)
|
||||
assert_equal("<>", escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :text")
|
||||
|
||||
escaped = '<">'.encode(src_enc).force_encoding("UTF-8").encode(dst_enc, src_enc, :xml=>:attr)
|
||||
assert_equal('"<">"', escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :attr")
|
||||
end
|
||||
end
|
||||
# regression test; U+6E7F (湿) uses the same bytes in ISO-2022-JP as "<>"
|
||||
assert_equal( "<>\u6E7F", "<>\u6E7F".encode("ISO-2022-JP").encode("ISO-2022-JP", :xml=>:text).encode("UTF-8"))
|
||||
assert_equal("\"<>\u6E7F\"", "<>\u6E7F".encode("ISO-2022-JP").encode("ISO-2022-JP", :xml=>:attr).encode("UTF-8"))
|
||||
end
|
||||
|
||||
def test_ascii_range
|
||||
encodings = [
|
||||
'US-ASCII', 'ASCII-8BIT',
|
||||
|
|
|
|||
|
|
@ -2719,6 +2719,12 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
|
|||
}
|
||||
}
|
||||
else {
|
||||
if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
|
||||
rb_encoding *utf8 = rb_utf8_encoding();
|
||||
str = rb_str_conv_enc(str, senc, utf8);
|
||||
senc = utf8;
|
||||
sname = "UTF-8";
|
||||
}
|
||||
if (encoding_equal(sname, dname)) {
|
||||
sname = "";
|
||||
dname = "";
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@
|
|||
# define RUBY_VERSION_MINOR RUBY_API_VERSION_MINOR
|
||||
#define RUBY_VERSION_TEENY 3
|
||||
#define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR
|
||||
#define RUBY_PATCHLEVEL 109
|
||||
#define RUBY_PATCHLEVEL 110
|
||||
|
||||
#define RUBY_RELEASE_YEAR 2021
|
||||
#define RUBY_RELEASE_MONTH 7
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue