From 576bdec03f0d58847690a0607c788ada433ce60f Mon Sep 17 00:00:00 2001 From: Nobuyoshi Nakada Date: Tue, 30 Aug 2022 18:12:08 +0900 Subject: [PATCH] [Bug #18973] Promote US-ASCII to ASCII-8BIT when adding 8-bit char --- internal/string.h | 1 + sprintf.c | 13 +++++++++---- string.c | 32 ++++++++++++++++++++++++-------- test/ruby/test_sprintf.rb | 3 +++ 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/internal/string.h b/internal/string.h index 8fb9553d03..46862d77f5 100644 --- a/internal/string.h +++ b/internal/string.h @@ -43,6 +43,7 @@ char *rb_str_to_cstr(VALUE str); const char *ruby_escaped_char(int c); void rb_str_make_independent(VALUE str); int rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc); +int rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code); static inline bool STR_EMBED_P(VALUE str); static inline bool STR_SHARED_P(VALUE str); diff --git a/sprintf.c b/sprintf.c index b2bdd4a072..bfe25e1d3c 100644 --- a/sprintf.c +++ b/sprintf.c @@ -454,13 +454,18 @@ rb_str_format(int argc, const VALUE *argv, VALUE fmt) str = tmp; goto format_s1; } - else { - n = NUM2INT(val); - if (n >= 0) n = rb_enc_codelen((c = n), enc); - } + n = NUM2INT(val); + if (n >= 0) n = rb_enc_codelen((c = n), enc); if (n <= 0) { rb_raise(rb_eArgError, "invalid character"); } + int encidx = rb_ascii8bit_appendable_encoding_index(enc, c); + if (encidx >= 0 && encidx != rb_enc_to_index(enc)) { + /* special case */ + rb_enc_associate_index(result, encidx); + enc = rb_enc_from_index(encidx); + coderange = ENC_CODERANGE_VALID; + } if (!(flags & FWIDTH)) { CHECK(n); rb_enc_mbcput(c, &buf[blen], enc); diff --git a/string.c b/string.c index 564812ae51..951aeca6dd 100644 --- a/string.c +++ b/string.c @@ -3481,17 +3481,13 @@ rb_str_concat(VALUE str1, VALUE str2) return rb_str_append(str1, str2); } - encidx = rb_enc_to_index(enc); - if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) { - /* US-ASCII automatically extended to ASCII-8BIT */ + encidx = rb_ascii8bit_appendable_encoding_index(enc, code); + if (encidx >= 0) { char buf[1]; buf[0] = (char)code; - if (code > 0xFF) { - rb_raise(rb_eRangeError, "%u out of char range", code); - } rb_str_cat(str1, buf, 1); - if (encidx == ENCINDEX_US_ASCII && code > 127) { - rb_enc_associate_index(str1, ENCINDEX_ASCII_8BIT); + if (encidx != rb_enc_to_index(enc)) { + rb_enc_associate_index(str1, encidx); ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID); } } @@ -3524,6 +3520,26 @@ rb_str_concat(VALUE str1, VALUE str2) return str1; } +int +rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code) +{ + int encidx = rb_enc_to_index(enc); + + if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) { + /* US-ASCII automatically extended to ASCII-8BIT */ + if (code > 0xFF) { + rb_raise(rb_eRangeError, "%u out of char range", code); + } + if (encidx == ENCINDEX_US_ASCII && code > 127) { + return ENCINDEX_ASCII_8BIT; + } + return encidx; + } + else { + return -1; + } +} + /* * call-seq: * prepend(*other_strings) -> string diff --git a/test/ruby/test_sprintf.rb b/test/ruby/test_sprintf.rb index 803399fdb3..c453ecd350 100644 --- a/test/ruby/test_sprintf.rb +++ b/test/ruby/test_sprintf.rb @@ -369,6 +369,9 @@ class TestSprintf < Test::Unit::TestCase assert_equal(" " * BSIZ + "a", sprintf("%#{ BSIZ + 1 }c", ?a)) assert_equal("a" + " " * BSIZ, sprintf("%-#{ BSIZ + 1 }c", ?a)) assert_raise(ArgumentError) { sprintf("%c", -1) } + s = sprintf("%c".encode(Encoding::US_ASCII), 0x80) + assert_equal("\x80".b, s) + assert_predicate(s, :valid_encoding?) end def test_string