mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
* string.c (rb_str_inspect): inspect as a dummy encoding string
when a UTF-16/32 (not BE/LE) string does not have a BOM. Unicode and some RFCs say that a string labeld as UTF-16/32 doesn't have a BOM, it should be considered big endian. But many Windows programs generates little endian UTF-16 strings without a BOM. So String#inspect treats a string labeled UTF-16/32 withaout a BOM as a dummy encoding string. patched by Martin Duerst. [ruby-core:33461] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@29984 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
87703540b0
commit
05d7d889ea
3 changed files with 38 additions and 4 deletions
11
ChangeLog
11
ChangeLog
|
@ -1,3 +1,14 @@
|
||||||
|
Wed Dec 1 01:29:15 2010 NARUSE, Yui <naruse@ruby-lang.org>
|
||||||
|
|
||||||
|
* string.c (rb_str_inspect): inspect as a dummy encoding string
|
||||||
|
when a UTF-16/32 (not BE/LE) string does not have a BOM.
|
||||||
|
Unicode and some RFCs say that a string labeld as UTF-16/32
|
||||||
|
doesn't have a BOM, it should be considered big endian.
|
||||||
|
But many Windows programs generates little endian UTF-16
|
||||||
|
strings without a BOM. So String#inspect treats a string
|
||||||
|
labeled UTF-16/32 withaout a BOM as a dummy encoding string.
|
||||||
|
patched by Martin Duerst. [ruby-core:33461]
|
||||||
|
|
||||||
Tue Nov 30 17:04:10 2010 NARUSE, Yui <naruse@ruby-lang.org>
|
Tue Nov 30 17:04:10 2010 NARUSE, Yui <naruse@ruby-lang.org>
|
||||||
|
|
||||||
* addr2line.c (parse_debug_line_cu): ignore DW_LNE_set_discriminator.
|
* addr2line.c (parse_debug_line_cu): ignore DW_LNE_set_discriminator.
|
||||||
|
|
18
string.c
18
string.c
|
@ -4214,10 +4214,22 @@ rb_str_inspect(VALUE str)
|
||||||
p = RSTRING_PTR(str); pend = RSTRING_END(str);
|
p = RSTRING_PTR(str); pend = RSTRING_END(str);
|
||||||
prev = p;
|
prev = p;
|
||||||
if (enc == utf16) {
|
if (enc == utf16) {
|
||||||
enc = *p == (char)0xFF ? rb_enc_find("UTF-16LE") : rb_enc_find("UTF-16BE");
|
const unsigned char *q = (const unsigned char *)p;
|
||||||
|
if (q[0] == 0xFE && q[1] == 0xFF)
|
||||||
|
enc = rb_enc_find("UTF-16BE");
|
||||||
|
else if (q[0] == 0xFF && q[1] == 0xFD)
|
||||||
|
enc = rb_enc_find("UTF-16LE");
|
||||||
|
else
|
||||||
|
unicode_p = 0;
|
||||||
}
|
}
|
||||||
else if (enc == utf32) {
|
else if (enc == utf32) {
|
||||||
enc = *p == (char)0xFF ? rb_enc_find("UTF-32LE") : rb_enc_find("UTF-32BE");
|
const unsigned char *q = (const unsigned char *)p;
|
||||||
|
if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
|
||||||
|
enc = rb_enc_find("UTF-32BE");
|
||||||
|
else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
|
||||||
|
enc = rb_enc_find("UTF-32LE");
|
||||||
|
else
|
||||||
|
unicode_p = 0;
|
||||||
}
|
}
|
||||||
while (p < pend) {
|
while (p < pend) {
|
||||||
unsigned int c, cc;
|
unsigned int c, cc;
|
||||||
|
@ -6004,7 +6016,6 @@ static VALUE
|
||||||
rb_str_each_codepoint(VALUE str)
|
rb_str_each_codepoint(VALUE str)
|
||||||
{
|
{
|
||||||
VALUE orig = str;
|
VALUE orig = str;
|
||||||
long len;
|
|
||||||
int n;
|
int n;
|
||||||
unsigned int c;
|
unsigned int c;
|
||||||
const char *ptr, *end;
|
const char *ptr, *end;
|
||||||
|
@ -6014,7 +6025,6 @@ rb_str_each_codepoint(VALUE str)
|
||||||
RETURN_ENUMERATOR(str, 0, 0);
|
RETURN_ENUMERATOR(str, 0, 0);
|
||||||
str = rb_str_new4(str);
|
str = rb_str_new4(str);
|
||||||
ptr = RSTRING_PTR(str);
|
ptr = RSTRING_PTR(str);
|
||||||
len = RSTRING_LEN(str);
|
|
||||||
end = RSTRING_END(str);
|
end = RSTRING_END(str);
|
||||||
enc = STR_ENC_GET(str);
|
enc = STR_ENC_GET(str);
|
||||||
while (ptr < end) {
|
while (ptr < end) {
|
||||||
|
|
|
@ -232,6 +232,19 @@ class TestM17N < Test::Unit::TestCase
|
||||||
Encoding.default_external = orig_ext
|
Encoding.default_external = orig_ext
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_utf_16_32_inspect
|
||||||
|
str = "\u3042"
|
||||||
|
%w/UTF-16 UTF-32/.each do |enc|
|
||||||
|
%w/BE LE/.each do |endian|
|
||||||
|
s = str.encode(enc + endian)
|
||||||
|
# When a UTF-16/32 string doesn't have a BOM,
|
||||||
|
# inspect as a dummy encoding string.
|
||||||
|
assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect,
|
||||||
|
s.dup.force_encoding(enc).inspect)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def test_str_dump
|
def test_str_dump
|
||||||
[
|
[
|
||||||
e("\xfe"),
|
e("\xfe"),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue