From 19ab08653eb18b18507a9c4de09327c904387d7f Mon Sep 17 00:00:00 2001 From: akr Date: Tue, 26 Aug 2008 12:55:14 +0000 Subject: [PATCH] * transcode.c (rb_econv_open): disable newline conversion for ASCII incompatible encodings. (str_transcode0): don't need disable newline conversion here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18870 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 6 +++++ include/ruby/encoding.h | 6 +++++ test/ruby/test_io_m17n.rb | 49 +++++++++++++++++++++++++++++++++++++-- transcode.c | 47 ++++++++++++++++++++++++++----------- 4 files changed, 92 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index 438d502d40..df348e3934 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +Tue Aug 26 21:53:56 2008 Tanaka Akira + + * transcode.c (rb_econv_open): disable newline conversion for ASCII + incompatible encodings. + (str_transcode0): don't need disable newline conversion here. + Tue Aug 26 21:44:39 2008 Tanaka Akira * transcode.c (rb_econv_binmode): binmode is effective only once. diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 55b5bde354..fa3ebeff1d 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -297,12 +297,18 @@ void rb_econv_binmode(rb_econv_t *ec); #define ECONV_UNDEF_IGNORE 0x0010 #define ECONV_UNDEF_REPLACE 0x0020 +/* effective only if output is ascii compatible */ #define ECONV_UNIVERSAL_NEWLINE_DECODER 0x0100 + +/* effective only if input is ascii compatible */ #define ECONV_CRLF_NEWLINE_ENCODER 0x0200 #define ECONV_CR_NEWLINE_ENCODER 0x0400 +/* end of flags for rb_econv_open */ + /* flags for rb_econv_convert */ #define ECONV_PARTIAL_INPUT 0x10000 #define ECONV_OUTPUT_FOLLOWED_BY_INPUT 0x20000 +/* end of flags for rb_econv_convert */ #endif /* RUBY_ENCODING_H */ diff --git a/test/ruby/test_io_m17n.rb b/test/ruby/test_io_m17n.rb index 47d04a18d5..2caa2cb2b8 100644 --- a/test/ruby/test_io_m17n.rb +++ b/test/ruby/test_io_m17n.rb @@ -1154,8 +1154,18 @@ EOT } end + SYSTEM_NEWLINE = [] def system_newline - File::BINARY == 0 ? "\n" : "\r\n" + return SYSTEM_NEWLINE.first if !SYSTEM_NEWLINE.empty? + with_tmpdir { + open("newline", "wt") {|f| + f.print "\n" + } + open("newline", "rb") {|f| + SYSTEM_NEWLINE << f.read + } + } + SYSTEM_NEWLINE.first end def test_textmode_encode_newline @@ -1170,6 +1180,41 @@ EOT } end + def test_textmode_encode_newline_enc + with_tmpdir { + open("t.txt", "wt:euc-jp") {|f| + f.puts "abc\u3042" + f.puts "def\u3044" + } + content = File.read("t.txt", :mode=>"rb:ascii-8bit") + nl = system_newline + assert_equal("abc\xA4\xA2#{nl}def\xA4\xA4#{nl}", content) + } + end + + def test_textmode_read_ascii_incompat_internal + with_tmpdir { + generate_file("t.utf8.crlf", "a\r\nb\r\n") + open("t.utf8.crlf", "rt:utf-8:utf-16be") {|f| + content = f.read + # textmode doesn't affect for ascii incompatible internal encoding. + assert_equal("\0a\0\r\0\n\0b\0\r\0\n".force_encoding("UTF-16BE"), + content) + } + } + end + + def test_textmode_write_ascii_incompat_internal + with_tmpdir { + open("t.utf8.lf", "wt:utf-8:utf-16be") {|f| + f.print "\0a\0\n\0b\0\n".force_encoding("UTF-16BE") + } + content = File.read("t.utf8.lf", :mode=>"rb:ascii-8bit") + # textmode doesn't affect for ascii incompatible internal encoding. + assert_equal("a\nb\n", content) + } + end + def test_binary with_tmpdir { src = "a\nb\rc\r\nd\n" @@ -1180,7 +1225,7 @@ EOT open("t.txt", "r", :binmode=>true) {|f| assert_equal(src, f.read) } - if File::BINARY == 0 + if system_newline == "\n" open("t.txt", "r") {|f| assert_equal(src, f.read) } diff --git a/transcode.c b/transcode.c index 108a44b880..a35db7c0cb 100644 --- a/transcode.c +++ b/transcode.c @@ -748,6 +748,26 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts) int num_additional; static rb_econv_t *ec; int flags = opts ? opts->flags : 0; + int universal_newline_decoder_added = 0; + + rb_encoding *senc, *denc; + int sidx, didx; + + senc = NULL; + if (*from) { + sidx = rb_enc_find_index(from); + if (0 <= sidx) { + senc = rb_enc_from_index(sidx); + } + } + + denc = NULL; + if (*to) { + didx = rb_enc_find_index(to); + if (0 <= didx) { + denc = rb_enc_from_index(didx); + } + } if (*from == '\0' && *to == '\0') { num_trans = 0; @@ -763,7 +783,8 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts) } num_additional = 0; - if (flags & (ECONV_CRLF_NEWLINE_ENCODER|ECONV_CR_NEWLINE_ENCODER)) { + if ((!*from || (senc && rb_enc_asciicompat(senc))) && + (flags & (ECONV_CRLF_NEWLINE_ENCODER|ECONV_CR_NEWLINE_ENCODER))) { const char *name = (flags & ECONV_CRLF_NEWLINE_ENCODER) ? "crlf_newline" : "cr_newline"; transcoder_entry_t *e = get_transcoder_entry("", name); if (flags & ECONV_CRLF_NEWLINE_ENCODER) @@ -779,8 +800,12 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts) num_trans++; num_additional++; } + else { + flags &= ~(ECONV_CRLF_NEWLINE_ENCODER|ECONV_CR_NEWLINE_ENCODER); + } - if (flags & ECONV_UNIVERSAL_NEWLINE_DECODER) { + if ((!*to || (denc && rb_enc_asciicompat(denc))) && + (flags & ECONV_UNIVERSAL_NEWLINE_DECODER)) { transcoder_entry_t *e = get_transcoder_entry("universal_newline", ""); if (!e) { xfree(entries); @@ -788,6 +813,10 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts) } entries[num_trans++] = e; num_additional++; + universal_newline_decoder_added = 1; + } + else { + flags &= ~ECONV_UNIVERSAL_NEWLINE_DECODER; } ec = rb_econv_open_by_transcoder_entries(num_trans, entries); @@ -799,6 +828,7 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts) ec->opts.flags = 0; else ec->opts = *opts; + ec->opts.flags = flags; ec->source_encoding_name = from; ec->destination_encoding_name = to; @@ -806,7 +836,7 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts) ec->last_tc = NULL; ec->last_trans_index = -1; } - else if (flags & ECONV_UNIVERSAL_NEWLINE_DECODER) { + else if (universal_newline_decoder_added) { ec->last_tc = ec->elems[ec->num_trans-2].tc; ec->last_trans_index = ec->num_trans-2; } @@ -1886,17 +1916,6 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, rb_econv_option_t *ecopts_arg else rb_econv_opts(Qnil, &ecopts); - /* disable newline conversion for ascii incompatible encoding. - * xxx: convert newline in ascii-compatible encoding? - * ex. UTF-16BE -> UTF-8 -> newline conversion -> UTF-8 -> UTF-16BE. - */ - if (!from_enc || !rb_enc_asciicompat(from_enc)) { - ecopts.flags &= ~(ECONV_CRLF_NEWLINE_ENCODER|ECONV_CR_NEWLINE_ENCODER); - } - if (!to_enc || !rb_enc_asciicompat(to_enc)) { - ecopts.flags &= ~ECONV_UNIVERSAL_NEWLINE_DECODER; - } - if ((ecopts.flags & (ECONV_UNIVERSAL_NEWLINE_DECODER| ECONV_CRLF_NEWLINE_ENCODER| ECONV_CR_NEWLINE_ENCODER)) == 0) {