diff --git a/ChangeLog b/ChangeLog index ff020234a3..6a1f240c98 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +Fri Aug 15 18:10:49 2008 Tanaka Akira + + * include/ruby/encoding.h (rb_econv_t): new field: last_error. + + * transcode.c (rb_trans_conv): new argument: result_position_ptr. + (rb_econv_convert): fill last_error. + (econv_result_to_symbol): extracted from econv_primitive_convert. + (econv_primitive_errinfo): new method. + Fri Aug 15 17:39:05 2008 Nobuyoshi Nakada * common.mk (parse.{c,h}): creates in the dependency order. diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index d7ad0d0237..675bd5d90d 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -224,6 +224,17 @@ typedef struct { int num_finished; struct rb_transcoding *last_tc; + /* last error */ + struct { + rb_econv_result_t result; + const char *source_encoding; + const char *destination_encoding; + const unsigned char *error_bytes_start; + size_t error_bytes_len; + size_t readagain_len; + int partial_input; + } last_error; + /* The following fields are only for Encoding::Converter. * rb_econv_open set them NULL. */ rb_encoding *source_encoding; diff --git a/test/ruby/test_econv.rb b/test/ruby/test_econv.rb index 3ae46b184c..0a797aecd6 100644 --- a/test/ruby/test_econv.rb +++ b/test/ruby/test_econv.rb @@ -20,6 +20,14 @@ class TestEncodingConverter < Test::Unit::TestCase [o, ret, i]) end + def assert_errinfo(e_res, e_enc1, e_enc2, e_error_bytes, e_readagain_bytes, e_partial_input, ec) + assert_equal([e_res, e_enc1, e_enc2, + e_error_bytes && e_error_bytes.dup.force_encoding("ASCII-8BIT"), + e_readagain_bytes && e_readagain_bytes.dup.force_encoding("ASCII-8BIT"), + e_partial_input], + ec.primitive_errinfo) + end + def test_new assert_kind_of(Encoding::Converter, Encoding::Converter.new("UTF-8", "EUC-JP")) assert_kind_of(Encoding::Converter, Encoding::Converter.new(Encoding::UTF_8, Encoding::EUC_JP)) @@ -319,4 +327,42 @@ class TestEncodingConverter < Test::Unit::TestCase check_ec("abc\xA4\xA2def", "", :output_followed_by_input, *a) check_ec("abc\xA4\xA2def", "", :finished, *a) end + + def test_errinfo_invalid_euc_jp + ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") + ec.primitive_convert(src="\xff", dst="", nil, 10) + assert_errinfo(:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", "", nil, ec) + end + + def test_errinfo_undefined_hiragana + ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") + ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) + assert_errinfo(:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", "", nil, ec) + end + + def test_errinfo_invalid_partial_character + ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") + ec.primitive_convert(src="\xa4", dst="", nil, 10) + assert_errinfo(:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xA4", "", nil, ec) + end + + def test_errinfo_valid_partial_character + ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") + ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) + assert_errinfo(:source_buffer_empty, nil, nil, nil, nil, :partial_input, ec) + end + + def test_errinfo_invalid_utf16be + ec = Encoding::Converter.new("UTF-16BE", "UTF-8") + ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) + assert_errinfo(:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00", nil, ec) + assert_equal("@", src) + end + + def test_errinfo_invalid_utf16le + ec = Encoding::Converter.new("UTF-16LE", "UTF-8") + ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) + assert_errinfo(:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00", nil, ec) + assert_equal("", src) + end end diff --git a/transcode.c b/transcode.c index 16979b9b4b..92d9ec62fc 100644 --- a/transcode.c +++ b/transcode.c @@ -839,7 +839,8 @@ static rb_econv_result_t rb_trans_conv(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, - int flags) + int flags, + int *result_position_ptr) { int i; int needreport_index; @@ -888,7 +889,8 @@ rb_trans_conv(rb_econv_t *ec, rb_econv_result_t res; res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop, - (flags & ~ECONV_OUTPUT_FOLLOWED_BY_INPUT)|ECONV_PARTIAL_INPUT); + (flags & ~ECONV_OUTPUT_FOLLOWED_BY_INPUT)|ECONV_PARTIAL_INPUT, + result_position_ptr); if (res == econv_source_buffer_empty) return econv_output_followed_by_input; @@ -912,9 +914,13 @@ found_needreport: res == econv_output_followed_by_input) { ec->elems[i].last_result = econv_source_buffer_empty; } + if (result_position_ptr) + *result_position_ptr = i; return res; } } + if (result_position_ptr) + *result_position_ptr = -1; return econv_source_buffer_empty; } @@ -925,15 +931,32 @@ rb_econv_convert(rb_econv_t *ec, int flags) { rb_econv_result_t res; + int result_position; + + memset(&ec->last_error, 0, sizeof(ec->last_error)); if ((flags & ECONV_OUTPUT_FOLLOWED_BY_INPUT) || - ec->num_trans == 1) - return rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags); + ec->num_trans == 1) { + res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position); + } + else { + flags |= ECONV_OUTPUT_FOLLOWED_BY_INPUT; + do { + res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position); + } while (res == econv_output_followed_by_input); + } + + ec->last_error.result = res; + ec->last_error.partial_input = flags & ECONV_PARTIAL_INPUT; + if (res == econv_invalid_byte_sequence || + res == econv_undefined_conversion) { + ec->last_error.source_encoding = ec->elems[result_position].tc->transcoder->from_encoding; + ec->last_error.destination_encoding = ec->elems[result_position].tc->transcoder->to_encoding; + ec->last_error.error_bytes_start = TRANSCODING_READBUF(ec->elems[result_position].tc); + ec->last_error.error_bytes_len = ec->elems[result_position].tc->recognized_len; + ec->last_error.readagain_len = ec->elems[result_position].tc->readagain_len; + } - flags |= ECONV_OUTPUT_FOLLOWED_BY_INPUT; - do { - res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags); - } while (res == econv_output_followed_by_input); return res; } @@ -1572,6 +1595,20 @@ econv_destination_encoding(VALUE self) return rb_enc_from_encoding(ec->destination_encoding); } +static VALUE +econv_result_to_symbol(rb_econv_result_t res) +{ + switch (res) { + case econv_invalid_byte_sequence: return ID2SYM(rb_intern("invalid_byte_sequence")); + case econv_undefined_conversion: return ID2SYM(rb_intern("undefined_conversion")); + case econv_destination_buffer_full: return ID2SYM(rb_intern("destination_buffer_full")); + case econv_source_buffer_empty: return ID2SYM(rb_intern("source_buffer_empty")); + case econv_finished: return ID2SYM(rb_intern("finished")); + case econv_output_followed_by_input: return ID2SYM(rb_intern("output_followed_by_input")); + default: return INT2NUM(res); /* should not be reached */ + } +} + /* * call-seq: * primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol @@ -1704,15 +1741,113 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) rb_enc_associate(output, ec->destination_encoding); } - switch (res) { - case econv_invalid_byte_sequence: return ID2SYM(rb_intern("invalid_byte_sequence")); - case econv_undefined_conversion: return ID2SYM(rb_intern("undefined_conversion")); - case econv_destination_buffer_full: return ID2SYM(rb_intern("destination_buffer_full")); - case econv_source_buffer_empty: return ID2SYM(rb_intern("source_buffer_empty")); - case econv_finished: return ID2SYM(rb_intern("finished")); - case econv_output_followed_by_input: return ID2SYM(rb_intern("output_followed_by_input")); - default: return INT2NUM(res); /* should not be reached */ + return econv_result_to_symbol(res); +} + +/* + * call-seq: + * primitive_errinfo -> array + * + * primitive_errinfo returns a precious information of last error result + * as a 6-elements array: + * + * [result, enc1, enc2, error_bytes, readagain_bytes, partial_input] + * + * result is the last result of primitive_convert. + * + * partial_input is :partial_input or nil. + * :partial_input means that Encoding::Converter::PARTIAL_INPUT is specified + * for primitive_convert. + * + * Other elements are only meaningful when result is + * :invalid_byte_sequence or :undefined_conversion. + * + * enc1 and enc2 indicats a conversion step as pair of strings. + * For example, EUC-JP to ISO-8859-1 is + * converted as EUC-JP -> UTF-8 -> ISO-8859-1. + * So [enc1, enc2] is ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"]. + * + * error_bytes and readagain_bytes indicats the byte sequences which causes the error. + * error_bytes is discarded portion. + * readagain_bytes is buffered portion which is read again on next conversion. + * + * Example: + * + * # \xff is invalid as EUC-JP. + * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") + * ec.primitive_convert(src="\xff", dst="", nil, 10) + * p ec.primitive_errinfo + * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", "", nil] + * + * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. + * # Since this error is occur in UTF-8 to ISO-8859-1 conversion, + * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82). + * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") + * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) + * p ec.primitive_errinfo + * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", "", nil] + * + * # partial character is invalid + * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") + * ec.primitive_convert(src="\xa4", dst="", nil, 10) + * p ec.primitive_errinfo + * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xA4", "", nil] + * + * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by + * # partial characters. + * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") + * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) + * p ec.primitive_errinfo + * #=> [:source_buffer_empty, nil, nil, nil, nil, :partial_input] + * + * # \xd8\x00\x00@ is invalid as UTF-16BE because + * # no low surrogate after high surrogate (\xd8\x00). + * # It is detected by 3rd byte (\00) which is part of next character. + * # So the high surrogate (\xd8\x00) is discarded and + * # the 3rd byte is read again later. + * # Since the byte is buffered in ec, it is dropped from src. + * ec = Encoding::Converter.new("UTF-16BE", "UTF-8") + * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) + * p ec.primitive_errinfo + * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00", nil] + * p src + * #=> "@" + * + * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE. + * # The problem is detected by 4th byte. + * ec = Encoding::Converter.new("UTF-16LE", "UTF-8") + * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) + * p ec.primitive_errinfo + * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00", nil] + * p src + * #=> "" + * + */ +static VALUE +econv_primitive_errinfo(VALUE self) +{ + rb_econv_t *ec = check_econv(self); + + VALUE ary; + + ary = rb_ary_new2(6); + + rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result)); + + if (ec->last_error.source_encoding) + rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding)); + + if (ec->last_error.destination_encoding) + rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding)); + + if (ec->last_error.error_bytes_start) { + rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len)); + rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len)); } + + rb_ary_store(ary, 5, ec->last_error.partial_input ? ID2SYM(rb_intern("partial_input")) : Qnil); + + return ary; } void @@ -1738,6 +1873,7 @@ Init_transcode(void) rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0); rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0); rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1); + rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0); rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT)); rb_define_const(rb_cEncodingConverter, "OUTPUT_FOLLOWED_BY_INPUT", INT2FIX(ECONV_OUTPUT_FOLLOWED_BY_INPUT)); rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECODER", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECODER));