From 74a36d5d1f17f1529090fb42a405d4677caa567e Mon Sep 17 00:00:00 2001 From: akr Date: Thu, 14 Aug 2008 15:56:39 +0000 Subject: [PATCH] * include/ruby/encoding.h (rb_econv_output): declared. * transcode_data.h (rb_transcoder): add resetsize_func field. * enc/trans/iso2022.trans (iso2022jp_reset_sequence_size): defined. (rb_EUC_JP_to_ISO_2022_JP): provede resetsize_func. * tool/transcode-tblgen.rb: set NULL for resetsize_func. * transcode.c (rb_econv_output): new function for inserting output. (output_replacement_character): use rb_econv_output. (transcode_loop): check return value of output_replacement_character. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18628 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 16 ++++++ enc/trans/iso2022.trans | 17 ++++-- include/ruby/encoding.h | 10 ++-- tool/transcode-tblgen.rb | 3 +- transcode.c | 114 +++++++++++++++++++++++++-------------- transcode_data.h | 3 +- 6 files changed, 114 insertions(+), 49 deletions(-) diff --git a/ChangeLog b/ChangeLog index 457bdedd05..94bd6e9ca2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +Fri Aug 15 00:52:40 2008 Tanaka Akira + + * include/ruby/encoding.h (rb_econv_output): declared. + + * transcode_data.h (rb_transcoder): add resetsize_func field. + + * enc/trans/iso2022.trans (iso2022jp_reset_sequence_size): defined. + (rb_EUC_JP_to_ISO_2022_JP): provede resetsize_func. + + * tool/transcode-tblgen.rb: set NULL for resetsize_func. + + * transcode.c (rb_econv_output): new function for inserting output. + (output_replacement_character): use rb_econv_output. + (transcode_loop): check return value of + output_replacement_character. + Thu Aug 14 23:47:21 2008 Tanaka Akira * include/ruby/encoding.h (ECONV_UNIVERSAL_NEWLINE_DECODER): defined. diff --git a/enc/trans/iso2022.trans b/enc/trans/iso2022.trans index 1d015eea54..0414493635 100644 --- a/enc/trans/iso2022.trans +++ b/enc/trans/iso2022.trans @@ -83,7 +83,7 @@ fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, u if (t->stateful[0] == 0) { t->stateful[0] = 1; /* initialized flag */ - t->stateful[1] = 1; /* ASCII mode */ + t->stateful[1] = 1; /* G0 = ASCII */ } if (l != t->stateful[1]) { @@ -91,13 +91,13 @@ fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, u *o++ = 0x1b; *o++ = '('; *o++ = 'B'; - t->stateful[1] = 1; + t->stateful[1] = 1; /* G0 = ASCII */ } else { *o++ = 0x1b; *o++ = '$'; *o++ = 'B'; - t->stateful[1] = 2; + t->stateful[1] = 2; /* G0 = JIS X 0208 1983 */ } } @@ -112,6 +112,14 @@ fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, u return o - output0; } +static int +iso2022jp_reset_sequence_size(rb_transcoding *t) +{ + if (t->stateful[1] == 2) + return 3; + return 0; +} + static int finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o) { @@ -137,7 +145,8 @@ rb_EUC_JP_to_ISO_2022_JP = { 3, /* max_input */ 5, /* max_output */ NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, - finish_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp + finish_eucjp_to_iso2022jp, + iso2022jp_reset_sequence_size, finish_eucjp_to_iso2022jp }; void diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 7e188a0c1f..d7ad0d0237 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -230,11 +230,15 @@ typedef struct { rb_encoding *destination_encoding; } rb_econv_t; -rb_econv_t *rb_econv_open(const char *from, const char *to, int flags); +rb_econv_t *rb_econv_open(const char *source_encoding, const char *destination_encoding, int flags); rb_econv_result_t rb_econv_convert(rb_econv_t *ec, - const unsigned char **input_ptr, const unsigned char *input_stop, - unsigned char **output_ptr, unsigned char *output_stop, + const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, + unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags); +int rb_econv_output(rb_econv_t *ec, + const unsigned char *str, size_t len, + unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, + size_t *required_size); void rb_econv_close(rb_econv_t *ec); /* flags for rb_econv_open */ diff --git a/tool/transcode-tblgen.rb b/tool/transcode-tblgen.rb index b0d35f6230..87bb29c885 100644 --- a/tool/transcode-tblgen.rb +++ b/tool/transcode-tblgen.rb @@ -449,7 +449,8 @@ static const rb_transcoder #{input_unit_length}, /* input_unit_length */ #{max_input}, /* max_input */ #{max_output}, /* max_output */ - NULL, NULL, NULL, NULL, NULL, NULL + NULL, NULL, NULL, NULL, + NULL, NULL, NULL }; End tree_code + "\n" + transcoder_code diff --git a/transcode.c b/transcode.c index 2a68b4ebd9..b8c8d1a1ae 100644 --- a/transcode.c +++ b/transcode.c @@ -937,6 +937,58 @@ rb_econv_convert(rb_econv_t *ec, return res; } +int +rb_econv_output(rb_econv_t *ec, + const unsigned char *str, size_t len, /* string in destination encoding */ + unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, + size_t *required_size) +{ + size_t reset_len, total_len; + rb_transcoding *tc = ec->last_tc; + const rb_transcoder *tr = tc->transcoder; + + /* + * Assumption for stateful encoding: + * + * - str can be output on resetted state and doesn't change the state. + * - it is acceptable that extra state changing sequence if str contains + * a state changing sequence. + * + * Currently the replacement character for stateful encoding such as + * ISO-2022-JP is "?" and it has no state changing sequence. + * So the extra state changing sequence don't occur when + * rb_econv_output is used for replacement characters. + * + * Thease assumption may be removed in future. + * It needs to scan str to check state changing sequences in it. + */ + + reset_len = 0; + if (tr->resetsize_func) { + reset_len = tr->resetsize_func(tc); + } + + total_len = reset_len + len; + if (total_len < len) + return -1; + + if (required_size) { + *required_size = total_len; + } + + if (destination_buffer_end - *destination_buffer_ptr < total_len) + return -1; + + if (reset_len) { + *destination_buffer_ptr += tr->resetstate_func(tc, *destination_buffer_ptr); + } + + memcpy(*destination_buffer_ptr, str, len); + *destination_buffer_ptr += len; + + return 0; +} + void rb_econv_close(rb_econv_t *ec) { @@ -968,58 +1020,40 @@ more_output_buffer( *out_stop_ptr = *out_start_ptr + new_len; } -static void +static int output_replacement_character( VALUE destination, unsigned char *(*resize_destination)(VALUE, int, int), - rb_transcoding *tc, + rb_econv_t *ec, unsigned char **out_start_ptr, unsigned char **out_pos, unsigned char **out_stop_ptr) { + rb_transcoding *tc = ec->last_tc; const rb_transcoder *tr; - int max_output; rb_encoding *enc; - const char *replacement; + const unsigned char *replacement; int len; + size_t required_size; tr = tc->transcoder; - max_output = tr->max_output; enc = rb_enc_find(tr->to_encoding); - /* - * Assumption for stateful encoding: - * - * - The replacement character can be output on resetted state and doesn't - * change the state. - * - it is acceptable that extra state changing sequence if the replacement - * character contains a state changing sequence. - * - * Currently the replacement character for stateful encoding such as - * ISO-2022-JP is "?" and it has no state changing sequence. - * So the extra state changing sequence don't occur. - * - * Thease assumption may be removed in future. - * It needs to scan the replacement character to check - * state changing sequences in the replacement character. - */ + replacement = (const unsigned char *)get_replacement_character(enc, &len); - if (tr->resetstate_func) { - if (*out_stop_ptr - *out_pos < max_output) - more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr); - *out_pos += tr->resetstate_func(tc, *out_pos); - } + if (rb_econv_output(ec, replacement, len, out_pos, *out_stop_ptr, &required_size) == 0) + return 0; - if (*out_stop_ptr - *out_pos < max_output) - more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr); + if (required_size < len) + return -1; /* overflow */ - replacement = get_replacement_character(enc, &len); + more_output_buffer(destination, resize_destination, required_size, out_start_ptr, out_pos, out_stop_ptr); - memcpy(*out_pos, replacement, len); + if (rb_econv_output(ec, replacement, len, out_pos, *out_stop_ptr, &required_size) == 0) + return 0; - *out_pos += len; - return; + return -1; } #if 1 @@ -1054,8 +1088,8 @@ resume: goto resume; } else if (opt&INVALID_REPLACE) { - output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop); - goto resume; + if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0) + goto resume; } rb_econv_close(ec); rb_raise(rb_eInvalidByteSequence, "invalid byte sequence"); @@ -1068,8 +1102,8 @@ resume: goto resume; } else if (opt&UNDEF_REPLACE) { - output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop); - goto resume; + if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0) + goto resume; } rb_econv_close(ec); rb_raise(rb_eConversionUndefined, "conversion undefined for byte sequence (maybe invalid byte sequence)"); @@ -1135,8 +1169,8 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, break; } else if (opt&INVALID_REPLACE) { - output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop); - break; + if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0) + break; } rb_econv_close(ec); rb_raise(rb_eInvalidByteSequence, "invalid byte sequence"); @@ -1150,8 +1184,8 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, break; } else if (opt&UNDEF_REPLACE) { - output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop); - break; + if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0) + break; } rb_econv_close(ec); rb_raise(rb_eConversionUndefined, "conversion undefined for byte sequence (maybe invalid byte sequence)"); diff --git a/transcode_data.h b/transcode_data.h index b53a1813df..69f3048124 100644 --- a/transcode_data.h +++ b/transcode_data.h @@ -107,8 +107,9 @@ struct rb_transcoder { VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */ int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */ int (*func_so)(rb_transcoding*, const unsigned char*, size_t, unsigned char*); /* start -> output */ - int (*resetstate_func)(rb_transcoding*, unsigned char*); /* -> output */ int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */ + int (*resetsize_func)(rb_transcoding*); /* -> len */ + int (*resetstate_func)(rb_transcoding*, unsigned char*); /* -> output */ }; void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib);