diff --git a/ChangeLog b/ChangeLog index 1d01173cd1..1aa50b5a04 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,37 @@ +Thu Aug 7 23:43:11 2008 Tanaka Akira + + * transcode_data.h (rb_transcoding): new field "stateful". + (rb_transcoder): preprocessor and postprocessor field removed. + change arguments of func_ii, func_si, func_io and func_so. + new field "finish_func". + + * tool/transcode-tblgen.rb: make FUNii, FUNsi and FUNio + generatable. + + * transcode.c (transcoder_lib_table): removed. + (transcoder_table): change structure. + (transcoder_key): removed because the above structure change. + (make_transcoder_entry): new function. + (get_transcoder_entry): ditto. + (rb_register_transcoder): follow the structure change. + (declare_transcoder): ditto. + (transcode_search_path): new function for breadth first search to + find a list of converters. + (transcode_search_path_i): new function. + (transcode_dispatch_cb): ditto. + (transcode_dispatch): use transcode_search_path. + (transcode_loop): follow the argument change. + (str_transcode): preprocessor and postprocessor stuff removed. + + * enc/trans/iso2022.erb.c: new file. ISO-2022-JP conversion + re-implemented. + + * enc/trans/japanese.erb.c: ISO-2022-JP stuff removed. + + * enc/trans/utf_16_32.erb.c: follow argument change of FUNso. + + [ruby-dev:35798] + Thu Aug 7 22:55:44 2008 TAKAO Kouji * ext/readline/README.ja: updated API document for Readline module. diff --git a/enc/trans/iso2022.erb.c b/enc/trans/iso2022.erb.c new file mode 100644 index 0000000000..c3f6be693c --- /dev/null +++ b/enc/trans/iso2022.erb.c @@ -0,0 +1,142 @@ +#include "transcode_data.h" + +<% + map = {} + map["1b2842"] = :func_so # designate US-ASCII to G0. "ESC ( B" + map["1b284a"] = :func_so # designate JIS X 0201 latin to G0. "ESC ( J" + map["1b2440"] = :func_so # designate JIS X 0208 1978 to G0. "ESC $ @" + map["1b2442"] = :func_so # designate JIS X 0208 1983 to G0. "ESC $ B" + map["{00-0d,10-1a,1c-7f}"] = :func_si + + map_jisx0208_rest = {} + map_jisx0208_rest["{21-7e}"] = :func_so +%> + +<%= transcode_generate_node(ActionMap.parse(map), "iso2022jp_to_eucjp", []) %> +<%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest), "iso2022jp_to_eucjp_jisx0208_rest", []) %> + +static VALUE +fun_si_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l) +{ + if (t->stateful[0] == 0) + return (VALUE)NOMAP; + else if (0x21 <= s[0] && s[0] <= 0x7e) + return (VALUE)&iso2022jp_to_eucjp_jisx0208_rest; + else + return (VALUE)INVALID; +} + +static int +fun_so_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o) +{ + if (s[0] == 0x1b) { + if (s[1] == '(') { + switch (s[l-1]) { + case 'B': + case 'J': + t->stateful[0] = 0; + break; + } + } + else { + switch (s[l-1]) { + case '@': + case 'B': + t->stateful[0] = 1; + break; + } + } + return 0; + } + else { + o[0] = s[0] | 0x80; + o[1] = s[1] | 0x80; + return 2; + } +} + +static const rb_transcoder +rb_ISO_2022_JP_to_EUC_JP = { + "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 3, 0, + NULL, fun_si_iso2022jp_to_eucjp, NULL, fun_so_iso2022jp_to_eucjp +}; + +<% + map_eucjp = { + "{0e,0f,1b}" => :undef, + "{00-0d,10-1a,1c-7f}" => :func_so, + "{a1-fe}{a1-fe}" => :func_so, + "8e{a1-fe}" => :undef, + "8f{a1-fe}{a1-fe}" => :undef, + } +%> + +<%= transcode_generate_node(ActionMap.parse(map_eucjp), "eucjp_to_iso2022jp", []) %> + +static int +fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, unsigned char *o) +{ + unsigned char *output0 = o; + + if (t->stateful[0] == 0) { + t->stateful[0] = 1; /* initialized flag */ + t->stateful[1] = 1; /* ASCII mode */ + } + + if (l != t->stateful[1]) { + if (l == 1) { + *o++ = 0x1b; + *o++ = '('; + *o++ = 'B'; + t->stateful[1] = 1; + } + else { + *o++ = 0x1b; + *o++ = '$'; + *o++ = 'B'; + t->stateful[1] = 2; + } + } + + if (l == 1) { + *o++ = s[0] & 0x7f; + } + else { + *o++ = s[0] & 0x7f; + *o++ = s[1] & 0x7f; + } + + return o - output0; +} + +static int +finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o) +{ + unsigned char *output0 = o; + + if (t->stateful[0] == 0) + return 0; + + if (t->stateful[1] != 1) { + *o++ = 0x1b; + *o++ = '('; + *o++ = 'B'; + t->stateful[1] = 1; + } + + return o - output0; +} + +static const rb_transcoder +rb_EUC_JP_to_ISO_2022_JP = { + "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 5, 0, + NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp +}; + +void +Init_iso2022(void) +{ + rb_register_transcoder(&rb_ISO_2022_JP_to_EUC_JP); + rb_register_transcoder(&rb_EUC_JP_to_ISO_2022_JP); +} + diff --git a/enc/trans/japanese.erb.c b/enc/trans/japanese.erb.c index dce9ab5932..dae3bf1e03 100644 --- a/enc/trans/japanese.erb.c +++ b/enc/trans/japanese.erb.c @@ -17,235 +17,8 @@ <%= transcode_tblgen "UTF-8", "EUC-JP", [["{00-7f}", :nomap], *UCS_TO_EUCJP_TBL] %> <%= transcode_tblgen "UTF-8", "CP51932", [["{00-7f}", :nomap], *UCS_TO_EUCJP_TBL] %> -#define ISO_2022_ENCODING(escseq, byte) ((escseq<<8)|byte) -enum ISO_2022_ESCSEQ { - ISO_2022_CZD = '!', - ISO_2022_C1D = '"', - ISO_2022_GZD4 = '(', - ISO_2022_G1D4 = ')', - ISO_2022_G2D4 = '*', - ISO_2022_G3D4 = '+', - ISO_2022_G1D6 = '-', - ISO_2022_G2D6 = '.', - ISO_2022_G3D6 = '/', - ISO_2022_GZDM4 = ISO_2022_ENCODING('$','('), - ISO_2022_G1DM4 = ISO_2022_ENCODING('$',')'), - ISO_2022_G2DM4 = ISO_2022_ENCODING('$','*'), - ISO_2022_G3DM4 = ISO_2022_ENCODING('$','+'), - ISO_2022_G1DM6 = ISO_2022_ENCODING('$','-'), - ISO_2022_G2DM6 = ISO_2022_ENCODING('$','.'), - ISO_2022_G3DM6 = ISO_2022_ENCODING('$','/'), - ISO_2022_DOCS = ISO_2022_ENCODING('%','I'), - ISO_2022_IRR = '&' -}; - - -#define ISO_2022_GZ_ASCII ISO_2022_ENCODING(ISO_2022_GZD4, 'B') -#define ISO_2022_GZ_JIS_X_0201_Katakana ISO_2022_ENCODING(ISO_2022_GZD4, 'I') -#define ISO_2022_GZ_JIS_X_0201_Roman ISO_2022_ENCODING(ISO_2022_GZD4, 'J') -#define ISO_2022_GZ_JIS_C_6226_1978 ISO_2022_ENCODING(ISO_2022_GZDM4,'@') -#define ISO_2022_GZ_JIS_X_0208_1983 ISO_2022_ENCODING(ISO_2022_GZDM4,'B') -#define ISO_2022_GZ_JIS_X_0212_1990 ISO_2022_ENCODING(ISO_2022_GZDM4,'D') -#define ISO_2022_GZ_JIS_X_0213_2000_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'O') -#define ISO_2022_GZ_JIS_X_0213_2000_2 ISO_2022_ENCODING(ISO_2022_GZDM4,'P') -#define ISO_2022_GZ_JIS_X_0213_2004_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'Q') - -#define UNSUPPORTED_MODE TRANSCODE_ERROR - -static int -get_iso_2022_mode(const unsigned char **in_pos) -{ - int new_mode; - const unsigned char *in_p = *in_pos; - switch (*in_p++) { - case '(': - switch (*in_p++) { - case 'B': case 'I': case 'J': - new_mode = ISO_2022_ENCODING(ISO_2022_GZD4, *(in_p-1)); - break; - default: - rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC ( %c)", *(in_p-1)); - break; - } - break; - case '$': - switch (*in_p++) { - case '@': case 'A': case 'B': - new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1)); - break; - case '(': - switch (*in_p++) { - case 'D': case 'O': case 'P': case 'Q': - new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1)); - break; - default: - rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC $ ( %c)", *(in_p-1)); - break; - } - break; - default: - rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC $ %c)", *(in_p-1)); - break; - } - break; - default: - rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC %c)", *(in_p-1)); - break; - } - *in_pos = in_p; - return new_mode; -} - -static void -from_iso_2022_jp_transcoder_preprocessor(const unsigned char **in_pos, unsigned char **out_pos, - const unsigned char *in_stop, unsigned char *out_stop, - rb_transcoding *my_transcoding) -{ - const rb_transcoder *my_transcoder = my_transcoding->transcoder; - const unsigned char *in_p = *in_pos; - unsigned char *out_p = *out_pos; - int cur_mode = ISO_2022_GZ_ASCII; - unsigned char c1; - unsigned char *out_s = out_stop - my_transcoder->max_output + 1; - while (in_p < in_stop) { - if (out_p >= out_s) { - int len = (out_p - *out_pos); - int new_len = (len + my_transcoder->max_output) * 2; - *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len); - out_p = *out_pos + len; - out_s = *out_pos + new_len - my_transcoder->max_output; - } - c1 = *in_p++; - if (c1 == 0x1B) { - cur_mode = get_iso_2022_mode(&in_p); - } - else if (c1 == 0x1E || c1 == 0x1F) { - /* SHIFT */ - rb_raise(UNSUPPORTED_MODE, "shift is not supported"); - } - else if (c1 >= 0x80) { - rb_raise(TRANSCODE_ERROR, "invalid byte sequence"); - } - else { - switch (cur_mode) { - case ISO_2022_GZ_ASCII: - case ISO_2022_GZ_JIS_X_0201_Roman: - *out_p++ = c1; - break; - case ISO_2022_GZ_JIS_X_0201_Katakana: - *out_p++ = 0x8E; - *out_p++ = c1 | 0x80; - break; - case ISO_2022_GZ_JIS_X_0212_1990: - *out_p++ = 0x8F; - case ISO_2022_GZ_JIS_C_6226_1978: - case ISO_2022_GZ_JIS_X_0208_1983: - *out_p++ = c1 | 0x80; - *out_p++ = *in_p++ | 0x80; - break; - } - } - } - /* cleanup */ - *in_pos = in_p; - *out_pos = out_p; -} - -static int -select_iso_2022_mode(unsigned char **out_pos, int new_mode) -{ - unsigned char *out_p = *out_pos; - *out_p++ = '\x1b'; - switch (new_mode>>8) { - case ISO_2022_GZD4: - *out_p++ = new_mode >> 8; - *out_p++ = new_mode & 0x7F; - break; - case ISO_2022_GZDM4: - *out_p++ = new_mode >> 16; - if ((new_mode & 0x7F) != '@' && - (new_mode & 0x7F) != 'A' && - (new_mode & 0x7F) != 'B') - { - *out_p++ = (new_mode>>8) & 0x7F; - } - *out_p++ = new_mode & 0x7F; - break; - default: - rb_raise(UNSUPPORTED_MODE, "this mode is not supported."); - break; - } - *out_pos = out_p; - return new_mode; -} - -static void -to_iso_2022_jp_transcoder_postprocessor(const unsigned char **in_pos, unsigned char **out_pos, - const unsigned char *in_stop, unsigned char *out_stop, - rb_transcoding *my_transcoding) -{ - const rb_transcoder *my_transcoder = my_transcoding->transcoder; - const unsigned char *in_p = *in_pos; - unsigned char *out_p = *out_pos; - int cur_mode = ISO_2022_GZ_ASCII, new_mode = 0; - unsigned char next_byte; - unsigned char *out_s = out_stop - my_transcoder->max_output + 1; - while (in_p < in_stop) { - if (out_p >= out_s) { - int len = (out_p - *out_pos); - int new_len = (len + my_transcoder->max_output) * 2; - *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len); - out_p = *out_pos + len; - out_s = *out_pos + new_len - my_transcoder->max_output; - } - next_byte = *in_p++; - if (next_byte < 0x80) { - new_mode = ISO_2022_GZ_ASCII; - } - else if (next_byte == 0x8E) { - new_mode = ISO_2022_GZ_JIS_X_0201_Katakana; - next_byte = *in_p++; - } - else if (next_byte == 0x8F) { - new_mode = ISO_2022_GZ_JIS_X_0212_1990; - next_byte = *in_p++; - } - else { - new_mode = ISO_2022_GZ_JIS_X_0208_1983; - } - if (cur_mode != new_mode) - cur_mode = select_iso_2022_mode(&out_p, new_mode); - if (cur_mode < 0xFFFF) { - *out_p++ = next_byte & 0x7F; - } - else { - *out_p++ = next_byte & 0x7F; - *out_p++ = *in_p++ & 0x7F; - } - } - if (cur_mode != ISO_2022_GZ_ASCII) - cur_mode = select_iso_2022_mode(&out_p, ISO_2022_GZ_ASCII); - /* cleanup */ - *in_pos = in_p; - *out_pos = out_p; -} - -static const rb_transcoder -rb_from_ISO_2022_JP = { - "ISO-2022-JP", "UTF-8", &from_EUC_JP, 8, 0, - &from_iso_2022_jp_transcoder_preprocessor, NULL, -}; - -static const rb_transcoder -rb_to_ISO_2022_JP = { - "UTF-8", "ISO-2022-JP", &to_EUC_JP, 8, 1, - NULL, &to_iso_2022_jp_transcoder_postprocessor, -}; - void Init_japanese(void) { <%= transcode_register_code %> - rb_register_transcoder(&rb_from_ISO_2022_JP); - rb_register_transcoder(&rb_to_ISO_2022_JP); } diff --git a/enc/trans/utf_16_32.erb.c b/enc/trans/utf_16_32.erb.c index 5f5af5294e..67f84e74bf 100644 --- a/enc/trans/utf_16_32.erb.c +++ b/enc/trans/utf_16_32.erb.c @@ -1,7 +1,7 @@ #include "transcode_data.h" static int -fun_so_from_utf_16be(const unsigned char* s, unsigned char* o) +fun_so_from_utf_16be(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o) { if (!s[0] && s[1]<0x80) { o[0] = s[1]; @@ -29,7 +29,7 @@ fun_so_from_utf_16be(const unsigned char* s, unsigned char* o) } static int -fun_so_to_utf_16be(const unsigned char* s, unsigned char* o) +fun_so_to_utf_16be(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o) { if (!(s[0]&0x80)) { o[0] = 0x00; @@ -57,7 +57,7 @@ fun_so_to_utf_16be(const unsigned char* s, unsigned char* o) } static int -fun_so_from_utf_16le(const unsigned char* s, unsigned char* o) +fun_so_from_utf_16le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o) { if (!s[1] && s[0]<0x80) { o[0] = s[0]; @@ -85,7 +85,7 @@ fun_so_from_utf_16le(const unsigned char* s, unsigned char* o) } static int -fun_so_to_utf_16le(const unsigned char* s, unsigned char* o) +fun_so_to_utf_16le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o) { if (!(s[0]&0x80)) { o[1] = 0x00; @@ -113,7 +113,7 @@ fun_so_to_utf_16le(const unsigned char* s, unsigned char* o) } static int -fun_so_from_utf_32be(const unsigned char* s, unsigned char* o) +fun_so_from_utf_32be(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o) { if (!s[1]) { if (s[2]==0 && s[3]<0x80) { @@ -142,7 +142,7 @@ fun_so_from_utf_32be(const unsigned char* s, unsigned char* o) } static int -fun_so_to_utf_32be(const unsigned char* s, unsigned char* o) +fun_so_to_utf_32be(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o) { o[0] = 0; if (!(s[0]&0x80)) { @@ -168,13 +168,13 @@ fun_so_to_utf_32be(const unsigned char* s, unsigned char* o) } static int -fun_so_from_utf_32le(const unsigned char* s, unsigned char* o) +fun_so_from_utf_32le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o) { return 1; } static int -fun_so_to_utf_32le(const unsigned char* s, unsigned char* o) +fun_so_to_utf_32le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o) { return 4; } @@ -191,7 +191,7 @@ fun_so_to_utf_32le(const unsigned char* s, unsigned char* o) static const rb_transcoder rb_from_UTF_16BE = { "UTF-16BE", "UTF-8", &from_UTF_16BE, 4, 0, - NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_16be + NULL, NULL, NULL, &fun_so_from_utf_16be }; <%= @@ -217,7 +217,7 @@ rb_from_UTF_16BE = { static const rb_transcoder rb_to_UTF_16BE = { "UTF-8", "UTF-16BE", &to_UTF_16BE, 4, 1, - NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16be + NULL, NULL, NULL, &fun_so_to_utf_16be }; <%= @@ -232,13 +232,13 @@ rb_to_UTF_16BE = { static const rb_transcoder rb_from_UTF_16LE = { "UTF-16LE", "UTF-8", &from_UTF_16LE, 4, 0, - NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_16le + NULL, NULL, NULL, &fun_so_from_utf_16le }; static const rb_transcoder rb_to_UTF_16LE = { "UTF-8", "UTF-16LE", &to_UTF_16BE, 4, 1, - NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16le + NULL, NULL, NULL, &fun_so_to_utf_16le }; <%= @@ -254,13 +254,13 @@ rb_to_UTF_16LE = { static const rb_transcoder rb_from_UTF_32BE = { "UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 0, - NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_32be + NULL, NULL, NULL, &fun_so_from_utf_32be }; static const rb_transcoder rb_to_UTF_32BE = { "UTF-8", "UTF-32BE", &to_UTF_16BE, 4, 1, - NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32be + NULL, NULL, NULL, &fun_so_to_utf_32be }; <%= @@ -276,13 +276,13 @@ rb_to_UTF_32BE = { static const rb_transcoder rb_from_UTF_32LE = { "UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 0, - NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_32le + NULL, NULL, NULL, &fun_so_from_utf_32le }; static const rb_transcoder rb_to_UTF_32LE = { "UTF-8", "UTF-32LE", &to_UTF_16BE, 4, 1, - NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32le + NULL, NULL, NULL, &fun_so_to_utf_32le }; void diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 7dface484c..26e56ffb0c 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -321,12 +321,13 @@ class TestTranscode < Test::Unit::TestCase assert_raise(RuntimeError) { "\x1b(A".encode("utf-8", "iso-2022-jp") } assert_raise(RuntimeError) { "\x1b$(A".encode("utf-8", "iso-2022-jp") } assert_raise(RuntimeError) { "\x1b$C".encode("utf-8", "iso-2022-jp") } - assert_raise(RuntimeError) { "\x1e".encode("utf-8", "iso-2022-jp") } + assert_raise(RuntimeError) { "\x0e".encode("utf-8", "iso-2022-jp") } assert_raise(RuntimeError) { "\x80".encode("utf-8", "iso-2022-jp") } assert_raise(RuntimeError) { "\x1b$(Dd!\x1b(B".encode("utf-8", "iso-2022-jp") } assert_raise(RuntimeError) { "\u9299".encode("iso-2022-jp") } - #@@@@ TODO: the next test should actually fail, because iso-2022-jp does not include half-width kana - check_both_ways("\uff71\uff72\uff73\uff74\uff75", "\x1b(I12345\x1b(B", "iso-2022-jp") # JIS X 0201 ァィゥェォ + assert_raise(RuntimeError) { "\u9299".encode("iso-2022-jp") } + assert_raise(RuntimeError) { "\uff71\uff72\uff73\uff74\uff75".encode("iso-2022-jp") } + assert_raise(RuntimeError) { "\x1b(I12345\x1b(B".encode("utf-8", "iso-2022-jp") } end def test_iso_2022_jp_1 diff --git a/tool/transcode-tblgen.rb b/tool/transcode-tblgen.rb index bdc8752f50..767ea0bbf3 100644 --- a/tool/transcode-tblgen.rb +++ b/tool/transcode-tblgen.rb @@ -234,6 +234,12 @@ class ActionMap "UNDEF" when :invalid "INVALID" + when :func_ii + "FUNii" + when :func_si + "FUNsi" + when :func_io + "FUNio" when :func_so "FUNso" when /\A([0-9a-f][0-9a-f])\z/i diff --git a/transcode.c b/transcode.c index adca763318..3a1ab70a81 100644 --- a/transcode.c +++ b/transcode.c @@ -25,53 +25,78 @@ static VALUE sym_invalid, sym_undef, sym_ignore, sym_replace; * Dispatch data and logic */ -static st_table *transcoder_table, *transcoder_lib_table; +typedef struct { + const char *from; + const char *to; + const char *lib; /* maybe null. it means that don't load the library. */ + const rb_transcoder *transcoder; +} transcoder_entry_t; -#define TRANSCODER_INTERNAL_SEPARATOR '\t' +static st_table *transcoder_table; -static char * -transcoder_key(const char *from_e, const char *to_e) +static transcoder_entry_t * +make_transcoder_entry(const char *from, const char *to) { - int to_len = strlen(to_e); - int from_len = strlen(from_e); - char *const key = xmalloc(to_len + from_len + 2); + st_data_t val; + st_table *table2; - memcpy(key, to_e, to_len); - memcpy(key + to_len + 1, from_e, from_len + 1); - key[to_len] = TRANSCODER_INTERNAL_SEPARATOR; - return key; + if (!st_lookup(transcoder_table, (st_data_t)from, &val)) { + val = (st_data_t)st_init_strcasetable(); + st_add_direct(transcoder_table, (st_data_t)from, val); + } + table2 = (st_table *)val; + if (!st_lookup(table2, (st_data_t)to, &val)) { + transcoder_entry_t *entry = ALLOC(transcoder_entry_t); + entry->from = from; + entry->to = to; + entry->lib = NULL; + entry->transcoder = NULL; + val = (st_data_t)entry; + st_add_direct(table2, (st_data_t)to, val); + } + return (transcoder_entry_t *)val; +} + +static transcoder_entry_t * +get_transcoder_entry(const char *from, const char *to) +{ + st_data_t val; + st_table *table2; + + if (!st_lookup(transcoder_table, (st_data_t)from, &val)) { + return NULL; + } + table2 = (st_table *)val; + if (!st_lookup(table2, (st_data_t)to, &val)) { + return NULL; + } + return (transcoder_entry_t *)val; } void rb_register_transcoder(const rb_transcoder *tr) { - st_data_t k, val = 0; const char *const from_e = tr->from_encoding; const char *const to_e = tr->to_encoding; - char *const key = transcoder_key(from_e, to_e); - if (st_lookup(transcoder_table, (st_data_t)key, &val)) { - xfree(key); + transcoder_entry_t *entry; + + entry = make_transcoder_entry(from_e, to_e); + if (entry->transcoder) { rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered", from_e, to_e); } - k = (st_data_t)key; - if (st_delete(transcoder_lib_table, &k, &val)) { - xfree((char *)k); - } - st_insert(transcoder_table, (st_data_t)key, (st_data_t)tr); + + entry->transcoder = tr; } static void declare_transcoder(const char *to, const char *from, const char *lib) { - const char *const key = transcoder_key(to, from); - st_data_t k = (st_data_t)key, val; + transcoder_entry_t *entry; - if (st_delete(transcoder_lib_table, &k, &val)) { - xfree((char *)k); - } - st_insert(transcoder_lib_table, (st_data_t)key, (st_data_t)lib); + entry = make_transcoder_entry(from, to); + entry->lib = lib; } #define MAX_TRANSCODER_LIBNAME_LEN 64 @@ -90,38 +115,166 @@ rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib) #define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0) +typedef struct search_path_queue_tag { + struct search_path_queue_tag *next; + const char *enc; +} search_path_queue_t; + +typedef struct { + st_table *visited; + search_path_queue_t *queue; + search_path_queue_t **queue_last_ptr; + const char *base_enc; +} search_path_bfs_t; + +static int +transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg) +{ + const char *to = (const char *)key; + search_path_bfs_t *bfs = (search_path_bfs_t *)arg; + search_path_queue_t *q; + + if (st_lookup(bfs->visited, (st_data_t)to, &val)) { + return ST_CONTINUE; + } + + q = ALLOC(search_path_queue_t); + q->enc = to; + q->next = NULL; + *bfs->queue_last_ptr = q; + bfs->queue_last_ptr = &q->next; + + st_add_direct(bfs->visited, (st_data_t)to, (st_data_t)bfs->base_enc); + return ST_CONTINUE; +} + +static int +transcode_search_path(const char *from, const char *to, + void (*callback)(const char *from, const char *to, int depth, void *arg), + void *arg) +{ + search_path_bfs_t bfs; + search_path_queue_t *q; + st_data_t val; + st_table *table2; + int found; + + q = ALLOC(search_path_queue_t); + q->enc = from; + q->next = NULL; + bfs.queue_last_ptr = &q->next; + bfs.queue = q; + + bfs.visited = st_init_strcasetable(); + st_add_direct(bfs.visited, (st_data_t)from, (st_data_t)NULL); + + while (bfs.queue) { + q = bfs.queue; + bfs.queue = q->next; + if (!bfs.queue) + bfs.queue_last_ptr = &bfs.queue; + + if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) { + xfree(q); + continue; + } + table2 = (st_table *)val; + + if (st_lookup(table2, (st_data_t)to, &val)) { + st_add_direct(bfs.visited, (st_data_t)to, (st_data_t)q->enc); + xfree(q); + found = 1; + goto cleanup; + } + + bfs.base_enc = q->enc; + st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs); + bfs.base_enc = NULL; + + xfree(q); + } + found = 0; + +cleanup: + while (bfs.queue) { + q = bfs.queue; + bfs.queue = q->next; + xfree(q); + } + + if (found) { + const char *enc = to; + int depth = 0; + while (1) { + st_lookup(bfs.visited, (st_data_t)enc, &val); + if (!val) + break; + depth++; + enc = (const char *)val; + } + enc = to; + while (1) { + st_lookup(bfs.visited, (st_data_t)enc, &val); + if (!val) + break; + callback((const char *)val, enc, --depth, arg); + enc = (const char *)val; + } + } + + st_free_table(bfs.visited); + + return found; +} + +static void +transcode_dispatch_cb(const char *from, const char *to, int depth, void *arg) +{ + const rb_transcoder **first_transcoder_ptr = (const rb_transcoder **)arg; + + transcoder_entry_t *entry; + + if (!*first_transcoder_ptr) + return; + + entry = get_transcoder_entry(from, to); + if (!entry) + goto failed; + + if (!entry->transcoder && entry->lib) { + const char *lib = entry->lib; + int len = strlen(lib); + char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN]; + + entry->lib = NULL; + + if (len > MAX_TRANSCODER_LIBNAME_LEN) goto failed; + memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1); + memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1); + if (!rb_require(path)) goto failed; + } + if (!entry->transcoder) + goto failed; + + if (depth == 0) + *first_transcoder_ptr = entry->transcoder; + + return; + +failed: + *first_transcoder_ptr = NULL; + return; +} + static const rb_transcoder * transcode_dispatch(const char *from_encoding, const char *to_encoding) { - char *const key = transcoder_key(from_encoding, to_encoding); - st_data_t k, val = 0; + const rb_transcoder *first_transcoder = (rb_transcoder *)1; - while (!st_lookup(transcoder_table, (k = (st_data_t)key), &val) && - st_delete(transcoder_lib_table, &k, &val)) { - const char *const lib = (const char *)val; - int len = strlen(lib); - char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN]; - - xfree((char *)k); - if (len > MAX_TRANSCODER_LIBNAME_LEN) return NULL; - memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1); - memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1); - if (!rb_require(path)) return NULL; + if (transcode_search_path(from_encoding, to_encoding, transcode_dispatch_cb, (void *)&first_transcoder)) { + return first_transcoder; } - if (!val) { - if (!st_lookup(transcoder_table, (st_data_t)key, &val)) { - xfree(key); - /* multistep logic, via UTF-8 */ - if (!encoding_equal(from_encoding, "UTF-8") && - !encoding_equal(to_encoding, "UTF-8") && - transcode_dispatch("UTF-8", to_encoding)) { /* check that we have a second step */ - return transcode_dispatch(from_encoding, "UTF-8"); /* return first step */ - } - return NULL; - } - } - xfree(key); - return (rb_transcoder *)val; + return NULL; } static void @@ -245,17 +398,17 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, *out_p++ = getBT3(next_info); continue; case FUNii: - next_info = (VALUE)(*my_transcoder->func_ii)(next_info); + next_info = (VALUE)(*my_transcoder->func_ii)(my_transcoding, next_info); goto follow_info; case FUNsi: - next_info = (VALUE)(*my_transcoder->func_si)(char_start); + next_info = (VALUE)(*my_transcoder->func_si)(my_transcoding, char_start, (size_t)(in_p-char_start)); goto follow_info; break; case FUNio: - out_p += (VALUE)(*my_transcoder->func_io)(next_info, out_p); + out_p += (VALUE)(*my_transcoder->func_io)(my_transcoding, next_info, out_p); break; case FUNso: - out_p += (VALUE)(*my_transcoder->func_so)(char_start, out_p); + out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, char_start, (size_t)(in_p-char_start), out_p); break; case INVALID: goto invalid; @@ -290,6 +443,16 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, continue; } /* cleanup */ + if (my_transcoder->finish_func) { + if (out_p >= out_s) { + int len = (out_p - *out_pos); + int new_len = (len + my_transcoder->max_output) * 2; + *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len); + out_p = *out_pos + len; + out_s = *out_pos + new_len - my_transcoder->max_output; + } + out_p += my_transcoder->finish_func(my_transcoding, out_p); + } *in_pos = in_p; *out_pos = out_p; } @@ -401,21 +564,8 @@ str_transcode(int argc, VALUE *argv, VALUE *self) } my_transcoding.transcoder = my_transcoder; + memset(my_transcoding.stateful, 0, sizeof(my_transcoding.stateful)); - if (my_transcoder->preprocessor) { - fromp = sp = (unsigned char *)RSTRING_PTR(str); - slen = RSTRING_LEN(str); - blen = slen + 30; /* len + margin */ - dest = rb_str_tmp_new(blen); - bp = (unsigned char *)RSTRING_PTR(dest); - my_transcoding.ruby_string_dest = dest; - (*my_transcoder->preprocessor)(&fromp, &bp, (sp+slen), (bp+blen), &my_transcoding); - if (fromp != sp+slen) { - rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp); - } - rb_str_set_len(dest, (char *)bp - RSTRING_PTR(dest)); - str = dest; - } fromp = sp = (unsigned char *)RSTRING_PTR(str); slen = RSTRING_LEN(str); blen = slen + 30; /* len + margin */ @@ -431,21 +581,6 @@ str_transcode(int argc, VALUE *argv, VALUE *self) buf = (unsigned char *)RSTRING_PTR(dest); *bp = '\0'; rb_str_set_len(dest, bp - buf); - if (my_transcoder->postprocessor) { - str = dest; - fromp = sp = (unsigned char *)RSTRING_PTR(str); - slen = RSTRING_LEN(str); - blen = slen + 30; /* len + margin */ - dest = rb_str_tmp_new(blen); - bp = (unsigned char *)RSTRING_PTR(dest); - my_transcoding.ruby_string_dest = dest; - (*my_transcoder->postprocessor)(&fromp, &bp, (sp+slen), (bp+blen), &my_transcoding); - if (fromp != sp+slen) { - rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp); - } - buf = (unsigned char *)RSTRING_PTR(dest); - rb_str_set_len(dest, bp - buf); - } if (encoding_equal(my_transcoder->to_encoding, to_e)) { final_encoding = 1; @@ -541,7 +676,6 @@ void Init_transcode(void) { transcoder_table = st_init_strcasetable(); - transcoder_lib_table = st_init_strcasetable(); sym_invalid = ID2SYM(rb_intern("invalid")); sym_undef = ID2SYM(rb_intern("undef")); diff --git a/transcode_data.h b/transcode_data.h index 3fecaf6acc..92f8ade436 100644 --- a/transcode_data.h +++ b/transcode_data.h @@ -63,6 +63,8 @@ typedef struct rb_transcoding { VALUE ruby_string_dest; /* the String used as the conversion destination, or NULL if something else is being converted */ unsigned char *(*flush_func)(struct rb_transcoding*, int, int); + + unsigned char stateful[256]; /* opaque data for stateful encoding */ } rb_transcoding; /* static structure, one per supported encoding pair */ @@ -72,12 +74,11 @@ typedef struct rb_transcoder { const BYTE_LOOKUP *conv_tree_start; int max_output; int from_utf8; - void (*preprocessor)(const unsigned char**, unsigned char**, const unsigned char*, unsigned char*, struct rb_transcoding *); - void (*postprocessor)(const unsigned char**, unsigned char**, const unsigned char*, unsigned char*, struct rb_transcoding *); - VALUE (*func_ii)(VALUE); /* info -> info */ - VALUE (*func_si)(const unsigned char *); /* start -> info */ - int (*func_io)(VALUE, const unsigned char*); /* info -> output */ - int (*func_so)(const unsigned char*, unsigned char*); /* start -> output */ + VALUE (*func_ii)(rb_transcoding*, VALUE); /* info -> info */ + VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */ + int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */ + int (*func_so)(rb_transcoding*, const unsigned char*, size_t, unsigned char*); /* start -> output */ + int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */ } rb_transcoder; void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib);