mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
* transcode_data.h (rb_transcoder): add resetstate_func field for
resetting a state of stateful encoding. * enc/trans/iso2022.trans (rb_EUC_JP_to_ISO_2022_JP): specify finish_eucjp_to_iso2022jp for resetstate_func. * tool/transcode-tblgen.rb: specify NULL for resetstate_func. * transcode.c (output_replacement_character): call resetstate_func before appending the replacement character. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18503 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
5f9b877ebe
commit
94ca2d94de
6 changed files with 80 additions and 27 deletions
13
ChangeLog
13
ChangeLog
|
@ -1,3 +1,16 @@
|
||||||
|
Tue Aug 12 07:41:13 2008 Tanaka Akira <akr@fsij.org>
|
||||||
|
|
||||||
|
* transcode_data.h (rb_transcoder): add resetstate_func field for
|
||||||
|
resetting a state of stateful encoding.
|
||||||
|
|
||||||
|
* enc/trans/iso2022.trans (rb_EUC_JP_to_ISO_2022_JP): specify
|
||||||
|
finish_eucjp_to_iso2022jp for resetstate_func.
|
||||||
|
|
||||||
|
* tool/transcode-tblgen.rb: specify NULL for resetstate_func.
|
||||||
|
|
||||||
|
* transcode.c (output_replacement_character): call resetstate_func
|
||||||
|
before appending the replacement character.
|
||||||
|
|
||||||
Tue Aug 12 07:19:24 2008 Tanaka Akira <akr@fsij.org>
|
Tue Aug 12 07:19:24 2008 Tanaka Akira <akr@fsij.org>
|
||||||
|
|
||||||
* transcode.c (get_replacement_character): extracted from
|
* transcode.c (get_replacement_character): extracted from
|
||||||
|
|
|
@ -136,7 +136,8 @@ rb_EUC_JP_to_ISO_2022_JP = {
|
||||||
1, /* input_unit_length */
|
1, /* input_unit_length */
|
||||||
3, /* max_input */
|
3, /* max_input */
|
||||||
5, /* max_output */
|
5, /* max_output */
|
||||||
NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
|
NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp,
|
||||||
|
finish_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
|
||||||
};
|
};
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
|
@ -303,6 +303,9 @@ class TestTranscode < Test::Unit::TestCase
|
||||||
"\xdc\x00".encode("EUC-JP", "UTF-16BE", :invalid=>:replace), "[ruby-dev:35776]")
|
"\xdc\x00".encode("EUC-JP", "UTF-16BE", :invalid=>:replace), "[ruby-dev:35776]")
|
||||||
assert_equal("ab?cd?ef",
|
assert_equal("ab?cd?ef",
|
||||||
"\0a\0b\xdc\x00\0c\0d\xdf\x00\0e\0f".encode("EUC-JP", "UTF-16BE", :invalid=>:replace))
|
"\0a\0b\xdc\x00\0c\0d\xdf\x00\0e\0f".encode("EUC-JP", "UTF-16BE", :invalid=>:replace))
|
||||||
|
|
||||||
|
assert_equal("\e$B!!\e(B?".force_encoding("ISO-2022-JP"),
|
||||||
|
"\xA1\xA1\xFF".encode("ISO-2022-JP", "EUC-JP", invalid: :replace))
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_undef_replace
|
def test_undef_replace
|
||||||
|
|
|
@ -446,7 +446,7 @@ static const rb_transcoder
|
||||||
#{input_unit_length}, /* input_unit_length */
|
#{input_unit_length}, /* input_unit_length */
|
||||||
#{max_input}, /* max_input */
|
#{max_input}, /* max_input */
|
||||||
#{max_output}, /* max_output */
|
#{max_output}, /* max_output */
|
||||||
NULL, NULL, NULL, NULL, NULL
|
NULL, NULL, NULL, NULL, NULL, NULL
|
||||||
};
|
};
|
||||||
End
|
End
|
||||||
tree_code + "\n" + transcoder_code
|
tree_code + "\n" + transcoder_code
|
||||||
|
|
85
transcode.c
85
transcode.c
|
@ -292,19 +292,6 @@ get_replacement_character(rb_encoding *enc, int *len_ret)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
|
||||||
output_replacement_character(unsigned char **out_pp, rb_encoding *enc)
|
|
||||||
{
|
|
||||||
const char *replacement;
|
|
||||||
int len;
|
|
||||||
replacement = get_replacement_character(enc, &len);
|
|
||||||
|
|
||||||
memcpy(*out_pp, replacement, len);
|
|
||||||
|
|
||||||
*out_pp += len;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Transcoding engine logic
|
* Transcoding engine logic
|
||||||
*/
|
*/
|
||||||
|
@ -818,6 +805,62 @@ more_output_buffer(
|
||||||
*out_stop_ptr = *out_start_ptr + new_len;
|
*out_stop_ptr = *out_start_ptr + new_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
output_replacement_character(
|
||||||
|
VALUE destination,
|
||||||
|
unsigned char *(*resize_destination)(VALUE, int, int),
|
||||||
|
rb_trans_t *ts,
|
||||||
|
unsigned char **out_start_ptr,
|
||||||
|
unsigned char **out_pos,
|
||||||
|
unsigned char **out_stop_ptr)
|
||||||
|
|
||||||
|
{
|
||||||
|
rb_transcoding *tc;
|
||||||
|
const rb_transcoder *tr;
|
||||||
|
int max_output;
|
||||||
|
rb_encoding *enc;
|
||||||
|
const char *replacement;
|
||||||
|
int len;
|
||||||
|
|
||||||
|
tc = ts->elems[ts->num_trans-1].tc;
|
||||||
|
tr = tc->transcoder;
|
||||||
|
max_output = tr->max_output;
|
||||||
|
enc = rb_enc_find(tr->to_encoding);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Assumption for stateful encoding:
|
||||||
|
*
|
||||||
|
* - The replacement character can be output on resetted state and doesn't
|
||||||
|
* change the state.
|
||||||
|
* - it is acceptable that extra state changing sequence if the replacement
|
||||||
|
* character contains a state changing sequence.
|
||||||
|
*
|
||||||
|
* Currently the replacement character for stateful encoding such as
|
||||||
|
* ISO-2022-JP is "?" and it has no state changing sequence.
|
||||||
|
* So the extra state changing sequence don't occur.
|
||||||
|
*
|
||||||
|
* Thease assumption may be removed in future.
|
||||||
|
* It needs to scan the replacement character to check
|
||||||
|
* state changing sequences in the replacement character.
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (tr->resetstate_func) {
|
||||||
|
if (*out_stop_ptr - *out_pos < max_output)
|
||||||
|
more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
|
||||||
|
*out_pos += tr->resetstate_func(tc, *out_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*out_stop_ptr - *out_pos < max_output)
|
||||||
|
more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
|
||||||
|
|
||||||
|
replacement = get_replacement_character(enc, &len);
|
||||||
|
|
||||||
|
memcpy(*out_pos, replacement, len);
|
||||||
|
|
||||||
|
*out_pos += len;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
static void
|
static void
|
||||||
transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
||||||
|
@ -848,9 +891,7 @@ resume:
|
||||||
goto resume;
|
goto resume;
|
||||||
}
|
}
|
||||||
else if (opt&INVALID_REPLACE) {
|
else if (opt&INVALID_REPLACE) {
|
||||||
if (out_stop - *out_pos < max_output)
|
output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
|
||||||
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
|
|
||||||
output_replacement_character(out_pos, rb_enc_find(to_encoding));
|
|
||||||
goto resume;
|
goto resume;
|
||||||
}
|
}
|
||||||
rb_trans_close(ts);
|
rb_trans_close(ts);
|
||||||
|
@ -864,9 +905,7 @@ resume:
|
||||||
goto resume;
|
goto resume;
|
||||||
}
|
}
|
||||||
else if (opt&UNDEF_REPLACE) {
|
else if (opt&UNDEF_REPLACE) {
|
||||||
if (out_stop - *out_pos < max_output)
|
output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
|
||||||
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
|
|
||||||
output_replacement_character(out_pos, rb_enc_find(to_encoding));
|
|
||||||
goto resume;
|
goto resume;
|
||||||
}
|
}
|
||||||
rb_trans_close(ts);
|
rb_trans_close(ts);
|
||||||
|
@ -931,9 +970,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else if (opt&INVALID_REPLACE) {
|
else if (opt&INVALID_REPLACE) {
|
||||||
if (out_stop - *out_pos < max_output)
|
output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
|
||||||
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
|
|
||||||
output_replacement_character(out_pos, rb_enc_find(to_encoding));
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
rb_trans_close(ts);
|
rb_trans_close(ts);
|
||||||
|
@ -948,9 +985,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else if (opt&UNDEF_REPLACE) {
|
else if (opt&UNDEF_REPLACE) {
|
||||||
if (out_stop - *out_pos < max_output)
|
output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
|
||||||
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
|
|
||||||
output_replacement_character(out_pos, rb_enc_find(to_encoding));
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
rb_trans_close(ts);
|
rb_trans_close(ts);
|
||||||
|
|
|
@ -95,6 +95,7 @@ struct rb_transcoder {
|
||||||
VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */
|
VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */
|
||||||
int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */
|
int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */
|
||||||
int (*func_so)(rb_transcoding*, const unsigned char*, size_t, unsigned char*); /* start -> output */
|
int (*func_so)(rb_transcoding*, const unsigned char*, size_t, unsigned char*); /* start -> output */
|
||||||
|
int (*resetstate_func)(rb_transcoding*, unsigned char*); /* -> output */
|
||||||
int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */
|
int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue