mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
* include/ruby/encoding.h (ECONV_UNDEF_HEX_CHARREF): defined.
* transcode.c (output_hex_charref): new function. (rb_econv_convert): call output_hex_charref if ECONV_UNDEF_HEX_CHARREF. (Init_transcode): Encoding::Converter::UNDEF_HEX_CHARREF added. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@19162 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
7d3598ee60
commit
c500c37fbc
4 changed files with 94 additions and 0 deletions
|
@ -1,3 +1,12 @@
|
||||||
|
Sat Sep 6 05:22:29 2008 Tanaka Akira <akr@fsij.org>
|
||||||
|
|
||||||
|
* include/ruby/encoding.h (ECONV_UNDEF_HEX_CHARREF): defined.
|
||||||
|
|
||||||
|
* transcode.c (output_hex_charref): new function.
|
||||||
|
(rb_econv_convert): call output_hex_charref if
|
||||||
|
ECONV_UNDEF_HEX_CHARREF.
|
||||||
|
(Init_transcode): Encoding::Converter::UNDEF_HEX_CHARREF added.
|
||||||
|
|
||||||
Sat Sep 6 03:52:47 2008 Tanaka Akira <akr@fsij.org>
|
Sat Sep 6 03:52:47 2008 Tanaka Akira <akr@fsij.org>
|
||||||
|
|
||||||
* transcode.c (rb_econv_convert): use ECONV_INVALID_MASK and
|
* transcode.c (rb_econv_convert): use ECONV_INVALID_MASK and
|
||||||
|
|
|
@ -255,6 +255,7 @@ void rb_econv_binmode(rb_econv_t *ec);
|
||||||
|
|
||||||
#define ECONV_UNDEF_MASK 0x00f0
|
#define ECONV_UNDEF_MASK 0x00f0
|
||||||
#define ECONV_UNDEF_REPLACE 0x0020
|
#define ECONV_UNDEF_REPLACE 0x0020
|
||||||
|
#define ECONV_UNDEF_HEX_CHARREF 0x0030
|
||||||
|
|
||||||
/* effective only if output is ascii compatible */
|
/* effective only if output is ascii compatible */
|
||||||
#define ECONV_UNIVERSAL_NEWLINE_DECODER 0x0100
|
#define ECONV_UNIVERSAL_NEWLINE_DECODER 0x0100
|
||||||
|
|
|
@ -670,4 +670,19 @@ class TestEncodingConverter < Test::Unit::TestCase
|
||||||
ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace, :replace => "X")
|
ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace, :replace => "X")
|
||||||
assert_equal("a X b", ec.convert("a \u3042 b"))
|
assert_equal("a X b", ec.convert("a \u3042 b"))
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_hex_charref
|
||||||
|
ec = Encoding::Converter.new("UTF-8", "US-ASCII", Encoding::Converter::UNDEF_HEX_CHARREF)
|
||||||
|
assert_equal("あ", ec.convert("\u3042"))
|
||||||
|
|
||||||
|
ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::UNDEF_HEX_CHARREF)
|
||||||
|
assert_equal("\xa4\xcf\xa4\xa1\xa4\xa4♥\xa1\xa3".force_encoding("euc-jp"),
|
||||||
|
ec.convert("\u{306f 3041 3044 2665 3002}"))
|
||||||
|
|
||||||
|
ec = Encoding::Converter.new("UTF-8", "ISO-2022-JP", Encoding::Converter::UNDEF_HEX_CHARREF)
|
||||||
|
assert_equal("\e$B$O$!$$\e(B♥\e$B!#".force_encoding("ISO-2022-JP"),
|
||||||
|
ec.convert("\u{306f 3041 3044 2665 3002}"))
|
||||||
|
assert_equal("\e(B".force_encoding("ISO-2022-JP"),
|
||||||
|
ec.finish)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
69
transcode.c
69
transcode.c
|
@ -34,6 +34,12 @@ static VALUE sym_finished;
|
||||||
static VALUE sym_output_followed_by_input;
|
static VALUE sym_output_followed_by_input;
|
||||||
static VALUE sym_incomplete_input;
|
static VALUE sym_incomplete_input;
|
||||||
|
|
||||||
|
static unsigned char *
|
||||||
|
allocate_converted_string(const char *sname, const char *dname,
|
||||||
|
const unsigned char *str, size_t len,
|
||||||
|
unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
|
||||||
|
size_t *dst_len_ptr);
|
||||||
|
|
||||||
/* dynamic structure, one per conversion (similar to iconv_t) */
|
/* dynamic structure, one per conversion (similar to iconv_t) */
|
||||||
/* may carry conversion state (e.g. for iso-2022-jp) */
|
/* may carry conversion state (e.g. for iso-2022-jp) */
|
||||||
typedef struct rb_transcoding {
|
typedef struct rb_transcoding {
|
||||||
|
@ -1261,6 +1267,62 @@ rb_econv_convert0(rb_econv_t *ec,
|
||||||
|
|
||||||
static int output_replacement_character(rb_econv_t *ec);
|
static int output_replacement_character(rb_econv_t *ec);
|
||||||
|
|
||||||
|
static int
|
||||||
|
output_hex_charref(rb_econv_t *ec)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
unsigned char utfbuf[1024];
|
||||||
|
const unsigned char *utf;
|
||||||
|
size_t utf_len;
|
||||||
|
int utf_allocated = 0;
|
||||||
|
char charef_buf[16];
|
||||||
|
const unsigned char *p;
|
||||||
|
|
||||||
|
if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
|
||||||
|
utf = ec->last_error.error_bytes_start;
|
||||||
|
utf_len = ec->last_error.error_bytes_len;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
|
||||||
|
ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
|
||||||
|
utfbuf, sizeof(utfbuf),
|
||||||
|
&utf_len);
|
||||||
|
if (!utf)
|
||||||
|
return -1;
|
||||||
|
if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
|
||||||
|
utf_allocated = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (utf_len % 4 != 0)
|
||||||
|
goto fail;
|
||||||
|
|
||||||
|
p = utf;
|
||||||
|
while (4 <= utf_len) {
|
||||||
|
unsigned int u = 0;
|
||||||
|
u += p[0] << 24;
|
||||||
|
u += p[1] << 16;
|
||||||
|
u += p[2] << 8;
|
||||||
|
u += p[3];
|
||||||
|
snprintf(charef_buf, sizeof(charef_buf), "&#x%x;", u);
|
||||||
|
|
||||||
|
ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
|
||||||
|
if (ret == -1)
|
||||||
|
goto fail;
|
||||||
|
|
||||||
|
p += 4;
|
||||||
|
utf_len -= 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (utf_allocated)
|
||||||
|
xfree((void *)utf);
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
fail:
|
||||||
|
if (utf_allocated)
|
||||||
|
xfree((void *)utf);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
rb_econv_result_t
|
rb_econv_result_t
|
||||||
rb_econv_convert(rb_econv_t *ec,
|
rb_econv_convert(rb_econv_t *ec,
|
||||||
const unsigned char **input_ptr, const unsigned char *input_stop,
|
const unsigned char **input_ptr, const unsigned char *input_stop,
|
||||||
|
@ -1305,6 +1367,11 @@ rb_econv_convert(rb_econv_t *ec,
|
||||||
if (output_replacement_character(ec) == 0)
|
if (output_replacement_character(ec) == 0)
|
||||||
goto resume;
|
goto resume;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ECONV_UNDEF_HEX_CHARREF:
|
||||||
|
if (output_hex_charref(ec) == 0)
|
||||||
|
goto resume;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3424,10 +3491,12 @@ Init_transcode(void)
|
||||||
rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
|
rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
|
||||||
rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
|
rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
|
||||||
rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
|
rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
|
||||||
|
|
||||||
rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
|
rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
|
||||||
rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
|
rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
|
||||||
rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
|
rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
|
||||||
rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
|
rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
|
||||||
|
rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
|
||||||
rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
|
rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
|
||||||
rb_define_const(rb_cEncodingConverter, "OUTPUT_FOLLOWED_BY_INPUT", INT2FIX(ECONV_OUTPUT_FOLLOWED_BY_INPUT));
|
rb_define_const(rb_cEncodingConverter, "OUTPUT_FOLLOWED_BY_INPUT", INT2FIX(ECONV_OUTPUT_FOLLOWED_BY_INPUT));
|
||||||
rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECODER", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECODER));
|
rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECODER", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECODER));
|
||||||
|
|
Loading…
Reference in a new issue