mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
* transcode.c (str_transcode, transcode_dispatch): added two-step
* trancode.c: some minor formatting fixes * transcode_data.h, transcode_data_iso_8859.c: Shortened extremely frequently used macros to shorten file length. * test/ruby/test_transcode.rb: Fixed name of test class; added setup method to ensure all necessary encodings exist; split tests into more test methods; added tests; fixed ordering of arguments in assert_equal to have expected result first. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14236 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
4f13113945
commit
f2b0dba1cf
5 changed files with 1452 additions and 1377 deletions
17
ChangeLog
17
ChangeLog
|
@ -52,6 +52,23 @@ Fri Dec 14 16:06:18 2007 Yukihiro Matsumoto <matz@ruby-lang.org>
|
|||
|
||||
* string.c (rb_str_casecmp): ditto.
|
||||
|
||||
Fri Dec 14 15:25:30 2007 Martin Duerst <duerst@it.aoyama.ac.jp>
|
||||
|
||||
* transcode.c (encoding_equal): new function.
|
||||
|
||||
* transcode.c (str_transcode, transcode_dispatch): added two-step
|
||||
conversion logic via UTF-8.
|
||||
|
||||
* trancode.c: some minor formatting fixes
|
||||
|
||||
* transcode_data.h, transcode_data_iso_8859.c: Shortened
|
||||
extremely frequently used macros to shorten file length.
|
||||
|
||||
* test/ruby/test_transcode.rb: Fixed name of test class;
|
||||
added setup method to ensure all necessary encodings exist;
|
||||
split tests into more test methods; added tests; fixed ordering
|
||||
of arguments in assert_equal to have expected result first.
|
||||
|
||||
Fri Dec 14 13:47:54 2007 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
||||
|
||||
* common.mk (ruby.imp): fix for circular dependency. a patch from
|
||||
|
|
|
@ -1,8 +1,22 @@
|
|||
# -*- encoding: US-ASCII -*- # make sure this runs in binary mode
|
||||
# -*- encoding: ASCII-8BIT -*- # make sure this runs in binary mode
|
||||
|
||||
require 'test/unit'
|
||||
class TestConvert < Test::Unit::TestCase
|
||||
def test_basic
|
||||
class TestTranscode < Test::Unit::TestCase
|
||||
def setup # trick to create all the necessary encodings
|
||||
all_encodings = [ 'ISO-8859-1', 'ISO-8859-2',
|
||||
'ISO-8859-3', 'ISO-8859-4',
|
||||
'ISO-8859-5', 'ISO-8859-6',
|
||||
'ISO-8859-7', 'ISO-8859-8',
|
||||
'ISO-8859-9', 'ISO-8859-10',
|
||||
'ISO-8859-11', 'ISO-8859-13',
|
||||
'ISO-8859-14', 'ISO-8859-15'
|
||||
]
|
||||
all_encodings.each do |enc|
|
||||
'abc'.encode(enc, 'UTF-8')
|
||||
end
|
||||
end
|
||||
|
||||
def test_errors
|
||||
# we don't have semantics for conversion without attribute yet
|
||||
# maybe 'convert to UTF-8' would be nice :-)
|
||||
assert_raise(ArgumentError) { 'abc'.encode }
|
||||
|
@ -13,43 +27,63 @@ class TestConvert < Test::Unit::TestCase
|
|||
assert_raise(ArgumentError) { 'abc'.encode!('foo', 'bar') }
|
||||
assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode('foo') }
|
||||
assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode!('foo') }
|
||||
assert_equal('abc'.force_encoding('utf-8').encode('iso-8859-1'), 'abc')
|
||||
# check that encoding is kept when no conversion is done
|
||||
assert_equal('abc'.force_encoding('Shift_JIS').encode('Shift_JIS'), 'abc'.force_encoding('Shift_JIS'))
|
||||
assert_equal('abc'.force_encoding('Shift_JIS').encode!('Shift_JIS'), 'abc'.force_encoding('Shift_JIS'))
|
||||
# assert that encoding is correctly set
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-1').encode('utf-8').encoding, "D\u00FCrst".encoding)
|
||||
# check that Encoding can be used as parameter
|
||||
assert_equal("D\xFCrst".encode('utf-8', Encoding.find('ISO-8859-1')), "D\u00FCrst")
|
||||
assert_equal("D\xFCrst".encode(Encoding.find('utf-8'), 'ISO-8859-1'), "D\u00FCrst")
|
||||
assert_equal("D\xFCrst".encode(Encoding.find('utf-8'), Encoding.find('ISO-8859-1')), "D\u00FCrst")
|
||||
end
|
||||
|
||||
# temporary, fix encoding
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-1').encode('utf-8'), "D\u00FCrst")
|
||||
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-1'), "D\u00FCrst")
|
||||
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-2'), "D\u00FCrst")
|
||||
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-3'), "D\u00FCrst")
|
||||
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-4'), "D\u00FCrst")
|
||||
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-9'), "D\u00FCrst")
|
||||
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-10'), "D\u00FCrst")
|
||||
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-13'), "D\u00FCrst")
|
||||
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-14'), "D\u00FCrst")
|
||||
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-15'), "D\u00FCrst")
|
||||
assert_equal("D\u00FCrst".encode('iso-8859-1'), "D\xFCrst".force_encoding('iso-8859-1'))
|
||||
assert_equal("D\u00FCrst".encode('iso-8859-2'), "D\xFCrst".force_encoding('iso-8859-2'))
|
||||
assert_equal("D\u00FCrst".encode('iso-8859-3'), "D\xFCrst".force_encoding('iso-8859-3'))
|
||||
assert_equal("D\u00FCrst".encode('iso-8859-4'), "D\xFCrst".force_encoding('iso-8859-4'))
|
||||
assert_equal("D\u00FCrst".encode('iso-8859-9'), "D\xFCrst".force_encoding('iso-8859-9'))
|
||||
assert_equal("D\u00FCrst".encode('iso-8859-10'), "D\xFCrst".force_encoding('iso-8859-10'))
|
||||
assert_equal("D\u00FCrst".encode('iso-8859-13'), "D\xFCrst".force_encoding('iso-8859-13'))
|
||||
assert_equal("D\u00FCrst".encode('iso-8859-14'), "D\xFCrst".force_encoding('iso-8859-14'))
|
||||
assert_equal("D\u00FCrst".encode('iso-8859-15'), "D\xFCrst".force_encoding('iso-8859-15'))
|
||||
# test length extension
|
||||
assert_equal(("\xA4"*20).encode('utf-8', 'iso-8859-15'), "\u20AC"*20)
|
||||
assert_equal(("\xA4"*20).encode!('utf-8', 'iso-8859-15'), "\u20AC"*20)
|
||||
|
||||
def test_arguments
|
||||
assert_equal('abc', 'abc'.force_encoding('utf-8').encode('iso-8859-1'))
|
||||
# check that encoding is kept when no conversion is done
|
||||
assert_equal('abc'.force_encoding('Shift_JIS'), 'abc'.force_encoding('Shift_JIS').encode('Shift_JIS'))
|
||||
assert_equal('abc'.force_encoding('Shift_JIS'), 'abc'.force_encoding('Shift_JIS').encode!('Shift_JIS'))
|
||||
# assert that encoding is correctly set
|
||||
assert_equal("D\u00FCrst".encoding, "D\xFCrst".force_encoding('iso-8859-1').encode('utf-8').encoding)
|
||||
# check that Encoding can be used as parameter
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', Encoding.find('ISO-8859-1')))
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode(Encoding.find('utf-8'), 'ISO-8859-1'))
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode(Encoding.find('utf-8'), Encoding.find('ISO-8859-1')))
|
||||
end
|
||||
|
||||
def test_length
|
||||
assert_equal("\u20AC"*20, ("\xA4"*20).encode('utf-8', 'iso-8859-15'))
|
||||
assert_equal("\u20AC"*20, ("\xA4"*20).encode!('utf-8', 'iso-8859-15'))
|
||||
assert_equal("\u20AC"*2000, ("\xA4"*2000).encode('utf-8', 'iso-8859-15'))
|
||||
assert_equal("\u20AC"*2000, ("\xA4"*2000).encode!('utf-8', 'iso-8859-15'))
|
||||
assert_equal("\u20AC"*200000, ("\xA4"*200000).encode('utf-8', 'iso-8859-15'))
|
||||
assert_equal("\u20AC"*200000, ("\xA4"*200000).encode!('utf-8', 'iso-8859-15'))
|
||||
end
|
||||
|
||||
def test_encodings
|
||||
# temporary, fix encoding
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".force_encoding('iso-8859-1').encode('utf-8'))
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-1'))
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-2'))
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-3'))
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-4'))
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-9'))
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-10'))
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-13'))
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-14'))
|
||||
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-15'))
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-1'), "D\u00FCrst".encode('iso-8859-1'))
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-2'), "D\u00FCrst".encode('iso-8859-2'))
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-3').encoding, "D\u00FCrst".encode('iso-8859-3').encoding)
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-4'), "D\u00FCrst".encode('iso-8859-4'))
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-9'), "D\u00FCrst".encode('iso-8859-9'))
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-10'), "D\u00FCrst".encode('iso-8859-10'))
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-13'), "D\u00FCrst".encode('iso-8859-13'))
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-14'), "D\u00FCrst".encode('iso-8859-14'))
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-15'), "D\u00FCrst".encode('iso-8859-15'))
|
||||
assert_equal("r\xE9sum\xE9".force_encoding('iso-8859-1'), "r\u00E9sum\u00E9".encode('iso-8859-1'))
|
||||
assert_equal("el\xF5\xEDr\xE1s".force_encoding('iso-8859-2'),
|
||||
"\u0065\u006C\u0151\u00ED\u0072\u00E1\u0073".encode('iso-8859-2'))
|
||||
assert_equal("\xE3\xCA\xC8".force_encoding('iso-8859-6'), "\u0643\u062A\u0628".encode('iso-8859-6'))
|
||||
assert_equal( "\xDF\xD5\xE0\xD5\xD2\xDE\xD4".force_encoding('iso-8859-5'),
|
||||
"\u043F\u0435\u0440\u0435\u0432\u043E\u0434".encode('iso-8859-5'))
|
||||
end
|
||||
|
||||
def test_twostep
|
||||
assert_equal("D\xFCrst".force_encoding('iso-8859-2'), "D\xFCrst".encode('iso-8859-2', 'iso-8859-1'))
|
||||
end
|
||||
|
||||
def test_all_bytes
|
||||
encodings_8859 = [
|
||||
'ISO-8859-1', 'ISO-8859-2',
|
||||
|
@ -69,7 +103,7 @@ class TestConvert < Test::Unit::TestCase
|
|||
test_start.encode('UTF-8','ISO-8859-1').encode('ISO-8859-1')
|
||||
encodings_8859.each do |enc|
|
||||
test_start = all_bytes
|
||||
assert_equal(test_start.encode('UTF-8',enc).encode(enc).force_encoding('ASCII-8BIT'), test_start)
|
||||
assert_equal(test_start, test_start.encode('UTF-8',enc).encode(enc).force_encoding('ASCII-8BIT'))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
88
transcode.c
88
transcode.c
|
@ -81,8 +81,8 @@ register_transcoder(const char *from_e, const char *to_e,
|
|||
{
|
||||
static int n = 0;
|
||||
if (n >= MAX_TRANSCODERS) {
|
||||
/* we are initializing, is it okay to use rb_raise here? */
|
||||
rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots");
|
||||
/* we are initializing, is it okay to use rb_raise here? */
|
||||
rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots");
|
||||
}
|
||||
transcoder_table[n].from_encoding = from_e;
|
||||
transcoder_table[n].to_encoding = to_e;
|
||||
|
@ -127,25 +127,37 @@ init_transcoder_table(void)
|
|||
register_transcoder(NULL, NULL, NULL, 0, 0);
|
||||
}
|
||||
|
||||
static int
|
||||
encoding_equal(const char* encoding1, const char* encoding2)
|
||||
{
|
||||
return 0==strcasecmp(encoding1, encoding2);
|
||||
}
|
||||
|
||||
static transcoder*
|
||||
transcode_dispatch(const char* from_encoding, const char* to_encoding)
|
||||
{
|
||||
transcoder *candidate = transcoder_table;
|
||||
|
||||
for (candidate = transcoder_table; candidate->from_encoding; candidate++)
|
||||
if (0==strcasecmp(from_encoding, candidate->from_encoding)
|
||||
&& 0==strcasecmp(to_encoding, candidate->to_encoding))
|
||||
break;
|
||||
/* in the future, add multistep transcoding logic here */
|
||||
return candidate->from_encoding ? candidate : NULL;
|
||||
for (candidate = transcoder_table; candidate->from_encoding; candidate++) {
|
||||
if (encoding_equal(from_encoding, candidate->from_encoding)
|
||||
&& encoding_equal(to_encoding, candidate->to_encoding)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
/* multistep logic, via UTF-8 */
|
||||
if (!encoding_equal(from_encoding, "UTF-8")
|
||||
&& !encoding_equal(to_encoding, "UTF-8")
|
||||
&& transcode_dispatch("UTF-8", to_encoding)) { /* check that we have a second step */
|
||||
return transcode_dispatch(from_encoding, "UTF-8"); /* return first step */
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* dynamic structure, one per conversion (similar to iconv_t) */
|
||||
/* may carry conversion state (e.g. for iso-2022-jp) */
|
||||
typedef struct transcoding {
|
||||
VALUE ruby_string_dest; /* the String used as the conversion destination,
|
||||
or NULL if something else is being converted */
|
||||
or NULL if something else is being converted */
|
||||
char *(*flush_func)(struct transcoding*, int, int);
|
||||
} transcoding;
|
||||
|
||||
|
@ -201,7 +213,7 @@ transcode_loop(char **in_pos, char **out_pos,
|
|||
}
|
||||
next_table = next_table->info[next_offset];
|
||||
goto follow_byte;
|
||||
/* maybe rewrite the following cases to use fallthrough???? */
|
||||
/* maybe rewrite the following cases to use fallthrough???? */
|
||||
case ZERObt: /* drop input */
|
||||
continue;
|
||||
case ONEbt:
|
||||
|
@ -262,6 +274,7 @@ str_transcode(int argc, VALUE *argv, VALUE str)
|
|||
VALUE from_encval, to_encval;
|
||||
transcoder *my_transcoder;
|
||||
transcoding my_transcoding;
|
||||
int final_encoding = 0;
|
||||
|
||||
if (argc<1 || argc>2) {
|
||||
rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
|
||||
|
@ -275,7 +288,7 @@ str_transcode(int argc, VALUE *argv, VALUE str)
|
|||
to_e = rb_enc_name(to_enc);
|
||||
}
|
||||
if (argc==1) {
|
||||
from_encidx = rb_enc_get_index(str);
|
||||
from_encidx = rb_enc_get_index(str);
|
||||
from_enc = rb_enc_from_index(from_encidx);
|
||||
from_e = rb_enc_name(from_enc);
|
||||
}
|
||||
|
@ -298,33 +311,44 @@ str_transcode(int argc, VALUE *argv, VALUE str)
|
|||
if (strcasecmp(from_e, to_e) == 0) {
|
||||
return Qnil;
|
||||
}
|
||||
if (!(my_transcoder = transcode_dispatch(from_e, to_e))) {
|
||||
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e);
|
||||
|
||||
while (!final_encoding) /* loop for multistep transcoding */
|
||||
{ /* later, maybe use smaller intermediate strings for very long strings */
|
||||
if (!(my_transcoder = transcode_dispatch(from_e, to_e))) {
|
||||
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e);
|
||||
}
|
||||
|
||||
fromp = sp = RSTRING_PTR(str);
|
||||
slen = RSTRING_LEN(str);
|
||||
blen = slen + 30; /* len + margin */
|
||||
dest = rb_str_tmp_new(blen);
|
||||
bp = RSTRING_PTR(dest);
|
||||
my_transcoding.ruby_string_dest = dest;
|
||||
my_transcoding.flush_func = str_transcoding_resize;
|
||||
|
||||
transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding);
|
||||
if (fromp != sp+slen) {
|
||||
rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp);
|
||||
}
|
||||
buf = RSTRING_PTR(dest);
|
||||
*bp = '\0';
|
||||
rb_str_set_len(dest, bp - buf);
|
||||
|
||||
rb_enc_associate(dest, to_enc);
|
||||
|
||||
if (encoding_equal(my_transcoder->to_encoding, to_e)) {
|
||||
final_encoding = 1;
|
||||
}
|
||||
else {
|
||||
from_e = my_transcoder->to_encoding;
|
||||
str = dest;
|
||||
}
|
||||
}
|
||||
|
||||
fromp = sp = RSTRING_PTR(str);
|
||||
slen = RSTRING_LEN(str);
|
||||
blen = slen + 30; /* len + margin */
|
||||
dest = rb_str_tmp_new(blen);
|
||||
bp = RSTRING_PTR(dest);
|
||||
my_transcoding.ruby_string_dest = dest;
|
||||
my_transcoding.flush_func = str_transcoding_resize;
|
||||
|
||||
/* for simple testing: */
|
||||
transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding);
|
||||
if (fromp != sp+slen) {
|
||||
rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp);
|
||||
}
|
||||
buf = RSTRING_PTR(dest);
|
||||
*bp = '\0';
|
||||
rb_str_set_len(dest, bp - buf);
|
||||
|
||||
/* set encoding */
|
||||
if (!to_enc) {
|
||||
to_encidx = rb_enc_replicate(to_e, rb_default_encoding());
|
||||
to_enc = rb_enc_from_index(to_encidx);
|
||||
}
|
||||
rb_enc_associate(dest, to_enc);
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
|
|
@ -22,10 +22,10 @@ typedef struct byte_lookup {
|
|||
#define UNDEF (PType 0x09) /* legal but undefined */
|
||||
#define ZERObt (PType 0x0A) /* zero bytes of payload, i.e. remove */
|
||||
|
||||
#define output1(b1) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|ONEbt))
|
||||
#define output2(b1,b2) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt))
|
||||
#define output3(b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|THREEbt))
|
||||
#define output4(b0,b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<< 8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt))
|
||||
#define o1(b1) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|ONEbt))
|
||||
#define o2(b1,b2) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt))
|
||||
#define o3(b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|THREEbt))
|
||||
#define o4(b0,b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<< 8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt))
|
||||
|
||||
#define getBT1(a) (((a)>> 8)&0xFF)
|
||||
#define getBT2(a) (((a)>>16)&0xFF)
|
||||
|
|
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue