* transcode.c (str_transcode, transcode_dispatch): added two-step

* trancode.c: some minor formatting fixes * transcode_data.h, transcode_data_iso_8859.c: Shortened extremely frequently used macros to shorten file length. * test/ruby/test_transcode.rb: Fixed name of test class; added setup method to ensure all necessary encodings exist; split tests into more test methods; added tests; fixed ordering of arguments in assert_equal to have expected result first. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14236 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2022-11-09 12:17:21 -05:00 · 2007-12-15 05:42:25 +00:00 · 2007-12-15 05:42:25 +00:00 · f2b0dba1cf
commit f2b0dba1cf
parent 4f13113945
5 changed files with 1452 additions and 1377 deletions
--- a/17
+++ b/17
@ -52,6 +52,23 @@ Fri Dec 14 16:06:18 2007  Yukihiro Matsumoto  <matz@ruby-lang.org>

 	* string.c (rb_str_casecmp): ditto.

+Fri Dec 14 15:25:30 2007  Martin Duerst  <duerst@it.aoyama.ac.jp>
+
+        * transcode.c (encoding_equal): new function.
+
+	* transcode.c (str_transcode, transcode_dispatch): added two-step
+          conversion logic via UTF-8.
+
+	* trancode.c: some minor formatting fixes
+
+	* transcode_data.h, transcode_data_iso_8859.c: Shortened
+	  extremely frequently used macros to shorten file length.
+
+	* test/ruby/test_transcode.rb: Fixed name of test class;
+	  added setup method to ensure all necessary encodings exist;
+	  split tests into more test methods; added tests; fixed ordering
+	  of arguments in assert_equal to have expected result first.
+
 Fri Dec 14 13:47:54 2007  Nobuyoshi Nakada  <nobu@ruby-lang.org>

 	* common.mk (ruby.imp): fix for circular dependency.  a patch from
--- a/test/ruby/test_transcode.rb
+++ b/test/ruby/test_transcode.rb
@ -1,8 +1,22 @@
-# -*- encoding: US-ASCII -*-   # make sure this runs in binary mode
+# -*- encoding: ASCII-8BIT -*-   # make sure this runs in binary mode

 require 'test/unit'
-class TestConvert < Test::Unit::TestCase
-  def test_basic
+class TestTranscode < Test::Unit::TestCase
+  def setup # trick to create all the necessary encodings
+    all_encodings = [ 'ISO-8859-1', 'ISO-8859-2',
+                      'ISO-8859-3', 'ISO-8859-4',
+                      'ISO-8859-5', 'ISO-8859-6',
+                      'ISO-8859-7', 'ISO-8859-8',
+                      'ISO-8859-9', 'ISO-8859-10',
+                      'ISO-8859-11', 'ISO-8859-13',
+                      'ISO-8859-14', 'ISO-8859-15'
+                    ]
+    all_encodings.each do |enc|
+      'abc'.encode(enc, 'UTF-8')
+    end
+  end
+
+  def test_errors
    # we don't have semantics for conversion without attribute yet
    # maybe 'convert to UTF-8' would be nice :-)
    assert_raise(ArgumentError) { 'abc'.encode }
@ -13,43 +27,63 @@ class TestConvert < Test::Unit::TestCase
    assert_raise(ArgumentError) { 'abc'.encode!('foo', 'bar') }
    assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode('foo') }
    assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode!('foo') }
-    assert_equal('abc'.force_encoding('utf-8').encode('iso-8859-1'), 'abc')
-    # check that encoding is kept when no conversion is done
-    assert_equal('abc'.force_encoding('Shift_JIS').encode('Shift_JIS'), 'abc'.force_encoding('Shift_JIS'))
-    assert_equal('abc'.force_encoding('Shift_JIS').encode!('Shift_JIS'), 'abc'.force_encoding('Shift_JIS'))
-    # assert that encoding is correctly set
-    assert_equal("D\xFCrst".force_encoding('iso-8859-1').encode('utf-8').encoding, "D\u00FCrst".encoding)
-    # check that Encoding can be used as parameter
-    assert_equal("D\xFCrst".encode('utf-8', Encoding.find('ISO-8859-1')), "D\u00FCrst")
-    assert_equal("D\xFCrst".encode(Encoding.find('utf-8'), 'ISO-8859-1'), "D\u00FCrst")
-    assert_equal("D\xFCrst".encode(Encoding.find('utf-8'), Encoding.find('ISO-8859-1')), "D\u00FCrst")
+  end

-    # temporary, fix encoding
-    assert_equal("D\xFCrst".force_encoding('iso-8859-1').encode('utf-8'), "D\u00FCrst")
-    assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-1'), "D\u00FCrst")
-    assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-2'), "D\u00FCrst")
-    assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-3'), "D\u00FCrst")
-    assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-4'), "D\u00FCrst")
-    assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-9'), "D\u00FCrst")
-    assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-10'), "D\u00FCrst")
-    assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-13'), "D\u00FCrst")
-    assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-14'), "D\u00FCrst")
-    assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-15'), "D\u00FCrst")
-    assert_equal("D\u00FCrst".encode('iso-8859-1'), "D\xFCrst".force_encoding('iso-8859-1'))
-    assert_equal("D\u00FCrst".encode('iso-8859-2'), "D\xFCrst".force_encoding('iso-8859-2'))
-    assert_equal("D\u00FCrst".encode('iso-8859-3'), "D\xFCrst".force_encoding('iso-8859-3'))
-    assert_equal("D\u00FCrst".encode('iso-8859-4'), "D\xFCrst".force_encoding('iso-8859-4'))
-    assert_equal("D\u00FCrst".encode('iso-8859-9'), "D\xFCrst".force_encoding('iso-8859-9'))
-    assert_equal("D\u00FCrst".encode('iso-8859-10'), "D\xFCrst".force_encoding('iso-8859-10'))
-    assert_equal("D\u00FCrst".encode('iso-8859-13'), "D\xFCrst".force_encoding('iso-8859-13'))
-    assert_equal("D\u00FCrst".encode('iso-8859-14'), "D\xFCrst".force_encoding('iso-8859-14'))
-    assert_equal("D\u00FCrst".encode('iso-8859-15'), "D\xFCrst".force_encoding('iso-8859-15'))
-    # test length extension
-    assert_equal(("\xA4"*20).encode('utf-8', 'iso-8859-15'), "\u20AC"*20)
-    assert_equal(("\xA4"*20).encode!('utf-8', 'iso-8859-15'), "\u20AC"*20)
-    
+  def test_arguments
+    assert_equal('abc', 'abc'.force_encoding('utf-8').encode('iso-8859-1'))
+    # check that encoding is kept when no conversion is done
+    assert_equal('abc'.force_encoding('Shift_JIS'), 'abc'.force_encoding('Shift_JIS').encode('Shift_JIS'))
+    assert_equal('abc'.force_encoding('Shift_JIS'), 'abc'.force_encoding('Shift_JIS').encode!('Shift_JIS'))
+    # assert that encoding is correctly set
+    assert_equal("D\u00FCrst".encoding, "D\xFCrst".force_encoding('iso-8859-1').encode('utf-8').encoding)
+    # check that Encoding can be used as parameter
+    assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', Encoding.find('ISO-8859-1')))
+    assert_equal("D\u00FCrst", "D\xFCrst".encode(Encoding.find('utf-8'), 'ISO-8859-1'))
+    assert_equal("D\u00FCrst", "D\xFCrst".encode(Encoding.find('utf-8'), Encoding.find('ISO-8859-1')))
+  end
+
+  def test_length
+    assert_equal("\u20AC"*20, ("\xA4"*20).encode('utf-8', 'iso-8859-15'))
+    assert_equal("\u20AC"*20, ("\xA4"*20).encode!('utf-8', 'iso-8859-15'))
+    assert_equal("\u20AC"*2000, ("\xA4"*2000).encode('utf-8', 'iso-8859-15'))
+    assert_equal("\u20AC"*2000, ("\xA4"*2000).encode!('utf-8', 'iso-8859-15'))
+    assert_equal("\u20AC"*200000, ("\xA4"*200000).encode('utf-8', 'iso-8859-15'))
+    assert_equal("\u20AC"*200000, ("\xA4"*200000).encode!('utf-8', 'iso-8859-15'))
  end
  
+  def test_encodings
+    # temporary, fix encoding
+    assert_equal("D\u00FCrst", "D\xFCrst".force_encoding('iso-8859-1').encode('utf-8'))
+    assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-1'))
+    assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-2'))
+    assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-3'))
+    assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-4'))
+    assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-9'))
+    assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-10'))
+    assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-13'))
+    assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-14'))
+    assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-15'))
+    assert_equal("D\xFCrst".force_encoding('iso-8859-1'), "D\u00FCrst".encode('iso-8859-1'))
+    assert_equal("D\xFCrst".force_encoding('iso-8859-2'), "D\u00FCrst".encode('iso-8859-2'))
+    assert_equal("D\xFCrst".force_encoding('iso-8859-3').encoding, "D\u00FCrst".encode('iso-8859-3').encoding)
+    assert_equal("D\xFCrst".force_encoding('iso-8859-4'), "D\u00FCrst".encode('iso-8859-4'))
+    assert_equal("D\xFCrst".force_encoding('iso-8859-9'), "D\u00FCrst".encode('iso-8859-9'))
+    assert_equal("D\xFCrst".force_encoding('iso-8859-10'), "D\u00FCrst".encode('iso-8859-10'))
+    assert_equal("D\xFCrst".force_encoding('iso-8859-13'), "D\u00FCrst".encode('iso-8859-13'))
+    assert_equal("D\xFCrst".force_encoding('iso-8859-14'), "D\u00FCrst".encode('iso-8859-14'))
+    assert_equal("D\xFCrst".force_encoding('iso-8859-15'), "D\u00FCrst".encode('iso-8859-15'))
+    assert_equal("r\xE9sum\xE9".force_encoding('iso-8859-1'), "r\u00E9sum\u00E9".encode('iso-8859-1'))
+    assert_equal("el\xF5\xEDr\xE1s".force_encoding('iso-8859-2'),
+        "\u0065\u006C\u0151\u00ED\u0072\u00E1\u0073".encode('iso-8859-2'))
+    assert_equal("\xE3\xCA\xC8".force_encoding('iso-8859-6'), "\u0643\u062A\u0628".encode('iso-8859-6'))
+    assert_equal( "\xDF\xD5\xE0\xD5\xD2\xDE\xD4".force_encoding('iso-8859-5'),
+        "\u043F\u0435\u0440\u0435\u0432\u043E\u0434".encode('iso-8859-5'))
+  end
+
+  def test_twostep
+    assert_equal("D\xFCrst".force_encoding('iso-8859-2'), "D\xFCrst".encode('iso-8859-2', 'iso-8859-1'))
+  end
+
  def test_all_bytes
    encodings_8859 = [
      'ISO-8859-1', 'ISO-8859-2',
@ -69,7 +103,7 @@ class TestConvert < Test::Unit::TestCase
    test_start.encode('UTF-8','ISO-8859-1').encode('ISO-8859-1')
    encodings_8859.each do |enc|
      test_start = all_bytes
-      assert_equal(test_start.encode('UTF-8',enc).encode(enc).force_encoding('ASCII-8BIT'), test_start) 
+      assert_equal(test_start, test_start.encode('UTF-8',enc).encode(enc).force_encoding('ASCII-8BIT')) 
    end
  end
 end
--- a/transcode.c
+++ b/transcode.c
@ -81,8 +81,8 @@ register_transcoder(const char *from_e, const char *to_e,
 {
    static int n = 0;
    if (n >= MAX_TRANSCODERS) {
-        /* we are initializing, is it okay to use rb_raise here? */
-        rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots");
+	/* we are initializing, is it okay to use rb_raise here? */
+	rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots");
    }
    transcoder_table[n].from_encoding = from_e;
    transcoder_table[n].to_encoding = to_e;
@ -127,25 +127,37 @@ init_transcoder_table(void)
    register_transcoder(NULL, NULL, NULL, 0, 0);
 }

+static int
+encoding_equal(const char* encoding1, const char* encoding2)
+{
+    return 0==strcasecmp(encoding1, encoding2);
+}

 static transcoder*
 transcode_dispatch(const char* from_encoding, const char* to_encoding)
 {
    transcoder *candidate = transcoder_table;
    
-    for (candidate = transcoder_table; candidate->from_encoding; candidate++)
-        if (0==strcasecmp(from_encoding, candidate->from_encoding)
-            && 0==strcasecmp(to_encoding, candidate->to_encoding))
-                break;
-    /* in the future, add multistep transcoding logic here */
-    return candidate->from_encoding ? candidate : NULL;
+    for (candidate = transcoder_table; candidate->from_encoding; candidate++) {
+	if (encoding_equal(from_encoding, candidate->from_encoding)
+	    && encoding_equal(to_encoding, candidate->to_encoding)) {
+		return candidate;
+	}
+    }
+    /* multistep logic, via UTF-8 */
+    if (!encoding_equal(from_encoding, "UTF-8")
+	&& !encoding_equal(to_encoding, "UTF-8")
+	&& transcode_dispatch("UTF-8", to_encoding)) {  /* check that we have a second step */
+	    return transcode_dispatch(from_encoding, "UTF-8"); /* return first step */
+    }
+    return NULL;
 }

 /* dynamic structure, one per conversion (similar to iconv_t) */
 /* may carry conversion state (e.g. for iso-2022-jp) */
 typedef struct transcoding {
    VALUE ruby_string_dest; /* the String used as the conversion destination,
-                               or NULL if something else is being converted */
+			       or NULL if something else is being converted */
    char *(*flush_func)(struct transcoding*, int, int);
 } transcoding;

@ -201,7 +213,7 @@ transcode_loop(char **in_pos, char **out_pos,
 	    }
 	    next_table = next_table->info[next_offset];
 	    goto follow_byte;
-            /* maybe rewrite the following cases to use fallthrough???? */
+	    /* maybe rewrite the following cases to use fallthrough???? */
 	  case ZERObt: /* drop input */
 	    continue;
 	  case ONEbt:
@ -262,6 +274,7 @@ str_transcode(int argc, VALUE *argv, VALUE str)
    VALUE from_encval, to_encval;
    transcoder *my_transcoder;
    transcoding my_transcoding;
+    int final_encoding = 0;

    if (argc<1 || argc>2) {
 	rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
@ -275,7 +288,7 @@ str_transcode(int argc, VALUE *argv, VALUE str)
 	to_e = rb_enc_name(to_enc);
    }
    if (argc==1) {
-        from_encidx = rb_enc_get_index(str);
+	from_encidx = rb_enc_get_index(str);
 	from_enc = rb_enc_from_index(from_encidx);
 	from_e = rb_enc_name(from_enc);
    }
@ -298,33 +311,44 @@ str_transcode(int argc, VALUE *argv, VALUE str)
    if (strcasecmp(from_e, to_e) == 0) {
 	return Qnil;
    }
-    if (!(my_transcoder = transcode_dispatch(from_e, to_e))) {
-	rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e);
+
+    while (!final_encoding) /* loop for multistep transcoding */
+    {                       /* later, maybe use smaller intermediate strings for very long strings */
+	if (!(my_transcoder = transcode_dispatch(from_e, to_e))) {
+	    rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e);
+	}
+
+	fromp = sp = RSTRING_PTR(str);
+	slen = RSTRING_LEN(str);
+	blen = slen + 30; /* len + margin */
+	dest = rb_str_tmp_new(blen);
+	bp = RSTRING_PTR(dest);
+	my_transcoding.ruby_string_dest = dest;
+	my_transcoding.flush_func = str_transcoding_resize;
+
+	transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding);
+	if (fromp != sp+slen) {
+	    rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp);
+	}
+	buf = RSTRING_PTR(dest);
+	*bp = '\0';
+	rb_str_set_len(dest, bp - buf);
+
+	rb_enc_associate(dest, to_enc);
+	
+	if (encoding_equal(my_transcoder->to_encoding, to_e)) {
+	    final_encoding = 1;
+	}
+	else {
+	    from_e = my_transcoder->to_encoding;
+	    str = dest;
+	}
    }
-
-    fromp = sp = RSTRING_PTR(str);
-    slen = RSTRING_LEN(str);
-    blen = slen + 30; /* len + margin */
-    dest = rb_str_tmp_new(blen);
-    bp = RSTRING_PTR(dest);
-    my_transcoding.ruby_string_dest = dest;
-    my_transcoding.flush_func = str_transcoding_resize;
-
-    /* for simple testing: */
-    transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding);
-    if (fromp != sp+slen) {
-	rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp);
-    }
-    buf = RSTRING_PTR(dest);
-    *bp = '\0';
-    rb_str_set_len(dest, bp - buf);
-
    /* set encoding */
    if (!to_enc) {
 	to_encidx = rb_enc_replicate(to_e, rb_default_encoding());
 	to_enc = rb_enc_from_index(to_encidx);
    }
-    rb_enc_associate(dest, to_enc);

    return dest;
 }
--- a/transcode_data.h
+++ b/transcode_data.h
@ -22,10 +22,10 @@ typedef struct byte_lookup {
 #define UNDEF   (PType 0x09)   /* legal but undefined */
 #define ZERObt  (PType 0x0A)   /* zero bytes of payload, i.e. remove */

-#define output1(b1)          ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|ONEbt))
-#define output2(b1,b2)       ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt))
-#define output3(b1,b2,b3)    ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|THREEbt))
-#define output4(b0,b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<< 8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt))
+#define o1(b1)          ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|ONEbt))
+#define o2(b1,b2)       ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt))
+#define o3(b1,b2,b3)    ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|THREEbt))
+#define o4(b0,b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<< 8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt))

 #define getBT1(a)      (((a)>> 8)&0xFF)
 #define getBT2(a)      (((a)>>16)&0xFF)
--- a/transcode_data_iso_8859.c
+++ b/transcode_data_iso_8859.c