add encoding conversion from/to CESU-8

Add encoding conversion (transcoding) from UTF-8 to CESU-8 and back. CESU-8 is an encoding similar to UTF-8, but encodes codepoints above U+FFFF as two surrogates, these surrogates again being encoded as if they were UTF-8 codepoints. This preserves the same binary sorting order as in UTF-16. It is also somewhat similar (although not exactly identical) to an encoding used internally by Java. This completes issue #15995. enc/trans/cesu_8.trans: Add encoding conversion from/to CESU-8 test/ruby/test_transcode.rb: Add tests for above
2022-11-09 12:17:21 -05:00 · 2019-07-14 10:58:50 +09:00 · 2019-07-14 10:58:50 +09:00 · 369ff79394
commit 369ff79394
parent ac2866005b
2 changed files with 107 additions and 0 deletions
--- a/enc/trans/cesu_8.trans
+++ b/enc/trans/cesu_8.trans
@ -0,0 +1,85 @@
 #include "transcode_data.h"
 <%
  map = {}
  map["{00-7f}"] = :nomap
  map["{c2-df}{80-bf}"] = :nomap
  map["e0{a0-bf}{80-bf}"] = :nomap
  map["{e1-ec}{80-bf}{80-bf}"] = :nomap
  map["ed{80-9f}{80-bf}"] = :nomap
  map["{ee-ef}{80-bf}{80-bf}"] = :nomap
  map["ed{a0-af}{80-bf}ed{b0-bf}{80-bf}"] = :func_so # surrogate pairs
  transcode_generate_node(ActionMap.parse(map), "from_CESU_8")
  map = {}
  map["{00-7f}"] = :nomap
  map["{c2-df}{80-bf}"] = :nomap
  map["e0{a0-bf}{80-bf}"] = :nomap
  map["{e1-ec}{80-bf}{80-bf}"] = :nomap
  map["ed{80-9f}{80-bf}"] = :nomap
  map["{ee-ef}{80-bf}{80-bf}"] = :nomap
  map["f0{90-bf}{80-bf}{80-bf}"] = :func_so      # planes 1-3
  map["{f1-f3}{80-bf}{80-bf}{80-bf}"] = :func_so # planes 4-15
  map["f4{80-8f}{80-bf}{80-bf}"] = :func_so      # plane 16
  transcode_generate_node(ActionMap.parse(map), "to_CESU_8")
 %>
 <%= transcode_generated_code %>
 static ssize_t
 fun_so_from_cesu_8(void *statep, const unsigned char *s, size_t l, unsigned char *o, size_t osize)
 {
    unsigned int scalar = ( ((s[1]&0x0F)<<16) | ((s[2]&0x3F)<<10)
                          | ((s[4]&0x0F)<< 6) |  (s[5]&0x3F)
                          ) + 0x10000;
    o[0] = 0xF0 |  (scalar>>18);
    o[1] = 0x80 | ((scalar>>12)&0x3F);
    o[2] = 0x80 | ((scalar>> 6)&0x3F);
    o[3] = 0x80 | ( scalar     &0x3F);
    return 4;
 }
 static ssize_t
 fun_so_to_cesu_8(void *statep, const unsigned char *s, size_t l, unsigned char *o, size_t osize)
 {
    unsigned int scalar = ((s[0]&0x07)<<18) | ((s[1]&0x3F)<<12)
                        | ((s[2]&0x3F)<< 6) |  (s[3]&0x3F);
    scalar -= 0x10000;
    o[0] = 0xED;
    o[1] = 0xA0 |  (scalar>>16);
    o[2] = 0x80 | ((scalar>>10)&0x3F);
    o[3] = 0xED;
    o[4] = 0xB0 | ((scalar>> 6)&0x0F);
    o[5] = 0x80 | (scalar      &0x3F);
    return 6;
 }
 static const rb_transcoder
 rb_from_CESU_8 = {
    "CESU-8", "UTF-8", from_CESU_8,
    TRANSCODE_TABLE_INFO,
    1, /* input_unit_length */
    6, /* max_input */
    4, /* max_output */
    asciicompat_decoder, /* asciicompat_type */
    0, NULL, NULL, /* state_size, state_init, state_fini */
    NULL, NULL, NULL, fun_so_from_cesu_8
 };
 static const rb_transcoder
 rb_to_CESU_8 = {
    "UTF-8", "CESU-8", to_CESU_8,
    TRANSCODE_TABLE_INFO,
    1, /* input_unit_length */
    4, /* max_input */
    6, /* max_output */
    asciicompat_encoder, /* asciicompat_type */
    0, NULL, NULL, /* state_size, state_init, state_fini */
    NULL, NULL, NULL, fun_so_to_cesu_8
 };
 TRANS_INIT(cesu_8)
 {
    rb_register_transcoder(&rb_from_CESU_8);
    rb_register_transcoder(&rb_to_CESU_8);
 }
--- a/test/ruby/test_transcode.rb
+++ b/test/ruby/test_transcode.rb
@ -2116,6 +2116,28 @@ class TestTranscode < Test::Unit::TestCase
    check_both_ways("D\u00FCrst", "\xC4\xDC\x99\xA2\xA3", 'IBM037') # Dürst
  end
  def test_CESU_8
    check_both_ways("aijrszAIJRSZ09", "aijrszAIJRSZ09", 'CESU-8') # single bytes
    # check NULL explicitly
    # this is different in CESU-8 and in Java modified UTF-8 strings
    check_both_ways("\0", "\0", 'CESU-8')
    # U+0080 U+00FC U+00FF U+0100 U+0400 U+0700 U+07FF
    two_byte_chars = "\xC2\x80\x20\xC3\xBC\x20\xC3\xBF\x20\xC4\x80\x20\xD0\x80\x20\xDC\x80\x20\xDF\xBF"
    check_both_ways(two_byte_chars, two_byte_chars, 'CESU-8')
    # U+0800 U+2200 U+4E00 U+D7FF U+E000 U+FFFF
    three_byte_chars = "\xE0\xA0\x80\x20\xE2\x88\x80\x20\xE4\xB8\x80\x20\xED\x9F\xBF\x20\xEE\x80\x80\x20\xEF\xBF\xBF"
    check_both_ways(three_byte_chars, three_byte_chars, 'CESU-8')
    # characters outside BMP (double surrogates in CESU-8)
    # U+10000 U+20000 U+50000 U+10FFFF
    utf8 = "\xF0\x90\x80\x80 \xF0\xA0\x80\x80 \xF1\x90\x80\x80 \xF4\x8F\xBF\xBF"
    cesu = "\xED\xA0\x80\xED\xB0\x80 \xED\xA1\x80\xED\xB0\x80 \xED\xA4\x80\xED\xB0\x80 \xED\xAF\xBF\xED\xBF\xBF"
    check_both_ways(utf8, cesu, 'CESU-8')
  end
  def test_nothing_changed
    a = "James".force_encoding("US-ASCII")
    b = a.encode("Shift_JIS")