mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
* enc/big5.c: split CP950 from Big5.
* enc/big5.c: split CP951 from Big5-HKSCS. * enc/trans/big5.trans: import conversion table of Big5, Big5-HKSCS, CP950, and CP951 from ICU. they need fallback conversions. ref [ruby-core:33256] http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ * tool/transcode-tblgen.rb (import_ucm): add to import ucm files. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@29869 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
6ade3a4291
commit
60dfa6b655
7 changed files with 67 additions and 32103 deletions
13
ChangeLog
13
ChangeLog
|
@ -1,3 +1,16 @@
|
|||
Mon Nov 22 18:04:40 2010 NARUSE, Yui <naruse@ruby-lang.org>
|
||||
|
||||
* enc/big5.c: split CP950 from Big5.
|
||||
|
||||
* enc/big5.c: split CP951 from Big5-HKSCS.
|
||||
|
||||
* enc/trans/big5.trans: import conversion table of Big5, Big5-HKSCS,
|
||||
CP950, and CP951 from ICU. they need fallback conversions.
|
||||
ref [ruby-core:33256]
|
||||
http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/
|
||||
|
||||
* tool/transcode-tblgen.rb (import_ucm): add to import ucm files.
|
||||
|
||||
Mon Nov 22 18:33:30 2010 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
||||
|
||||
* string.c (rb_str_inspect): append for each chars instead of bulk
|
||||
|
|
16
enc/big5.c
16
enc/big5.c
|
@ -301,7 +301,12 @@ OnigEncodingDefine(big5, BIG5) = {
|
|||
big5_left_adjust_char_head,
|
||||
big5_is_allowed_reverse_match
|
||||
};
|
||||
ENC_ALIAS("CP950", "Big5")
|
||||
|
||||
/*
|
||||
* Name: CP950
|
||||
* Source: http://msdn.microsoft.com/en-us/goglobal/cc305155.aspx
|
||||
*/
|
||||
ENC_REPLICATE("CP950", "Big5")
|
||||
|
||||
/*
|
||||
* Name: Big5-HKSCS
|
||||
|
@ -327,7 +332,14 @@ OnigEncodingDefine(big5_hkscs, BIG5_HKSCS) = {
|
|||
big5_left_adjust_char_head,
|
||||
big5_is_allowed_reverse_match
|
||||
};
|
||||
ENC_ALIAS("CP951", "Big5-HKSCS")
|
||||
|
||||
/*
|
||||
* Name: CP951
|
||||
* Source: http://www.microsoft.com/hk/hkscs/default.aspx
|
||||
* Source: http://www.microsoft.com/downloads/en/details.aspx?FamilyID=0e6f5ac8-7baa-4571-b8e8-78b3b776afd7&DisplayLang=en
|
||||
* Source: http://blogs.msdn.com/b/shawnste/archive/2007/03/12/cp-951-hkscs.aspx
|
||||
*/
|
||||
ENC_REPLICATE("CP951", "Big5-HKSCS")
|
||||
|
||||
/*
|
||||
* Name: Big5-UAO [NOT registered by IANA!]
|
||||
|
|
File diff suppressed because it is too large
Load diff
13705
enc/trans/big5-tbl.rb
13705
enc/trans/big5-tbl.rb
File diff suppressed because it is too large
Load diff
|
@ -1,15 +1,23 @@
|
|||
#include "transcode_data.h"
|
||||
|
||||
<%
|
||||
require "big5-tbl"
|
||||
require "big5-hkscs-tbl"
|
||||
require "big5-uao-tbl"
|
||||
|
||||
transcode_tblgen "Big5", "UTF-8", [["{00-7f}", :nomap], *BIG5_TO_UCS_TBL]
|
||||
transcode_tblgen "UTF-8", "Big5", [["{00-7f}", :nomap], *BIG5_TO_UCS_TBL.map {|a,b| [b,a] }]
|
||||
tbls = import_ucm("glibc-BIG5-2.3.3.ucm")
|
||||
transcode_tblgen "Big5", "UTF-8", [["{00-7f}", :nomap]] + tbls[0]
|
||||
transcode_tblgen "UTF-8", "Big5", [["{00-7f}", :nomap]] + tbls[1]
|
||||
|
||||
transcode_tblgen "Big5-HKSCS", "UTF-8", [["{00-7f}", :nomap], *BIG5_HKSCS_TO_UCS_TBL], ValidEncoding('Big5')
|
||||
transcode_tblgen "UTF-8", "Big5-HKSCS", [["{00-7f}", :nomap], *BIG5_HKSCS_TO_UCS_TBL.map {|a,b| [b,a] }]
|
||||
tbls = import_ucm("windows-950-2000.ucm")
|
||||
transcode_tblgen "CP950", "UTF-8", [["{00-7f}", :nomap]] + tbls[0], ValidEncoding('Big5')
|
||||
transcode_tblgen "UTF-8", "CP950", [["{00-7f}", :nomap]] + tbls[1]
|
||||
|
||||
tbls = import_ucm("glibc-BIG5HKSCS-2.3.3.ucm")
|
||||
transcode_tblgen "Big5-HKSCS", "UTF-8", [["{00-7f}", :nomap]] + tbls[0], ValidEncoding('Big5')
|
||||
transcode_tblgen "UTF-8", "Big5-HKSCS", [["{00-7f}", :nomap]] + tbls[1]
|
||||
|
||||
tbls = import_ucm("windows-950_hkscs-2001.ucm")
|
||||
transcode_tblgen "CP951", "UTF-8", [["{00-7f}", :nomap]] + tbls[0], ValidEncoding('Big5')
|
||||
transcode_tblgen "UTF-8", "CP951", [["{00-7f}", :nomap]] + tbls[1]
|
||||
|
||||
transcode_tblgen "Big5-UAO", "UTF-8", [["{00-7f}", :nomap], *BIG5_UAO_TO_UCS_TBL], ValidEncoding('Big5')
|
||||
transcode_tblgen "UTF-8", "Big5-UAO", [["{00-7f}", :nomap], *BIG5_UAO_TO_UCS_TBL.map {|a,b| [b,a] }]
|
||||
|
|
|
@ -51,7 +51,7 @@ class TestTranscode < Test::Unit::TestCase
|
|||
end
|
||||
|
||||
def check_both_ways(utf8, raw, encoding)
|
||||
assert_equal(utf8.force_encoding('utf-8'), raw.encode('utf-8', encoding))
|
||||
assert_equal(utf8.force_encoding('utf-8'), raw.encode('utf-8', encoding),utf8.dump)
|
||||
assert_equal(raw.force_encoding(encoding), utf8.encode(encoding, 'utf-8'))
|
||||
end
|
||||
|
||||
|
@ -1794,9 +1794,9 @@ class TestTranscode < Test::Unit::TestCase
|
|||
check_both_ways("\u77AC", "\xC0\xFE", 'Big5') # 瞬
|
||||
check_both_ways("\u8B96", "\xC6\x40", 'Big5') # 讖
|
||||
check_both_ways("\u7C72", "\xC6\x7E", 'Big5') # 籲
|
||||
#assert_raise(Encoding::UndefinedConversionError) { "\xC6\xA1".encode("utf-8", 'Big5') }
|
||||
#assert_raise(Encoding::UndefinedConversionError) { "\xC6\xA1".encode("utf-8", 'Big5') }
|
||||
#assert_raise(Encoding::UndefinedConversionError) { "\xC7\x40".encode("utf-8", 'Big5') }
|
||||
assert_raise(Encoding::UndefinedConversionError) { "\xC8\x40".encode("utf-8", 'Big5') }
|
||||
#assert_raise(Encoding::UndefinedConversionError) { "\xC8\x40".encode("utf-8", 'Big5') }
|
||||
check_both_ways("\u4E42", "\xC9\x40", 'Big5') # 乂
|
||||
check_both_ways("\u6C15", "\xC9\x7E", 'Big5') # 氕
|
||||
check_both_ways("\u6C36", "\xC9\xA1", 'Big5') # 氶
|
||||
|
@ -1829,7 +1829,7 @@ class TestTranscode < Test::Unit::TestCase
|
|||
check_both_ways("\u9F0A", "\xF9\x7E", 'Big5') # 鼊
|
||||
check_both_ways("\u9FA4", "\xF9\xA1", 'Big5') # 龤
|
||||
check_both_ways("\u9F98", "\xF9\xD5", 'Big5') # 龘
|
||||
assert_raise(Encoding::UndefinedConversionError) { "\xF9\xD6".encode("utf-8", 'Big5') }
|
||||
#assert_raise(Encoding::UndefinedConversionError) { "\xF9\xD6".encode("utf-8", 'Big5') }
|
||||
check_both_ways("\u795E\u6797\u7FA9\u535A", "\xAF\xAB\xAA\x4C\xB8\x71\xB3\xD5", 'Big5') # 神林義博
|
||||
end
|
||||
|
||||
|
@ -1896,7 +1896,7 @@ class TestTranscode < Test::Unit::TestCase
|
|||
check_both_ways("\u9F0A", "\xF9\x7E", 'Big5-HKSCS') # 鼊
|
||||
check_both_ways("\u9FA4", "\xF9\xA1", 'Big5-HKSCS') # 龤
|
||||
check_both_ways("\u9F98", "\xF9\xD5", 'Big5-HKSCS') # 龘
|
||||
check_both_ways("\u{23ED7}", "\x8E\x40", 'Big5-HKSCS') # 𣻗
|
||||
#check_both_ways("\u{23ED7}", "\x8E\x40", 'Big5-HKSCS') # 𣻗
|
||||
#assert_raise(Encoding::UndefinedConversionError) { "\xF9\xD6".encode("utf-8", 'Big5-HKSCS') }
|
||||
check_both_ways("\u795E\u6797\u7FA9\u535A", "\xAF\xAB\xAA\x4C\xB8\x71\xB3\xD5", 'Big5-HKSCS') # 神林義博
|
||||
end
|
||||
|
|
|
@ -748,6 +748,27 @@ def citrus_decode_mapsrc(ces, csid, mapsrcs)
|
|||
return table
|
||||
end
|
||||
|
||||
def import_ucm(path)
|
||||
to_ucs = []
|
||||
from_ucs = []
|
||||
File.foreach(File.join($srcdir, "ucm", path)) do |line|
|
||||
uc, bs, fb = nil
|
||||
if /^<U(\h+)>\s*([\+\hx\\]+)\s*\|(\d)/ =~ line
|
||||
uc = $1.hex
|
||||
bs = $2.delete('x\\')
|
||||
fb = $3.to_i
|
||||
next if uc < 128 && uc == bs.hex
|
||||
elsif /^([<U\h>+]+)\s*([\+\hx\\]+)\s*\|(\d)/ =~ line
|
||||
uc = $1.scan(/\h+>/).map(&:hex).pack("U*").unpack("H*")[0]
|
||||
bs = $2.delete('x\\')
|
||||
fb = $3.to_i
|
||||
end
|
||||
to_ucs << [bs, uc] if fb == 0 || fb == 3
|
||||
from_ucs << [uc, bs] if fb == 0 || fb == 1
|
||||
end
|
||||
[to_ucs, from_ucs]
|
||||
end
|
||||
|
||||
def encode_utf8(map)
|
||||
r = []
|
||||
map.each {|k, v|
|
||||
|
|
Loading…
Reference in a new issue