1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

* tool/build-transcode: new file.

* tool/transcode-tblgen.rb: new file.

* enc/trans/make_transdb.rb: exclude *.erb.c.

* enc/depend: exclude *.erb.c.

* enc/trans/utf_16_32.erb.c: new file.

* enc/trans/single_byte.erb.c: new file.

* enc/trans/japanese.erb.c: new file.

* enc/trans/korean.erb.c: new file.

* enc/trans/iso-8859-2-tbl.rb: new file.

* enc/trans/iso-8859-3-tbl.rb: new file.

* enc/trans/iso-8859-4-tbl.rb: new file.

* enc/trans/iso-8859-5-tbl.rb: new file.

* enc/trans/iso-8859-6-tbl.rb: new file.

* enc/trans/iso-8859-7-tbl.rb: new file.

* enc/trans/iso-8859-8-tbl.rb: new file.

* enc/trans/iso-8859-9-tbl.rb: new file.

* enc/trans/iso-8859-10-tbl.rb: new file.

* enc/trans/iso-8859-11-tbl.rb: new file.

* enc/trans/iso-8859-13-tbl.rb: new file.

* enc/trans/iso-8859-14-tbl.rb: new file.

* enc/trans/iso-8859-15-tbl.rb: new file.

* enc/trans/eucjp-tbl.rb: new file.

* enc/trans/sjis-tbl.rb: new file.

* enc/trans/euckr-tbl.rb: new file.

* enc/trans/utf_16_32.c: regenerated.

* enc/trans/single_byte.c: regenerated.

* enc/trans/japanese.c: regenerated.

* enc/trans/korean.c: regenerated.

[ruby-dev:35730]


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18373 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2008-08-05 12:32:13 +00:00
parent 81577c26ee
commit f694ec83e8
30 changed files with 63539 additions and 33174 deletions

View file

@ -1,3 +1,63 @@
Tue Aug 5 20:46:20 2008 Tanaka Akira <akr@fsij.org>
* tool/build-transcode: new file.
* tool/transcode-tblgen.rb: new file.
* enc/trans/make_transdb.rb: exclude *.erb.c.
* enc/depend: exclude *.erb.c.
* enc/trans/utf_16_32.erb.c: new file.
* enc/trans/single_byte.erb.c: new file.
* enc/trans/japanese.erb.c: new file.
* enc/trans/korean.erb.c: new file.
* enc/trans/iso-8859-2-tbl.rb: new file.
* enc/trans/iso-8859-3-tbl.rb: new file.
* enc/trans/iso-8859-4-tbl.rb: new file.
* enc/trans/iso-8859-5-tbl.rb: new file.
* enc/trans/iso-8859-6-tbl.rb: new file.
* enc/trans/iso-8859-7-tbl.rb: new file.
* enc/trans/iso-8859-8-tbl.rb: new file.
* enc/trans/iso-8859-9-tbl.rb: new file.
* enc/trans/iso-8859-10-tbl.rb: new file.
* enc/trans/iso-8859-11-tbl.rb: new file.
* enc/trans/iso-8859-13-tbl.rb: new file.
* enc/trans/iso-8859-14-tbl.rb: new file.
* enc/trans/iso-8859-15-tbl.rb: new file.
* enc/trans/eucjp-tbl.rb: new file.
* enc/trans/sjis-tbl.rb: new file.
* enc/trans/euckr-tbl.rb: new file.
* enc/trans/utf_16_32.c: regenerated.
* enc/trans/single_byte.c: regenerated.
* enc/trans/japanese.c: regenerated.
* enc/trans/korean.c: regenerated.
[ruby-dev:35730]
Tue Aug 5 18:02:53 2008 Kazuhiro NISHIYAMA <zn@mbf.nifty.com>
* test/io/nonblock/test_flush.rb (TestIONonblock#test_flush):

View file

@ -3,7 +3,7 @@
% encs.each {|e| e.chomp!(".c")}
% alphanumeric_order = proc {|e| e.scan(/(\d+)|(\D+)/).map {|n,a| a||[n.size,n.to_i]}.flatten}
% encs = encs.sort_by(&alphanumeric_order)
% trans = Dir.open($srcdir+"/trans") {|d| d.select {|e| e.chomp!('.c')}}
% trans = Dir.open($srcdir+"/trans") {|d| d.select {|e| e.chomp!('.c') && /\.erb\z/ !~ e }}
% trans = trans.sort_by(&alphanumeric_order)
% trans.map! {|e| "trans/#{e}"}
% dependencies = encs + trans

8831
enc/trans/cp949-tbl.rb Normal file

File diff suppressed because it is too large Load diff

14803
enc/trans/eucjp-tbl.rb Normal file

File diff suppressed because it is too large Load diff

8228
enc/trans/euckr-tbl.rb Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,98 @@
ISO_8859_10_TO_UCS_TBL = [
["A0",0xA0],
["A1",0x104],
["A2",0x112],
["A3",0x122],
["A4",0x12A],
["A5",0x128],
["A6",0x136],
["A7",0xA7],
["A8",0x13B],
["A9",0x110],
["AA",0x160],
["AB",0x166],
["AC",0x17D],
["AD",0xAD],
["AE",0x16A],
["AF",0x14A],
["B0",0xB0],
["B1",0x105],
["B2",0x113],
["B3",0x123],
["B4",0x12B],
["B5",0x129],
["B6",0x137],
["B7",0xB7],
["B8",0x13C],
["B9",0x111],
["BA",0x161],
["BB",0x167],
["BC",0x17E],
["BD",0x2015],
["BE",0x16B],
["BF",0x14B],
["C0",0x100],
["C1",0xC1],
["C2",0xC2],
["C3",0xC3],
["C4",0xC4],
["C5",0xC5],
["C6",0xC6],
["C7",0x12E],
["C8",0x10C],
["C9",0xC9],
["CA",0x118],
["CB",0xCB],
["CC",0x116],
["CD",0xCD],
["CE",0xCE],
["CF",0xCF],
["D0",0xD0],
["D1",0x145],
["D2",0x14C],
["D3",0xD3],
["D4",0xD4],
["D5",0xD5],
["D6",0xD6],
["D7",0x168],
["D8",0xD8],
["D9",0x172],
["DA",0xDA],
["DB",0xDB],
["DC",0xDC],
["DD",0xDD],
["DE",0xDE],
["DF",0xDF],
["E0",0x101],
["E1",0xE1],
["E2",0xE2],
["E3",0xE3],
["E4",0xE4],
["E5",0xE5],
["E6",0xE6],
["E7",0x12F],
["E8",0x10D],
["E9",0xE9],
["EA",0x119],
["EB",0xEB],
["EC",0x117],
["ED",0xED],
["EE",0xEE],
["EF",0xEF],
["F0",0xF0],
["F1",0x146],
["F2",0x14D],
["F3",0xF3],
["F4",0xF4],
["F5",0xF5],
["F6",0xF6],
["F7",0x169],
["F8",0xF8],
["F9",0x173],
["FA",0xFA],
["FB",0xFB],
["FC",0xFC],
["FD",0xFD],
["FE",0xFE],
["FF",0x138],
]

View file

@ -0,0 +1,90 @@
ISO_8859_11_TO_UCS_TBL = [
["A0",0xA0],
["A1",0xE01],
["A2",0xE02],
["A3",0xE03],
["A4",0xE04],
["A5",0xE05],
["A6",0xE06],
["A7",0xE07],
["A8",0xE08],
["A9",0xE09],
["AA",0xE0A],
["AB",0xE0B],
["AC",0xE0C],
["AD",0xE0D],
["AE",0xE0E],
["AF",0xE0F],
["B0",0xE10],
["B1",0xE11],
["B2",0xE12],
["B3",0xE13],
["B4",0xE14],
["B5",0xE15],
["B6",0xE16],
["B7",0xE17],
["B8",0xE18],
["B9",0xE19],
["BA",0xE1A],
["BB",0xE1B],
["BC",0xE1C],
["BD",0xE1D],
["BE",0xE1E],
["BF",0xE1F],
["C0",0xE20],
["C1",0xE21],
["C2",0xE22],
["C3",0xE23],
["C4",0xE24],
["C5",0xE25],
["C6",0xE26],
["C7",0xE27],
["C8",0xE28],
["C9",0xE29],
["CA",0xE2A],
["CB",0xE2B],
["CC",0xE2C],
["CD",0xE2D],
["CE",0xE2E],
["CF",0xE2F],
["D0",0xE30],
["D1",0xE31],
["D2",0xE32],
["D3",0xE33],
["D4",0xE34],
["D5",0xE35],
["D6",0xE36],
["D7",0xE37],
["D8",0xE38],
["D9",0xE39],
["DA",0xE3A],
["DF",0xE3F],
["E0",0xE40],
["E1",0xE41],
["E2",0xE42],
["E3",0xE43],
["E4",0xE44],
["E5",0xE45],
["E6",0xE46],
["E7",0xE47],
["E8",0xE48],
["E9",0xE49],
["EA",0xE4A],
["EB",0xE4B],
["EC",0xE4C],
["ED",0xE4D],
["EE",0xE4E],
["EF",0xE4F],
["F0",0xE50],
["F1",0xE51],
["F2",0xE52],
["F3",0xE53],
["F4",0xE54],
["F5",0xE55],
["F6",0xE56],
["F7",0xE57],
["F8",0xE58],
["F9",0xE59],
["FA",0xE5A],
["FB",0xE5B],
]

View file

@ -0,0 +1,98 @@
ISO_8859_13_TO_UCS_TBL = [
["A0",0xA0],
["A1",0x201D],
["A2",0xA2],
["A3",0xA3],
["A4",0xA4],
["A5",0x201E],
["A6",0xA6],
["A7",0xA7],
["A8",0xD8],
["A9",0xA9],
["AA",0x156],
["AB",0xAB],
["AC",0xAC],
["AD",0xAD],
["AE",0xAE],
["AF",0xC6],
["B0",0xB0],
["B1",0xB1],
["B2",0xB2],
["B3",0xB3],
["B4",0x201C],
["B5",0xB5],
["B6",0xB6],
["B7",0xB7],
["B8",0xF8],
["B9",0xB9],
["BA",0x157],
["BB",0xBB],
["BC",0xBC],
["BD",0xBD],
["BE",0xBE],
["BF",0xE6],
["C0",0x104],
["C1",0x12E],
["C2",0x100],
["C3",0x106],
["C4",0xC4],
["C5",0xC5],
["C6",0x118],
["C7",0x112],
["C8",0x10C],
["C9",0xC9],
["CA",0x179],
["CB",0x116],
["CC",0x122],
["CD",0x136],
["CE",0x12A],
["CF",0x13B],
["D0",0x160],
["D1",0x143],
["D2",0x145],
["D3",0xD3],
["D4",0x14C],
["D5",0xD5],
["D6",0xD6],
["D7",0xD7],
["D8",0x172],
["D9",0x141],
["DA",0x15A],
["DB",0x16A],
["DC",0xDC],
["DD",0x17B],
["DE",0x17D],
["DF",0xDF],
["E0",0x105],
["E1",0x12F],
["E2",0x101],
["E3",0x107],
["E4",0xE4],
["E5",0xE5],
["E6",0x119],
["E7",0x113],
["E8",0x10D],
["E9",0xE9],
["EA",0x17A],
["EB",0x117],
["EC",0x123],
["ED",0x137],
["EE",0x12B],
["EF",0x13C],
["F0",0x161],
["F1",0x144],
["F2",0x146],
["F3",0xF3],
["F4",0x14D],
["F5",0xF5],
["F6",0xF6],
["F7",0xF7],
["F8",0x173],
["F9",0x142],
["FA",0x15B],
["FB",0x16B],
["FC",0xFC],
["FD",0x17C],
["FE",0x17E],
["FF",0x2019],
]

View file

@ -0,0 +1,98 @@
ISO_8859_14_TO_UCS_TBL = [
["A0",0xA0],
["A1",0x1E02],
["A2",0x1E03],
["A3",0xA3],
["A4",0x10A],
["A5",0x10B],
["A6",0x1E0A],
["A7",0xA7],
["A8",0x1E80],
["A9",0xA9],
["AA",0x1E82],
["AB",0x1E0B],
["AC",0x1EF2],
["AD",0xAD],
["AE",0xAE],
["AF",0x178],
["B0",0x1E1E],
["B1",0x1E1F],
["B2",0x120],
["B3",0x121],
["B4",0x1E40],
["B5",0x1E41],
["B6",0xB6],
["B7",0x1E56],
["B8",0x1E81],
["B9",0x1E57],
["BA",0x1E83],
["BB",0x1E60],
["BC",0x1EF3],
["BD",0x1E84],
["BE",0x1E85],
["BF",0x1E61],
["C0",0xC0],
["C1",0xC1],
["C2",0xC2],
["C3",0xC3],
["C4",0xC4],
["C5",0xC5],
["C6",0xC6],
["C7",0xC7],
["C8",0xC8],
["C9",0xC9],
["CA",0xCA],
["CB",0xCB],
["CC",0xCC],
["CD",0xCD],
["CE",0xCE],
["CF",0xCF],
["D0",0x174],
["D1",0xD1],
["D2",0xD2],
["D3",0xD3],
["D4",0xD4],
["D5",0xD5],
["D6",0xD6],
["D7",0x1E6A],
["D8",0xD8],
["D9",0xD9],
["DA",0xDA],
["DB",0xDB],
["DC",0xDC],
["DD",0xDD],
["DE",0x176],
["DF",0xDF],
["E0",0xE0],
["E1",0xE1],
["E2",0xE2],
["E3",0xE3],
["E4",0xE4],
["E5",0xE5],
["E6",0xE6],
["E7",0xE7],
["E8",0xE8],
["E9",0xE9],
["EA",0xEA],
["EB",0xEB],
["EC",0xEC],
["ED",0xED],
["EE",0xEE],
["EF",0xEF],
["F0",0x175],
["F1",0xF1],
["F2",0xF2],
["F3",0xF3],
["F4",0xF4],
["F5",0xF5],
["F6",0xF6],
["F7",0x1E6B],
["F8",0xF8],
["F9",0xF9],
["FA",0xFA],
["FB",0xFB],
["FC",0xFC],
["FD",0xFD],
["FE",0x177],
["FF",0xFF],
]

View file

@ -0,0 +1,98 @@
ISO_8859_15_TO_UCS_TBL = [
["A0",0xA0],
["A1",0xA1],
["A2",0xA2],
["A3",0xA3],
["A4",0x20AC],
["A5",0xA5],
["A6",0x160],
["A7",0xA7],
["A8",0x161],
["A9",0xA9],
["AA",0xAA],
["AB",0xAB],
["AC",0xAC],
["AD",0xAD],
["AE",0xAE],
["AF",0xAF],
["B0",0xB0],
["B1",0xB1],
["B2",0xB2],
["B3",0xB3],
["B4",0x17D],
["B5",0xB5],
["B6",0xB6],
["B7",0xB7],
["B8",0x17E],
["B9",0xB9],
["BA",0xBA],
["BB",0xBB],
["BC",0x152],
["BD",0x153],
["BE",0x178],
["BF",0xBF],
["C0",0xC0],
["C1",0xC1],
["C2",0xC2],
["C3",0xC3],
["C4",0xC4],
["C5",0xC5],
["C6",0xC6],
["C7",0xC7],
["C8",0xC8],
["C9",0xC9],
["CA",0xCA],
["CB",0xCB],
["CC",0xCC],
["CD",0xCD],
["CE",0xCE],
["CF",0xCF],
["D0",0xD0],
["D1",0xD1],
["D2",0xD2],
["D3",0xD3],
["D4",0xD4],
["D5",0xD5],
["D6",0xD6],
["D7",0xD7],
["D8",0xD8],
["D9",0xD9],
["DA",0xDA],
["DB",0xDB],
["DC",0xDC],
["DD",0xDD],
["DE",0xDE],
["DF",0xDF],
["E0",0xE0],
["E1",0xE1],
["E2",0xE2],
["E3",0xE3],
["E4",0xE4],
["E5",0xE5],
["E6",0xE6],
["E7",0xE7],
["E8",0xE8],
["E9",0xE9],
["EA",0xEA],
["EB",0xEB],
["EC",0xEC],
["ED",0xED],
["EE",0xEE],
["EF",0xEF],
["F0",0xF0],
["F1",0xF1],
["F2",0xF2],
["F3",0xF3],
["F4",0xF4],
["F5",0xF5],
["F6",0xF6],
["F7",0xF7],
["F8",0xF8],
["F9",0xF9],
["FA",0xFA],
["FB",0xFB],
["FC",0xFC],
["FD",0xFD],
["FE",0xFE],
["FF",0xFF],
]

View file

@ -0,0 +1,98 @@
ISO_8859_2_TO_UCS_TBL = [
["A0",0xA0],
["A1",0x104],
["A2",0x2D8],
["A3",0x141],
["A4",0xA4],
["A5",0x13D],
["A6",0x15A],
["A7",0xA7],
["A8",0xA8],
["A9",0x160],
["AA",0x15E],
["AB",0x164],
["AC",0x179],
["AD",0xAD],
["AE",0x17D],
["AF",0x17B],
["B0",0xB0],
["B1",0x105],
["B2",0x2DB],
["B3",0x142],
["B4",0xB4],
["B5",0x13E],
["B6",0x15B],
["B7",0x2C7],
["B8",0xB8],
["B9",0x161],
["BA",0x15F],
["BB",0x165],
["BC",0x17A],
["BD",0x2DD],
["BE",0x17E],
["BF",0x17C],
["C0",0x154],
["C1",0xC1],
["C2",0xC2],
["C3",0x102],
["C4",0xC4],
["C5",0x139],
["C6",0x106],
["C7",0xC7],
["C8",0x10C],
["C9",0xC9],
["CA",0x118],
["CB",0xCB],
["CC",0x11A],
["CD",0xCD],
["CE",0xCE],
["CF",0x10E],
["D0",0x110],
["D1",0x143],
["D2",0x147],
["D3",0xD3],
["D4",0xD4],
["D5",0x150],
["D6",0xD6],
["D7",0xD7],
["D8",0x158],
["D9",0x16E],
["DA",0xDA],
["DB",0x170],
["DC",0xDC],
["DD",0xDD],
["DE",0x162],
["DF",0xDF],
["E0",0x155],
["E1",0xE1],
["E2",0xE2],
["E3",0x103],
["E4",0xE4],
["E5",0x13A],
["E6",0x107],
["E7",0xE7],
["E8",0x10D],
["E9",0xE9],
["EA",0x119],
["EB",0xEB],
["EC",0x11B],
["ED",0xED],
["EE",0xEE],
["EF",0x10F],
["F0",0x111],
["F1",0x144],
["F2",0x148],
["F3",0xF3],
["F4",0xF4],
["F5",0x151],
["F6",0xF6],
["F7",0xF7],
["F8",0x159],
["F9",0x16F],
["FA",0xFA],
["FB",0x171],
["FC",0xFC],
["FD",0xFD],
["FE",0x163],
["FF",0x2D9],
]

View file

@ -0,0 +1,91 @@
ISO_8859_3_TO_UCS_TBL = [
["A0",0xA0],
["A1",0x126],
["A2",0x2D8],
["A3",0xA3],
["A4",0xA4],
["A6",0x124],
["A7",0xA7],
["A8",0xA8],
["A9",0x130],
["AA",0x15E],
["AB",0x11E],
["AC",0x134],
["AD",0xAD],
["AF",0x17B],
["B0",0xB0],
["B1",0x127],
["B2",0xB2],
["B3",0xB3],
["B4",0xB4],
["B5",0xB5],
["B6",0x125],
["B7",0xB7],
["B8",0xB8],
["B9",0x131],
["BA",0x15F],
["BB",0x11F],
["BC",0x135],
["BD",0xBD],
["BF",0x17C],
["C0",0xC0],
["C1",0xC1],
["C2",0xC2],
["C4",0xC4],
["C5",0x10A],
["C6",0x108],
["C7",0xC7],
["C8",0xC8],
["C9",0xC9],
["CA",0xCA],
["CB",0xCB],
["CC",0xCC],
["CD",0xCD],
["CE",0xCE],
["CF",0xCF],
["D1",0xD1],
["D2",0xD2],
["D3",0xD3],
["D4",0xD4],
["D5",0x120],
["D6",0xD6],
["D7",0xD7],
["D8",0x11C],
["D9",0xD9],
["DA",0xDA],
["DB",0xDB],
["DC",0xDC],
["DD",0x16C],
["DE",0x15C],
["DF",0xDF],
["E0",0xE0],
["E1",0xE1],
["E2",0xE2],
["E4",0xE4],
["E5",0x10B],
["E6",0x109],
["E7",0xE7],
["E8",0xE8],
["E9",0xE9],
["EA",0xEA],
["EB",0xEB],
["EC",0xEC],
["ED",0xED],
["EE",0xEE],
["EF",0xEF],
["F1",0xF1],
["F2",0xF2],
["F3",0xF3],
["F4",0xF4],
["F5",0x121],
["F6",0xF6],
["F7",0xF7],
["F8",0x11D],
["F9",0xF9],
["FA",0xFA],
["FB",0xFB],
["FC",0xFC],
["FD",0x16D],
["FE",0x15D],
["FF",0x2D9],
]

View file

@ -0,0 +1,98 @@
ISO_8859_4_TO_UCS_TBL = [
["A0",0xA0],
["A1",0x104],
["A2",0x138],
["A3",0x156],
["A4",0xA4],
["A5",0x128],
["A6",0x13B],
["A7",0xA7],
["A8",0xA8],
["A9",0x160],
["AA",0x112],
["AB",0x122],
["AC",0x166],
["AD",0xAD],
["AE",0x17D],
["AF",0xAF],
["B0",0xB0],
["B1",0x105],
["B2",0x2DB],
["B3",0x157],
["B4",0xB4],
["B5",0x129],
["B6",0x13C],
["B7",0x2C7],
["B8",0xB8],
["B9",0x161],
["BA",0x113],
["BB",0x123],
["BC",0x167],
["BD",0x14A],
["BE",0x17E],
["BF",0x14B],
["C0",0x100],
["C1",0xC1],
["C2",0xC2],
["C3",0xC3],
["C4",0xC4],
["C5",0xC5],
["C6",0xC6],
["C7",0x12E],
["C8",0x10C],
["C9",0xC9],
["CA",0x118],
["CB",0xCB],
["CC",0x116],
["CD",0xCD],
["CE",0xCE],
["CF",0x12A],
["D0",0x110],
["D1",0x145],
["D2",0x14C],
["D3",0x136],
["D4",0xD4],
["D5",0xD5],
["D6",0xD6],
["D7",0xD7],
["D8",0xD8],
["D9",0x172],
["DA",0xDA],
["DB",0xDB],
["DC",0xDC],
["DD",0x168],
["DE",0x16A],
["DF",0xDF],
["E0",0x101],
["E1",0xE1],
["E2",0xE2],
["E3",0xE3],
["E4",0xE4],
["E5",0xE5],
["E6",0xE6],
["E7",0x12F],
["E8",0x10D],
["E9",0xE9],
["EA",0x119],
["EB",0xEB],
["EC",0x117],
["ED",0xED],
["EE",0xEE],
["EF",0x12B],
["F0",0x111],
["F1",0x146],
["F2",0x14D],
["F3",0x137],
["F4",0xF4],
["F5",0xF5],
["F6",0xF6],
["F7",0xF7],
["F8",0xF8],
["F9",0x173],
["FA",0xFA],
["FB",0xFB],
["FC",0xFC],
["FD",0x169],
["FE",0x16B],
["FF",0x2D9],
]

View file

@ -0,0 +1,98 @@
ISO_8859_5_TO_UCS_TBL = [
["A0",0xA0],
["A1",0x401],
["A2",0x402],
["A3",0x403],
["A4",0x404],
["A5",0x405],
["A6",0x406],
["A7",0x407],
["A8",0x408],
["A9",0x409],
["AA",0x40A],
["AB",0x40B],
["AC",0x40C],
["AD",0xAD],
["AE",0x40E],
["AF",0x40F],
["B0",0x410],
["B1",0x411],
["B2",0x412],
["B3",0x413],
["B4",0x414],
["B5",0x415],
["B6",0x416],
["B7",0x417],
["B8",0x418],
["B9",0x419],
["BA",0x41A],
["BB",0x41B],
["BC",0x41C],
["BD",0x41D],
["BE",0x41E],
["BF",0x41F],
["C0",0x420],
["C1",0x421],
["C2",0x422],
["C3",0x423],
["C4",0x424],
["C5",0x425],
["C6",0x426],
["C7",0x427],
["C8",0x428],
["C9",0x429],
["CA",0x42A],
["CB",0x42B],
["CC",0x42C],
["CD",0x42D],
["CE",0x42E],
["CF",0x42F],
["D0",0x430],
["D1",0x431],
["D2",0x432],
["D3",0x433],
["D4",0x434],
["D5",0x435],
["D6",0x436],
["D7",0x437],
["D8",0x438],
["D9",0x439],
["DA",0x43A],
["DB",0x43B],
["DC",0x43C],
["DD",0x43D],
["DE",0x43E],
["DF",0x43F],
["E0",0x440],
["E1",0x441],
["E2",0x442],
["E3",0x443],
["E4",0x444],
["E5",0x445],
["E6",0x446],
["E7",0x447],
["E8",0x448],
["E9",0x449],
["EA",0x44A],
["EB",0x44B],
["EC",0x44C],
["ED",0x44D],
["EE",0x44E],
["EF",0x44F],
["F0",0x2116],
["F1",0x451],
["F2",0x452],
["F3",0x453],
["F4",0x454],
["F5",0x455],
["F6",0x456],
["F7",0x457],
["F8",0x458],
["F9",0x459],
["FA",0x45A],
["FB",0x45B],
["FC",0x45C],
["FD",0xA7],
["FE",0x45E],
["FF",0x45F],
]

View file

@ -0,0 +1,53 @@
ISO_8859_6_TO_UCS_TBL = [
["A0",0xA0],
["A4",0xA4],
["AC",0x60C],
["AD",0xAD],
["BB",0x61B],
["BF",0x61F],
["C1",0x621],
["C2",0x622],
["C3",0x623],
["C4",0x624],
["C5",0x625],
["C6",0x626],
["C7",0x627],
["C8",0x628],
["C9",0x629],
["CA",0x62A],
["CB",0x62B],
["CC",0x62C],
["CD",0x62D],
["CE",0x62E],
["CF",0x62F],
["D0",0x630],
["D1",0x631],
["D2",0x632],
["D3",0x633],
["D4",0x634],
["D5",0x635],
["D6",0x636],
["D7",0x637],
["D8",0x638],
["D9",0x639],
["DA",0x63A],
["E0",0x640],
["E1",0x641],
["E2",0x642],
["E3",0x643],
["E4",0x644],
["E5",0x645],
["E6",0x646],
["E7",0x647],
["E8",0x648],
["E9",0x649],
["EA",0x64A],
["EB",0x64B],
["EC",0x64C],
["ED",0x64D],
["EE",0x64E],
["EF",0x64F],
["F0",0x650],
["F1",0x651],
["F2",0x652],
]

View file

@ -0,0 +1,95 @@
ISO_8859_7_TO_UCS_TBL = [
["A0",0xA0],
["A1",0x2018],
["A2",0x2019],
["A3",0xA3],
["A4",0x20AC],
["A5",0x20AF],
["A6",0xA6],
["A7",0xA7],
["A8",0xA8],
["A9",0xA9],
["AA",0x37A],
["AB",0xAB],
["AC",0xAC],
["AD",0xAD],
["AF",0x2015],
["B0",0xB0],
["B1",0xB1],
["B2",0xB2],
["B3",0xB3],
["B4",0x384],
["B5",0x385],
["B6",0x386],
["B7",0xB7],
["B8",0x388],
["B9",0x389],
["BA",0x38A],
["BB",0xBB],
["BC",0x38C],
["BD",0xBD],
["BE",0x38E],
["BF",0x38F],
["C0",0x390],
["C1",0x391],
["C2",0x392],
["C3",0x393],
["C4",0x394],
["C5",0x395],
["C6",0x396],
["C7",0x397],
["C8",0x398],
["C9",0x399],
["CA",0x39A],
["CB",0x39B],
["CC",0x39C],
["CD",0x39D],
["CE",0x39E],
["CF",0x39F],
["D0",0x3A0],
["D1",0x3A1],
["D3",0x3A3],
["D4",0x3A4],
["D5",0x3A5],
["D6",0x3A6],
["D7",0x3A7],
["D8",0x3A8],
["D9",0x3A9],
["DA",0x3AA],
["DB",0x3AB],
["DC",0x3AC],
["DD",0x3AD],
["DE",0x3AE],
["DF",0x3AF],
["E0",0x3B0],
["E1",0x3B1],
["E2",0x3B2],
["E3",0x3B3],
["E4",0x3B4],
["E5",0x3B5],
["E6",0x3B6],
["E7",0x3B7],
["E8",0x3B8],
["E9",0x3B9],
["EA",0x3BA],
["EB",0x3BB],
["EC",0x3BC],
["ED",0x3BD],
["EE",0x3BE],
["EF",0x3BF],
["F0",0x3C0],
["F1",0x3C1],
["F2",0x3C2],
["F3",0x3C3],
["F4",0x3C4],
["F5",0x3C5],
["F6",0x3C6],
["F7",0x3C7],
["F8",0x3C8],
["F9",0x3C9],
["FA",0x3CA],
["FB",0x3CB],
["FC",0x3CC],
["FD",0x3CD],
["FE",0x3CE],
]

View file

@ -0,0 +1,62 @@
ISO_8859_8_TO_UCS_TBL = [
["A0",0xA0],
["A2",0xA2],
["A3",0xA3],
["A4",0xA4],
["A5",0xA5],
["A6",0xA6],
["A7",0xA7],
["A8",0xA8],
["A9",0xA9],
["AA",0xD7],
["AB",0xAB],
["AC",0xAC],
["AD",0xAD],
["AE",0xAE],
["AF",0xAF],
["B0",0xB0],
["B1",0xB1],
["B2",0xB2],
["B3",0xB3],
["B4",0xB4],
["B5",0xB5],
["B6",0xB6],
["B7",0xB7],
["B8",0xB8],
["B9",0xB9],
["BA",0xF7],
["BB",0xBB],
["BC",0xBC],
["BD",0xBD],
["BE",0xBE],
["DF",0x2017],
["E0",0x5D0],
["E1",0x5D1],
["E2",0x5D2],
["E3",0x5D3],
["E4",0x5D4],
["E5",0x5D5],
["E6",0x5D6],
["E7",0x5D7],
["E8",0x5D8],
["E9",0x5D9],
["EA",0x5DA],
["EB",0x5DB],
["EC",0x5DC],
["ED",0x5DD],
["EE",0x5DE],
["EF",0x5DF],
["F0",0x5E0],
["F1",0x5E1],
["F2",0x5E2],
["F3",0x5E3],
["F4",0x5E4],
["F5",0x5E5],
["F6",0x5E6],
["F7",0x5E7],
["F8",0x5E8],
["F9",0x5E9],
["FA",0x5EA],
["FD",0x200E],
["FE",0x200F],
]

View file

@ -0,0 +1,98 @@
ISO_8859_9_TO_UCS_TBL = [
["A0",0xA0],
["A1",0xA1],
["A2",0xA2],
["A3",0xA3],
["A4",0xA4],
["A5",0xA5],
["A6",0xA6],
["A7",0xA7],
["A8",0xA8],
["A9",0xA9],
["AA",0xAA],
["AB",0xAB],
["AC",0xAC],
["AD",0xAD],
["AE",0xAE],
["AF",0xAF],
["B0",0xB0],
["B1",0xB1],
["B2",0xB2],
["B3",0xB3],
["B4",0xB4],
["B5",0xB5],
["B6",0xB6],
["B7",0xB7],
["B8",0xB8],
["B9",0xB9],
["BA",0xBA],
["BB",0xBB],
["BC",0xBC],
["BD",0xBD],
["BE",0xBE],
["BF",0xBF],
["C0",0xC0],
["C1",0xC1],
["C2",0xC2],
["C3",0xC3],
["C4",0xC4],
["C5",0xC5],
["C6",0xC6],
["C7",0xC7],
["C8",0xC8],
["C9",0xC9],
["CA",0xCA],
["CB",0xCB],
["CC",0xCC],
["CD",0xCD],
["CE",0xCE],
["CF",0xCF],
["D0",0x11E],
["D1",0xD1],
["D2",0xD2],
["D3",0xD3],
["D4",0xD4],
["D5",0xD5],
["D6",0xD6],
["D7",0xD7],
["D8",0xD8],
["D9",0xD9],
["DA",0xDA],
["DB",0xDB],
["DC",0xDC],
["DD",0x130],
["DE",0x15E],
["DF",0xDF],
["E0",0xE0],
["E1",0xE1],
["E2",0xE2],
["E3",0xE3],
["E4",0xE4],
["E5",0xE5],
["E6",0xE6],
["E7",0xE7],
["E8",0xE8],
["E9",0xE9],
["EA",0xEA],
["EB",0xEB],
["EC",0xEC],
["ED",0xED],
["EE",0xEE],
["EF",0xEF],
["F0",0x11F],
["F1",0xF1],
["F2",0xF2],
["F3",0xF3],
["F4",0xF4],
["F5",0xF5],
["F6",0xF6],
["F7",0xF7],
["F8",0xF8],
["F9",0xF9],
["FA",0xFA],
["FB",0xFB],
["FC",0xFC],
["FD",0x131],
["FE",0x15F],
["FF",0xFF],
]

File diff suppressed because it is too large Load diff

251
enc/trans/japanese.erb.c Normal file
View file

@ -0,0 +1,251 @@
#include "transcode_data.h"
<%
require 'sjis-tbl'
require 'eucjp-tbl'
%>
<%= transcode_tblgen "Shift_JIS", "UTF-8", [["{00-7f}", :nomap], *SJIS_TO_UCS_TBL] %>
<%= transcode_tblgen "Windows-31J", "UTF-8", [["{00-7f}", :nomap], *SJIS_TO_UCS_TBL] %>
<%= transcode_tblgen "UTF-8", "Shift_JIS", [["{00-7f}", :nomap], *UCS_TO_SJIS_TBL] %>
<%= transcode_tblgen "UTF-8", "Windows-31J", [["{00-7f}", :nomap], *UCS_TO_SJIS_TBL] %>
<%= transcode_tblgen "EUC-JP", "UTF-8", [["{00-7f}", :nomap], *EUCJP_TO_UCS_TBL] %>
<%= transcode_tblgen "CP51932", "UTF-8", [["{00-7f}", :nomap], *EUCJP_TO_UCS_TBL] %>
<%= transcode_tblgen "UTF-8", "EUC-JP", [["{00-7f}", :nomap], *UCS_TO_EUCJP_TBL] %>
<%= transcode_tblgen "UTF-8", "CP51932", [["{00-7f}", :nomap], *UCS_TO_EUCJP_TBL] %>
#define ISO_2022_ENCODING(escseq, byte) ((escseq<<8)|byte)
enum ISO_2022_ESCSEQ {
ISO_2022_CZD = '!',
ISO_2022_C1D = '"',
ISO_2022_GZD4 = '(',
ISO_2022_G1D4 = ')',
ISO_2022_G2D4 = '*',
ISO_2022_G3D4 = '+',
ISO_2022_G1D6 = '-',
ISO_2022_G2D6 = '.',
ISO_2022_G3D6 = '/',
ISO_2022_GZDM4 = ISO_2022_ENCODING('$','('),
ISO_2022_G1DM4 = ISO_2022_ENCODING('$',')'),
ISO_2022_G2DM4 = ISO_2022_ENCODING('$','*'),
ISO_2022_G3DM4 = ISO_2022_ENCODING('$','+'),
ISO_2022_G1DM6 = ISO_2022_ENCODING('$','-'),
ISO_2022_G2DM6 = ISO_2022_ENCODING('$','.'),
ISO_2022_G3DM6 = ISO_2022_ENCODING('$','/'),
ISO_2022_DOCS = ISO_2022_ENCODING('%','I'),
ISO_2022_IRR = '&'
};
#define ISO_2022_GZ_ASCII ISO_2022_ENCODING(ISO_2022_GZD4, 'B')
#define ISO_2022_GZ_JIS_X_0201_Katakana ISO_2022_ENCODING(ISO_2022_GZD4, 'I')
#define ISO_2022_GZ_JIS_X_0201_Roman ISO_2022_ENCODING(ISO_2022_GZD4, 'J')
#define ISO_2022_GZ_JIS_C_6226_1978 ISO_2022_ENCODING(ISO_2022_GZDM4,'@')
#define ISO_2022_GZ_JIS_X_0208_1983 ISO_2022_ENCODING(ISO_2022_GZDM4,'B')
#define ISO_2022_GZ_JIS_X_0212_1990 ISO_2022_ENCODING(ISO_2022_GZDM4,'D')
#define ISO_2022_GZ_JIS_X_0213_2000_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'O')
#define ISO_2022_GZ_JIS_X_0213_2000_2 ISO_2022_ENCODING(ISO_2022_GZDM4,'P')
#define ISO_2022_GZ_JIS_X_0213_2004_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'Q')
#define UNSUPPORTED_MODE TRANSCODE_ERROR
static int
get_iso_2022_mode(const unsigned char **in_pos)
{
int new_mode;
const unsigned char *in_p = *in_pos;
switch (*in_p++) {
case '(':
switch (*in_p++) {
case 'B': case 'I': case 'J':
new_mode = ISO_2022_ENCODING(ISO_2022_GZD4, *(in_p-1));
break;
default:
rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC ( %c)", *(in_p-1));
break;
}
break;
case '$':
switch (*in_p++) {
case '@': case 'A': case 'B':
new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1));
break;
case '(':
switch (*in_p++) {
case 'D': case 'O': case 'P': case 'Q':
new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1));
break;
default:
rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC $ ( %c)", *(in_p-1));
break;
}
break;
default:
rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC $ %c)", *(in_p-1));
break;
}
break;
default:
rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC %c)", *(in_p-1));
break;
}
*in_pos = in_p;
return new_mode;
}
static void
from_iso_2022_jp_transcoder_preprocessor(const unsigned char **in_pos, unsigned char **out_pos,
const unsigned char *in_stop, unsigned char *out_stop,
rb_transcoding *my_transcoding)
{
const rb_transcoder *my_transcoder = my_transcoding->transcoder;
const unsigned char *in_p = *in_pos;
unsigned char *out_p = *out_pos;
int cur_mode = ISO_2022_GZ_ASCII;
unsigned char c1;
unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
while (in_p < in_stop) {
if (out_p >= out_s) {
int len = (out_p - *out_pos);
int new_len = (len + my_transcoder->max_output) * 2;
*out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len);
out_p = *out_pos + len;
out_s = *out_pos + new_len - my_transcoder->max_output;
}
c1 = *in_p++;
if (c1 == 0x1B) {
cur_mode = get_iso_2022_mode(&in_p);
}
else if (c1 == 0x1E || c1 == 0x1F) {
/* SHIFT */
rb_raise(UNSUPPORTED_MODE, "shift is not supported");
}
else if (c1 >= 0x80) {
rb_raise(TRANSCODE_ERROR, "invalid byte sequence");
}
else {
switch (cur_mode) {
case ISO_2022_GZ_ASCII:
case ISO_2022_GZ_JIS_X_0201_Roman:
*out_p++ = c1;
break;
case ISO_2022_GZ_JIS_X_0201_Katakana:
*out_p++ = 0x8E;
*out_p++ = c1 | 0x80;
break;
case ISO_2022_GZ_JIS_X_0212_1990:
*out_p++ = 0x8F;
case ISO_2022_GZ_JIS_C_6226_1978:
case ISO_2022_GZ_JIS_X_0208_1983:
*out_p++ = c1 | 0x80;
*out_p++ = *in_p++ | 0x80;
break;
}
}
}
/* cleanup */
*in_pos = in_p;
*out_pos = out_p;
}
static int
select_iso_2022_mode(unsigned char **out_pos, int new_mode)
{
unsigned char *out_p = *out_pos;
*out_p++ = '\x1b';
switch (new_mode>>8) {
case ISO_2022_GZD4:
*out_p++ = new_mode >> 8;
*out_p++ = new_mode & 0x7F;
break;
case ISO_2022_GZDM4:
*out_p++ = new_mode >> 16;
if ((new_mode & 0x7F) != '@' &&
(new_mode & 0x7F) != 'A' &&
(new_mode & 0x7F) != 'B')
{
*out_p++ = (new_mode>>8) & 0x7F;
}
*out_p++ = new_mode & 0x7F;
break;
default:
rb_raise(UNSUPPORTED_MODE, "this mode is not supported.");
break;
}
*out_pos = out_p;
return new_mode;
}
static void
to_iso_2022_jp_transcoder_postprocessor(const unsigned char **in_pos, unsigned char **out_pos,
const unsigned char *in_stop, unsigned char *out_stop,
rb_transcoding *my_transcoding)
{
const rb_transcoder *my_transcoder = my_transcoding->transcoder;
const unsigned char *in_p = *in_pos;
unsigned char *out_p = *out_pos;
int cur_mode = ISO_2022_GZ_ASCII, new_mode = 0;
unsigned char next_byte;
unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
while (in_p < in_stop) {
if (out_p >= out_s) {
int len = (out_p - *out_pos);
int new_len = (len + my_transcoder->max_output) * 2;
*out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len);
out_p = *out_pos + len;
out_s = *out_pos + new_len - my_transcoder->max_output;
}
next_byte = *in_p++;
if (next_byte < 0x80) {
new_mode = ISO_2022_GZ_ASCII;
}
else if (next_byte == 0x8E) {
new_mode = ISO_2022_GZ_JIS_X_0201_Katakana;
next_byte = *in_p++;
}
else if (next_byte == 0x8F) {
new_mode = ISO_2022_GZ_JIS_X_0212_1990;
next_byte = *in_p++;
}
else {
new_mode = ISO_2022_GZ_JIS_X_0208_1983;
}
if (cur_mode != new_mode)
cur_mode = select_iso_2022_mode(&out_p, new_mode);
if (cur_mode < 0xFFFF) {
*out_p++ = next_byte & 0x7F;
}
else {
*out_p++ = next_byte & 0x7F;
*out_p++ = *in_p++ & 0x7F;
}
}
if (cur_mode != ISO_2022_GZ_ASCII)
cur_mode = select_iso_2022_mode(&out_p, ISO_2022_GZ_ASCII);
/* cleanup */
*in_pos = in_p;
*out_pos = out_p;
}
static const rb_transcoder
rb_from_ISO_2022_JP = {
"ISO-2022-JP", "UTF-8", &from_EUC_JP, 8, 0,
&from_iso_2022_jp_transcoder_preprocessor, NULL,
};
static const rb_transcoder
rb_to_ISO_2022_JP = {
"UTF-8", "ISO-2022-JP", &to_EUC_JP, 8, 1,
NULL, &to_iso_2022_jp_transcoder_postprocessor,
};
void
Init_japanese(void)
{
<%= transcode_register_code %>
rb_register_transcoder(&rb_from_ISO_2022_JP);
rb_register_transcoder(&rb_to_ISO_2022_JP);
}

File diff suppressed because it is too large Load diff

17
enc/trans/korean.erb.c Normal file
View file

@ -0,0 +1,17 @@
#include "transcode_data.h"
<%
require "euckr-tbl"
require "cp949-tbl"
%>
<%= transcode_tblgen "UTF-8", "EUC-KR", [["{00-7f}", :nomap], *UCS_TO_EUCKR_TBL] %>
<%= transcode_tblgen "EUC-KR", "UTF-8", [["{00-7f}", :nomap], *EUCKR_TO_UCS_TBL] %>
<%= transcode_tblgen "UTF-8", "CP949", [["{00-7f}", :nomap], *UCS_TO_CP949_TBL] %>
<%= transcode_tblgen "CP949", "UTF-8", [["{00-7f}", :nomap], *CP949_TO_UCS_TBL] %>
void
Init_korean(void)
{
<%= transcode_register_code %>
}

View file

@ -10,7 +10,7 @@ count = 0
converters = {}
transdir = ARGV[0]
outhdr = ARGV[1] || 'transdb.h'
Dir.open(transdir) {|d| d.grep(/.+\.[ch]\z/)}.sort_by {|e|
Dir.open(transdir) {|d| d.grep(/.+\.[ch]\z/).reject {|n| /\.erb\.c\z/ =~ n }}.sort_by {|e|
e.scan(/(\d+)|(\D+)/).map {|n,a| a||[n.size,n.to_i]}.flatten
}.each do |fn|
open(File.join(transdir,fn)) do |f|

View file

@ -1,5 +1,23 @@
/* autogenerated. */
/* src="single_byte.erb.c", len=2228, checksum=35690 */
/* src="iso-8859-2-tbl.rb", len=1525, checksum=18386 */
/* src="iso-8859-3-tbl.rb", len=1391, checksum=11560 */
/* src="iso-8859-4-tbl.rb", len=1518, checksum=18050 */
/* src="iso-8859-5-tbl.rb", len=1562, checksum=19680 */
/* src="iso-8859-6-tbl.rb", len=841, checksum=46155 */
/* src="iso-8859-7-tbl.rb", len=1505, checksum=17611 */
/* src="iso-8859-8-tbl.rb", len=961, checksum=53500 */
/* src="iso-8859-9-tbl.rb", len=1474, checksum=16589 */
/* src="iso-8859-10-tbl.rb", len=1516, checksum=18011 */
/* src="iso-8859-11-tbl.rb", len=1436, checksum=14115 */
/* src="iso-8859-13-tbl.rb", len=1525, checksum=18280 */
/* src="iso-8859-14-tbl.rb", len=1522, checksum=18993 */
/* src="iso-8859-15-tbl.rb", len=1478, checksum=16787 */
#include "transcode_data.h"
static const unsigned char
from_US_ASCII_offsets[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -25,49 +43,43 @@ from_US_ASCII_infos[2] = {
};
static const BYTE_LOOKUP
from_US_ASCII = {
/* used from from_US_ASCII */
/* used from to_US_ASCII */
/* used from to_ASCII_8BIT */
/* used from from_ASCII_8BIT */
from_US_ASCII_offsets,
from_US_ASCII_infos
};
static const rb_transcoder
rb_from_US_ASCII = {
"US-ASCII", "UTF-8", &from_US_ASCII, 1, 0,
NULL, NULL,
};
static const rb_transcoder
rb_to_US_ASCII = {
"UTF-8", "US-ASCII", &from_US_ASCII, 1, 1,
NULL, NULL,
};
static const rb_transcoder
rb_from_ASCII_8BIT = {
"ASCII-8BIT", "UTF-8", &from_US_ASCII, 1, 0,
NULL, NULL,
};
static const rb_transcoder
rb_to_ASCII_8BIT = {
"UTF-8", "ASCII-8BIT", &from_US_ASCII, 1, 1,
NULL, NULL,
};
static const unsigned char
from_ISO_8859_1_offsets[256] = {
/* used from from_ISO_8859_1 */
/* used from from_ISO_8859_2 */
/* used from from_ISO_8859_4 */
/* used from from_ISO_8859_5 */
/* used from from_ISO_8859_9 */
/* used from from_ISO_8859_10 */
/* used from from_ISO_8859_13 */
/* used from from_ISO_8859_14 */
/* used from from_ISO_8859_15 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -127,6 +139,7 @@ from_ISO_8859_1 = {
from_ISO_8859_1_infos
};
static const rb_transcoder
rb_from_ISO_8859_1 = {
"ISO-8859-1", "UTF-8", &from_ISO_8859_1, 2, 0,
@ -135,8 +148,6 @@ rb_from_ISO_8859_1 = {
static const unsigned char
to_ISO_8859_1_C2_offsets[64] = {
/* used from to_ISO_8859_1_C2 */
/* used from to_ISO_8859_1_C3 */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
@ -163,8 +174,6 @@ to_ISO_8859_1_C2_infos[64] = {
};
static const BYTE_LOOKUP
to_ISO_8859_1_C2 = {
/* used from to_ISO_8859_1 */
/* used from to_ISO_8859_9 */
to_ISO_8859_1_C2_offsets,
to_ISO_8859_1_C2_infos
};
@ -190,8 +199,6 @@ to_ISO_8859_1_C3_infos[64] = {
};
static const BYTE_LOOKUP
to_ISO_8859_1_C3 = {
/* used from to_ISO_8859_1 */
/* used from to_ISO_8859_15 */
to_ISO_8859_1_C2_offsets,
to_ISO_8859_1_C3_infos
};
@ -226,6 +233,7 @@ to_ISO_8859_1 = {
to_ISO_8859_1_infos
};
static const rb_transcoder
rb_to_ISO_8859_1 = {
"UTF-8", "ISO-8859-1", &to_ISO_8859_1, 1, 1,
@ -274,6 +282,7 @@ from_ISO_8859_2 = {
from_ISO_8859_2_infos
};
static const rb_transcoder
rb_from_ISO_8859_2 = {
"ISO-8859-2", "UTF-8", &from_ISO_8859_2, 2, 0,
@ -398,9 +407,6 @@ to_ISO_8859_2_CB = {
static const unsigned char
to_ISO_8859_2_offsets[256] = {
/* used from to_ISO_8859_2 */
/* used from to_ISO_8859_3 */
/* used from to_ISO_8859_4 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -431,6 +437,7 @@ to_ISO_8859_2 = {
to_ISO_8859_2_infos
};
static const rb_transcoder
rb_to_ISO_8859_2 = {
"UTF-8", "ISO-8859-2", &to_ISO_8859_2, 1, 1,
@ -496,6 +503,7 @@ from_ISO_8859_3 = {
from_ISO_8859_3_infos
};
static const rb_transcoder
rb_from_ISO_8859_3 = {
"ISO-8859-3", "UTF-8", &from_ISO_8859_3, 2, 0,
@ -628,6 +636,7 @@ to_ISO_8859_3 = {
to_ISO_8859_3_infos
};
static const rb_transcoder
rb_to_ISO_8859_3 = {
"UTF-8", "ISO-8859-3", &to_ISO_8859_3, 1, 1,
@ -676,6 +685,7 @@ from_ISO_8859_4 = {
from_ISO_8859_4_infos
};
static const rb_transcoder
rb_from_ISO_8859_4 = {
"ISO-8859-4", "UTF-8", &from_ISO_8859_4, 2, 0,
@ -737,8 +747,6 @@ to_ISO_8859_4_C3 = {
static const unsigned char
to_ISO_8859_4_C4_offsets[64] = {
/* used from to_ISO_8859_4_C4 */
/* used from to_ISO_8859_10_C4 */
0, 1, 27, 27, 2, 3, 27, 27, 27, 27, 27, 27, 4, 5, 27, 27,
6, 7, 8, 9, 27, 27, 10, 11, 12, 13, 27, 27, 27, 27, 27, 27,
27, 27, 14, 15, 27, 27, 27, 27, 16, 17, 18, 19, 27, 27, 20, 21,
@ -812,6 +820,7 @@ to_ISO_8859_4 = {
to_ISO_8859_4_infos
};
static const rb_transcoder
rb_to_ISO_8859_4 = {
"UTF-8", "ISO-8859-4", &to_ISO_8859_4, 1, 1,
@ -892,6 +901,7 @@ from_ISO_8859_5 = {
from_ISO_8859_5_infos
};
static const rb_transcoder
rb_from_ISO_8859_5 = {
"ISO-8859-5", "UTF-8", &from_ISO_8859_5, 3, 0,
@ -1044,6 +1054,7 @@ to_ISO_8859_5 = {
to_ISO_8859_5_infos
};
static const rb_transcoder
rb_to_ISO_8859_5 = {
"UTF-8", "ISO-8859-5", &to_ISO_8859_5, 1, 1,
@ -1100,6 +1111,7 @@ from_ISO_8859_6 = {
from_ISO_8859_6_infos
};
static const rb_transcoder
rb_from_ISO_8859_6 = {
"ISO-8859-6", "UTF-8", &from_ISO_8859_6, 2, 0,
@ -1207,6 +1219,7 @@ to_ISO_8859_6 = {
to_ISO_8859_6_infos
};
static const rb_transcoder
rb_to_ISO_8859_6 = {
"UTF-8", "ISO-8859-6", &to_ISO_8859_6, 1, 1,
@ -1305,6 +1318,7 @@ from_ISO_8859_7 = {
from_ISO_8859_7_infos
};
static const rb_transcoder
rb_from_ISO_8859_7 = {
"ISO-8859-7", "UTF-8", &from_ISO_8859_7, 3, 0,
@ -1492,6 +1506,7 @@ to_ISO_8859_7 = {
to_ISO_8859_7_infos
};
static const rb_transcoder
rb_to_ISO_8859_7 = {
"UTF-8", "ISO-8859-7", &to_ISO_8859_7, 1, 1,
@ -1573,6 +1588,7 @@ from_ISO_8859_8 = {
from_ISO_8859_8_infos
};
static const rb_transcoder
rb_from_ISO_8859_8 = {
"ISO-8859-8", "UTF-8", &from_ISO_8859_8, 3, 0,
@ -1670,9 +1686,6 @@ to_ISO_8859_8_E2_80 = {
static const unsigned char
to_ISO_8859_8_E2_offsets[64] = {
/* used from to_ISO_8859_8_E2 */
/* used from to_ISO_8859_10_E2 */
/* used from to_ISO_8859_13_E2 */
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@ -1719,6 +1732,7 @@ to_ISO_8859_8 = {
to_ISO_8859_8_infos
};
static const rb_transcoder
rb_to_ISO_8859_8 = {
"UTF-8", "ISO-8859-8", &to_ISO_8859_8, 1, 1,
@ -1767,6 +1781,7 @@ from_ISO_8859_9 = {
from_ISO_8859_9_infos
};
static const rb_transcoder
rb_from_ISO_8859_9 = {
"ISO-8859-9", "UTF-8", &from_ISO_8859_9, 2, 0,
@ -1870,6 +1885,7 @@ to_ISO_8859_9 = {
to_ISO_8859_9_infos
};
static const rb_transcoder
rb_to_ISO_8859_9 = {
"UTF-8", "ISO-8859-9", &to_ISO_8859_9, 1, 1,
@ -1950,6 +1966,7 @@ from_ISO_8859_10 = {
from_ISO_8859_10_infos
};
static const rb_transcoder
rb_from_ISO_8859_10 = {
"ISO-8859-10", "UTF-8", &from_ISO_8859_10, 3, 0,
@ -2076,8 +2093,6 @@ to_ISO_8859_10_E2 = {
static const unsigned char
to_ISO_8859_10_offsets[256] = {
/* used from to_ISO_8859_10 */
/* used from to_ISO_8859_13 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -2108,6 +2123,7 @@ to_ISO_8859_10 = {
to_ISO_8859_10_infos
};
static const rb_transcoder
rb_to_ISO_8859_10 = {
"UTF-8", "ISO-8859-10", &to_ISO_8859_10, 1, 1,
@ -2203,6 +2219,7 @@ from_ISO_8859_11 = {
from_ISO_8859_11_infos
};
static const rb_transcoder
rb_from_ISO_8859_11 = {
"ISO-8859-11", "UTF-8", &from_ISO_8859_11, 3, 0,
@ -2337,6 +2354,7 @@ to_ISO_8859_11 = {
to_ISO_8859_11_infos
};
static const rb_transcoder
rb_to_ISO_8859_11 = {
"UTF-8", "ISO-8859-11", &to_ISO_8859_11, 1, 1,
@ -2417,6 +2435,7 @@ from_ISO_8859_13 = {
from_ISO_8859_13_infos
};
static const rb_transcoder
rb_from_ISO_8859_13 = {
"ISO-8859-13", "UTF-8", &from_ISO_8859_13, 3, 0,
@ -2562,6 +2581,7 @@ to_ISO_8859_13 = {
to_ISO_8859_13_infos
};
static const rb_transcoder
rb_to_ISO_8859_13 = {
"UTF-8", "ISO-8859-13", &to_ISO_8859_13, 1, 1,
@ -2642,6 +2662,7 @@ from_ISO_8859_14 = {
from_ISO_8859_14_infos
};
static const rb_transcoder
rb_from_ISO_8859_14 = {
"ISO-8859-14", "UTF-8", &from_ISO_8859_14, 3, 0,
@ -2864,6 +2885,7 @@ to_ISO_8859_14 = {
to_ISO_8859_14_infos
};
static const rb_transcoder
rb_to_ISO_8859_14 = {
"UTF-8", "ISO-8859-14", &to_ISO_8859_14, 1, 1,
@ -2944,6 +2966,7 @@ from_ISO_8859_15 = {
from_ISO_8859_15_infos
};
static const rb_transcoder
rb_from_ISO_8859_15 = {
"ISO-8859-15", "UTF-8", &from_ISO_8859_15, 3, 0,
@ -3064,12 +3087,14 @@ to_ISO_8859_15 = {
to_ISO_8859_15_infos
};
static const rb_transcoder
rb_to_ISO_8859_15 = {
"UTF-8", "ISO-8859-15", &to_ISO_8859_15, 1, 1,
NULL, NULL,
};
void
Init_single_byte(void)
{
@ -3105,5 +3130,7 @@ Init_single_byte(void)
rb_register_transcoder(&rb_to_ISO_8859_14);
rb_register_transcoder(&rb_from_ISO_8859_15);
rb_register_transcoder(&rb_to_ISO_8859_15);
}
/* Footprint (bytes): gross: 27876, saved: 4544, net: 23332 */

View file

@ -0,0 +1,62 @@
#include "transcode_data.h"
<%
us_ascii_map = [["{00-7f}", :nomap], ["{80-ff}", :undef]]
ISO_8859_1_TO_UCS_TBL = (0x80..0xff).map {|c| ["%02X" % c, c] }
CONTROL1_TO_UCS_TBL = (0x80..0x9f).map {|c| ["%02X" % c, c] }
require 'iso-8859-2-tbl'
require 'iso-8859-3-tbl'
require 'iso-8859-4-tbl'
require 'iso-8859-5-tbl'
require 'iso-8859-6-tbl'
require 'iso-8859-7-tbl'
require 'iso-8859-8-tbl'
require 'iso-8859-9-tbl'
require 'iso-8859-10-tbl'
require 'iso-8859-11-tbl'
require 'iso-8859-13-tbl'
require 'iso-8859-14-tbl'
require 'iso-8859-15-tbl'
%>
<%= transcode_tblgen "US-ASCII", "UTF-8", us_ascii_map %>
<%= transcode_tblgen "UTF-8", "US-ASCII", us_ascii_map %>
<%= transcode_tblgen "ASCII-8BIT", "UTF-8", us_ascii_map %>
<%= transcode_tblgen "UTF-8", "ASCII-8BIT", us_ascii_map %>
<%
def transcode_tblgen_iso8859(name, tbl_to_ucs)
tbl_to_ucs = CONTROL1_TO_UCS_TBL + tbl_to_ucs
name_ident = name.tr('-','_')
code = ''
code << transcode_tblgen(name, "UTF-8", [["{00-7f}", :nomap], *tbl_to_ucs])
code << "\n"
code << transcode_tblgen("UTF-8", name, [["{00-7f}", :nomap], *tbl_to_ucs.map {|a,b| [b,a] }])
code
end
%>
<%= transcode_tblgen_iso8859("ISO-8859-1", ISO_8859_1_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-2", ISO_8859_2_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-3", ISO_8859_3_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-4", ISO_8859_4_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-5", ISO_8859_5_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-6", ISO_8859_6_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-7", ISO_8859_7_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-8", ISO_8859_8_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-9", ISO_8859_9_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-10", ISO_8859_10_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-11", ISO_8859_11_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-13", ISO_8859_13_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-14", ISO_8859_14_TO_UCS_TBL) %>
<%= transcode_tblgen_iso8859("ISO-8859-15", ISO_8859_15_TO_UCS_TBL) %>
void
Init_single_byte(void)
{
<%= transcode_register_code %>
}

14803
enc/trans/sjis-tbl.rb Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,5 @@
/* Autogenerated, do not change */
/* Report bugs to Martin Duerst (duerst@it.aoyama.ac.jp) */
/* autogenerated. */
/* src="utf_16_32.erb.c", len=8014, checksum=26811 */
#include "transcode_data.h"
@ -181,16 +181,9 @@ fun_so_to_utf_32le(const unsigned char* s, unsigned char* o)
{
return 4;
}
static const unsigned char
from_UTF_16BE_00_offsets[256] = {
/* used by from_UTF_16BE_00 */
/* used by from_UTF_32BE_00_00_D8 */
/* used by from_UTF_32BE_00_01 */
/* used by from_UTF_32BE_00_11 */
/* used by from_UTF_16BE_D8 */
/* used by from_UTF_16LE */
/* used by from_UTF_32LE */
/* used by from_UTF_16LE_00_D8 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -210,42 +203,26 @@ from_UTF_16BE_00_offsets[256] = {
};
static const struct byte_lookup* const
from_UTF_16BE_00_infos[1] = {
/* used by from_UTF_16BE_00 */
/* used by to_UTF_32BE_C2 */
FUNso,
};
static const BYTE_LOOKUP
from_UTF_16BE_00 = {
/* used as from_UTF_16BE */
/* used as from_UTF_32BE_00_00 */
/* used as from_UTF_32BE_00_01 */
/* used as from_UTF_16BE_D8_00 */
/* used as from_UTF_32LE_00_00 */
/* used as from_UTF_32LE_00_D8 */
from_UTF_16BE_00_offsets,
from_UTF_16BE_00_infos
};
static const struct byte_lookup* const
from_UTF_32BE_00_00_D8_infos[1] = {
from_UTF_16BE_D8_00_00_infos[1] = {
INVALID,
};
static const BYTE_LOOKUP
from_UTF_32BE_00_00_D8 = {
/* used as from_UTF_32BE_00_00 */
/* used as from_UTF_32BE_00_11 */
/* used as from_UTF_16BE_D8_00 */
/* used as from_UTF_16BE */
/* used as from_UTF_32LE_00_00 */
/* used as from_UTF_32LE_00_D8 */
from_UTF_16BE_D8_00_00 = {
from_UTF_16BE_00_offsets,
from_UTF_32BE_00_00_D8_infos
from_UTF_16BE_D8_00_00_infos
};
static const unsigned char
from_UTF_16BE_D8_00_offsets[256] = {
/* used by from_UTF_16BE_D8_00 */
/* used by from_UTF_16LE_00_D8_00 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -265,9 +242,7 @@ from_UTF_16BE_D8_00_offsets[256] = {
};
static const struct byte_lookup* const
from_UTF_16BE_D8_00_infos[2] = {
/* used by from_UTF_16BE_D8_00 */
/* used by from_UTF_32LE_00_D8 */
&from_UTF_32BE_00_00_D8, &from_UTF_16BE_00,
&from_UTF_16BE_D8_00_00, &from_UTF_16BE_00,
};
static const BYTE_LOOKUP
from_UTF_16BE_D8_00 = {
@ -287,8 +262,6 @@ from_UTF_16BE_D8 = {
static const unsigned char
from_UTF_16BE_offsets[256] = {
/* used by from_UTF_16BE */
/* used by from_UTF_16LE_00 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -309,7 +282,7 @@ from_UTF_16BE_offsets[256] = {
static const struct byte_lookup* const
from_UTF_16BE_infos[3] = {
&from_UTF_16BE_00, &from_UTF_16BE_D8,
&from_UTF_32BE_00_00_D8,
&from_UTF_16BE_D8_00_00,
};
static const BYTE_LOOKUP
from_UTF_16BE = {
@ -317,6 +290,8 @@ from_UTF_16BE = {
from_UTF_16BE_infos
};
static const rb_transcoder
rb_from_UTF_16BE = {
"UTF-16BE", "UTF-8", &from_UTF_16BE, 4, 0,
@ -324,165 +299,94 @@ rb_from_UTF_16BE = {
};
static const unsigned char
to_UTF_32BE_C2_offsets[64] = {
/* used by to_UTF_32BE_C2 */
/* used by to_UTF_32BE_E1 */
/* used by to_UTF_32BE_F1 */
to_UTF_16BE_C2_offsets[64] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const BYTE_LOOKUP
to_UTF_32BE_C2 = {
/* used as to_UTF_32BE */
/* used as to_UTF_16BE */
/* used as to_UTF_32BE_E0 */
/* used as to_UTF_16BE_E0 */
/* used as to_UTF_16BE_E1 */
/* used as to_UTF_32BE_E1 */
/* used as to_UTF_32BE_ED */
/* used as to_UTF_16BE_ED */
/* used as to_UTF_16BE_F0_90 */
/* used as to_UTF_32BE_F0_90 */
/* used as to_UTF_16BE_F1_80 */
/* used as to_UTF_32BE_F1_80 */
/* used as to_UTF_32BE_F4_80 */
/* used as to_UTF_16BE_F4_80 */
/* used as to_UTF_16LE */
/* used as to_UTF_32LE */
/* used as to_UTF_16LE_E0 */
/* used as to_UTF_32LE_E0 */
/* used as to_UTF_32LE_E1 */
/* used as to_UTF_16LE_E1 */
/* used as to_UTF_32LE_ED */
/* used as to_UTF_16LE_ED */
/* used as to_UTF_16LE_F0_90 */
/* used as to_UTF_32LE_F0_90 */
/* used as to_UTF_16LE_F1_80 */
/* used as to_UTF_32LE_F1_80 */
/* used as to_UTF_16LE_F4_80 */
/* used as to_UTF_32LE_F4_80 */
to_UTF_32BE_C2_offsets,
to_UTF_16BE_C2 = {
to_UTF_16BE_C2_offsets,
from_UTF_16BE_00_infos
};
static const unsigned char
to_UTF_32BE_E0_offsets[64] = {
/* used by to_UTF_32BE_E0 */
/* used by to_UTF_32BE_ED */
to_UTF_16BE_E0_offsets[64] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
};
static const struct byte_lookup* const
to_UTF_32BE_E0_infos[2] = {
INVALID, &to_UTF_32BE_C2,
to_UTF_16BE_E0_infos[2] = {
INVALID, &to_UTF_16BE_C2,
};
static const BYTE_LOOKUP
to_UTF_32BE_E0 = {
/* used as to_UTF_32BE */
/* used as to_UTF_16BE */
/* used as to_UTF_16LE */
/* used as to_UTF_32LE */
to_UTF_32BE_E0_offsets,
to_UTF_32BE_E0_infos
to_UTF_16BE_E0 = {
to_UTF_16BE_E0_offsets,
to_UTF_16BE_E0_infos
};
static const struct byte_lookup* const
to_UTF_32BE_E1_infos[1] = {
&to_UTF_32BE_C2,
to_UTF_16BE_E1_infos[1] = {
&to_UTF_16BE_C2,
};
static const BYTE_LOOKUP
to_UTF_32BE_E1 = {
/* used as to_UTF_32BE */
/* used as to_UTF_16BE */
/* used as to_UTF_32BE_F0 */
/* used as to_UTF_16BE_F0 */
/* used as to_UTF_16BE_F1 */
/* used as to_UTF_32BE_F1 */
/* used as to_UTF_16BE_F4 */
/* used as to_UTF_32BE_F4 */
/* used as to_UTF_32LE */
/* used as to_UTF_16LE */
/* used as to_UTF_16LE_F0 */
/* used as to_UTF_32LE_F0 */
/* used as to_UTF_16LE_F1 */
/* used as to_UTF_32LE_F1 */
/* used as to_UTF_16LE_F4 */
/* used as to_UTF_32LE_F4 */
to_UTF_32BE_C2_offsets,
to_UTF_32BE_E1_infos
to_UTF_16BE_E1 = {
to_UTF_16BE_C2_offsets,
to_UTF_16BE_E1_infos
};
static const struct byte_lookup* const
to_UTF_32BE_ED_infos[2] = {
&to_UTF_32BE_C2, INVALID,
to_UTF_16BE_ED_infos[2] = {
&to_UTF_16BE_C2, INVALID,
};
static const BYTE_LOOKUP
to_UTF_32BE_ED = {
/* used as to_UTF_32BE */
/* used as to_UTF_16BE */
/* used as to_UTF_16LE */
/* used as to_UTF_32LE */
to_UTF_32BE_E0_offsets,
to_UTF_32BE_ED_infos
to_UTF_16BE_ED = {
to_UTF_16BE_E0_offsets,
to_UTF_16BE_ED_infos
};
static const unsigned char
to_UTF_32BE_F0_offsets[64] = {
/* used by to_UTF_32BE_F0 */
/* used by to_UTF_32BE_F4 */
to_UTF_16BE_F0_offsets[64] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
};
static const struct byte_lookup* const
to_UTF_32BE_F0_infos[2] = {
INVALID, &to_UTF_32BE_E1,
to_UTF_16BE_F0_infos[2] = {
INVALID, &to_UTF_16BE_E1,
};
static const BYTE_LOOKUP
to_UTF_32BE_F0 = {
/* used as to_UTF_32BE */
/* used as to_UTF_16BE */
/* used as to_UTF_16LE */
/* used as to_UTF_32LE */
to_UTF_32BE_F0_offsets,
to_UTF_32BE_F0_infos
to_UTF_16BE_F0 = {
to_UTF_16BE_F0_offsets,
to_UTF_16BE_F0_infos
};
static const struct byte_lookup* const
to_UTF_32BE_F1_infos[1] = {
&to_UTF_32BE_E1,
to_UTF_16BE_F1_infos[1] = {
&to_UTF_16BE_E1,
};
static const BYTE_LOOKUP
to_UTF_32BE_F1 = {
/* used as to_UTF_32BE */
/* used as to_UTF_16BE */
/* used as to_UTF_16LE */
/* used as to_UTF_32LE */
to_UTF_32BE_C2_offsets,
to_UTF_32BE_F1_infos
to_UTF_16BE_F1 = {
to_UTF_16BE_C2_offsets,
to_UTF_16BE_F1_infos
};
static const struct byte_lookup* const
to_UTF_32BE_F4_infos[2] = {
&to_UTF_32BE_E1, INVALID,
to_UTF_16BE_F4_infos[2] = {
&to_UTF_16BE_E1, INVALID,
};
static const BYTE_LOOKUP
to_UTF_32BE_F4 = {
/* used as to_UTF_32BE */
/* used as to_UTF_16BE */
/* used as to_UTF_16LE */
/* used as to_UTF_32LE */
to_UTF_32BE_F0_offsets,
to_UTF_32BE_F4_infos
to_UTF_16BE_F4 = {
to_UTF_16BE_F0_offsets,
to_UTF_16BE_F4_infos
};
static const unsigned char
to_UTF_32BE_offsets[256] = {
to_UTF_16BE_offsets[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -501,24 +405,22 @@ to_UTF_32BE_offsets[256] = {
6, 7, 7, 7, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
};
static const struct byte_lookup* const
to_UTF_32BE_infos[9] = {
FUNso, INVALID, &to_UTF_32BE_C2, &to_UTF_32BE_E0,
&to_UTF_32BE_E1, &to_UTF_32BE_ED, &to_UTF_32BE_F0, &to_UTF_32BE_F1,
&to_UTF_32BE_F4,
to_UTF_16BE_infos[9] = {
FUNso, INVALID, &to_UTF_16BE_C2, &to_UTF_16BE_E0,
&to_UTF_16BE_E1, &to_UTF_16BE_ED, &to_UTF_16BE_F0, &to_UTF_16BE_F1,
&to_UTF_16BE_F4,
};
static const BYTE_LOOKUP
to_UTF_32BE = {
/* used as to_UTF_32BE */
/* used as to_UTF_16BE */
/* used as to_UTF_16LE */
/* used as to_UTF_32LE */
to_UTF_32BE_offsets,
to_UTF_32BE_infos
to_UTF_16BE = {
to_UTF_16BE_offsets,
to_UTF_16BE_infos
};
static const rb_transcoder
rb_to_UTF_16BE = {
"UTF-8", "UTF-16BE", &to_UTF_32BE, 4, 1,
"UTF-8", "UTF-16BE", &to_UTF_16BE, 4, 1,
NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16be
};
@ -563,6 +465,8 @@ from_UTF_16LE = {
from_UTF_16LE_infos
};
static const rb_transcoder
rb_from_UTF_16LE = {
"UTF-16LE", "UTF-8", &from_UTF_16LE, 4, 0,
@ -571,14 +475,12 @@ rb_from_UTF_16LE = {
static const rb_transcoder
rb_to_UTF_16LE = {
"UTF-8", "UTF-16LE", &to_UTF_32BE, 4, 1,
"UTF-8", "UTF-16LE", &to_UTF_16BE, 4, 1,
NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16le
};
static const unsigned char
from_UTF_32BE_00_00_offsets[256] = {
/* used by from_UTF_32BE_00_00 */
/* used by from_UTF_32LE_00 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -598,9 +500,7 @@ from_UTF_32BE_00_00_offsets[256] = {
};
static const struct byte_lookup* const
from_UTF_32BE_00_00_infos[2] = {
/* used by from_UTF_32BE_00_00 */
/* used by from_UTF_32LE_00_00 */
&from_UTF_16BE_00, &from_UTF_32BE_00_00_D8,
&from_UTF_16BE_00, &from_UTF_16BE_D8_00_00,
};
static const BYTE_LOOKUP
from_UTF_32BE_00_00 = {
@ -620,7 +520,7 @@ from_UTF_32BE_00_01 = {
static const struct byte_lookup* const
from_UTF_32BE_00_11_infos[1] = {
&from_UTF_32BE_00_00_D8,
&from_UTF_16BE_D8_00_00,
};
static const BYTE_LOOKUP
from_UTF_32BE_00_11 = {
@ -658,6 +558,16 @@ from_UTF_32BE_00 = {
from_UTF_32BE_00_infos
};
static const struct byte_lookup* const
from_UTF_32BE_01_infos[1] = {
&from_UTF_32BE_00_11,
};
static const BYTE_LOOKUP
from_UTF_32BE_01 = {
from_UTF_16BE_00_offsets,
from_UTF_32BE_01_infos
};
static const unsigned char
from_UTF_32BE_offsets[256] = {
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@ -679,7 +589,7 @@ from_UTF_32BE_offsets[256] = {
};
static const struct byte_lookup* const
from_UTF_32BE_infos[2] = {
&from_UTF_32BE_00, INVALID,
&from_UTF_32BE_00, &from_UTF_32BE_01,
};
static const BYTE_LOOKUP
from_UTF_32BE = {
@ -687,6 +597,8 @@ from_UTF_32BE = {
from_UTF_32BE_infos
};
static const rb_transcoder
rb_from_UTF_32BE = {
"UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 0,
@ -695,10 +607,20 @@ rb_from_UTF_32BE = {
static const rb_transcoder
rb_to_UTF_32BE = {
"UTF-8", "UTF-32BE", &to_UTF_32BE, 4, 1,
"UTF-8", "UTF-32BE", &to_UTF_16BE, 4, 1,
NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32be
};
static const struct byte_lookup* const
from_UTF_32LE_00_00_00_infos[2] = {
FUNso, INVALID,
};
static const BYTE_LOOKUP
from_UTF_32LE_00_00_00 = {
from_UTF_32BE_offsets,
from_UTF_32LE_00_00_00_infos
};
static const unsigned char
from_UTF_32LE_00_00_offsets[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -718,10 +640,14 @@ from_UTF_32LE_00_00_offsets[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
};
static const struct byte_lookup* const
from_UTF_32LE_00_00_infos[2] = {
&from_UTF_32LE_00_00_00, &from_UTF_16BE_D8_00_00,
};
static const BYTE_LOOKUP
from_UTF_32LE_00_00 = {
from_UTF_32LE_00_00_offsets,
from_UTF_32BE_00_00_infos
from_UTF_32LE_00_00_infos
};
static const unsigned char
@ -743,10 +669,14 @@ from_UTF_32LE_00_D8_offsets[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const struct byte_lookup* const
from_UTF_32LE_00_D8_infos[2] = {
&from_UTF_16BE_D8_00_00, &from_UTF_32LE_00_00_00,
};
static const BYTE_LOOKUP
from_UTF_32LE_00_D8 = {
from_UTF_32LE_00_D8_offsets,
from_UTF_16BE_D8_00_infos
from_UTF_32LE_00_D8_infos
};
static const struct byte_lookup* const
@ -769,6 +699,8 @@ from_UTF_32LE = {
from_UTF_32LE_infos
};
static const rb_transcoder
rb_from_UTF_32LE = {
"UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 0,
@ -777,7 +709,7 @@ rb_from_UTF_32LE = {
static const rb_transcoder
rb_to_UTF_32LE = {
"UTF-8", "UTF-32LE", &to_UTF_32BE, 4, 1,
"UTF-8", "UTF-32LE", &to_UTF_16BE, 4, 1,
NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32le
};
@ -793,4 +725,4 @@ Init_utf_16_32(void)
rb_register_transcoder(&rb_from_UTF_32LE);
rb_register_transcoder(&rb_to_UTF_32LE);
}
/* Footprint (bytes): gross: 13684, saved: 10796, net: 2888 */

310
enc/trans/utf_16_32.erb.c Normal file
View file

@ -0,0 +1,310 @@
#include "transcode_data.h"
static int
fun_so_from_utf_16be(const unsigned char* s, unsigned char* o)
{
if (!s[0] && s[1]<0x80) {
o[0] = s[1];
return 1;
}
else if (s[0]<0x08) {
o[0] = 0xC0 | (s[0]<<2) | (s[1]>>6);
o[1] = 0x80 | (s[1]&0x3F);
return 2;
}
else if ((s[0]&0xF8)!=0xD8) {
o[0] = 0xE0 | (s[0]>>4);
o[1] = 0x80 | ((s[0]&0x0F)<<2) | (s[1]>>6);
o[2] = 0x80 | (s[1]&0x3F);
return 3;
}
else {
unsigned int u = (((s[0]&0x03)<<2)|(s[1]>>6)) + 1;
o[0] = 0xF0 | (u>>2);
o[1] = 0x80 | ((u&0x03)<<4) | ((s[1]>>2)&0x0F);
o[2] = 0x80 | ((s[1]&0x03)<<4) | ((s[2]&0x03)<<2) | (s[3]>>6);
o[3] = 0x80 | (s[3]&0x3F);
return 4;
}
}
static int
fun_so_to_utf_16be(const unsigned char* s, unsigned char* o)
{
if (!(s[0]&0x80)) {
o[0] = 0x00;
o[1] = s[0];
return 2;
}
else if ((s[0]&0xE0)==0xC0) {
o[0] = (s[0]>>2)&0x07;
o[1] = ((s[0]&0x03)<<6) | (s[1]&0x3F);
return 2;
}
else if ((s[0]&0xF0)==0xE0) {
o[0] = (s[0]<<4) | ((s[1]>>2)^0x20);
o[1] = (s[1]<<6) | (s[2]^0x80);
return 2;
}
else {
int w = (((s[0]&0x07)<<2) | ((s[1]>>4)&0x03)) - 1;
o[0] = 0xD8 | (w>>2);
o[1] = (w<<6) | ((s[1]&0x0F)<<2) | ((s[2]>>4)-8);
o[2] = 0xDC | ((s[2]>>2)&0x03);
o[3] = (s[2]<<6) | (s[3]&~0x80);
return 4;
}
}
static int
fun_so_from_utf_16le(const unsigned char* s, unsigned char* o)
{
if (!s[1] && s[0]<0x80) {
o[0] = s[0];
return 1;
}
else if (s[1]<0x08) {
o[0] = 0xC0 | (s[1]<<2) | (s[0]>>6);
o[1] = 0x80 | (s[0]&0x3F);
return 2;
}
else if ((s[1]&0xF8)!=0xD8) {
o[0] = 0xE0 | (s[1]>>4);
o[1] = 0x80 | ((s[1]&0x0F)<<2) | (s[0]>>6);
o[2] = 0x80 | (s[0]&0x3F);
return 3;
}
else {
unsigned int u = (((s[1]&0x03)<<2)|(s[0]>>6)) + 1;
o[0] = 0xF0 | u>>2;
o[1] = 0x80 | ((u&0x03)<<4) | ((s[0]>>2)&0x0F);
o[2] = 0x80 | ((s[0]&0x03)<<4) | ((s[3]&0x03)<<2) | (s[2]>>6);
o[3] = 0x80 | (s[2]&0x3F);
return 4;
}
}
static int
fun_so_to_utf_16le(const unsigned char* s, unsigned char* o)
{
if (!(s[0]&0x80)) {
o[1] = 0x00;
o[0] = s[0];
return 2;
}
else if ((s[0]&0xE0)==0xC0) {
o[1] = (s[0]>>2)&0x07;
o[0] = ((s[0]&0x03)<<6) | (s[1]&0x3F);
return 2;
}
else if ((s[0]&0xF0)==0xE0) {
o[1] = (s[0]<<4) | ((s[1]>>2)^0x20);
o[0] = (s[1]<<6) | (s[2]^0x80);
return 2;
}
else {
int w = (((s[0]&0x07)<<2) | ((s[1]>>4)&0x03)) - 1;
o[1] = 0xD8 | (w>>2);
o[0] = (w<<6) | ((s[1]&0x0F)<<2) | ((s[2]>>4)-8);
o[3] = 0xDC | ((s[2]>>2)&0x03);
o[2] = (s[2]<<6) | (s[3]&~0x80);
return 4;
}
}
static int
fun_so_from_utf_32be(const unsigned char* s, unsigned char* o)
{
if (!s[1]) {
if (s[2]==0 && s[3]<0x80) {
o[0] = s[3];
return 1;
}
else if (s[2]<0x08) {
o[0] = 0xC0 | (s[2]<<2) | (s[3]>>6);
o[1] = 0x80 | (s[3]&0x3F);
return 2;
}
else {
o[0] = 0xE0 | (s[2]>>4);
o[1] = 0x80 | ((s[2]&0x0F)<<2) | (s[3]>>6);
o[2] = 0x80 | (s[3]&0x3F);
return 3;
}
}
else {
o[0] = 0xF0 | (s[1]>>2);
o[1] = 0x80 | ((s[1]&0x03)<<4) | (s[2]>>4);
o[2] = 0x80 | ((s[2]&0x0F)<<2) | (s[3]>>6);
o[3] = 0x80 | (s[3]&0x3F);
return 4;
}
}
static int
fun_so_to_utf_32be(const unsigned char* s, unsigned char* o)
{
o[0] = 0;
if (!(s[0]&0x80)) {
o[1] = o[2] = 0x00;
o[3] = s[0];
}
else if ((s[0]&0xE0)==0xC0) {
o[1] = 0x00;
o[2] = (s[0]>>2)&0x07;
o[3] = ((s[0]&0x03)<<6) | (s[1]&0x3F);
}
else if ((s[0]&0xF0)==0xE0) {
o[1] = 0x00;
o[2] = (s[0]<<4) | ((s[1]>>2)^0x20);
o[3] = (s[1]<<6) | (s[2]^0x80);
}
else {
o[1] = ((s[0]&0x07)<<2) | ((s[1]>>4)&0x03);
o[2] = ((s[1]&0x0F)<<4) | ((s[2]>>2)&0x0F);
o[3] = ((s[2]&0x03)<<6) | (s[3]&0x3F);
}
return 4;
}
static int
fun_so_from_utf_32le(const unsigned char* s, unsigned char* o)
{
return 1;
}
static int
fun_so_to_utf_32le(const unsigned char* s, unsigned char* o)
{
return 4;
}
<%=
map = {}
map["{00-d7,e0-ff}{00-ff}"] = :func_so
map["{d8-db}{00-ff}{dc-df}{00-ff}"] = :func_so
map["{dc-df}{00-ff}"] = :invalid
map["{d8-db}{00-ff}{00-db,e0-ff}{00-ff}"] = :invalid
code = ''
ActionMap.parse(map).generate_node(code, "from_UTF_16BE", [])
code
%>
static const rb_transcoder
rb_from_UTF_16BE = {
"UTF-16BE", "UTF-8", &from_UTF_16BE, 4, 0,
NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_16be
};
<%=
map = {}
map["{00-7f}"] = :func_so
map["{c2-df}{80-bf}"] = :func_so
map["e0{a0-bf}{80-bf}"] = :func_so
map["{e1-ec}{80-bf}{80-bf}"] = :func_so
map["ed{80-9f}{80-bf}"] = :func_so
map["{ee-ef}{80-bf}{80-bf}"] = :func_so
map["f0{90-bf}{80-bf}{80-bf}"] = :func_so
map["{f1-f3}{80-bf}{80-bf}{80-bf}"] = :func_so
map["f4{80-8f}{80-bf}{80-bf}"] = :func_so
map["{80-c1,f5-ff}"] = :invalid
map["e0{80-9f}"] = :invalid
map["ed{a0-bf}"] = :invalid
map["f0{80-8f}"] = :invalid
map["f4{90-bf}"] = :invalid
code = ''
am = ActionMap.parse(map)
am.generate_node(code, "to_UTF_16BE", [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf])
code
%>
static const rb_transcoder
rb_to_UTF_16BE = {
"UTF-8", "UTF-16BE", &to_UTF_16BE, 4, 1,
NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16be
};
<%=
map = {}
map["{00-ff}{00-d7,e0-ff}"] = :func_so
map["{00-ff}{d8-db}{00-ff}{dc-df}"] = :func_so
map["{00-ff}{dc-df}"] = :invalid
map["{00-ff}{d8-db}{00-ff}{00-db,e0-ff}"] = :invalid
code = ''
ActionMap.parse(map).generate_node(code, "from_UTF_16LE", [])
code
%>
static const rb_transcoder
rb_from_UTF_16LE = {
"UTF-16LE", "UTF-8", &from_UTF_16LE, 4, 0,
NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_16le
};
static const rb_transcoder
rb_to_UTF_16LE = {
"UTF-8", "UTF-16LE", &to_UTF_16BE, 4, 1,
NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16le
};
<%=
map = {}
map["0000{00-d7,e0-ff}{00-ff}"] = :func_so
map["00{01-10}{00-ff}{00-ff}"] = :func_so
map["00{11-ff}{00-ff}{00-ff}"] = :invalid
map["0000{d8-df}{00-ff}"] = :invalid
#map["{01-ff}"] = :invalid
map["{01-ff}{00-ff}{00-ff}{00-ff}"] = :invalid
code = ''
ActionMap.parse(map).generate_node(code, "from_UTF_32BE", [])
code
%>
static const rb_transcoder
rb_from_UTF_32BE = {
"UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 0,
NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_32be
};
static const rb_transcoder
rb_to_UTF_32BE = {
"UTF-8", "UTF-32BE", &to_UTF_16BE, 4, 1,
NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32be
};
<%=
map = {}
map["{00-ff}{00-d7,e0-ff}0000"] = :func_so
map["{00-ff}{00-ff}{01-10}00"] = :func_so
map["{00-ff}{00-ff}{00-ff}{01-ff}"] = :invalid
map["{00-ff}{00-ff}{11-ff}00"] = :invalid
map["{00-ff}{d8-df}0000"] = :invalid
code = ''
ActionMap.parse(map).generate_node(code, "from_UTF_32LE", [])
code
%>
static const rb_transcoder
rb_from_UTF_32LE = {
"UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 0,
NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_32le
};
static const rb_transcoder
rb_to_UTF_32LE = {
"UTF-8", "UTF-32LE", &to_UTF_16BE, 4, 1,
NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32le
};
void
Init_utf_16_32(void)
{
rb_register_transcoder(&rb_from_UTF_16BE);
rb_register_transcoder(&rb_to_UTF_16BE);
rb_register_transcoder(&rb_from_UTF_16LE);
rb_register_transcoder(&rb_to_UTF_16LE);
rb_register_transcoder(&rb_from_UTF_32BE);
rb_register_transcoder(&rb_to_UTF_32BE);
rb_register_transcoder(&rb_from_UTF_32LE);
rb_register_transcoder(&rb_to_UTF_32LE);
}

6
tool/build-transcode Executable file
View file

@ -0,0 +1,6 @@
#!/bin/sh
ruby tool/transcode-tblgen.rb -vo enc/trans/single_byte.c enc/trans/single_byte.erb.c
ruby tool/transcode-tblgen.rb -vo enc/trans/utf_16_32.c enc/trans/utf_16_32.erb.c
ruby tool/transcode-tblgen.rb -vo enc/trans/japanese.c enc/trans/japanese.erb.c
ruby tool/transcode-tblgen.rb -vo enc/trans/korean.c enc/trans/korean.erb.c

565
tool/transcode-tblgen.rb Normal file
View file

@ -0,0 +1,565 @@
require 'optparse'
require 'erb'
C_ESC = {
"\\" => "\\\\",
'"' => '\"',
"\n" => '\n',
}
0x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch }
0x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch }
C_ESC_PAT = Regexp.union(*C_ESC.keys)
def c_esc(str)
'"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"'
end
class StrSet
def self.parse(pattern)
result = []
pattern.scan(/\S+/) {|seq|
seq_result = []
while !seq.empty?
if /\A([0-9a-f][0-9a-f])/i =~ seq
byte = $1.to_i(16)
seq_result << [byte..byte]
seq = $'
elsif /\A\{([^\}]+)\}/ =~ seq
set = $1
seq = $'
set_result = []
set.scan(/[^,]+/) {|range|
if /\A([0-9a-f][0-9a-f])-([0-9a-f][0-9a-f])\z/ =~ range
b = $1.to_i(16)
e = $2.to_i(16)
set_result << (b..e)
elsif /\A([0-9a-f][0-9a-f])\z/ =~ range
byte = $1.to_i(16)
set_result << (byte..byte)
else
raise "invalid range: #{range.inspect}"
end
}
seq_result << set_result
else
raise "invalid sequence: #{seq.inspect}"
end
end
result << seq_result
}
self.new(result)
end
def initialize(pat)
@pat = pat
end
def hash
@pat.hash
end
def eql?(other)
self.class == other.class &&
@pat == other.instance_eval { @pat }
end
alias == eql?
def to_s
if @pat.empty?
"(empset)"
elsif @pat == [[]]
"(empstr)"
else
@pat.map {|seq|
seq.map {|byteset|
if byteset.length == 1 && byteset[0].begin == byteset[0].end
"%02x" % byteset[0].begin
else
"{" +
byteset.map {|range|
if range.begin == range.end
"%02x" % range.begin
else
"%02x-%02x" % [range.begin, range.end]
end
}.join(',') +
"}"
end
}.join('')
}.join(' ')
end
end
def inspect
"\#<#{self.class}: #{self.to_s}>"
end
def emptyable?
@pat.any? {|seq|
seq.empty?
}
end
def first_bytes
result = {}
@pat.each {|seq|
next if seq.empty?
seq.first.each {|range|
range.each {|byte|
result[byte] = true
}
}
}
result.keys.sort
end
def each_firstbyte
h = {}
@pat.each {|seq|
next if seq.empty?
seq.first.each {|range|
range.each {|byte|
(h[byte] ||= []) << seq[1..-1]
}
}
}
h.keys.sort.each {|byte|
yield byte, StrSet.new(h[byte])
}
end
end
class ActionMap
def self.parse(hash)
h = {}
hash.each {|pat, action|
h[StrSet.parse(pat)] = action
}
self.new(h)
end
def initialize(h)
@map = h
@default_action = :undef
end
attr_accessor :default_action
def hash
hash = 0
@map.each {|k,v|
hash ^= k.hash ^ v.hash
}
hash
end
def eql?(other)
self.class == other.class &&
@map.eql?(other.instance_eval { @map })
end
alias == eql?
def inspect
"\#<#{self.class}:" +
@map.map {|k, v| " [" + k.to_s + "]=>" + v.inspect }.join('') +
">"
end
def empty_action
@map.each {|ss, action|
return action if ss.emptyable?
}
nil
end
def each_firstbyte
h = {}
@map.each {|ss, action|
if ss.emptyable?
raise "emptyable pattern"
else
ss.each_firstbyte {|byte, rest|
h[byte] ||= {}
if h[byte][rest]
raise "ambiguous"
else
h[byte][rest] = action
end
}
end
}
h.keys.sort.each {|byte|
am = ActionMap.new(h[byte])
am.default_action = @default_action
yield byte, am
}
end
OffsetsMemo = {}
InfosMemo = {}
def format_offsets(offsets)
code = "{\n"
0.step(offsets.length-1,16) {|i|
code << " "
code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
code << " "
code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
code << "\n"
}
code << '}'
code
end
def generate_info(info)
case info
when :nomap
"NOMAP"
when :undef
"UNDEF"
when :invalid
"INVALID"
when :func_so
"FUNso"
when /\A([0-9a-f][0-9a-f])\z/i
"o1(0x#$1)"
when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
"o2(0x#$1,0x#$2)"
when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
"o3(0x#$1,0x#$2,0x#$3)"
when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
"o4(0x#$1,0x#$2,0x#$3,0x#$4)"
else
info.to_s
end
end
def format_infos(infos)
infos = infos.map {|info| generate_info(info) }
maxlen = infos.map {|info| info.length }.max
columns = maxlen <= 16 ? 4 : 2
code = "{\n"
0.step(infos.length-1, columns) {|i|
code << " "
is = infos[i,columns]
is.each {|info|
code << sprintf(" %#{maxlen}s,", info)
}
code << "\n"
}
code << "}"
code
end
def generate_lookup_node(name, table)
offsets = []
infos = []
infomap = {}
noaction_bytes = []
table.each_with_index {|action, byte|
if !action
noaction_bytes << byte
next
end
unless o = infomap[action]
infomap[action] = o = infos.length
infos[o] = action
end
offsets[byte] = o
}
if !noaction_bytes.empty?
noaction_bytes.each {|byte|
offsets[byte] = infos.length
}
infos << @default_action
end
if n = OffsetsMemo[offsets]
offsets_name = n
offsets_code = ''
else
offsets_name = "#{name}_offsets"
offsets_code = <<"End"
static const unsigned char
#{offsets_name}[#{offsets.length}] = #{format_offsets(offsets)};
End
OffsetsMemo[offsets] = offsets_name
end
if n = InfosMemo[infos]
infos_name = n
infos_code = ''
else
infos_name = "#{name}_infos"
infos_code = <<"End"
static const struct byte_lookup* const
#{infos_name}[#{infos.length}] = #{format_infos(infos)};
End
InfosMemo[infos] = infos_name
end
r = offsets_code + infos_code + <<"End"
static const BYTE_LOOKUP
#{name} = {
#{offsets_name},
#{infos_name}
};
End
r
end
PreMemo = {}
PostMemo = {}
NextName = "a"
def generate_node(code, name_hint=nil, ranges=[])
ranges = [0x00..0xff] if ranges.empty?
range = ranges.first
if n = PreMemo[self]
return n
end
table = Array.new(range.end - range.begin + 1)
each_firstbyte {|byte, rest|
unless range === byte
raise "byte not in range"
end
if a = rest.empty_action
table[byte-range.begin] = a
else
name_hint2 = nil
name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint
table[byte-range.begin] = "&" + rest.generate_node(code, name_hint2, ranges[1..-1])
end
}
if n = PostMemo[table]
return n
end
if !name_hint
name_hint = "fun_" + NextName.dup
NextName.succ!
end
PreMemo[self] = PostMemo[table] = name_hint
code << generate_lookup_node(name_hint, table)
name_hint
end
end
def encode_utf8(map)
r = []
map.each {|k, v|
# integer means UTF-8 encoded sequence.
k = [k].pack("U").unpack("H*")[0].upcase if Integer === k
v = [v].pack("U").unpack("H*")[0].upcase if Integer === v
r << [k,v]
}
r
end
def transcode_compile_tree(name, from, map)
map = encode_utf8(map)
h = {}
map.each {|k, v|
h[k] = v
}
am = ActionMap.parse(h)
ranges = from == "UTF-8" ? [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf] : []
code = ''
defined_name = am.generate_node(code, name, ranges)
return defined_name, code
end
TRANSCODERS = []
def transcode_tblgen(from, to, map)
id_from = from.tr('^0-9A-Za-z', '_')
id_to = to.tr('^0-9A-Za-z', '_')
if from == "UTF-8"
tree_name = "to_#{id_to}"
elsif to == "UTF-8"
tree_name = "from_#{id_from}"
else
tree_name = "from_#{id_from}_to_#{id_to}"
end
map = encode_utf8(map)
real_tree_name, tree_code = transcode_compile_tree(tree_name, from, map)
transcoder_name = "rb_#{tree_name}"
TRANSCODERS << transcoder_name
from_utf8 = from == 'UTF-8' ? 1 : 0
max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
transcoder_code = <<"End"
static const rb_transcoder
#{transcoder_name} = {
#{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{max_output}, #{from_utf8},
NULL, NULL,
};
End
tree_code + "\n" + transcoder_code
end
def transcode_register_code
code = ''
TRANSCODERS.each {|transcoder_name|
code << " rb_register_transcoder(&#{transcoder_name});\n"
}
code
end
Universe = {
"singlebyte" => "{00-ff}",
"doublebyte" => "{00-ff}{00-ff}",
"quadruplebyte" => "{00-ff}{00-ff}{00-ff}{00-ff}",
"US-ASCII" => "{00-7f}",
"EUC-JP" => <<-End,
{00-7f}
{a1-fe}{a1-fe}
8e{a1-fe}
8f{a1-fe}{a1-fe}
End
"EUC-KR" => <<-End,
{00-7f}
{a1-fe}{a1-fe}
End
"EUC-TW" => <<-End,
{00-7f}
{a1-fe}{a1-fe}
8e{a1-b0}{a1-fe}{a1-fe}
End
"Shift_JIS" => <<-End,
{00-7f}
{81-9f,e0-fc}{40-7e,80-fc}
{a1-df}
End
"Big5" => <<-End,
{00-7f}
{a1-fe}{40-7e,a1-fe}
End
"GBK" => <<-End,
{00-80}
{81-fe}{40-7e,80-fe}
End
"CP949" => <<-End,
{00-80}
{81-fe}{41-5a,61-7a,81-fe}
End
"UTF-8" => <<-End,
{00-7f}
{c2-df}{80-bf}
e0{a0-bf}{80-bf}
{e1-ec}{80-bf}{80-bf}
ed{80-9f}{80-bf}
{ee-ef}{80-bf}{80-bf}
f0{90-bf}{80-bf}{80-bf}
{f1-f3}{80-bf}{80-bf}{80-bf}
f4{80-8f}{80-bf}{80-bf}
End
"GB18030" => <<-End,
{00-7f}
{81-fe}{40-7e,80-fe}
{81-fe}{30-93}{81-fe}{30-93}
End
"UTF-16BE" => <<-End,
{00-d7,e0-ff}{00-ff}
{d8-db}{00-ff}{dc-df}{00-ff}
End
"UTF-16LE" => <<-End,
{00-ff}{00-d7,e0-ff}
{00-ff}{d8-db}{00-ff}{dc-df}
End
"UTF-32BE" => <<-End,
0000{00-d7,e0-ff}{00-ff}
00{01-10}{00-ff}{00-ff}
End
"UTF-32LE" => <<-End,
{00-ff}{00-d7,e0-ff}0000
{00-ff}{00-ff}{01-10}00
End
}
def make_signature(filename, src)
"src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}"
end
output_filename = nil
verbose_mode = false
force_mode = false
op = OptionParser.new
op.def_option("--help", "show help message") { puts op; exit 0 }
op.def_option("--verbose", "verbose mode") { verbose_mode = true }
op.def_option("--force", "force table generation") { force_mode = true }
op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg }
op.parse!
arg = ARGV.shift
dir = File.dirname(arg)
$:.unshift dir unless $:.include? dir
src = File.read(arg)
src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding
base_signature = "/* autogenerated. */\n"
base_signature << "/* #{make_signature(File.basename(arg), src)} */\n"
if !force_mode && output_filename && File.readable?(output_filename)
old_signature = File.open(output_filename) {|f| f.gets("").chomp }
chk_signature = base_signature.dup
old_signature.each_line {|line|
if %r{/\* src="([0-9a-z_.-]+)",} =~ line
name = $1
next if name == File.basename(arg)
path = File.join(dir, name)
if File.readable? path
chk_signature << "/* #{make_signature(name, File.read(path))} */\n"
end
end
}
if old_signature == chk_signature
now = Time.now
File.utime(now, now, output_filename)
STDERR.puts "#{output_filename} is already up-to-date." if verbose_mode
exit
end
end
if verbose_mode
if output_filename
STDERR.print "generate #{output_filename} ..."
end
end
libs1 = $".dup
erb_result = ERB.new(src, nil, '%').result(binding)
libs2 = $".dup
libs = libs2 - libs1
lib_sigs = ''
libs.each {|lib|
lib = File.basename(lib)
path = File.join(dir, lib)
if File.readable? path
lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n"
end
}
result = ''
result << base_signature
result << lib_sigs
result << "\n"
result << erb_result
result << "\n"
if output_filename
new_filename = output_filename + ".new"
File.open(new_filename, "w") {|f| f << result }
File.rename(new_filename, output_filename)
STDERR.puts " done." if verbose_mode
else
print result
end