1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

Add string encoding IBM720 alias CP720 (#3803)

The mapping table is generated from the ICU project:
  https://github.com/unicode-org/icu/blob/master/icu4c/source/data/mappings/ibm-720_P100-1997.ucm

Fixes bug 16233 : https://bugs.ruby-lang.org/issues/16233
This commit is contained in:
Lars Kanis 2020-11-22 14:23:40 +01:00 committed by GitHub
parent 2d112c346a
commit d403591b34
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
Notes: git 2020-11-22 22:24:04 +09:00
Merged-By: nurse <naruse@airemix.jp>
6 changed files with 154 additions and 1 deletions

View file

@ -61,6 +61,8 @@ OnigEncodingDefine(ascii, ASCII) = {
ENC_ALIAS("BINARY", "ASCII-8BIT") ENC_ALIAS("BINARY", "ASCII-8BIT")
ENC_REPLICATE("IBM437", "ASCII-8BIT") ENC_REPLICATE("IBM437", "ASCII-8BIT")
ENC_ALIAS("CP437", "IBM437") ENC_ALIAS("CP437", "IBM437")
ENC_REPLICATE("IBM720", "ASCII-8BIT")
ENC_ALIAS("CP720", "IBM720")
ENC_REPLICATE("IBM737", "ASCII-8BIT") ENC_REPLICATE("IBM737", "ASCII-8BIT")
ENC_ALIAS("CP737", "IBM737") ENC_ALIAS("CP737", "IBM737")
ENC_REPLICATE("IBM775", "ASCII-8BIT") ENC_REPLICATE("IBM775", "ASCII-8BIT")

122
enc/trans/ibm720-tbl.rb Normal file
View file

@ -0,0 +1,122 @@
IBM720_TO_UCS_TBL = [
["FF",0xA0],
["9C",0xA3],
["94",0xA4],
["AE",0xAB],
["F8",0xB0],
["FD",0xB2],
["E6",0xB5],
["FA",0xB7],
["AF",0xBB],
["85",0xE0],
["83",0xE2],
["87",0xE7],
["8A",0xE8],
["82",0xE9],
["88",0xEA],
["89",0xEB],
["8C",0xEE],
["8B",0xEF],
["93",0xF4],
["97",0xF9],
["96",0xFB],
["98",0x621],
["99",0x622],
["9A",0x623],
["9B",0x624],
["9D",0x625],
["9E",0x626],
["9F",0x627],
["A0",0x628],
["A1",0x629],
["A2",0x62A],
["A3",0x62B],
["A4",0x62C],
["A5",0x62D],
["A6",0x62E],
["A7",0x62F],
["A8",0x630],
["A9",0x631],
["AA",0x632],
["AB",0x633],
["AC",0x634],
["AD",0x635],
["E0",0x636],
["E1",0x637],
["E2",0x638],
["E3",0x639],
["E4",0x63A],
["95",0x640],
["E5",0x641],
["E7",0x642],
["E8",0x643],
["E9",0x644],
["EA",0x645],
["EB",0x646],
["EC",0x647],
["ED",0x648],
["EE",0x649],
["EF",0x64A],
["F1",0x64B],
["F2",0x64C],
["F3",0x64D],
["F4",0x64E],
["F5",0x64F],
["F6",0x650],
["91",0x651],
["92",0x652],
["FC",0x207F],
["F9",0x2219],
["FB",0x221A],
["F7",0x2248],
["F0",0x2261],
["C4",0x2500],
["B3",0x2502],
["DA",0x250C],
["BF",0x2510],
["C0",0x2514],
["D9",0x2518],
["C3",0x251C],
["B4",0x2524],
["C2",0x252C],
["C1",0x2534],
["C5",0x253C],
["CD",0x2550],
["BA",0x2551],
["D5",0x2552],
["D6",0x2553],
["C9",0x2554],
["B8",0x2555],
["B7",0x2556],
["BB",0x2557],
["D4",0x2558],
["D3",0x2559],
["C8",0x255A],
["BE",0x255B],
["BD",0x255C],
["BC",0x255D],
["C6",0x255E],
["C7",0x255F],
["CC",0x2560],
["B5",0x2561],
["B6",0x2562],
["B9",0x2563],
["D1",0x2564],
["D2",0x2565],
["CB",0x2566],
["CF",0x2567],
["D0",0x2568],
["CA",0x2569],
["D8",0x256A],
["D7",0x256B],
["CE",0x256C],
["DF",0x2580],
["DC",0x2584],
["DB",0x2588],
["DD",0x258C],
["DE",0x2590],
["B0",0x2591],
["B1",0x2592],
["B2",0x2593],
["FE",0x25A0],
]

View file

@ -51,8 +51,9 @@
transcode_tblgen_singlebyte "WINDOWS-1256" transcode_tblgen_singlebyte "WINDOWS-1256"
transcode_tblgen_singlebyte "WINDOWS-1257" transcode_tblgen_singlebyte "WINDOWS-1257"
transcode_tblgen_singlebyte "IBM437" transcode_tblgen_singlebyte "IBM437"
transcode_tblgen_singlebyte "IBM775" transcode_tblgen_singlebyte "IBM720"
transcode_tblgen_singlebyte "IBM737" transcode_tblgen_singlebyte "IBM737"
transcode_tblgen_singlebyte "IBM775"
transcode_tblgen_singlebyte "IBM852" transcode_tblgen_singlebyte "IBM852"
transcode_tblgen_singlebyte "IBM855" transcode_tblgen_singlebyte "IBM855"
transcode_tblgen_singlebyte "IBM857" transcode_tblgen_singlebyte "IBM857"

View file

@ -507,6 +507,7 @@ static UINT ole_encoding2cp(rb_encoding *enc)
ENC_MACHING_CP(enc, "GB2312", 20936); ENC_MACHING_CP(enc, "GB2312", 20936);
ENC_MACHING_CP(enc, "GBK", 936); ENC_MACHING_CP(enc, "GBK", 936);
ENC_MACHING_CP(enc, "IBM437", 437); ENC_MACHING_CP(enc, "IBM437", 437);
ENC_MACHING_CP(enc, "IBM720", 720);
ENC_MACHING_CP(enc, "IBM737", 737); ENC_MACHING_CP(enc, "IBM737", 737);
ENC_MACHING_CP(enc, "IBM775", 775); ENC_MACHING_CP(enc, "IBM775", 775);
ENC_MACHING_CP(enc, "IBM852", 852); ENC_MACHING_CP(enc, "IBM852", 852);

View file

@ -100,6 +100,14 @@ describe "String#valid_encoding?" do
str.force_encoding('UTF8-MAC').valid_encoding?.should be_true str.force_encoding('UTF8-MAC').valid_encoding?.should be_true
end end
ruby_version_is '3.0' do
it "returns true for IBM720 encoding self is valid in" do
str = "\u{6754}"
str.force_encoding('IBM720').valid_encoding?.should be_true
str.force_encoding('CP720').valid_encoding?.should be_true
end
end
it "returns false if self is valid in one encoding, but invalid in the one it's tagged with" do it "returns false if self is valid in one encoding, but invalid in the one it's tagged with" do
str = "\u{8765}" str = "\u{8765}"
str.valid_encoding?.should be_true str.valid_encoding?.should be_true

View file

@ -469,6 +469,25 @@ class TestTranscode < Test::Unit::TestCase
check_both_ways("\u00A0", "\xFF", 'IBM437') # non-breaking space check_both_ways("\u00A0", "\xFF", 'IBM437') # non-breaking space
end end
def test_IBM720
assert_raise(Encoding::UndefinedConversionError) { "\x80".encode("utf-8", 'IBM720') }
assert_raise(Encoding::UndefinedConversionError) { "\x8F".encode("utf-8", 'IBM720') }
assert_raise(Encoding::UndefinedConversionError) { "\x90".encode("utf-8", 'IBM720') }
check_both_ways("\u0627", "\x9F", 'IBM720') # ا
check_both_ways("\u0628", "\xA0", 'IBM720') # ب
check_both_ways("\u00BB", "\xAF", 'IBM720') # »
check_both_ways("\u2591", "\xB0", 'IBM720') # ░
check_both_ways("\u2510", "\xBF", 'IBM720') # ┐
check_both_ways("\u2514", "\xC0", 'IBM720') # └
check_both_ways("\u2567", "\xCF", 'IBM720') # ╧
check_both_ways("\u2568", "\xD0", 'IBM720') # ╨
check_both_ways("\u2580", "\xDF", 'IBM720') # ▀
check_both_ways("\u0636", "\xE0", 'IBM720') # ض
check_both_ways("\u064A", "\xEF", 'IBM720') # ي
check_both_ways("\u2261", "\xF0", 'IBM720') # ≡
check_both_ways("\u00A0", "\xFF", 'IBM720') # non-breaking space
end
def test_IBM775 def test_IBM775
check_both_ways("\u0106", "\x80", 'IBM775') # Ć check_both_ways("\u0106", "\x80", 'IBM775') # Ć
check_both_ways("\u00C5", "\x8F", 'IBM775') # Å check_both_ways("\u00C5", "\x8F", 'IBM775') # Å