2007-12-15 00:42:25 -05:00
# -*- encoding: ASCII-8BIT -*- # make sure this runs in binary mode
2007-12-22 04:52:00 -05:00
# some of the comments are in UTF-8
2007-12-10 03:25:01 -05:00
require 'test/unit'
2007-12-15 00:42:25 -05:00
class TestTranscode < Test :: Unit :: TestCase
2008-03-16 05:09:53 -04:00
def setup_really_needed? # trick to create all the necessary encodings
2007-12-15 00:42:25 -05:00
all_encodings = [ 'ISO-8859-1' , 'ISO-8859-2' ,
'ISO-8859-3' , 'ISO-8859-4' ,
'ISO-8859-5' , 'ISO-8859-6' ,
'ISO-8859-7' , 'ISO-8859-8' ,
'ISO-8859-9' , 'ISO-8859-10' ,
'ISO-8859-11' , 'ISO-8859-13' ,
2008-01-20 01:12:48 -05:00
'ISO-8859-14' , 'ISO-8859-15' ,
'UTF-16BE'
2007-12-15 00:42:25 -05:00
]
all_encodings . each do | enc |
'abc' . encode ( enc , 'UTF-8' )
end
end
def test_errors
2007-12-10 03:25:01 -05:00
assert_raise ( ArgumentError ) { 'abc' . encode }
assert_raise ( ArgumentError ) { 'abc' . encode! }
assert_raise ( ArgumentError ) { 'abc' . encode ( 'foo' , 'bar' ) }
assert_raise ( ArgumentError ) { 'abc' . encode! ( 'foo' , 'bar' ) }
assert_raise ( ArgumentError ) { 'abc' . force_encoding ( 'utf-8' ) . encode ( 'foo' ) }
assert_raise ( ArgumentError ) { 'abc' . force_encoding ( 'utf-8' ) . encode! ( 'foo' ) }
2007-12-28 04:26:55 -05:00
assert_raise ( RuntimeError ) { " \x80 " . encode ( 'utf-8' , 'ASCII-8BIT' ) }
assert_raise ( RuntimeError ) { " \x80 " . encode ( 'utf-8' , 'US-ASCII' ) }
2007-12-16 20:28:26 -05:00
assert_raise ( RuntimeError ) { " \xA5 " . encode ( 'utf-8' , 'iso-8859-3' ) }
2007-12-15 00:42:25 -05:00
end
def test_arguments
assert_equal ( 'abc' , 'abc' . force_encoding ( 'utf-8' ) . encode ( 'iso-8859-1' ) )
2007-12-11 00:27:52 -05:00
# check that encoding is kept when no conversion is done
2007-12-15 00:42:25 -05:00
assert_equal ( 'abc' . force_encoding ( 'Shift_JIS' ) , 'abc' . force_encoding ( 'Shift_JIS' ) . encode ( 'Shift_JIS' ) )
assert_equal ( 'abc' . force_encoding ( 'Shift_JIS' ) , 'abc' . force_encoding ( 'Shift_JIS' ) . encode! ( 'Shift_JIS' ) )
2007-12-11 00:27:52 -05:00
# assert that encoding is correctly set
2007-12-15 00:42:25 -05:00
assert_equal ( " D \ u00FCrst " . encoding , " D \xFC rst " . force_encoding ( 'iso-8859-1' ) . encode ( 'utf-8' ) . encoding )
2007-12-11 00:27:52 -05:00
# check that Encoding can be used as parameter
2007-12-15 00:42:25 -05:00
assert_equal ( " D \ u00FCrst " , " D \xFC rst " . encode ( 'utf-8' , Encoding . find ( 'ISO-8859-1' ) ) )
assert_equal ( " D \ u00FCrst " , " D \xFC rst " . encode ( Encoding . find ( 'utf-8' ) , 'ISO-8859-1' ) )
assert_equal ( " D \ u00FCrst " , " D \xFC rst " . encode ( Encoding . find ( 'utf-8' ) , Encoding . find ( 'ISO-8859-1' ) ) )
end
2007-12-11 00:27:52 -05:00
2007-12-15 00:42:25 -05:00
def test_length
assert_equal ( " \ u20AC " * 20 , ( " \xA4 " * 20 ) . encode ( 'utf-8' , 'iso-8859-15' ) )
assert_equal ( " \ u20AC " * 20 , ( " \xA4 " * 20 ) . encode! ( 'utf-8' , 'iso-8859-15' ) )
assert_equal ( " \ u20AC " * 2000 , ( " \xA4 " * 2000 ) . encode ( 'utf-8' , 'iso-8859-15' ) )
assert_equal ( " \ u20AC " * 2000 , ( " \xA4 " * 2000 ) . encode! ( 'utf-8' , 'iso-8859-15' ) )
assert_equal ( " \ u20AC " * 200000 , ( " \xA4 " * 200000 ) . encode ( 'utf-8' , 'iso-8859-15' ) )
assert_equal ( " \ u20AC " * 200000 , ( " \xA4 " * 200000 ) . encode! ( 'utf-8' , 'iso-8859-15' ) )
2007-12-11 00:27:52 -05:00
end
2007-12-22 04:52:00 -05:00
def check_both_ways ( utf8 , raw , encoding )
2008-01-20 01:12:48 -05:00
assert_equal ( utf8 . force_encoding ( 'utf-8' ) , raw . encode ( 'utf-8' , encoding ) )
assert_equal ( raw . force_encoding ( encoding ) , utf8 . encode ( encoding , 'utf-8' ) )
2007-12-22 04:52:00 -05:00
end
2007-12-15 00:42:25 -05:00
def test_encodings
2007-12-22 04:52:00 -05:00
check_both_ways ( " \ u307E \ u3064 \ u3082 \ u3068 \ u3086 \ u304D \ u3072 \ u308D " ,
" \x82 \xdc \x82 \xc2 \x82 \xe0 \x82 \xc6 \x82 \xe4 \x82 \xab \x82 \xd0 \x82 \xeb " , 'shift_jis' ) # まつもと ゆきひろ
check_both_ways ( " \ u307E \ u3064 \ u3082 \ u3068 \ u3086 \ u304D \ u3072 \ u308D " ,
" \xa4 \xde \xa4 \xc4 \xa4 \xe2 \xa4 \xc8 \xa4 \xe6 \xa4 \xad \xa4 \xd2 \xa4 \xed " , 'euc-jp' )
check_both_ways ( " \ u677E \ u672C \ u884C \ u5F18 " , " \x8f \xbc \x96 \x7b \x8d \x73 \x8d \x4f " , 'shift_jis' ) # 松本行弘
check_both_ways ( " \ u677E \ u672C \ u884C \ u5F18 " , " \xbe \xbe \xcb \xdc \xb9 \xd4 \xb9 \xb0 " , 'euc-jp' )
check_both_ways ( " D \ u00FCrst " , " D \xFC rst " , 'iso-8859-1' ) # Dürst
check_both_ways ( " D \ u00FCrst " , " D \xFC rst " , 'iso-8859-2' )
check_both_ways ( " D \ u00FCrst " , " D \xFC rst " , 'iso-8859-3' )
check_both_ways ( " D \ u00FCrst " , " D \xFC rst " , 'iso-8859-4' )
check_both_ways ( " D \ u00FCrst " , " D \xFC rst " , 'iso-8859-9' )
check_both_ways ( " D \ u00FCrst " , " D \xFC rst " , 'iso-8859-10' )
check_both_ways ( " D \ u00FCrst " , " D \xFC rst " , 'iso-8859-13' )
check_both_ways ( " D \ u00FCrst " , " D \xFC rst " , 'iso-8859-14' )
check_both_ways ( " D \ u00FCrst " , " D \xFC rst " , 'iso-8859-15' )
check_both_ways ( " r \ u00E9sum \ u00E9 " , " r \xE9 sum \xE9 " , 'iso-8859-1' ) # résumé
check_both_ways ( " \ u0065 \ u006C \ u0151 \ u00ED \ u0072 \ u00E1 \ u0073 " , " el \xF5 \xED r \xE1 s " , 'iso-8859-2' ) # előírás
check_both_ways ( " \ u043F \ u0435 \ u0440 \ u0435 \ u0432 \ u043E \ u0434 " ,
" \xDF \xD5 \xE0 \xD5 \xD2 \xDE \xD4 " , 'iso-8859-5' ) # перевод
check_both_ways ( " \ u0643 \ u062A \ u0628 " , " \xE3 \xCA \xC8 " , 'iso-8859-6' ) # كتب
check_both_ways ( " \ u65E5 \ u8A18 " , " \x93 \xFA \x8B L " , 'shift_jis' ) # 日記
check_both_ways ( " \ u65E5 \ u8A18 " , " \xC6 \xFC \xB5 \xAD " , 'euc-jp' )
2008-02-07 01:05:32 -05:00
check_both_ways ( " \ uC560 \ uC778 \ uAD6C \ uD568 \ u0020 \ u6734 \ uC9C0 \ uC778 " ,
" \xBE \xD6 \xC0 \xCE \xB1 \xB8 \xC7 \xD4 \x20 \xDA \xD3 \xC1 \xF6 \xC0 \xCE " , 'euc-kr' ) # 애인구함 朴지인
check_both_ways ( " \ uC544 \ uD58F \ uD58F \ u0020 \ uB620 \ uBC29 \ uD6BD \ uB2D8 \ u0020 \ uC0AC \ uB791 \ uD716 " ,
" \xBE \xC6 \xC1 \x64 \xC1 \x64 \x20 \x8C \x63 \xB9 \xE6 \xC4 \x4F \xB4 \xD4 \x20 \xBB \xE7 \xB6 \xFB \xC5 \x42 " , 'cp949' ) # 아햏햏 똠방횽님 사랑휖
2007-12-15 00:42:25 -05:00
end
def test_twostep
assert_equal ( " D \xFC rst " . force_encoding ( 'iso-8859-2' ) , " D \xFC rst " . encode ( 'iso-8859-2' , 'iso-8859-1' ) )
end
2007-12-22 04:52:00 -05:00
def test_ascii_range
encodings = [
2007-12-28 04:26:55 -05:00
'US-ASCII' , 'ASCII-8BIT' ,
2007-12-22 04:52:00 -05:00
'ISO-8859-1' , 'ISO-8859-2' , 'ISO-8859-3' ,
'ISO-8859-4' , 'ISO-8859-5' , 'ISO-8859-6' ,
'ISO-8859-7' , 'ISO-8859-8' , 'ISO-8859-9' ,
'ISO-8859-10' , 'ISO-8859-11' , 'ISO-8859-13' ,
'ISO-8859-14' , 'ISO-8859-15' ,
2008-02-07 01:05:32 -05:00
'EUC-JP' , 'SHIFT_JIS' , 'EUC-KR'
2007-12-22 04:52:00 -05:00
]
all_ascii = ( 0 .. 127 ) . to_a . pack 'C*'
encodings . each do | enc |
test_start = all_ascii
assert_equal ( test_start , test_start . encode ( 'UTF-8' , enc ) . encode ( enc ) . force_encoding ( 'ASCII-8BIT' ) )
end
end
2007-12-11 00:27:52 -05:00
def test_all_bytes
encodings_8859 = [
'ISO-8859-1' , 'ISO-8859-2' ,
#'ISO-8859-3', # not all bytes used
'ISO-8859-4' , 'ISO-8859-5' ,
#'ISO-8859-6', # not all bytes used
#'ISO-8859-7', # not all bytes used
#'ISO-8859-8', # not all bytes used
'ISO-8859-9' , 'ISO-8859-10' ,
#'ISO-8859-11', # not all bytes used
#'ISO-8859-12', # not available
'ISO-8859-13' , 'ISO-8859-14' , 'ISO-8859-15' ,
#'ISO-8859-16', # not available
]
2007-12-22 04:52:00 -05:00
all_bytes = ( 0 .. 255 ) . to_a . pack 'C*'
2007-12-11 00:27:52 -05:00
encodings_8859 . each do | enc |
test_start = all_bytes
2007-12-15 00:42:25 -05:00
assert_equal ( test_start , test_start . encode ( 'UTF-8' , enc ) . encode ( enc ) . force_encoding ( 'ASCII-8BIT' ) )
2007-12-11 00:27:52 -05:00
end
2007-12-10 03:25:01 -05:00
end
2008-01-20 06:00:24 -05:00
def check_utf_16_both_ways ( utf8 , raw )
copy = raw . dup
0 . step ( copy . length - 1 , 2 ) { | i | copy [ i + 1 ] , copy [ i ] = copy [ i ] , copy [ i + 1 ] }
check_both_ways ( utf8 , raw , 'utf-16be' )
check_both_ways ( utf8 , copy , 'utf-16le' )
end
def test_utf_16
check_utf_16_both_ways ( " abc " , " \x00 a \x00 b \x00 c " )
check_utf_16_both_ways ( " \ u00E9 " , " \x00 \xE9 " ) ;
check_utf_16_both_ways ( " \ u00E9 \ u0070 \ u00E9 \ u0065 " , " \x00 \xE9 \x00 \x70 \x00 \xE9 \x00 \x65 " ) # épée
check_utf_16_both_ways ( " \ u677E \ u672C \ u884C \ u5F18 " , " \x67 \x7E \x67 \x2C \x88 \x4C \x5F \x18 " ) # 松本行弘
check_utf_16_both_ways ( " \ u9752 \ u5C71 \ u5B66 \ u9662 \ u5927 \ u5B66 " , " \x97 \x52 \x5C \x71 \x5B \x66 \x96 \x62 \x59 \x27 \x5B \x66 " ) # 青山学院大学
check_utf_16_both_ways ( " Martin D \ u00FCrst " , " \x00 M \x00 a \x00 r \x00 t \x00 i \x00 n \x00 \x00 D \x00 \xFC \x00 r \x00 s \x00 t " ) # Martin Dürst
2008-01-20 01:12:48 -05:00
# BMP
2008-01-20 06:00:24 -05:00
check_utf_16_both_ways ( " \ u0000 " , " \x00 \x00 " )
check_utf_16_both_ways ( " \ u007F " , " \x00 \x7F " )
check_utf_16_both_ways ( " \ u0080 " , " \x00 \x80 " )
check_utf_16_both_ways ( " \ u0555 " , " \x05 \x55 " )
check_utf_16_both_ways ( " \ u04AA " , " \x04 \xAA " )
check_utf_16_both_ways ( " \ u0333 " , " \x03 \x33 " )
check_utf_16_both_ways ( " \ u04CC " , " \x04 \xCC " )
check_utf_16_both_ways ( " \ u00F0 " , " \x00 \xF0 " )
check_utf_16_both_ways ( " \ u070F " , " \x07 \x0F " )
check_utf_16_both_ways ( " \ u07FF " , " \x07 \xFF " )
check_utf_16_both_ways ( " \ u0800 " , " \x08 \x00 " )
check_utf_16_both_ways ( " \ uD7FF " , " \xD7 \xFF " )
check_utf_16_both_ways ( " \ uE000 " , " \xE0 \x00 " )
check_utf_16_both_ways ( " \ uFFFF " , " \xFF \xFF " )
check_utf_16_both_ways ( " \ u5555 " , " \x55 \x55 " )
check_utf_16_both_ways ( " \ uAAAA " , " \xAA \xAA " )
check_utf_16_both_ways ( " \ u3333 " , " \x33 \x33 " )
check_utf_16_both_ways ( " \ uCCCC " , " \xCC \xCC " )
check_utf_16_both_ways ( " \ uF0F0 " , " \xF0 \xF0 " )
check_utf_16_both_ways ( " \ u0F0F " , " \x0F \x0F " )
check_utf_16_both_ways ( " \ uFF00 " , " \xFF \x00 " )
check_utf_16_both_ways ( " \ u00FF " , " \x00 \xFF " )
2008-01-20 01:12:48 -05:00
# outer planes
2008-01-20 06:00:24 -05:00
check_utf_16_both_ways ( " \ u{10000} " , " \xD8 \x00 \xDC \x00 " )
check_utf_16_both_ways ( " \ u{FFFFF} " , " \xDB \xBF \xDF \xFF " )
check_utf_16_both_ways ( " \ u{100000} " , " \xDB \xC0 \xDC \x00 " )
check_utf_16_both_ways ( " \ u{10FFFF} " , " \xDB \xFF \xDF \xFF " )
check_utf_16_both_ways ( " \ u{105555} " , " \xDB \xD5 \xDD \x55 " )
check_utf_16_both_ways ( " \ u{55555} " , " \xD9 \x15 \xDD \x55 " )
check_utf_16_both_ways ( " \ u{AAAAA} " , " \xDA \x6A \xDE \xAA " )
check_utf_16_both_ways ( " \ u{33333} " , " \xD8 \x8C \xDF \x33 " )
check_utf_16_both_ways ( " \ u{CCCCC} " , " \xDA \xF3 \xDC \xCC " )
check_utf_16_both_ways ( " \ u{8F0F0} " , " \xD9 \xFC \xDC \xF0 " )
check_utf_16_both_ways ( " \ u{F0F0F} " , " \xDB \x83 \xDF \x0F " )
check_utf_16_both_ways ( " \ u{8FF00} " , " \xD9 \xFF \xDF \x00 " )
check_utf_16_both_ways ( " \ u{F00FF} " , " \xDB \x80 \xDC \xFF " )
2008-01-20 01:12:48 -05:00
end
2008-01-21 05:41:59 -05:00
def check_utf_32_both_ways ( utf8 , raw )
copy = raw . dup
0 . step ( copy . length - 1 , 4 ) do | i |
copy [ i + 3 ] , copy [ i + 2 ] , copy [ i + 1 ] , copy [ i ] = copy [ i ] , copy [ i + 1 ] , copy [ i + 2 ] , copy [ i + 3 ]
end
check_both_ways ( utf8 , raw , 'utf-32be' )
#check_both_ways(utf8, copy, 'utf-32le')
end
def test_utf_32
check_utf_32_both_ways ( " abc " , " \x00 \x00 \x00 a \x00 \x00 \x00 b \x00 \x00 \x00 c " )
check_utf_32_both_ways ( " \ u00E9 " , " \x00 \x00 \x00 \xE9 " ) ;
check_utf_32_both_ways ( " \ u00E9 \ u0070 \ u00E9 \ u0065 " ,
" \x00 \x00 \x00 \xE9 \x00 \x00 \x00 \x70 \x00 \x00 \x00 \xE9 \x00 \x00 \x00 \x65 " ) # épée
check_utf_32_both_ways ( " \ u677E \ u672C \ u884C \ u5F18 " ,
" \x00 \x00 \x67 \x7E \x00 \x00 \x67 \x2C \x00 \x00 \x88 \x4C \x00 \x00 \x5F \x18 " ) # 松本行弘
check_utf_32_both_ways ( " \ u9752 \ u5C71 \ u5B66 \ u9662 \ u5927 \ u5B66 " ,
" \x00 \x00 \x97 \x52 \x00 \x00 \x5C \x71 \x00 \x00 \x5B \x66 \x00 \x00 \x96 \x62 \x00 \x00 \x59 \x27 \x00 \x00 \x5B \x66 " ) # 青山学院大学
check_utf_32_both_ways ( " Martin D \ u00FCrst " ,
" \x00 \x00 \x00 M \x00 \x00 \x00 a \x00 \x00 \x00 r \x00 \x00 \x00 t \x00 \x00 \x00 i \x00 \x00 \x00 n \x00 \x00 \x00 \x00 \x00 \x00 D \x00 \x00 \x00 \xFC \x00 \x00 \x00 r \x00 \x00 \x00 s \x00 \x00 \x00 t " ) # Martin Dürst
# BMP
check_utf_32_both_ways ( " \ u0000 " , " \x00 \x00 \x00 \x00 " )
check_utf_32_both_ways ( " \ u007F " , " \x00 \x00 \x00 \x7F " )
check_utf_32_both_ways ( " \ u0080 " , " \x00 \x00 \x00 \x80 " )
check_utf_32_both_ways ( " \ u0555 " , " \x00 \x00 \x05 \x55 " )
check_utf_32_both_ways ( " \ u04AA " , " \x00 \x00 \x04 \xAA " )
check_utf_32_both_ways ( " \ u0333 " , " \x00 \x00 \x03 \x33 " )
check_utf_32_both_ways ( " \ u04CC " , " \x00 \x00 \x04 \xCC " )
check_utf_32_both_ways ( " \ u00F0 " , " \x00 \x00 \x00 \xF0 " )
check_utf_32_both_ways ( " \ u070F " , " \x00 \x00 \x07 \x0F " )
check_utf_32_both_ways ( " \ u07FF " , " \x00 \x00 \x07 \xFF " )
check_utf_32_both_ways ( " \ u0800 " , " \x00 \x00 \x08 \x00 " )
check_utf_32_both_ways ( " \ uD7FF " , " \x00 \x00 \xD7 \xFF " )
check_utf_32_both_ways ( " \ uE000 " , " \x00 \x00 \xE0 \x00 " )
check_utf_32_both_ways ( " \ uFFFF " , " \x00 \x00 \xFF \xFF " )
check_utf_32_both_ways ( " \ u5555 " , " \x00 \x00 \x55 \x55 " )
check_utf_32_both_ways ( " \ uAAAA " , " \x00 \x00 \xAA \xAA " )
check_utf_32_both_ways ( " \ u3333 " , " \x00 \x00 \x33 \x33 " )
check_utf_32_both_ways ( " \ uCCCC " , " \x00 \x00 \xCC \xCC " )
check_utf_32_both_ways ( " \ uF0F0 " , " \x00 \x00 \xF0 \xF0 " )
check_utf_32_both_ways ( " \ u0F0F " , " \x00 \x00 \x0F \x0F " )
check_utf_32_both_ways ( " \ uFF00 " , " \x00 \x00 \xFF \x00 " )
check_utf_32_both_ways ( " \ u00FF " , " \x00 \x00 \x00 \xFF " )
# outer planes
check_utf_32_both_ways ( " \ u{10000} " , " \x00 \x01 \x00 \x00 " )
check_utf_32_both_ways ( " \ u{FFFFF} " , " \x00 \x0F \xFF \xFF " )
check_utf_32_both_ways ( " \ u{100000} " , " \x00 \x10 \x00 \x00 " )
check_utf_32_both_ways ( " \ u{10FFFF} " , " \x00 \x10 \xFF \xFF " )
check_utf_32_both_ways ( " \ u{105555} " , " \x00 \x10 \x55 \x55 " )
check_utf_32_both_ways ( " \ u{55555} " , " \x00 \x05 \x55 \x55 " )
check_utf_32_both_ways ( " \ u{AAAAA} " , " \x00 \x0A \xAA \xAA " )
check_utf_32_both_ways ( " \ u{33333} " , " \x00 \x03 \x33 \x33 " )
check_utf_32_both_ways ( " \ u{CCCCC} " , " \x00 \x0C \xCC \xCC " )
check_utf_32_both_ways ( " \ u{8F0F0} " , " \x00 \x08 \xF0 \xF0 " )
check_utf_32_both_ways ( " \ u{F0F0F} " , " \x00 \x0F \x0F \x0F " )
check_utf_32_both_ways ( " \ u{8FF00} " , " \x00 \x08 \xFF \x00 " )
check_utf_32_both_ways ( " \ u{F00FF} " , " \x00 \x0F \x00 \xFF " )
end
2008-02-21 03:42:10 -05:00
def test_invalid_ignore
# arguments only
2008-03-05 03:45:51 -05:00
assert_nothing_raised { 'abc' . encode ( 'utf-8' , invalid : :ignore ) }
# check handling of UTF-8 ill-formed subsequences
assert_equal ( " \x00 \x41 \x00 \x3E \x00 \x42 " . force_encoding ( 'UTF-16BE' ) ,
" \x41 \xC2 \x3E \x42 " . encode ( 'UTF-16BE' , 'UTF-8' , invalid : :ignore ) )
assert_equal ( " \x00 \x41 \x00 \xF1 \x00 \x42 " . force_encoding ( 'UTF-16BE' ) ,
" \x41 \xC2 \xC3 \xB1 \x42 " . encode ( 'UTF-16BE' , 'UTF-8' , invalid : :ignore ) )
2008-03-16 05:09:53 -04:00
assert_equal ( " \x00 \x42 " . force_encoding ( 'UTF-16BE' ) ,
" \xF0 \x80 \x80 \x42 " . encode ( 'UTF-16BE' , 'UTF-8' , invalid : :ignore ) )
assert_equal ( '' . force_encoding ( 'UTF-16BE' ) ,
" \x82 \xAB " . encode ( 'UTF-16BE' , 'UTF-8' , invalid : :ignore ) )
2008-02-21 03:42:10 -05:00
end
2007-12-10 03:25:01 -05:00
end