2016-06-21 03:14:03 -04:00
|
|
|
# frozen_string_literal: true
|
2016-05-25 20:45:44 -04:00
|
|
|
# Copyright © 2016 Martin J. Dürst (duerst@it.aoyama.ac.jp)
|
|
|
|
|
|
|
|
require "test/unit"
|
|
|
|
require 'unicode_normalize/normalize' # only for UNICODE_VERSION
|
|
|
|
|
|
|
|
class CaseTest
|
|
|
|
attr_reader :method_name, :attributes, :first_data, :follow_data
|
|
|
|
def initialize(method_name, attributes, first_data, follow_data=first_data)
|
|
|
|
@method_name = method_name
|
|
|
|
@attributes = attributes
|
|
|
|
@first_data = first_data
|
|
|
|
@follow_data = follow_data
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
class TestComprehensiveCaseFold < Test::Unit::TestCase
|
|
|
|
UNICODE_VERSION = UnicodeNormalize::UNICODE_VERSION
|
|
|
|
UNICODE_DATA_PATH = "../../../enc/unicode/data/#{UNICODE_VERSION}"
|
|
|
|
|
|
|
|
def self.hex2utf8(s)
|
|
|
|
s.split(' ').map { |c| c.to_i(16) }.pack('U*')
|
|
|
|
end
|
|
|
|
|
2016-05-28 06:08:37 -04:00
|
|
|
def self.expand_filename(basename)
|
|
|
|
File.expand_path("#{UNICODE_DATA_PATH}/#{basename}.txt", __dir__)
|
|
|
|
end
|
|
|
|
|
2016-05-25 20:45:44 -04:00
|
|
|
def self.read_data_file (filename)
|
2016-06-20 15:01:37 -04:00
|
|
|
IO.foreach(expand_filename(filename), encoding: Encoding::ASCII_8BIT) do |line|
|
|
|
|
if $. == 1
|
|
|
|
if filename == 'UnicodeData'
|
|
|
|
elsif line.start_with?("# #{filename}-#{UNICODE_VERSION}.txt")
|
|
|
|
else
|
|
|
|
raise "File Version Mismatch"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
next if /\A(?:[\#@]|\s*\z)|Surrogate/.match?(line)
|
|
|
|
data = line.chomp.split('#')[0].split(/;\s*/, 15)
|
2016-06-21 03:14:03 -04:00
|
|
|
code = data[0].to_i(16).chr(Encoding::UTF_8)
|
2016-05-25 20:45:44 -04:00
|
|
|
yield code, data
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.read_data
|
|
|
|
@@codepoints = []
|
|
|
|
|
2016-06-20 14:27:45 -04:00
|
|
|
downcase = Hash.new { |h, c| c }
|
|
|
|
upcase = Hash.new { |h, c| c }
|
|
|
|
titlecase = Hash.new { |h, c| c }
|
|
|
|
casefold = Hash.new { |h, c| c }
|
|
|
|
swapcase = Hash.new { |h, c| c }
|
|
|
|
turkic_upcase = Hash.new { |h, c| upcase[c] }
|
|
|
|
turkic_downcase = Hash.new { |h, c| downcase[c] }
|
|
|
|
turkic_titlecase = Hash.new { |h, c| titlecase[c] }
|
|
|
|
turkic_swapcase = Hash.new { |h, c| swapcase[c] }
|
2016-06-20 15:01:37 -04:00
|
|
|
ascii_upcase = Hash.new { |h, c| /\A[a-zA-Z]\z/.match?(c) ? upcase[c] : c }
|
|
|
|
ascii_downcase = Hash.new { |h, c| /\A[a-zA-Z]\z/.match?(c) ? downcase[c] : c }
|
|
|
|
ascii_titlecase = Hash.new { |h, c| /\A[a-zA-Z]\z/.match?(c) ? titlecase[c] : c }
|
|
|
|
ascii_swapcase = Hash.new { |h, c| /\A[a-z]\z/.match?(c) ? upcase[c] : (/\A[A-Z]\z/.match?(c) ? downcase[c] : c) }
|
2016-05-25 20:45:44 -04:00
|
|
|
|
|
|
|
read_data_file('UnicodeData') do |code, data|
|
|
|
|
@@codepoints << code
|
|
|
|
upcase[code] = hex2utf8 data[12] unless data[12].empty?
|
|
|
|
downcase[code] = hex2utf8 data[13] unless data[13].empty?
|
|
|
|
titlecase[code] = hex2utf8 data[14] unless data[14].empty?
|
|
|
|
end
|
|
|
|
read_data_file('CaseFolding') do |code, data|
|
|
|
|
casefold[code] = hex2utf8(data[2]) if data[1] =~ /^[CF]$/
|
|
|
|
end
|
|
|
|
|
|
|
|
read_data_file('SpecialCasing') do |code, data|
|
|
|
|
case data[4]
|
|
|
|
when ''
|
|
|
|
upcase[code] = hex2utf8 data[3]
|
|
|
|
downcase[code] = hex2utf8 data[1]
|
|
|
|
titlecase[code] = hex2utf8 data[2]
|
2016-06-20 15:01:37 -04:00
|
|
|
when /\Atr\s*/
|
2016-05-25 20:45:45 -04:00
|
|
|
if data[4]!='tr After_I'
|
2016-05-25 20:45:44 -04:00
|
|
|
turkic_upcase[code] = hex2utf8 data[3]
|
|
|
|
turkic_downcase[code] = hex2utf8 data[1]
|
|
|
|
turkic_titlecase[code] = hex2utf8 data[2]
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2016-06-13 03:58:57 -04:00
|
|
|
@@codepoints.each do |c|
|
|
|
|
if upcase[c] != c
|
|
|
|
if downcase[c] != c
|
|
|
|
swapcase[c] = turkic_swapcase[c] =
|
|
|
|
case c
|
|
|
|
when "\u01C5" then "\u0064\u017D"
|
|
|
|
when "\u01C8" then "\u006C\u004A"
|
|
|
|
when "\u01CB" then "\u006E\u004A"
|
|
|
|
when "\u01F2" then "\u0064\u005A"
|
|
|
|
else # Greek
|
|
|
|
downcase[upcase[c][0]] + "\u0399"
|
|
|
|
end
|
|
|
|
else
|
|
|
|
swapcase[c] = upcase[c]
|
|
|
|
turkic_swapcase[c] = turkic_upcase[c]
|
|
|
|
end
|
|
|
|
else
|
|
|
|
if downcase[c] != c
|
|
|
|
swapcase[c] = downcase[c]
|
|
|
|
turkic_swapcase[c] = turkic_downcase[c]
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2016-06-20 15:01:37 -04:00
|
|
|
[
|
2016-06-05 02:24:36 -04:00
|
|
|
CaseTest.new(:downcase, [], downcase),
|
|
|
|
CaseTest.new(:upcase, [], upcase),
|
|
|
|
CaseTest.new(:capitalize, [], titlecase, downcase),
|
2016-06-13 03:58:57 -04:00
|
|
|
CaseTest.new(:swapcase, [], swapcase),
|
2016-05-25 20:45:44 -04:00
|
|
|
CaseTest.new(:downcase, [:fold], casefold),
|
|
|
|
CaseTest.new(:upcase, [:turkic], turkic_upcase),
|
|
|
|
CaseTest.new(:downcase, [:turkic], turkic_downcase),
|
|
|
|
CaseTest.new(:capitalize, [:turkic], turkic_titlecase, turkic_downcase),
|
2016-06-13 03:58:57 -04:00
|
|
|
CaseTest.new(:swapcase, [:turkic], turkic_swapcase),
|
2016-05-25 20:45:44 -04:00
|
|
|
CaseTest.new(:upcase, [:ascii], ascii_upcase),
|
|
|
|
CaseTest.new(:downcase, [:ascii], ascii_downcase),
|
|
|
|
CaseTest.new(:capitalize, [:ascii], ascii_titlecase, ascii_downcase),
|
2016-06-12 01:48:04 -04:00
|
|
|
CaseTest.new(:swapcase, [:ascii], ascii_swapcase),
|
2016-05-25 20:45:44 -04:00
|
|
|
]
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.all_tests
|
|
|
|
@@tests ||= read_data
|
2016-06-20 15:01:37 -04:00
|
|
|
rescue Errno::ENOENT
|
2016-05-27 21:37:49 -04:00
|
|
|
@@tests ||= []
|
2016-05-25 20:45:44 -04:00
|
|
|
end
|
|
|
|
|
2016-06-21 03:14:03 -04:00
|
|
|
def self.generate_unicode_case_mapping_tests (encoding)
|
|
|
|
all_tests.each do |test|
|
|
|
|
attributes = test.attributes.map(&:to_s).join '-'
|
|
|
|
attributes.prepend '_' unless attributes.empty?
|
|
|
|
define_method "test_#{encoding}_#{test.method_name}#{attributes}" do
|
|
|
|
@@codepoints.each do |code|
|
|
|
|
source = code.encode(encoding) * 5
|
|
|
|
target = "#{test.first_data[code]}#{test.follow_data[code]*4}".encode(encoding)
|
|
|
|
result = source.__send__(test.method_name, *test.attributes)
|
|
|
|
assert_equal target, target,
|
|
|
|
proc{"from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2016-06-07 02:28:47 -04:00
|
|
|
def self.generate_case_mapping_tests (encoding)
|
2016-06-06 04:29:38 -04:00
|
|
|
all_tests
|
|
|
|
# preselect codepoints to speed up testing for small encodings
|
|
|
|
codepoints = @@codepoints.select do |code|
|
|
|
|
begin
|
|
|
|
code.encode(encoding)
|
|
|
|
true
|
|
|
|
rescue Encoding::UndefinedConversionError
|
|
|
|
false
|
|
|
|
end
|
|
|
|
end
|
2016-05-25 20:45:44 -04:00
|
|
|
all_tests.each do |test|
|
|
|
|
attributes = test.attributes.map(&:to_s).join '-'
|
|
|
|
attributes.prepend '_' unless attributes.empty?
|
|
|
|
define_method "test_#{encoding}_#{test.method_name}#{attributes}" do
|
2016-06-06 04:29:38 -04:00
|
|
|
codepoints.each do |code|
|
2016-05-25 20:45:44 -04:00
|
|
|
begin
|
|
|
|
source = code.encode(encoding) * 5
|
2016-06-20 15:01:37 -04:00
|
|
|
target = "#{test.first_data[code]}#{test.follow_data[code]*4}".encode(encoding)
|
2016-05-25 20:45:44 -04:00
|
|
|
result = source.send(test.method_name, *test.attributes)
|
|
|
|
assert_equal target, result,
|
2016-06-20 15:01:37 -04:00
|
|
|
proc{"from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"}
|
2016-05-25 20:45:44 -04:00
|
|
|
rescue Encoding::UndefinedConversionError
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2016-06-07 03:13:39 -04:00
|
|
|
# temporary test to avoid regression when switching to primitives
|
|
|
|
def self.generate_ascii_only_case_mapping_tests (encoding)
|
|
|
|
all_tests
|
|
|
|
# preselect codepoints to speed up testing for small encodings
|
|
|
|
codepoints = @@codepoints.select do |code|
|
|
|
|
begin
|
|
|
|
code.encode(encoding)
|
|
|
|
true
|
|
|
|
rescue Encoding::UndefinedConversionError
|
|
|
|
false
|
|
|
|
end
|
|
|
|
end
|
|
|
|
define_method "test_#{encoding}_upcase" do
|
|
|
|
codepoints.each do |code|
|
|
|
|
begin
|
|
|
|
source = code.encode(encoding) * 5
|
|
|
|
target = source.tr 'a-z', 'A-Z'
|
|
|
|
result = source.upcase
|
|
|
|
assert_equal target, result,
|
|
|
|
"from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"
|
|
|
|
rescue Encoding::UndefinedConversionError
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
define_method "test_#{encoding}_downcase" do
|
|
|
|
codepoints.each do |code|
|
|
|
|
begin
|
|
|
|
source = code.encode(encoding) * 5
|
|
|
|
target = source.tr 'A-Z', 'a-z'
|
|
|
|
result = source.downcase
|
|
|
|
assert_equal target, result,
|
|
|
|
"from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"
|
|
|
|
rescue Encoding::UndefinedConversionError
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
define_method "test_#{encoding}_capitalize" do
|
|
|
|
codepoints.each do |code|
|
|
|
|
begin
|
|
|
|
source = code.encode(encoding) * 5
|
|
|
|
target = source[0].tr('a-z', 'A-Z') + source[1..-1].tr('A-Z', 'a-z')
|
|
|
|
result = source.capitalize
|
|
|
|
assert_equal target, result,
|
|
|
|
"from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"
|
|
|
|
rescue Encoding::UndefinedConversionError
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2016-06-12 01:48:04 -04:00
|
|
|
define_method "test_#{encoding}_swapcase" do
|
|
|
|
codepoints.each do |code|
|
|
|
|
begin
|
|
|
|
source = code.encode(encoding) * 5
|
|
|
|
target = source.tr('a-zA-Z', 'A-Za-z')
|
|
|
|
result = source.swapcase
|
|
|
|
assert_equal target, result,
|
|
|
|
"from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"
|
|
|
|
rescue Encoding::UndefinedConversionError
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2016-06-07 03:13:39 -04:00
|
|
|
end
|
|
|
|
|
2016-05-28 06:08:37 -04:00
|
|
|
def check_file_available(filename)
|
2016-05-28 06:08:37 -04:00
|
|
|
expanded = self.class.expand_filename(filename)
|
2016-05-28 06:08:37 -04:00
|
|
|
assert File.exist?(expanded), "File #{expanded} missing."
|
|
|
|
end
|
|
|
|
|
|
|
|
def test_AAAAA_data_files_available # AAAAA makes sure this test is run first
|
|
|
|
%w[UnicodeData CaseFolding SpecialCasing].each { |f| check_file_available f }
|
2016-05-27 21:37:49 -04:00
|
|
|
end
|
|
|
|
|
2016-06-07 03:13:39 -04:00
|
|
|
generate_ascii_only_case_mapping_tests 'ISO-8859-2'
|
|
|
|
generate_ascii_only_case_mapping_tests 'ISO-8859-3'
|
|
|
|
generate_ascii_only_case_mapping_tests 'ISO-8859-4'
|
|
|
|
generate_ascii_only_case_mapping_tests 'ISO-8859-5'
|
|
|
|
generate_ascii_only_case_mapping_tests 'ISO-8859-7'
|
|
|
|
generate_ascii_only_case_mapping_tests 'ISO-8859-9'
|
|
|
|
generate_ascii_only_case_mapping_tests 'ISO-8859-10'
|
|
|
|
generate_ascii_only_case_mapping_tests 'ISO-8859-13'
|
|
|
|
generate_ascii_only_case_mapping_tests 'ISO-8859-14'
|
|
|
|
generate_ascii_only_case_mapping_tests 'ISO-8859-15'
|
|
|
|
generate_ascii_only_case_mapping_tests 'ISO-8859-16'
|
|
|
|
generate_ascii_only_case_mapping_tests 'KOI8-R'
|
|
|
|
generate_ascii_only_case_mapping_tests 'KOI8-U'
|
|
|
|
generate_ascii_only_case_mapping_tests 'Big5'
|
|
|
|
generate_ascii_only_case_mapping_tests 'EUC-JP'
|
|
|
|
generate_ascii_only_case_mapping_tests 'EUC-KR'
|
|
|
|
generate_ascii_only_case_mapping_tests 'GB18030'
|
|
|
|
generate_ascii_only_case_mapping_tests 'GB2312'
|
|
|
|
generate_ascii_only_case_mapping_tests 'GBK'
|
|
|
|
generate_ascii_only_case_mapping_tests 'Shift_JIS'
|
|
|
|
generate_ascii_only_case_mapping_tests 'Windows-31J'
|
|
|
|
generate_ascii_only_case_mapping_tests 'Windows-1250'
|
|
|
|
generate_ascii_only_case_mapping_tests 'Windows-1251'
|
|
|
|
generate_ascii_only_case_mapping_tests 'Windows-1252'
|
|
|
|
generate_ascii_only_case_mapping_tests 'Windows-1253'
|
|
|
|
generate_ascii_only_case_mapping_tests 'Windows-1254'
|
|
|
|
generate_ascii_only_case_mapping_tests 'Windows-1256'
|
|
|
|
generate_ascii_only_case_mapping_tests 'Windows-1257'
|
2016-06-10 20:46:21 -04:00
|
|
|
generate_case_mapping_tests 'ISO-8859-1'
|
2016-06-07 02:28:47 -04:00
|
|
|
generate_case_mapping_tests 'US-ASCII'
|
|
|
|
generate_case_mapping_tests 'ASCII-8BIT'
|
2016-06-07 04:49:55 -04:00
|
|
|
generate_case_mapping_tests 'ISO-8859-11'
|
|
|
|
generate_case_mapping_tests 'ISO-8859-8'
|
|
|
|
generate_case_mapping_tests 'ISO-8859-6'
|
|
|
|
generate_case_mapping_tests 'Windows-1255'
|
2016-06-21 03:14:03 -04:00
|
|
|
generate_unicode_case_mapping_tests 'UTF-8'
|
|
|
|
generate_unicode_case_mapping_tests 'UTF-16BE'
|
|
|
|
generate_unicode_case_mapping_tests 'UTF-16LE'
|
|
|
|
generate_unicode_case_mapping_tests 'UTF-32BE'
|
|
|
|
generate_unicode_case_mapping_tests 'UTF-32LE'
|
2016-05-25 20:45:44 -04:00
|
|
|
end
|