ruby--ruby/test/ruby/enc/test_regex_casefold.rb

# Copyright Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp)

require "test/unit"

class TestCaseFold < Test::Unit::TestCase

  UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']
  path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__)
  UNICODE_DATA_PATH = File.directory?("#{path}/ucd") ? "#{path}/ucd" : path
  CaseTest = Struct.new :source, :target, :kind, :line

  def check_downcase_properties(expected, start, *flags)
    assert_equal expected, start.downcase(*flags)
    temp = start.dup
    assert_equal expected, temp.downcase!(*flags)
    assert_equal expected, expected.downcase(*flags)
    temp = expected
    assert_nil   temp.downcase!(*flags)
  end

  def read_tests
    IO.readlines("#{UNICODE_DATA_PATH}/CaseFolding.txt", encoding: Encoding::ASCII_8BIT)
    .collect.with_index { |linedata, linenumber| [linenumber.to_i+1, linedata.chomp] }
    .reject { |number, data| data =~ /^(#|$)/ }
    .collect do |linenumber, linedata|
      data, _ = linedata.split(/#\s*/)
      code, kind, result, _ = data.split(/;\s*/)
      CaseTest.new code.to_i(16).chr('UTF-8'),
                   result.split(/ /).collect { |hex| hex.to_i(16) }.pack('U*'),
                   kind, linenumber
    end.select { |test| test.kind=='C' }
  end

  def to_codepoints(string)
    string.codepoints.collect { |cp| cp.to_s(16).upcase.rjust(4, '0') }
  end

  def setup
    @@tests ||= read_tests
  rescue Errno::ENOENT => e
    @@tests ||= []
    skip e.message
  end

  def self.generate_test_casefold(encoding)
    define_method "test_mbc_case_fold_#{encoding}" do
      @@tests.each do |test|
        begin
          source = test.source.encode encoding
          target = test.target.encode encoding
          assert_equal 5, "12345#{target}67890" =~ /#{source}/i,
              "12345#{to_codepoints(target)}67890 and /#{to_codepoints(source)}/ do not match case-insensitive " +
              "(CaseFolding.txt line #{test[:line]})"
        rescue Encoding::UndefinedConversionError
        end
      end
    end

    define_method "test_get_case_fold_codes_by_str_#{encoding}" do
      @@tests.each do |test|
        begin
          source = test.source.encode encoding
          target = test.target.encode encoding
          assert_equal 5, "12345#{source}67890" =~ /#{target}/i,
              "12345#{to_codepoints(source)}67890 and /#{to_codepoints(target)}/ do not match case-insensitive " +
              "(CaseFolding.txt line #{test[:line]}), " +
              "error may also be triggered by mbc_case_fold"
        rescue Encoding::UndefinedConversionError
        end
      end
    end

    define_method "test_apply_all_case_fold_#{encoding}" do
      @@tests.each do |test|
        begin
          source = test.source.encode encoding
          target = test.target.encode encoding
          reg = '\p{Upper}'
          regexp = Regexp.compile reg.encode(encoding)
          regexpi = Regexp.compile reg.encode(encoding), Regexp::IGNORECASE
            assert_equal 5, "12345#{target}67890" =~ regexpi,
                "12345#{to_codepoints(target)}67890 and /#{reg}/i do not match " +
                "(CaseFolding.txt line #{test[:line]})"
        rescue Encoding::UndefinedConversionError
          source = source
          regexp = regexp
        end
      end
    end
  end

  def test_downcase_fold
    @@tests.each do |test|
      check_downcase_properties test.target, test.source, :fold
    end
  end

  # start with good encodings only
  generate_test_casefold 'US-ASCII'
  generate_test_casefold 'ISO-8859-1'
  generate_test_casefold 'ISO-8859-2'
  generate_test_casefold 'ISO-8859-3'
  generate_test_casefold 'ISO-8859-4'
  generate_test_casefold 'ISO-8859-5'
  generate_test_casefold 'ISO-8859-6'
  # generate_test_casefold 'ISO-8859-7'
  generate_test_casefold 'ISO-8859-8'
  generate_test_casefold 'ISO-8859-9'
  generate_test_casefold 'ISO-8859-10'
  generate_test_casefold 'ISO-8859-11'
  generate_test_casefold 'ISO-8859-13'
  generate_test_casefold 'ISO-8859-14'
  generate_test_casefold 'ISO-8859-15'
  generate_test_casefold 'ISO-8859-16'
  generate_test_casefold 'Windows-1250'
  # generate_test_casefold 'Windows-1251'
  generate_test_casefold 'Windows-1252'
  generate_test_casefold 'koi8-r'
  generate_test_casefold 'koi8-u'
end
* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 00:51:33 -05:00			`# Copyright Kimihito Matsui (松井仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp)`

			`require "test/unit"`

			`class TestCaseFold < Test::Unit::TestCase`

* test/ruby/enc/test_case_comprehensive.rb, test_regex_casefold.rb, test/test_unicode_normalize.rb: Replace UNICODE_VERSION from UnicodeNormalize with RbConfig::CONFIG['UNICODE_VERSION'] from feature 12460. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@55567 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-07-03 05:51:46 -04:00			`UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']`
fix unicode data directory * test/ruby/enc/test_regex_casefold.rb: fix searching unicode data directory, like as test_case_comprehensive.rb. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@61417 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2017-12-22 19:30:33 -05:00			`path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__)`
			`UNICODE_DATA_PATH = File.directory?("#{path}/ucd") ? "#{path}/ucd" : path`
* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 00:51:33 -05:00			`CaseTest = Struct.new :source, :target, :kind, :line`

* test/ruby/enc/test_regex_casefold.rb: Added data-based testing for String#downcase :fold. * enc/unicode.c: Fixed a range error (lowest non-ASCII character affected by case operations is U+00B5, MICRO SIGN) * test/ruby/enc/test_case_mapping.rb: Explicit test for case folding of MICRO SIGN to Greek mu. (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53749 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 01:18:38 -05:00			`def check_downcase_properties(expected, start, *flags)`
			`assert_equal expected, start.downcase(*flags)`
support multi-run for test/ruby/enc/test_regex_casefold.rb should not mutate test data. 2020-01-28 23:07:56 -05:00			`temp = start.dup`
* test/ruby/enc/test_regex_casefold.rb: Added data-based testing for String#downcase :fold. * enc/unicode.c: Fixed a range error (lowest non-ASCII character affected by case operations is U+00B5, MICRO SIGN) * test/ruby/enc/test_case_mapping.rb: Explicit test for case folding of MICRO SIGN to Greek mu. (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53749 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 01:18:38 -05:00			`assert_equal expected, temp.downcase!(*flags)`
			`assert_equal expected, expected.downcase(*flags)`
			`temp = expected`
			`assert_nil temp.downcase!(*flags)`
			`end`

* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 00:51:33 -05:00			`def read_tests`
fix unicode data directory * test/ruby/enc/test_regex_casefold.rb: fix searching unicode data directory, like as test_case_comprehensive.rb. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@61417 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2017-12-22 19:30:33 -05:00			`IO.readlines("#{UNICODE_DATA_PATH}/CaseFolding.txt", encoding: Encoding::ASCII_8BIT)`
* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 00:51:33 -05:00			`.collect.with_index { \|linedata, linenumber\| [linenumber.to_i+1, linedata.chomp] }`
			`.reject { \|number, data\| data =~ /^(#\|$)/ }`
			`.collect do \|linenumber, linedata\|`
test/ruby: suppress parser warnings git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53872 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-19 02:48:02 -05:00			`data, _ = linedata.split(/#\s*/)`
			`code, kind, result, _ = data.split(/;\s*/)`
* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 00:51:33 -05:00			`CaseTest.new code.to_i(16).chr('UTF-8'),`
			`result.split(/ /).collect { \|hex\| hex.to_i(16) }.pack('U*'),`
			`kind, linenumber`
			`end.select { \|test\| test.kind=='C' }`
			`end`

			`def to_codepoints(string)`
			`string.codepoints.collect { \|cp\| cp.to_s(16).upcase.rjust(4, '0') }`
			`end`

			`def setup`
			`@@tests \|\|= read_tests`
			`rescue Errno::ENOENT => e`
			`@@tests \|\|= []`
test_regex_casefold.rb: skip if no data file * test/ruby/enc/test_regex_casefold.rb (setup): skip with error message if CaseFolding.txt does not present, instead of printing the message, which causes unknown command in parallel test. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@56017 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-08-26 12:08:40 -04:00			`skip e.message`
* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 00:51:33 -05:00			`end`

			`def self.generate_test_casefold(encoding)`
			`define_method "test_mbc_case_fold_#{encoding}" do`
			`@@tests.each do \|test\|`
			`begin`
			`source = test.source.encode encoding`
			`target = test.target.encode encoding`
			`assert_equal 5, "12345#{target}67890" =~ /#{source}/i,`
			`"12345#{to_codepoints(target)}67890 and /#{to_codepoints(source)}/ do not match case-insensitive " +`
			`"(CaseFolding.txt line #{test[:line]})"`
			`rescue Encoding::UndefinedConversionError`
			`end`
			`end`
			`end`

			`define_method "test_get_case_fold_codes_by_str_#{encoding}" do`
			`@@tests.each do \|test\|`
			`begin`
			`source = test.source.encode encoding`
			`target = test.target.encode encoding`
			`assert_equal 5, "12345#{source}67890" =~ /#{target}/i,`
			`"12345#{to_codepoints(source)}67890 and /#{to_codepoints(target)}/ do not match case-insensitive " +`
			`"(CaseFolding.txt line #{test[:line]}), " +`
			`"error may also be triggered by mbc_case_fold"`
			`rescue Encoding::UndefinedConversionError`
			`end`
			`end`
			`end`

			`define_method "test_apply_all_case_fold_#{encoding}" do`
			`@@tests.each do \|test\|`
			`begin`
			`source = test.source.encode encoding`
			`target = test.target.encode encoding`
			`reg = '\p{Upper}'`
			`regexp = Regexp.compile reg.encode(encoding)`
			`regexpi = Regexp.compile reg.encode(encoding), Regexp::IGNORECASE`
			`assert_equal 5, "12345#{target}67890" =~ regexpi,`
			`"12345#{to_codepoints(target)}67890 and /#{reg}/i do not match " +`
			`"(CaseFolding.txt line #{test[:line]})"`
			`rescue Encoding::UndefinedConversionError`
test/ruby: suppress parser warnings git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53872 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-19 02:48:02 -05:00			`source = source`
			`regexp = regexp`
* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 00:51:33 -05:00			`end`
			`end`
			`end`
* test/ruby/enc/test_regex_casefold.rb: Added data-based testing for String#downcase :fold. * enc/unicode.c: Fixed a range error (lowest non-ASCII character affected by case operations is U+00B5, MICRO SIGN) * test/ruby/enc/test_case_mapping.rb: Explicit test for case folding of MICRO SIGN to Greek mu. (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53749 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 01:18:38 -05:00			`end`
* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 00:51:33 -05:00
* test/ruby/enc/test_regex_casefold.rb: Added data-based testing for String#downcase :fold. * enc/unicode.c: Fixed a range error (lowest non-ASCII character affected by case operations is U+00B5, MICRO SIGN) * test/ruby/enc/test_case_mapping.rb: Explicit test for case folding of MICRO SIGN to Greek mu. (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53749 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 01:18:38 -05:00			`def test_downcase_fold`
			`@@tests.each do \|test\|`
			`check_downcase_properties test.target, test.source, :fold`
			`end`
* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 00:51:33 -05:00			`end`

			`# start with good encodings only`
			`generate_test_casefold 'US-ASCII'`
			`generate_test_casefold 'ISO-8859-1'`
			`generate_test_casefold 'ISO-8859-2'`
			`generate_test_casefold 'ISO-8859-3'`
			`generate_test_casefold 'ISO-8859-4'`
			`generate_test_casefold 'ISO-8859-5'`
			`generate_test_casefold 'ISO-8859-6'`
			`# generate_test_casefold 'ISO-8859-7'`
			`generate_test_casefold 'ISO-8859-8'`
			`generate_test_casefold 'ISO-8859-9'`
			`generate_test_casefold 'ISO-8859-10'`
			`generate_test_casefold 'ISO-8859-11'`
			`generate_test_casefold 'ISO-8859-13'`
			`generate_test_casefold 'ISO-8859-14'`
			`generate_test_casefold 'ISO-8859-15'`
			`generate_test_casefold 'ISO-8859-16'`
			`generate_test_casefold 'Windows-1250'`
* test/ruby/enc/test_regex_casefold.rb: Add Windows-1251, KOI8-R, and KOI8-U to encodings; definitely removed EUC-JP. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@55485 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-06-21 06:44:57 -04:00			`# generate_test_casefold 'Windows-1251'`
* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 00:51:33 -05:00			`generate_test_casefold 'Windows-1252'`
* test/ruby/enc/test_regex_casefold.rb: Add Windows-1251, KOI8-R, and KOI8-U to encodings; definitely removed EUC-JP. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@55485 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-06-21 06:44:57 -04:00			`generate_test_casefold 'koi8-r'`
			`generate_test_casefold 'koi8-u'`
* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2016-02-06 00:51:33 -05:00			`end`