From 7599b3f6c66036d235ff850d4e30a7ae10693fe7 Mon Sep 17 00:00:00 2001 From: duerst Date: Sat, 24 Nov 2018 12:10:25 +0000 Subject: [PATCH] add tests using Unicode test data for grapheme clusters Add file test/ruby/enc/test_grapheme_breaks.rb to test String#each_grapheme_cluster and \X extended grapheme cluster matcher in regular expressions against test data provided by Unicode (ucd/auxiliary/GraphemeBreakTest.txt). Some lines in the data file are ignored, as follows: - Lines with a surrogate, because Ruby doesn't handle these - The case of "\r\n", because there is a bug (#15337) in the implementation git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@65955 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- test/ruby/enc/test_grapheme_breaks.rb | 94 +++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 test/ruby/enc/test_grapheme_breaks.rb diff --git a/test/ruby/enc/test_grapheme_breaks.rb b/test/ruby/enc/test_grapheme_breaks.rb new file mode 100644 index 0000000000..5a4a8326c9 --- /dev/null +++ b/test/ruby/enc/test_grapheme_breaks.rb @@ -0,0 +1,94 @@ +# frozen_string_literal: true +# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp) + +require "test/unit" + +class BreakTest + attr_reader :clusters, :string, :comment, :line_number + + def initialize (line_number, data, comment) + @line_number = line_number + @comment = comment + @clusters = data.sub(/\A\s*÷\s*/, '') + .sub(/\s*÷\s*\z/, '') + .split(/\s*÷\s*/) + .map do |cl| + cl.split(/\s*×\s*/) + .map do |ch| + c = ch.to_i(16) + # eliminate cases with surrogates + raise ArgumentError if 0xD800 <= c and c <= 0xDFFF + c.chr('UTF-8') + end.join + end + @string = @clusters.join + # remove the following line once we have fixed bug #15337 + raise ArgumentError if @string == "\r\n" + end +end + +class TestGraphemeBreaksFromFile < Test::Unit::TestCase + UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION'] + path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__) + UNICODE_DATA_PATH = File.directory?("#{path}/ucd/auxiliary") ? "#{path}/ucd/auxiliary" : path + GRAPHEME_BREAK_TEST_FILE = File.expand_path("#{UNICODE_DATA_PATH}/GraphemeBreakTest.txt", __dir__) + + def self.file_available? + File.exist? GRAPHEME_BREAK_TEST_FILE + end + + def test_data_files_available + unless TestGraphemeBreaksFromFile.file_available? + skip "Unicode data file GraphemeBreakTest not available in #{UNICODE_DATA_PATH}." + end + end +end + +TestGraphemeBreaksFromFile.file_available? and class TestGraphemeBreaksFromFile + def read_data + tests = [] + IO.foreach(GRAPHEME_BREAK_TEST_FILE, encoding: Encoding::UTF_8) do |line| + if $. == 1 and not line.start_with?("# GraphemeBreakTest-#{UNICODE_VERSION}.txt") + raise "File Version Mismatch" + end + next if /\A#/.match? line + tests << BreakTest.new($., *line.chomp.split('#')) rescue 'whatever' + end + tests + end + + def all_tests + @@tests ||= read_data + rescue Errno::ENOENT + @@tests ||= [] + end + + def test_each_grapheme_cluster + all_tests.each do |test| + expected = test.clusters + actual = test.string.each_grapheme_cluster.to_a + assert_equal expected, actual, + "line #{test.line_number}, expected '#{expected}', " + + "but got '#{actual}', comment: #{test.comment}" + end + end + + def test_backslash_X + all_tests.each do |test| + clusters = test.clusters.dup + string = test.string.dup + removals = 0 + while string.sub!(/\A\X/, '') + removals += 1 + clusters.shift + expected = clusters.join + assert_equal expected, string, + "line #{test.line_number}, removals: #{removals}, expected '#{expected}', " + + "but got '#{string}', comment: #{test.comment}" + end + assert_equal expected, string, + "line #{test.line_number}, after last removal, expected '#{expected}', " + + "but got '#{string}', comment: #{test.comment}" + end + end +end