From 7599b3f6c66036d235ff850d4e30a7ae10693fe7 Mon Sep 17 00:00:00 2001
From: duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>
Date: Sat, 24 Nov 2018 12:10:25 +0000
Subject: [PATCH] add tests using Unicode test data for grapheme clusters

Add file test/ruby/enc/test_grapheme_breaks.rb to test String#each_grapheme_cluster
and \X extended grapheme cluster matcher in regular expressions against test data
provided by Unicode (ucd/auxiliary/GraphemeBreakTest.txt).

Some lines in the data file are ignored, as follows:
- Lines with a surrogate, because Ruby doesn't handle these
- The case of "\r\n", because there is a bug (#15337) in the implementation

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@65955 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
---
 test/ruby/enc/test_grapheme_breaks.rb | 94 +++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 test/ruby/enc/test_grapheme_breaks.rb

diff --git a/test/ruby/enc/test_grapheme_breaks.rb b/test/ruby/enc/test_grapheme_breaks.rb
new file mode 100644
index 0000000000..5a4a8326c9
--- /dev/null
+++ b/test/ruby/enc/test_grapheme_breaks.rb
@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+class BreakTest
+  attr_reader :clusters, :string, :comment, :line_number
+
+  def initialize (line_number, data, comment)
+    @line_number = line_number
+    @comment = comment
+    @clusters = data.sub(/\A\s*÷\s*/, '')
+                    .sub(/\s*÷\s*\z/, '')
+                    .split(/\s*÷\s*/)
+                    .map do |cl|
+                      cl.split(/\s*×\s*/)
+                        .map do |ch|
+                          c = ch.to_i(16)
+                           # eliminate cases with surrogates
+                          raise ArgumentError if 0xD800 <= c and c <= 0xDFFF
+                          c.chr('UTF-8')
+                        end.join
+                    end
+    @string = @clusters.join
+    # remove the following line once we have fixed bug #15337
+    raise ArgumentError if @string == "\r\n"
+  end
+end
+
+class TestGraphemeBreaksFromFile < Test::Unit::TestCase
+  UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']
+  path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__)
+  UNICODE_DATA_PATH = File.directory?("#{path}/ucd/auxiliary") ? "#{path}/ucd/auxiliary" : path
+  GRAPHEME_BREAK_TEST_FILE = File.expand_path("#{UNICODE_DATA_PATH}/GraphemeBreakTest.txt", __dir__)
+
+  def self.file_available?
+    File.exist? GRAPHEME_BREAK_TEST_FILE
+  end
+
+  def test_data_files_available
+    unless TestGraphemeBreaksFromFile.file_available?
+      skip "Unicode data file GraphemeBreakTest not available in #{UNICODE_DATA_PATH}."
+    end
+  end
+end
+
+TestGraphemeBreaksFromFile.file_available? and  class TestGraphemeBreaksFromFile
+  def read_data
+    tests = []
+    IO.foreach(GRAPHEME_BREAK_TEST_FILE, encoding: Encoding::UTF_8) do |line|
+      if $. == 1 and not line.start_with?("# GraphemeBreakTest-#{UNICODE_VERSION}.txt")
+        raise "File Version Mismatch"
+      end
+      next if /\A#/.match? line
+      tests << BreakTest.new($., *line.chomp.split('#')) rescue 'whatever'
+    end
+    tests
+  end
+
+  def all_tests
+    @@tests ||= read_data
+  rescue Errno::ENOENT
+    @@tests ||= []
+  end
+
+  def test_each_grapheme_cluster
+    all_tests.each do |test|
+      expected = test.clusters
+      actual = test.string.each_grapheme_cluster.to_a
+      assert_equal expected, actual,
+        "line #{test.line_number}, expected '#{expected}', " +
+        "but got '#{actual}', comment: #{test.comment}"
+    end
+  end
+
+  def test_backslash_X
+    all_tests.each do |test|
+      clusters = test.clusters.dup
+      string = test.string.dup
+      removals = 0
+      while string.sub!(/\A\X/, '')
+        removals += 1
+        clusters.shift
+        expected = clusters.join
+        assert_equal expected, string,
+          "line #{test.line_number}, removals: #{removals}, expected '#{expected}', " +
+          "but got '#{string}', comment: #{test.comment}"
+      end
+      assert_equal expected, string,
+        "line #{test.line_number}, after last removal, expected '#{expected}', " +
+        "but got '#{string}', comment: #{test.comment}"
+    end
+  end
+end