From 3ca6d1fb4a3acc7a6dfff9ee39ee7f75fa71d0f4 Mon Sep 17 00:00:00 2001 From: Adam Roben Date: Wed, 13 Nov 2013 17:16:55 -0500 Subject: [PATCH] Support extended grapheme clusters and UAX 29 http://www.unicode.org/reports/tr29/tr29-21.html is the version of UAX 29 that corresponds to Unicode 6.2.0. Unicode.unpack_graphemes now implements all the rules listed there, including the ones for extended grapheme clusters. I added a new optional test, test/multibyte_grapheme_break_conformance.rb, that is heavily based on test/multibyte_normalization_conformance.rb, which runs the Unicode test suite. --- .../lib/active_support/multibyte/unicode.rb | 15 ++++ .../multibyte_grapheme_break_conformance.rb | 76 +++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 activesupport/test/multibyte_grapheme_break_conformance.rb diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb index aea7709b55..d85ea3b5e6 100644 --- a/activesupport/lib/active_support/multibyte/unicode.rb +++ b/activesupport/lib/active_support/multibyte/unicode.rb @@ -94,6 +94,12 @@ module ActiveSupport # GB3. CR X LF if previous == database.boundary[:cr] and current == database.boundary[:lf] false + # GB4. (Control|CR|LF) ÷ + elsif previous and in_char_class?(previous, [:control,:cr,:lf]) + true + # GB5. ÷ (Control|CR|LF) + elsif in_char_class?(current, [:control,:cr,:lf]) + true # GB6. L X (L|V|LV|LVT) elsif database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) false @@ -103,9 +109,18 @@ module ActiveSupport # GB8. (LVT|T) X (T) elsif in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current false + # GB8a. Regional_Indicator X Regional_Indicator + elsif database.boundary[:regional_indicator] === previous and database.boundary[:regional_indicator] === current + false # GB9. X Extend elsif database.boundary[:extend] === current false + # GB9a. X SpacingMark + elsif database.boundary[:spacingmark] === current + false + # GB9b. Prepend X + elsif database.boundary[:prepend] === previous + false # GB10. Any ÷ Any else true diff --git a/activesupport/test/multibyte_grapheme_break_conformance.rb b/activesupport/test/multibyte_grapheme_break_conformance.rb new file mode 100644 index 0000000000..7d185e2cae --- /dev/null +++ b/activesupport/test/multibyte_grapheme_break_conformance.rb @@ -0,0 +1,76 @@ +# encoding: utf-8 + +require 'abstract_unit' + +require 'fileutils' +require 'open-uri' +require 'tmpdir' + +class Downloader + def self.download(from, to) + unless File.exist?(to) + $stderr.puts "Downloading #{from} to #{to}" + unless File.exist?(File.dirname(to)) + system "mkdir -p #{File.dirname(to)}" + end + open(from) do |source| + File.open(to, 'w') do |target| + source.each_line do |l| + target.write l + end + end + end + end + end +end + +class MultibyteGraphemeBreakConformanceTest < ActiveSupport::TestCase + TEST_DATA_URL = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::Unicode::UNICODE_VERSION}/ucd/auxiliary" + TEST_DATA_FILE = '/GraphemeBreakTest.txt' + CACHE_DIR = File.join(Dir.tmpdir, 'cache') + + def setup + FileUtils.mkdir_p(CACHE_DIR) + Downloader.download(TEST_DATA_URL + TEST_DATA_FILE, CACHE_DIR + TEST_DATA_FILE) + end + + def test_breaks + each_line_of_break_tests do |*cols| + *clusters, comment = *cols + packed = ActiveSupport::Multibyte::Unicode.pack_graphemes(clusters) + assert_equal clusters, ActiveSupport::Multibyte::Unicode.unpack_graphemes(packed), comment + end + end + + protected + def each_line_of_break_tests(&block) + lines = 0 + max_test_lines = 0 # Don't limit below 21, because that's the header of the testfile + File.open(File.join(CACHE_DIR, TEST_DATA_FILE), 'r') do | f | + until f.eof? || (max_test_lines > 21 and lines > max_test_lines) + lines += 1 + line = f.gets.chomp! + next if (line.empty? || line =~ /^\#/) + + cols, comment = line.split("#") + # Cluster breaks are represented by ÷ + clusters = cols.split("÷").map{|e| e.strip}.reject{|e| e.empty? } + clusters = clusters.map do |cluster| + # Codepoints within each cluster are separated by × + codepoints = cluster.split("×").map{|e| e.strip}.reject{|e| e.empty? } + # codepoints are in hex in the test suite, pack wants them as integers + codepoints.map{|codepoint| codepoint.to_i(16)} + end + + # The tests contain a solitary U+D800 character, which Ruby does not allow to stand + # alone in a UTF-8 string. So we'll just skip it. + next if clusters.flatten.include?(0xd800) + + clusters << comment.strip + + yield(*clusters) + end + end + end +end