* tool/enc-unicode.rb,

enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: Add Age property to regexp. [ruby-core:33019] patched by Ammar Ali, tested by Run Paint Run Run git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@29717 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2022-11-09 12:17:21 -05:00 · 2010-11-08 05:32:45 +00:00 · 2010-11-08 05:32:45 +00:00 · a0265b0662
commit a0265b0662
parent 294070d86a
8 changed files with 22339 additions and 2687 deletions
--- a/8
+++ b/8
@ -1,3 +1,11 @@
+Mon Nov  8 13:41:33 2010  NARUSE, Yui  <naruse@ruby-lang.org>
+
+	* tool/enc-unicode.rb,
+	  enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
+	  enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
+	  Add Age property to regexp. [ruby-core:33019]
+	  patched by Ammar Ali, tested by Run Paint Run Run
+
 Mon Nov  8 12:16:39 2010  Ben Walton  <bwalton@artsci.utoronto.ca>

 	* configure.in: support -h for solaris linker when gcc not used
--- a/7
+++ b/7
@ -83,7 +83,12 @@ with all sufficient information, see the ChangeLog file.

 === Language changes

-* Regexps now support Unicode 6.0 (new characters and scripts)
+* Regexps now support Unicode 6.0. (new characters and scripts)
+
+* [experimental] Regexps now support Age property.
+  Unlike Perl, current implementation takes interpretation of the
+  interpretation of UTS #18.
+  http://www.unicode.org/reports/tr18/

 === Compatibility issues (excluding feature bug fixes)

--- a/enc/unicode/name2ctype.h
+++ b/enc/unicode/name2ctype.h
--- a/enc/unicode/name2ctype.h.blt
+++ b/enc/unicode/name2ctype.h.blt
--- a/enc/unicode/name2ctype.kwd
+++ b/enc/unicode/name2ctype.kwd
--- a/enc/unicode/name2ctype.src
+++ b/enc/unicode/name2ctype.src
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@ -798,6 +798,25 @@ class TestRegexp < Test::Unit::TestCase
    assert_nothing_raised { 0x7fffffff.chr("utf-8").size }
  end

+  def test_unicode_age
+    assert_match(/^\p{Age=6.0}$/u, "\u261c")
+    assert_match(/^\p{Age=1.1}$/u, "\u261c")
+    assert_no_match(/^\P{age=6.0}$/u, "\u261c")
+
+    assert_match(/^\p{age=6.0}$/u, "\u31f6")
+    assert_match(/^\p{age=3.2}$/u, "\u31f6")
+    assert_no_match(/^\p{age=3.1}$/u, "\u31f6")
+    assert_no_match(/^\p{age=3.0}$/u, "\u31f6")
+    assert_no_match(/^\p{age=1.1}$/u, "\u31f6")
+
+    assert_match(/^\p{age=6.0}$/u, "\u2754")
+    assert_no_match(/^\p{age=5.0}$/u, "\u2754")
+    assert_no_match(/^\p{age=4.0}$/u, "\u2754")
+    assert_no_match(/^\p{age=3.0}$/u, "\u2754")
+    assert_no_match(/^\p{age=2.0}$/u, "\u2754")
+    assert_no_match(/^\p{age=1.1}$/u, "\u2754")
+  end
+
  def test_matchdata
    a = "haystack".match(/hay/)
    b = "haystack".match(/hay/)
--- a/tool/enc-unicode.rb
+++ b/tool/enc-unicode.rb
@ -4,8 +4,8 @@
 # property names and POSIX character classes
 #
 # To use this, get UnicodeData.txt, Scripts.txt, PropList.txt,
-# PropertyAliases.txt, PropertyValueAliases.txt, and
-# DerivedCoreProperties.txt from unicode.org.
+# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt,
+# and DerivedAge.txt  from unicode.org.
 # (http://unicode.org/Public/UNIDATA/) And run following command.
 # ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
 # You can get source file for gperf.  After this, simply make ruby.
@ -174,6 +174,32 @@ def parse_aliases(data)
  kv
 end

+# According to Unicode6.0.0/ch03.pdf, Section 3.1, "An update version
+# never involves any additions to the character repertoire." Versions
+# in DerivedAge.txt should always be /\d+\.\d+/
+def parse_age(data)
+  current = nil
+  last_constname = nil
+  cps = []
+  ages = []
+  IO.foreach(get_file('DerivedAge.txt')) do |line|
+    if /^# Total code points: / =~ line
+      constname = constantize_agename(current)
+			# each version matches all previous versions
+      cps.concat(data[last_constname]) if last_constname
+      data[constname] = cps
+      make_const(constname, cps, "Derived Age #{current}")
+      ages << current
+      last_constname = constname
+      cps = []
+    elsif /^(\h+)(?:..(\h+))?\s*;\s*(\d+\.\d+)/ =~ line
+      current = $3
+      $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
+    end
+  end
+  ages
+end
+
 $const_cache = {}
 # make_const(property, pairs, name): Prints a 'static const' structure for a
 # given property, group of paired codepoints, and a human-friendly name for
@ -202,6 +228,10 @@ def normalize_propname(name)
  name
 end

+def constantize_agename(name)
+  "Age_#{name.sub(/\./, '_')}"
+end
+
 def get_file(name)
  File.join(ARGV[0], name)
 end
@ -224,6 +254,7 @@ end
 props.concat parse_scripts(data)
 puts '#endif /* USE_UNICODE_PROPERTIES */'
 aliases = parse_aliases(data)
+ages = parse_age(data)
 define_posix_props(data)
 POSIX_NAMES.each do |name|
  make_const(name, data[name], "[[:#{name}:]]")
@ -235,6 +266,7 @@ __HEREDOC
 POSIX_NAMES.each{|name|puts"  CR_#{name},"}
 puts "#ifdef USE_UNICODE_PROPERTIES"
 props.each{|name|puts"  CR_#{name},"}
+ages.each{|name| puts"  CR_#{constantize_agename(name)},"}

 puts(<<'__HEREDOC')
 #endif /* USE_UNICODE_PROPERTIES */
@ -268,6 +300,12 @@ aliases.each_pair do |k, v|
  next unless v = name_to_index[v]
  puts "%-40s %3d" % [k + ',', v]
 end
+ages.each do |name|
+  i += 1
+  name = "age=#{name}"
+  name_to_index[name] = i
+  puts "%-40s %3d" % [name + ',', i]
+end
 puts(<<'__HEREDOC')
 #endif /* USE_UNICODE_PROPERTIES */
 %%