Add derived core and binary property and aliases.

* tool/enc-unicode.rb, enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: Add DerivedCoreProperties, PropList (Binary Property), PropertyAlias and PropertyValueAlias. Now users of tool/enc-unicode.rb should specify the directory of UCD files. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25324 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2022-11-09 12:17:21 -05:00 · 2009-10-13 12:27:00 +00:00 · 2009-10-13 12:27:00 +00:00 · 181eb7d5c1
commit 181eb7d5c1
parent 391e5df571
6 changed files with 36662 additions and 1368 deletions
--- a/10
+++ b/10
@ -1,3 +1,13 @@
+Tue Oct 13 21:05:01 2009  NARUSE, Yui  <naruse@ruby-lang.org>
+
+	* tool/enc-unicode.rb,
+	  enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
+	  enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
+	  Add DerivedCoreProperties, PropList (Binary Property),
+	  PropertyAlias and PropertyValueAlias.
+	  Now users of tool/enc-unicode.rb should specify
+	  the directory of UCD files.
+
 Tue Oct 13 18:54:25 2009  Hidetoshi NAGAI  <nagai@ai.kyutech.ac.jp>

 	* ext/tk/variable.rb: bug fix. additional trace definition changes the
--- a/enc/unicode/name2ctype.h
+++ b/enc/unicode/name2ctype.h
--- a/enc/unicode/name2ctype.h.blt
+++ b/enc/unicode/name2ctype.h.blt
--- a/enc/unicode/name2ctype.kwd
+++ b/enc/unicode/name2ctype.kwd
--- a/enc/unicode/name2ctype.src
+++ b/enc/unicode/name2ctype.src
--- a/tool/enc-unicode.rb
+++ b/tool/enc-unicode.rb
@ -3,15 +3,15 @@
 # Creates the data structures needed by Onigurma to map Unicode codepoints to
 # property names and POSIX character classes
 #
-# To use this, get UnicodeData.txt and Scripts.txt from unicode.org.
+# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt from unicode.org.
 # (http://unicode.org/Public/UNIDATA/)
 # And run following command.
-#   ruby1.9 tool/enc-unicode.rb UnicodeData.txt Scripts.txt > enc/unicode/name2ctype.kwd
+#   ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
 # You can get source file for gperf.
 # After this, simply make ruby.

-unless ARGV.size == 2
-  $stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
+unless ARGV.size == 1
+  $stderr.puts "Usage: #{$0} data_directory"
  exit(1)
 end

@ -161,23 +161,49 @@ def parse_unicode_data(file)
 end


-def parse_scripts(file)
-  script = nil
+def parse_scripts
+  files = [
+    {fn: 'DerivedCoreProperties.txt', title: 'Derived Property'},
+    {fn: 'Scripts.txt', title: 'Script'},
+    {fn: 'PropList.txt', title: 'Binary Property'}
+  ]
+  current = nil
  data = []
  names = []
-  IO.foreach(file) do |line|
-    if /^# Total code points: / =~ line
-      make_const(script, pair_codepoints(data), 'Script')
-      names << script
-      data = []
-    elsif /^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?\s*;\s*(\w+)/ =~ line
-      script = $3
-      $2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
+  files.each do |file|
+    IO.foreach(get_file(file[:fn])) do |line|
+      if /^# Total code points: / =~ line
+        make_const(current, pair_codepoints(data), file[:title])
+        names << current
+        data = []
+      elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line
+        current = $3
+        $2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
+      end
    end
  end
  names
 end

+def parse_aliases
+  kv = {}
+  IO.foreach(get_file('PropertyAliases.txt')) do |line|
+    next unless /^(\w+)\s*; (\w+)/ =~ line
+    kv[normalize_propname($1)] = normalize_propname($2)
+  end
+  IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
+    next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
+    if $1 == 'gc'
+      kv[normalize_propname($3)] = normalize_propname($2)
+      kv[normalize_propname($4)] = normalize_propname($2) if $4
+    else
+      kv[normalize_propname($2)] = normalize_propname($3)
+      kv[normalize_propname($4)] = normalize_propname($3) if $4
+    end
+  end
+  kv
+end
+
 # make_const(property, pairs, name): Prints a 'static const' structure for a
 # given property, group of paired codepoints, and a human-friendly name for
 # the group
@ -195,17 +221,23 @@ end

 def normalize_propname(name)
  name = name.downcase
-  name.gsub!(/[- _]/, '')
+  name.delete!('- _')
  name
 end

+def get_file(name)
+  File.join(ARGV[0], name)
+end
+
+
+# Write Data
 puts '%{'
-gcps, data = parse_unicode_data(ARGV[0])
+props, data = parse_unicode_data(get_file('UnicodeData.txt'))
 POSIX_NAMES.each do |name|
  make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
 end
 print "\n#ifdef USE_UNICODE_PROPERTIES"
-gcps.each do |name|
+props.each do |name|
  category =
    case name.size
    when 1 then 'Major Category'
@ -214,18 +246,19 @@ gcps.each do |name|
    end
  make_const(name, pair_codepoints(data[name]), category)
 end
-scripts = parse_scripts(ARGV[1])
-puts "#endif /* USE_UNICODE_PROPERTIES */"
+props.concat parse_scripts
+puts(<<'__HEREDOC')
+#endif /* USE_UNICODE_PROPERTIES */

-puts "\n\nstatic const OnigCodePoint* const CodeRanges[] = {"
+static const OnigCodePoint* const CodeRanges[] = {
+__HEREDOC
 POSIX_NAMES.each{|name|puts"  CR_#{name},"}
 puts "#ifdef USE_UNICODE_PROPERTIES"
-gcps.each{|name|puts"  CR_#{name},"}
-scripts.each{|name|puts"  CR_#{name},"}
-puts "#endif /* USE_UNICODE_PROPERTIES */"
-puts "};"
+props.each{|name|puts"  CR_#{name},"}

 puts(<<'__HEREDOC')
+#endif /* USE_UNICODE_PROPERTIES */
+};
 struct uniname2ctype_struct {
  int name, ctype;
 };
@ -236,12 +269,27 @@ struct uniname2ctype_struct;
 %%
 __HEREDOC
 i = -1
-POSIX_NAMES.each  {|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
+name_to_index = {}
+POSIX_NAMES.each do |name|
+  i += 1
+  name = normalize_propname(name)
+  name_to_index[name] = i
+  puts"%-40s %3d" % [name + ',', i]
+end
 puts "#ifdef USE_UNICODE_PROPERTIES"
-gcps.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
-scripts.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
-puts "#endif /* USE_UNICODE_PROPERTIES */\n"
+props.each do |name|
+  i += 1
+  name = normalize_propname(name)
+  name_to_index[name] = i
+  puts "%-40s %3d" % [name + ',', i]
+end
+parse_aliases.each_pair do |k, v|
+  next if name_to_index[k]
+  next unless v = name_to_index[v]
+  puts "%-40s %3d" % [k + ',', v]
+end
 puts(<<'__HEREDOC')
+#endif /* USE_UNICODE_PROPERTIES */
 %%
 static int
 uniname2ctype(const UChar *name, unsigned int len)