1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

Add derived core and binary property and aliases.

* tool/enc-unicode.rb,
  enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
  enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
  Add DerivedCoreProperties, PropList (Binary Property),
  PropertyAlias and PropertyValueAlias.
  Now users of tool/enc-unicode.rb should specify
  the directory of UCD files.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25324 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
naruse 2009-10-13 12:27:00 +00:00
parent 391e5df571
commit 181eb7d5c1
6 changed files with 36662 additions and 1368 deletions

View file

@ -1,3 +1,13 @@
Tue Oct 13 21:05:01 2009 NARUSE, Yui <naruse@ruby-lang.org>
* tool/enc-unicode.rb,
enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
Add DerivedCoreProperties, PropList (Binary Property),
PropertyAlias and PropertyValueAlias.
Now users of tool/enc-unicode.rb should specify
the directory of UCD files.
Tue Oct 13 18:54:25 2009 Hidetoshi NAGAI <nagai@ai.kyutech.ac.jp>
* ext/tk/variable.rb: bug fix. additional trace definition changes the

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -3,15 +3,15 @@
# Creates the data structures needed by Onigurma to map Unicode codepoints to
# property names and POSIX character classes
#
# To use this, get UnicodeData.txt and Scripts.txt from unicode.org.
# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt from unicode.org.
# (http://unicode.org/Public/UNIDATA/)
# And run following command.
# ruby1.9 tool/enc-unicode.rb UnicodeData.txt Scripts.txt > enc/unicode/name2ctype.kwd
# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
# You can get source file for gperf.
# After this, simply make ruby.
unless ARGV.size == 2
$stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
unless ARGV.size == 1
$stderr.puts "Usage: #{$0} data_directory"
exit(1)
end
@ -161,23 +161,49 @@ def parse_unicode_data(file)
end
def parse_scripts(file)
script = nil
def parse_scripts
files = [
{fn: 'DerivedCoreProperties.txt', title: 'Derived Property'},
{fn: 'Scripts.txt', title: 'Script'},
{fn: 'PropList.txt', title: 'Binary Property'}
]
current = nil
data = []
names = []
IO.foreach(file) do |line|
if /^# Total code points: / =~ line
make_const(script, pair_codepoints(data), 'Script')
names << script
data = []
elsif /^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?\s*;\s*(\w+)/ =~ line
script = $3
$2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
files.each do |file|
IO.foreach(get_file(file[:fn])) do |line|
if /^# Total code points: / =~ line
make_const(current, pair_codepoints(data), file[:title])
names << current
data = []
elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line
current = $3
$2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
end
end
end
names
end
def parse_aliases
kv = {}
IO.foreach(get_file('PropertyAliases.txt')) do |line|
next unless /^(\w+)\s*; (\w+)/ =~ line
kv[normalize_propname($1)] = normalize_propname($2)
end
IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
if $1 == 'gc'
kv[normalize_propname($3)] = normalize_propname($2)
kv[normalize_propname($4)] = normalize_propname($2) if $4
else
kv[normalize_propname($2)] = normalize_propname($3)
kv[normalize_propname($4)] = normalize_propname($3) if $4
end
end
kv
end
# make_const(property, pairs, name): Prints a 'static const' structure for a
# given property, group of paired codepoints, and a human-friendly name for
# the group
@ -195,17 +221,23 @@ end
def normalize_propname(name)
name = name.downcase
name.gsub!(/[- _]/, '')
name.delete!('- _')
name
end
def get_file(name)
File.join(ARGV[0], name)
end
# Write Data
puts '%{'
gcps, data = parse_unicode_data(ARGV[0])
props, data = parse_unicode_data(get_file('UnicodeData.txt'))
POSIX_NAMES.each do |name|
make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
end
print "\n#ifdef USE_UNICODE_PROPERTIES"
gcps.each do |name|
props.each do |name|
category =
case name.size
when 1 then 'Major Category'
@ -214,18 +246,19 @@ gcps.each do |name|
end
make_const(name, pair_codepoints(data[name]), category)
end
scripts = parse_scripts(ARGV[1])
puts "#endif /* USE_UNICODE_PROPERTIES */"
props.concat parse_scripts
puts(<<'__HEREDOC')
#endif /* USE_UNICODE_PROPERTIES */
puts "\n\nstatic const OnigCodePoint* const CodeRanges[] = {"
static const OnigCodePoint* const CodeRanges[] = {
__HEREDOC
POSIX_NAMES.each{|name|puts" CR_#{name},"}
puts "#ifdef USE_UNICODE_PROPERTIES"
gcps.each{|name|puts" CR_#{name},"}
scripts.each{|name|puts" CR_#{name},"}
puts "#endif /* USE_UNICODE_PROPERTIES */"
puts "};"
props.each{|name|puts" CR_#{name},"}
puts(<<'__HEREDOC')
#endif /* USE_UNICODE_PROPERTIES */
};
struct uniname2ctype_struct {
int name, ctype;
};
@ -236,12 +269,27 @@ struct uniname2ctype_struct;
%%
__HEREDOC
i = -1
POSIX_NAMES.each {|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
name_to_index = {}
POSIX_NAMES.each do |name|
i += 1
name = normalize_propname(name)
name_to_index[name] = i
puts"%-40s %3d" % [name + ',', i]
end
puts "#ifdef USE_UNICODE_PROPERTIES"
gcps.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
scripts.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
puts "#endif /* USE_UNICODE_PROPERTIES */\n"
props.each do |name|
i += 1
name = normalize_propname(name)
name_to_index[name] = i
puts "%-40s %3d" % [name + ',', i]
end
parse_aliases.each_pair do |k, v|
next if name_to_index[k]
next unless v = name_to_index[v]
puts "%-40s %3d" % [k + ',', v]
end
puts(<<'__HEREDOC')
#endif /* USE_UNICODE_PROPERTIES */
%%
static int
uniname2ctype(const UChar *name, unsigned int len)