1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

Add derived core and binary property and aliases.

* tool/enc-unicode.rb,
  enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
  enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
  Add DerivedCoreProperties, PropList (Binary Property),
  PropertyAlias and PropertyValueAlias.
  Now users of tool/enc-unicode.rb should specify
  the directory of UCD files.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25324 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
naruse 2009-10-13 12:27:00 +00:00
parent 391e5df571
commit 181eb7d5c1
6 changed files with 36662 additions and 1368 deletions

View file

@ -1,3 +1,13 @@
Tue Oct 13 21:05:01 2009 NARUSE, Yui <naruse@ruby-lang.org>
* tool/enc-unicode.rb,
enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
Add DerivedCoreProperties, PropList (Binary Property),
PropertyAlias and PropertyValueAlias.
Now users of tool/enc-unicode.rb should specify
the directory of UCD files.
Tue Oct 13 18:54:25 2009 Hidetoshi NAGAI <nagai@ai.kyutech.ac.jp> Tue Oct 13 18:54:25 2009 Hidetoshi NAGAI <nagai@ai.kyutech.ac.jp>
* ext/tk/variable.rb: bug fix. additional trace definition changes the * ext/tk/variable.rb: bug fix. additional trace definition changes the

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -3,15 +3,15 @@
# Creates the data structures needed by Onigurma to map Unicode codepoints to # Creates the data structures needed by Onigurma to map Unicode codepoints to
# property names and POSIX character classes # property names and POSIX character classes
# #
# To use this, get UnicodeData.txt and Scripts.txt from unicode.org. # To use this, get UnicodeData.txt, Scripts.txt, PropList.txt from unicode.org.
# (http://unicode.org/Public/UNIDATA/) # (http://unicode.org/Public/UNIDATA/)
# And run following command. # And run following command.
# ruby1.9 tool/enc-unicode.rb UnicodeData.txt Scripts.txt > enc/unicode/name2ctype.kwd # ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
# You can get source file for gperf. # You can get source file for gperf.
# After this, simply make ruby. # After this, simply make ruby.
unless ARGV.size == 2 unless ARGV.size == 1
$stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt" $stderr.puts "Usage: #{$0} data_directory"
exit(1) exit(1)
end end
@ -161,23 +161,49 @@ def parse_unicode_data(file)
end end
def parse_scripts(file) def parse_scripts
script = nil files = [
{fn: 'DerivedCoreProperties.txt', title: 'Derived Property'},
{fn: 'Scripts.txt', title: 'Script'},
{fn: 'PropList.txt', title: 'Binary Property'}
]
current = nil
data = [] data = []
names = [] names = []
IO.foreach(file) do |line| files.each do |file|
if /^# Total code points: / =~ line IO.foreach(get_file(file[:fn])) do |line|
make_const(script, pair_codepoints(data), 'Script') if /^# Total code points: / =~ line
names << script make_const(current, pair_codepoints(data), file[:title])
data = [] names << current
elsif /^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?\s*;\s*(\w+)/ =~ line data = []
script = $3 elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line
$2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16)) current = $3
$2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
end
end end
end end
names names
end end
def parse_aliases
kv = {}
IO.foreach(get_file('PropertyAliases.txt')) do |line|
next unless /^(\w+)\s*; (\w+)/ =~ line
kv[normalize_propname($1)] = normalize_propname($2)
end
IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
if $1 == 'gc'
kv[normalize_propname($3)] = normalize_propname($2)
kv[normalize_propname($4)] = normalize_propname($2) if $4
else
kv[normalize_propname($2)] = normalize_propname($3)
kv[normalize_propname($4)] = normalize_propname($3) if $4
end
end
kv
end
# make_const(property, pairs, name): Prints a 'static const' structure for a # make_const(property, pairs, name): Prints a 'static const' structure for a
# given property, group of paired codepoints, and a human-friendly name for # given property, group of paired codepoints, and a human-friendly name for
# the group # the group
@ -195,17 +221,23 @@ end
def normalize_propname(name) def normalize_propname(name)
name = name.downcase name = name.downcase
name.gsub!(/[- _]/, '') name.delete!('- _')
name name
end end
def get_file(name)
File.join(ARGV[0], name)
end
# Write Data
puts '%{' puts '%{'
gcps, data = parse_unicode_data(ARGV[0]) props, data = parse_unicode_data(get_file('UnicodeData.txt'))
POSIX_NAMES.each do |name| POSIX_NAMES.each do |name|
make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]") make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
end end
print "\n#ifdef USE_UNICODE_PROPERTIES" print "\n#ifdef USE_UNICODE_PROPERTIES"
gcps.each do |name| props.each do |name|
category = category =
case name.size case name.size
when 1 then 'Major Category' when 1 then 'Major Category'
@ -214,18 +246,19 @@ gcps.each do |name|
end end
make_const(name, pair_codepoints(data[name]), category) make_const(name, pair_codepoints(data[name]), category)
end end
scripts = parse_scripts(ARGV[1]) props.concat parse_scripts
puts "#endif /* USE_UNICODE_PROPERTIES */" puts(<<'__HEREDOC')
#endif /* USE_UNICODE_PROPERTIES */
puts "\n\nstatic const OnigCodePoint* const CodeRanges[] = {" static const OnigCodePoint* const CodeRanges[] = {
__HEREDOC
POSIX_NAMES.each{|name|puts" CR_#{name},"} POSIX_NAMES.each{|name|puts" CR_#{name},"}
puts "#ifdef USE_UNICODE_PROPERTIES" puts "#ifdef USE_UNICODE_PROPERTIES"
gcps.each{|name|puts" CR_#{name},"} props.each{|name|puts" CR_#{name},"}
scripts.each{|name|puts" CR_#{name},"}
puts "#endif /* USE_UNICODE_PROPERTIES */"
puts "};"
puts(<<'__HEREDOC') puts(<<'__HEREDOC')
#endif /* USE_UNICODE_PROPERTIES */
};
struct uniname2ctype_struct { struct uniname2ctype_struct {
int name, ctype; int name, ctype;
}; };
@ -236,12 +269,27 @@ struct uniname2ctype_struct;
%% %%
__HEREDOC __HEREDOC
i = -1 i = -1
POSIX_NAMES.each {|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]} name_to_index = {}
POSIX_NAMES.each do |name|
i += 1
name = normalize_propname(name)
name_to_index[name] = i
puts"%-40s %3d" % [name + ',', i]
end
puts "#ifdef USE_UNICODE_PROPERTIES" puts "#ifdef USE_UNICODE_PROPERTIES"
gcps.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]} props.each do |name|
scripts.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]} i += 1
puts "#endif /* USE_UNICODE_PROPERTIES */\n" name = normalize_propname(name)
name_to_index[name] = i
puts "%-40s %3d" % [name + ',', i]
end
parse_aliases.each_pair do |k, v|
next if name_to_index[k]
next unless v = name_to_index[v]
puts "%-40s %3d" % [k + ',', v]
end
puts(<<'__HEREDOC') puts(<<'__HEREDOC')
#endif /* USE_UNICODE_PROPERTIES */
%% %%
static int static int
uniname2ctype(const UChar *name, unsigned int len) uniname2ctype(const UChar *name, unsigned int len)