mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
Add derived core and binary property and aliases.
* tool/enc-unicode.rb, enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: Add DerivedCoreProperties, PropList (Binary Property), PropertyAlias and PropertyValueAlias. Now users of tool/enc-unicode.rb should specify the directory of UCD files. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25324 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
391e5df571
commit
181eb7d5c1
6 changed files with 36662 additions and 1368 deletions
10
ChangeLog
10
ChangeLog
|
@ -1,3 +1,13 @@
|
||||||
|
Tue Oct 13 21:05:01 2009 NARUSE, Yui <naruse@ruby-lang.org>
|
||||||
|
|
||||||
|
* tool/enc-unicode.rb,
|
||||||
|
enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
|
||||||
|
enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
|
||||||
|
Add DerivedCoreProperties, PropList (Binary Property),
|
||||||
|
PropertyAlias and PropertyValueAlias.
|
||||||
|
Now users of tool/enc-unicode.rb should specify
|
||||||
|
the directory of UCD files.
|
||||||
|
|
||||||
Tue Oct 13 18:54:25 2009 Hidetoshi NAGAI <nagai@ai.kyutech.ac.jp>
|
Tue Oct 13 18:54:25 2009 Hidetoshi NAGAI <nagai@ai.kyutech.ac.jp>
|
||||||
|
|
||||||
* ext/tk/variable.rb: bug fix. additional trace definition changes the
|
* ext/tk/variable.rb: bug fix. additional trace definition changes the
|
||||||
|
|
10206
enc/unicode/name2ctype.h
10206
enc/unicode/name2ctype.h
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -3,15 +3,15 @@
|
||||||
# Creates the data structures needed by Onigurma to map Unicode codepoints to
|
# Creates the data structures needed by Onigurma to map Unicode codepoints to
|
||||||
# property names and POSIX character classes
|
# property names and POSIX character classes
|
||||||
#
|
#
|
||||||
# To use this, get UnicodeData.txt and Scripts.txt from unicode.org.
|
# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt from unicode.org.
|
||||||
# (http://unicode.org/Public/UNIDATA/)
|
# (http://unicode.org/Public/UNIDATA/)
|
||||||
# And run following command.
|
# And run following command.
|
||||||
# ruby1.9 tool/enc-unicode.rb UnicodeData.txt Scripts.txt > enc/unicode/name2ctype.kwd
|
# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
|
||||||
# You can get source file for gperf.
|
# You can get source file for gperf.
|
||||||
# After this, simply make ruby.
|
# After this, simply make ruby.
|
||||||
|
|
||||||
unless ARGV.size == 2
|
unless ARGV.size == 1
|
||||||
$stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
|
$stderr.puts "Usage: #{$0} data_directory"
|
||||||
exit(1)
|
exit(1)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -161,23 +161,49 @@ def parse_unicode_data(file)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
def parse_scripts(file)
|
def parse_scripts
|
||||||
script = nil
|
files = [
|
||||||
|
{fn: 'DerivedCoreProperties.txt', title: 'Derived Property'},
|
||||||
|
{fn: 'Scripts.txt', title: 'Script'},
|
||||||
|
{fn: 'PropList.txt', title: 'Binary Property'}
|
||||||
|
]
|
||||||
|
current = nil
|
||||||
data = []
|
data = []
|
||||||
names = []
|
names = []
|
||||||
IO.foreach(file) do |line|
|
files.each do |file|
|
||||||
if /^# Total code points: / =~ line
|
IO.foreach(get_file(file[:fn])) do |line|
|
||||||
make_const(script, pair_codepoints(data), 'Script')
|
if /^# Total code points: / =~ line
|
||||||
names << script
|
make_const(current, pair_codepoints(data), file[:title])
|
||||||
data = []
|
names << current
|
||||||
elsif /^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?\s*;\s*(\w+)/ =~ line
|
data = []
|
||||||
script = $3
|
elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line
|
||||||
$2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
|
current = $3
|
||||||
|
$2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
names
|
names
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def parse_aliases
|
||||||
|
kv = {}
|
||||||
|
IO.foreach(get_file('PropertyAliases.txt')) do |line|
|
||||||
|
next unless /^(\w+)\s*; (\w+)/ =~ line
|
||||||
|
kv[normalize_propname($1)] = normalize_propname($2)
|
||||||
|
end
|
||||||
|
IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
|
||||||
|
next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
|
||||||
|
if $1 == 'gc'
|
||||||
|
kv[normalize_propname($3)] = normalize_propname($2)
|
||||||
|
kv[normalize_propname($4)] = normalize_propname($2) if $4
|
||||||
|
else
|
||||||
|
kv[normalize_propname($2)] = normalize_propname($3)
|
||||||
|
kv[normalize_propname($4)] = normalize_propname($3) if $4
|
||||||
|
end
|
||||||
|
end
|
||||||
|
kv
|
||||||
|
end
|
||||||
|
|
||||||
# make_const(property, pairs, name): Prints a 'static const' structure for a
|
# make_const(property, pairs, name): Prints a 'static const' structure for a
|
||||||
# given property, group of paired codepoints, and a human-friendly name for
|
# given property, group of paired codepoints, and a human-friendly name for
|
||||||
# the group
|
# the group
|
||||||
|
@ -195,17 +221,23 @@ end
|
||||||
|
|
||||||
def normalize_propname(name)
|
def normalize_propname(name)
|
||||||
name = name.downcase
|
name = name.downcase
|
||||||
name.gsub!(/[- _]/, '')
|
name.delete!('- _')
|
||||||
name
|
name
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def get_file(name)
|
||||||
|
File.join(ARGV[0], name)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Write Data
|
||||||
puts '%{'
|
puts '%{'
|
||||||
gcps, data = parse_unicode_data(ARGV[0])
|
props, data = parse_unicode_data(get_file('UnicodeData.txt'))
|
||||||
POSIX_NAMES.each do |name|
|
POSIX_NAMES.each do |name|
|
||||||
make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
|
make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
|
||||||
end
|
end
|
||||||
print "\n#ifdef USE_UNICODE_PROPERTIES"
|
print "\n#ifdef USE_UNICODE_PROPERTIES"
|
||||||
gcps.each do |name|
|
props.each do |name|
|
||||||
category =
|
category =
|
||||||
case name.size
|
case name.size
|
||||||
when 1 then 'Major Category'
|
when 1 then 'Major Category'
|
||||||
|
@ -214,18 +246,19 @@ gcps.each do |name|
|
||||||
end
|
end
|
||||||
make_const(name, pair_codepoints(data[name]), category)
|
make_const(name, pair_codepoints(data[name]), category)
|
||||||
end
|
end
|
||||||
scripts = parse_scripts(ARGV[1])
|
props.concat parse_scripts
|
||||||
puts "#endif /* USE_UNICODE_PROPERTIES */"
|
puts(<<'__HEREDOC')
|
||||||
|
#endif /* USE_UNICODE_PROPERTIES */
|
||||||
|
|
||||||
puts "\n\nstatic const OnigCodePoint* const CodeRanges[] = {"
|
static const OnigCodePoint* const CodeRanges[] = {
|
||||||
|
__HEREDOC
|
||||||
POSIX_NAMES.each{|name|puts" CR_#{name},"}
|
POSIX_NAMES.each{|name|puts" CR_#{name},"}
|
||||||
puts "#ifdef USE_UNICODE_PROPERTIES"
|
puts "#ifdef USE_UNICODE_PROPERTIES"
|
||||||
gcps.each{|name|puts" CR_#{name},"}
|
props.each{|name|puts" CR_#{name},"}
|
||||||
scripts.each{|name|puts" CR_#{name},"}
|
|
||||||
puts "#endif /* USE_UNICODE_PROPERTIES */"
|
|
||||||
puts "};"
|
|
||||||
|
|
||||||
puts(<<'__HEREDOC')
|
puts(<<'__HEREDOC')
|
||||||
|
#endif /* USE_UNICODE_PROPERTIES */
|
||||||
|
};
|
||||||
struct uniname2ctype_struct {
|
struct uniname2ctype_struct {
|
||||||
int name, ctype;
|
int name, ctype;
|
||||||
};
|
};
|
||||||
|
@ -236,12 +269,27 @@ struct uniname2ctype_struct;
|
||||||
%%
|
%%
|
||||||
__HEREDOC
|
__HEREDOC
|
||||||
i = -1
|
i = -1
|
||||||
POSIX_NAMES.each {|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
|
name_to_index = {}
|
||||||
|
POSIX_NAMES.each do |name|
|
||||||
|
i += 1
|
||||||
|
name = normalize_propname(name)
|
||||||
|
name_to_index[name] = i
|
||||||
|
puts"%-40s %3d" % [name + ',', i]
|
||||||
|
end
|
||||||
puts "#ifdef USE_UNICODE_PROPERTIES"
|
puts "#ifdef USE_UNICODE_PROPERTIES"
|
||||||
gcps.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
|
props.each do |name|
|
||||||
scripts.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
|
i += 1
|
||||||
puts "#endif /* USE_UNICODE_PROPERTIES */\n"
|
name = normalize_propname(name)
|
||||||
|
name_to_index[name] = i
|
||||||
|
puts "%-40s %3d" % [name + ',', i]
|
||||||
|
end
|
||||||
|
parse_aliases.each_pair do |k, v|
|
||||||
|
next if name_to_index[k]
|
||||||
|
next unless v = name_to_index[v]
|
||||||
|
puts "%-40s %3d" % [k + ',', v]
|
||||||
|
end
|
||||||
puts(<<'__HEREDOC')
|
puts(<<'__HEREDOC')
|
||||||
|
#endif /* USE_UNICODE_PROPERTIES */
|
||||||
%%
|
%%
|
||||||
static int
|
static int
|
||||||
uniname2ctype(const UChar *name, unsigned int len)
|
uniname2ctype(const UChar *name, unsigned int len)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue