mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
181eb7d5c1
* tool/enc-unicode.rb, enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: Add DerivedCoreProperties, PropList (Binary Property), PropertyAlias and PropertyValueAlias. Now users of tool/enc-unicode.rb should specify the directory of UCD files. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25324 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
301 lines
8.7 KiB
Ruby
Executable file
301 lines
8.7 KiB
Ruby
Executable file
#!/usr/bin/env ruby
|
|
|
|
# Creates the data structures needed by Onigurma to map Unicode codepoints to
|
|
# property names and POSIX character classes
|
|
#
|
|
# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt from unicode.org.
|
|
# (http://unicode.org/Public/UNIDATA/)
|
|
# And run following command.
|
|
# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
|
|
# You can get source file for gperf.
|
|
# After this, simply make ruby.
|
|
|
|
unless ARGV.size == 1
|
|
$stderr.puts "Usage: #{$0} data_directory"
|
|
exit(1)
|
|
end
|
|
|
|
POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII]
|
|
|
|
def pair_codepoints(codepoints)
|
|
|
|
# We have a sorted Array of codepoints that we wish to partition into
|
|
# ranges such that the start- and endpoints form an inclusive set of
|
|
# codepoints with property _property_. Note: It is intended that some ranges
|
|
# will begin with the value with which they end, e.g. 0x0020 -> 0x0020
|
|
|
|
codepoints.sort!
|
|
last_cp = codepoints.first
|
|
pairs = [[last_cp, nil]]
|
|
codepoints[1..-1].each do |codepoint|
|
|
next if last_cp == codepoint
|
|
|
|
# If the current codepoint does not follow directly on from the last
|
|
# codepoint, the last codepoint represents the end of the current range,
|
|
# and the current codepoint represents the start of the next range.
|
|
if last_cp.next != codepoint
|
|
pairs[-1][-1] = last_cp
|
|
pairs << [codepoint, nil]
|
|
end
|
|
last_cp = codepoint
|
|
end
|
|
|
|
# The final pair has as its endpoint the last codepoint for this property
|
|
pairs[-1][-1] = codepoints.last
|
|
pairs
|
|
end
|
|
|
|
def parse_unicode_data(file)
|
|
last_cp = 0
|
|
data = {'Any' => [], 'Assigned' => [], 'Cn' => []}
|
|
beg_cp = nil
|
|
IO.foreach(file) do |line|
|
|
fields = line.split(';')
|
|
cp = fields[0].to_i(16)
|
|
|
|
case fields[1]
|
|
when /\A<(.*),\s*First>\z/
|
|
beg_cp = cp
|
|
next
|
|
when /\A<(.*),\s*Last>\z/
|
|
cps = (beg_cp..cp).to_a
|
|
else
|
|
beg_cp = cp
|
|
cps = [cp]
|
|
end
|
|
|
|
# The Cn category represents unassigned characters. These are not listed in
|
|
# UnicodeData.txt so we must derive them by looking for 'holes' in the range
|
|
# of listed codepoints. We increment the last codepoint seen and compare it
|
|
# with the current codepoint. If the current codepoint is less than
|
|
# last_cp.next we have found a hole, so we add the missing codepoint to the
|
|
# Cn category.
|
|
data['Cn'].concat((last_cp.next...beg_cp).to_a)
|
|
|
|
# Assigned - Defined in unicode.c; interpreted as every character in the
|
|
# Unicode range minus the unassigned characters
|
|
data['Assigned'].concat(cps)
|
|
|
|
# The third field denotes the 'General' category, e.g. Lu
|
|
(data[fields[2]] ||= []).concat(cps)
|
|
|
|
# The 'Major' category is the first letter of the 'General' category, e.g.
|
|
# 'Lu' -> 'L'
|
|
(data[fields[2][0,1]] ||= []).concat(cps)
|
|
last_cp = cp
|
|
end
|
|
|
|
# The last Cn codepoint should be 0x10ffff. If it's not, append the missing
|
|
# codepoints to Cn and C
|
|
cn_remainder = (last_cp.next..0x10ffff).to_a
|
|
data['Cn'] += cn_remainder
|
|
data['C'] += cn_remainder
|
|
|
|
# Define General Category properties
|
|
gcps = data.keys.sort
|
|
|
|
# We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
|
|
#
|
|
|
|
# alnum Letter | Mark | Decimal_Number
|
|
data['Alnum'] = data['L'] + data['M'] + data['Nd']
|
|
|
|
# alpha Letter | Mark
|
|
data['Alpha'] = data['L'] + data['M']
|
|
|
|
# ascii 0000 - 007F
|
|
data['ASCII'] = (0..0x007F).to_a
|
|
|
|
# blank Space_Separator | 0009
|
|
data['Blank'] = data['Zs'] + [0x0009]
|
|
|
|
# cntrl Control
|
|
data['Cntrl'] = data['Cc']
|
|
|
|
# digit Decimal_Number
|
|
data['Digit'] = data['Nd']
|
|
|
|
# lower Lowercase_Letter
|
|
data['Lower'] = data['Ll']
|
|
|
|
# punct Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
|
|
# Final_Punctuation | Initial_Punctuation | Other_Punctuation |
|
|
# Open_Punctuation
|
|
# NOTE: This definition encompasses the entire P category, and the current
|
|
# mappings agree, but we explcitly declare this way to marry it with the above
|
|
# definition.
|
|
data['Punct'] = data['Pc'] + data['Pd'] + data['Pe'] + data['Pf'] +
|
|
data['Pi'] + data['Po'] + data['Ps']
|
|
|
|
# space Space_Separator | Line_Separator | Paragraph_Separator |
|
|
# 0009 | 000A | 000B | 000C | 000D | 0085
|
|
data['Space'] = data['Zs'] + data['Zl'] + data['Zp'] +
|
|
[0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0085]
|
|
|
|
# upper Uppercase_Letter
|
|
data['Upper'] = data['Lu']
|
|
|
|
# xdigit 0030 - 0039 | 0041 - 0046 | 0061 - 0066
|
|
# (0-9, a-f, A-F)
|
|
data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
|
|
(0x0061..0x0066).to_a
|
|
|
|
# word Letter | Mark | Decimal_Number | Connector_Punctuation
|
|
data['Word'] = data['L'] + data['M'] + data['Nd'] + data['Pc']
|
|
|
|
# graph [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
|
|
data['Graph'] = data['L'] + data['M'] + data['N'] + data['P'] + data['S']
|
|
data['Graph'] -= data['Space'] - data['C']
|
|
|
|
# print [[:graph:]] | [[:space:]]
|
|
data['Print'] = data['Graph'] + data['Space']
|
|
|
|
# NEWLINE - This was defined in unicode.c
|
|
data['NEWLINE'] = [0x000a]
|
|
|
|
# Any - Defined in unicode.c
|
|
data['Any'] = (0x0000..0x10ffff).to_a
|
|
|
|
# Returns General Category Property names and the data
|
|
[gcps, data]
|
|
end
|
|
|
|
|
|
def parse_scripts
|
|
files = [
|
|
{fn: 'DerivedCoreProperties.txt', title: 'Derived Property'},
|
|
{fn: 'Scripts.txt', title: 'Script'},
|
|
{fn: 'PropList.txt', title: 'Binary Property'}
|
|
]
|
|
current = nil
|
|
data = []
|
|
names = []
|
|
files.each do |file|
|
|
IO.foreach(get_file(file[:fn])) do |line|
|
|
if /^# Total code points: / =~ line
|
|
make_const(current, pair_codepoints(data), file[:title])
|
|
names << current
|
|
data = []
|
|
elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line
|
|
current = $3
|
|
$2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
|
|
end
|
|
end
|
|
end
|
|
names
|
|
end
|
|
|
|
def parse_aliases
|
|
kv = {}
|
|
IO.foreach(get_file('PropertyAliases.txt')) do |line|
|
|
next unless /^(\w+)\s*; (\w+)/ =~ line
|
|
kv[normalize_propname($1)] = normalize_propname($2)
|
|
end
|
|
IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
|
|
next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
|
|
if $1 == 'gc'
|
|
kv[normalize_propname($3)] = normalize_propname($2)
|
|
kv[normalize_propname($4)] = normalize_propname($2) if $4
|
|
else
|
|
kv[normalize_propname($2)] = normalize_propname($3)
|
|
kv[normalize_propname($4)] = normalize_propname($3) if $4
|
|
end
|
|
end
|
|
kv
|
|
end
|
|
|
|
# make_const(property, pairs, name): Prints a 'static const' structure for a
|
|
# given property, group of paired codepoints, and a human-friendly name for
|
|
# the group
|
|
def make_const(prop, pairs, name)
|
|
puts "\n/* '#{prop}': #{name} */"
|
|
puts "static const OnigCodePoint CR_#{prop}[] = {"
|
|
# The first element of the constant is the number of pairs of codepoints
|
|
puts "\t#{pairs.size},"
|
|
pairs.each do |pair|
|
|
pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) }
|
|
puts "\t#{pair.first}, #{pair.last},"
|
|
end
|
|
puts "}; /* CR_#{prop} */"
|
|
end
|
|
|
|
def normalize_propname(name)
|
|
name = name.downcase
|
|
name.delete!('- _')
|
|
name
|
|
end
|
|
|
|
def get_file(name)
|
|
File.join(ARGV[0], name)
|
|
end
|
|
|
|
|
|
# Write Data
|
|
puts '%{'
|
|
props, data = parse_unicode_data(get_file('UnicodeData.txt'))
|
|
POSIX_NAMES.each do |name|
|
|
make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
|
|
end
|
|
print "\n#ifdef USE_UNICODE_PROPERTIES"
|
|
props.each do |name|
|
|
category =
|
|
case name.size
|
|
when 1 then 'Major Category'
|
|
when 2 then 'General Category'
|
|
else '-'
|
|
end
|
|
make_const(name, pair_codepoints(data[name]), category)
|
|
end
|
|
props.concat parse_scripts
|
|
puts(<<'__HEREDOC')
|
|
#endif /* USE_UNICODE_PROPERTIES */
|
|
|
|
static const OnigCodePoint* const CodeRanges[] = {
|
|
__HEREDOC
|
|
POSIX_NAMES.each{|name|puts" CR_#{name},"}
|
|
puts "#ifdef USE_UNICODE_PROPERTIES"
|
|
props.each{|name|puts" CR_#{name},"}
|
|
|
|
puts(<<'__HEREDOC')
|
|
#endif /* USE_UNICODE_PROPERTIES */
|
|
};
|
|
struct uniname2ctype_struct {
|
|
int name, ctype;
|
|
};
|
|
|
|
static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int);
|
|
%}
|
|
struct uniname2ctype_struct;
|
|
%%
|
|
__HEREDOC
|
|
i = -1
|
|
name_to_index = {}
|
|
POSIX_NAMES.each do |name|
|
|
i += 1
|
|
name = normalize_propname(name)
|
|
name_to_index[name] = i
|
|
puts"%-40s %3d" % [name + ',', i]
|
|
end
|
|
puts "#ifdef USE_UNICODE_PROPERTIES"
|
|
props.each do |name|
|
|
i += 1
|
|
name = normalize_propname(name)
|
|
name_to_index[name] = i
|
|
puts "%-40s %3d" % [name + ',', i]
|
|
end
|
|
parse_aliases.each_pair do |k, v|
|
|
next if name_to_index[k]
|
|
next unless v = name_to_index[v]
|
|
puts "%-40s %3d" % [k + ',', v]
|
|
end
|
|
puts(<<'__HEREDOC')
|
|
#endif /* USE_UNICODE_PROPERTIES */
|
|
%%
|
|
static int
|
|
uniname2ctype(const UChar *name, unsigned int len)
|
|
{
|
|
const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len);
|
|
if (p) return p->ctype;
|
|
return -1;
|
|
}
|
|
__HEREDOC
|