mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
Update Oniguruma's UnicodeData to 5.1.
* tool/enc-unicode.rb: added for generate name2ctype.kwd. contributed by Run Paint Run Run [ruby-core:24775] use like following: ruby19 tool/enc-unicode.rb enc/unicode/UnicodeData.txt \ enc/unicode/Scripts.txt > enc/unicode/name2ctype.kwd * enc/unicode.c (CodeRanges): move definitions to name2ctype.h. * enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: updated to v5.1. * enc/unicode/UnicodeData.txt, enc/unicode/Scripts.txt: added v5.1. * Makefile.in: add rule to generate name2ctype.kwd from UnicodeData.txt and Scripts.txt. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@24651 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
649f648ec1
commit
f1eff95745
9 changed files with 49881 additions and 9068 deletions
15
ChangeLog
15
ChangeLog
|
@ -1,3 +1,18 @@
|
|||
Tue Aug 25 23:51:07 2009 NARUSE, Yui <naruse@ruby-lang.org>
|
||||
|
||||
* tool/enc-unicode.rb: added for generate name2ctype.kwd.
|
||||
contributed by Run Paint Run Run [ruby-core:24775]
|
||||
|
||||
* enc/unicode.c (CodeRanges): move definitions to name2ctype.h.
|
||||
|
||||
* enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd,
|
||||
enc/unicode/name2ctype.src: updated to v5.1.
|
||||
|
||||
* enc/unicode/UnicodeData.txt, enc/unicode/Scripts.txt: added v5.1.
|
||||
|
||||
* Makefile.in: add rule to generate name2ctype.kwd from
|
||||
UnicodeData.txt and Scripts.txt.
|
||||
|
||||
Tue Aug 25 22:31:51 2009 NARUSE, Yui <naruse@ruby-lang.org>
|
||||
|
||||
* configure.in (MKDIR_P): Set 'mkdir -p' to MKDIR_P
|
||||
|
|
|
@ -207,6 +207,8 @@ lex.c: defs/keywords
|
|||
$(CP) $? $(srcdir)/defs/lex.c.src && \
|
||||
$(CP) $@ $(srcdir)/lex.c.blt; \
|
||||
fi
|
||||
$(srcdir)/enc/unicode/name2ctype.kwd: enc/unicode/UnicodeData.txt enc/unicode/Scripts.txt
|
||||
$(BASERUBY) $(srcdir)/tool/enc-unicode.rb $? > $@
|
||||
|
||||
NAME2CTYPE_OPTIONS = -7 -c -j1 -i1 -t -C -P -T -H uniname2ctype_hash -Q uniname2ctype_pool -N uniname2ctype_p
|
||||
|
||||
|
|
8576
enc/unicode.c
8576
enc/unicode.c
File diff suppressed because it is too large
Load diff
1747
enc/unicode/Scripts.txt
Normal file
1747
enc/unicode/Scripts.txt
Normal file
File diff suppressed because it is too large
Load diff
19336
enc/unicode/UnicodeData.txt
Normal file
19336
enc/unicode/UnicodeData.txt
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
230
tool/enc-unicode.rb
Executable file
230
tool/enc-unicode.rb
Executable file
|
@ -0,0 +1,230 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
# Creates the data structures needed by Onigurma to map Unicode codepoints to
|
||||
# property names and POSIX character classes
|
||||
|
||||
unless ARGV.size == 2
|
||||
$stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
|
||||
exit(1)
|
||||
end
|
||||
|
||||
POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII]
|
||||
|
||||
def pair_codepoints(codepoints)
|
||||
|
||||
# We have a sorted Array of codepoints that we wish to partition into
|
||||
# ranges such that the start- and endpoints form an inclusive set of
|
||||
# codepoints with property _property_. Note: It is intended that some ranges
|
||||
# will begin with the value with which they end, e.g. 0x0020 -> 0x0020
|
||||
|
||||
codepoints = codepoints.uniq.sort
|
||||
last_cp = codepoints.first
|
||||
pairs = [[last_cp, nil]]
|
||||
codepoints[1..-1].each do |codepoint|
|
||||
|
||||
# If the current codepoint does not follow directly on from the last
|
||||
# codepoint, the last codepoint represents the end of the current range,
|
||||
# and the current codepoint represents the start of the next range.
|
||||
if last_cp.next != codepoint
|
||||
pairs[-1][-1] = last_cp
|
||||
pairs << [codepoint, nil]
|
||||
end
|
||||
last_cp = codepoint
|
||||
end
|
||||
|
||||
# The final pair has as its endpoint the last codepoint for this property
|
||||
pairs[-1][-1] = codepoints.last
|
||||
pairs
|
||||
end
|
||||
|
||||
def parse_unicode_data(file)
|
||||
last_cp = 0
|
||||
data = {'Cn' => []}
|
||||
IO.foreach(file) do |line|
|
||||
fields = line.split(';')
|
||||
cp = fields[0].to_i(16)
|
||||
|
||||
# The Cn category represents unassigned characters. These are not listed in
|
||||
# UnicodeData.txt so we must derive them by looking for 'holes' in the range
|
||||
# of listed codepoints. We increment the last codepoint seen and compare it
|
||||
# with the current codepoint. If the current codepoint is less than
|
||||
# last_cp.next we have found a hole, so we add the missing codepoint to the
|
||||
# Cn category.
|
||||
while ((last_cp = last_cp.next) < cp)
|
||||
data['Cn'] << last_cp
|
||||
end
|
||||
|
||||
# The third field denotes the 'General' category, e.g. Lu
|
||||
(data[fields[2]] ||= []) << cp
|
||||
|
||||
# The 'Major' category is the first letter of the 'General' category, e.g.
|
||||
# 'Lu' -> 'L'
|
||||
(data[fields[2][0,1]] ||= []) << cp
|
||||
last_cp = cp
|
||||
end
|
||||
|
||||
# General Category property
|
||||
gcps = %w[Any Assigned]
|
||||
gcps.concat data.keys.sort
|
||||
|
||||
# The last Cn codepoint should be 0x10ffff. If it's not, append the missing
|
||||
# codepoints to Cn and C
|
||||
cn_remainder = (data['Cn'].last.next..0x10ffff).to_a
|
||||
data['Cn'] += cn_remainder
|
||||
data['C'] += cn_remainder
|
||||
|
||||
# We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
|
||||
#
|
||||
|
||||
# alnum Letter | Mark | Decimal_Number
|
||||
data['Alnum'] = data['L'] + data['M'] + data['Nd']
|
||||
|
||||
# alpha Letter | Mark
|
||||
data['Alpha'] = data['L'] + data['M']
|
||||
|
||||
# ascii 0000 - 007F
|
||||
data['ASCII'] = (0..0x007F).to_a
|
||||
|
||||
# blank Space_Separator | 0009
|
||||
data['Blank'] = data['Zs'] + [0x0009]
|
||||
|
||||
# cntrl Control
|
||||
data['Cntrl'] = data['Cc']
|
||||
|
||||
# digit Decimal_Number
|
||||
data['Digit'] = data['Nd']
|
||||
|
||||
# lower Lowercase_Letter
|
||||
data['Lower'] = data['Ll']
|
||||
|
||||
# punct Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
|
||||
# Final_Punctuation | Initial_Punctuation | Other_Punctuation |
|
||||
# Open_Punctuation
|
||||
# NOTE: This definition encompasses the entire P category, and the current
|
||||
# mappings agree, but we explcitly declare this way to marry it with the above
|
||||
# definition.
|
||||
data['Punct'] = data['Pc'] + data['Pd'] + data['Pe'] + data['Pf'] +
|
||||
data['Pi'] + data['Po'] + data['Ps']
|
||||
|
||||
# space Space_Separator | Line_Separator | Paragraph_Separator |
|
||||
# 0009 | 000A | 000B | 000C | 000D | 0085
|
||||
data['Space'] = data['Zs'] + data['Zl'] + data['Zp'] +
|
||||
[0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0085]
|
||||
|
||||
# upper Uppercase_Letter
|
||||
data['Upper'] = data['Lu']
|
||||
|
||||
# xdigit 0030 - 0039 | 0041 - 0046 | 0061 - 0066
|
||||
# (0-9, a-f, A-F)
|
||||
data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
|
||||
(0x0061..0x0066).to_a
|
||||
|
||||
# word Letter | Mark | Decimal_Number | Connector_Punctuation
|
||||
data['Word'] = data['L'] + data['M'] + data['Nd'] + data['Pc']
|
||||
|
||||
# graph [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
|
||||
data['Graph'] = data['L'] + data['M'] + data['N'] + data['P'] + data['S']
|
||||
data['Graph'] -= data['Space'] - data['C']
|
||||
|
||||
# print [[:graph:]] | [[:space:]]
|
||||
data['Print'] = data['Graph'] + data['Space']
|
||||
|
||||
# NEWLINE - This was defined in unicode.c
|
||||
data['NEWLINE'] = [0x000a]
|
||||
|
||||
# Any - Defined in unicode.c
|
||||
data['Any'] = (0x0000..0x10ffff).to_a
|
||||
|
||||
# Assigned - Defined in unicode.c; interpreted as every character in the
|
||||
# Unicode range minus the unassigned characters
|
||||
data['Assigned'] = data['Any'] - data['Cn']
|
||||
|
||||
# Returns General Category Property names and the data
|
||||
[gcps, data]
|
||||
end
|
||||
|
||||
|
||||
def parse_scripts(file)
|
||||
script = nil
|
||||
data = []
|
||||
names = []
|
||||
IO.foreach(file) do |line|
|
||||
if /^# Total code points: / =~ line
|
||||
make_const(script, pair_codepoints(data), 'Script')
|
||||
names << script
|
||||
data = []
|
||||
elsif /^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?\s*;\s*(\w+)/ =~ line
|
||||
script = $3
|
||||
$2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
|
||||
end
|
||||
end
|
||||
names
|
||||
end
|
||||
|
||||
# make_const(property, pairs, name): Prints a 'static const' structure for a
|
||||
# given property, group of paired codepoints, and a human-friendly name for
|
||||
# the group
|
||||
def make_const(prop, pairs, name)
|
||||
puts "\n/* '#{prop}': #{name} */"
|
||||
puts "static const OnigCodePoint CR_#{prop}[] = {"
|
||||
# The first element of the constant is the number of pairs of codepoints
|
||||
puts "\t#{pairs.size},"
|
||||
pairs.each do |pair|
|
||||
pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) }
|
||||
puts "\t#{pair.first}, #{pair.last},"
|
||||
end
|
||||
puts "}; /* CR_#{prop} */"
|
||||
end
|
||||
|
||||
puts '%{'
|
||||
gcps, data = parse_unicode_data(ARGV[0])
|
||||
POSIX_NAMES.each do |name|
|
||||
make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
|
||||
end
|
||||
print "\n#ifdef USE_UNICODE_PROPERTIES"
|
||||
gcps.each do |name|
|
||||
category =
|
||||
case name.size
|
||||
when 1 then 'Major Category'
|
||||
when 2 then 'General Category'
|
||||
else '-'
|
||||
end
|
||||
make_const(name, pair_codepoints(data[name]), category)
|
||||
end
|
||||
scripts = parse_scripts(ARGV[1])
|
||||
puts "#endif /* USE_UNICODE_PROPERTIES */"
|
||||
|
||||
puts "\n\nstatic const OnigCodePoint* const CodeRanges[] = {"
|
||||
POSIX_NAMES.each{|name|puts" CR_#{name},"}
|
||||
puts "#ifdef USE_UNICODE_PROPERTIES"
|
||||
gcps.each{|name|puts" CR_#{name},"}
|
||||
scripts.each{|name|puts" CR_#{name},"}
|
||||
puts "#endif /* USE_UNICODE_PROPERTIES */"
|
||||
puts "};"
|
||||
|
||||
puts(<<'__HEREDOC')
|
||||
struct uniname2ctype_struct {
|
||||
int name, ctype;
|
||||
};
|
||||
|
||||
static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int);
|
||||
%}
|
||||
struct uniname2ctype_struct;
|
||||
%%
|
||||
__HEREDOC
|
||||
i = -1
|
||||
POSIX_NAMES.each {|name|puts"%-21s %3d"%[name+',', i+=1]}
|
||||
puts "#ifdef USE_UNICODE_PROPERTIES"
|
||||
gcps.each{|name|puts"%-21s %3d"%[name+',', i+=1]}
|
||||
scripts.each{|name|puts"%-21s %3d"%[name+',', i+=1]}
|
||||
puts "#endif /* USE_UNICODE_PROPERTIES */\n"
|
||||
puts(<<'__HEREDOC')
|
||||
%%
|
||||
static int
|
||||
uniname2ctype(const UChar *name, unsigned int len)
|
||||
{
|
||||
const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len);
|
||||
if (p) return p->ctype;
|
||||
return -1;
|
||||
}
|
||||
__HEREDOC
|
Loading…
Add table
Reference in a new issue