1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

Update Oniguruma's UnicodeData to 5.1.

* tool/enc-unicode.rb: added for generate name2ctype.kwd.
  contributed by Run Paint Run Run [ruby-core:24775]
  use like following:
    ruby19 tool/enc-unicode.rb enc/unicode/UnicodeData.txt \
      enc/unicode/Scripts.txt > enc/unicode/name2ctype.kwd

* enc/unicode.c (CodeRanges): move definitions to name2ctype.h.

* enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd,
  enc/unicode/name2ctype.src: updated to v5.1.

* enc/unicode/UnicodeData.txt, enc/unicode/Scripts.txt: added v5.1.

* Makefile.in: add rule to generate name2ctype.kwd from
  UnicodeData.txt and Scripts.txt.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@24651 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
naruse 2009-08-25 16:15:38 +00:00
parent 649f648ec1
commit f1eff95745
9 changed files with 49881 additions and 9068 deletions

View file

@ -1,3 +1,18 @@
Tue Aug 25 23:51:07 2009 NARUSE, Yui <naruse@ruby-lang.org>
* tool/enc-unicode.rb: added for generate name2ctype.kwd.
contributed by Run Paint Run Run [ruby-core:24775]
* enc/unicode.c (CodeRanges): move definitions to name2ctype.h.
* enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd,
enc/unicode/name2ctype.src: updated to v5.1.
* enc/unicode/UnicodeData.txt, enc/unicode/Scripts.txt: added v5.1.
* Makefile.in: add rule to generate name2ctype.kwd from
UnicodeData.txt and Scripts.txt.
Tue Aug 25 22:31:51 2009 NARUSE, Yui <naruse@ruby-lang.org>
* configure.in (MKDIR_P): Set 'mkdir -p' to MKDIR_P

View file

@ -207,6 +207,8 @@ lex.c: defs/keywords
$(CP) $? $(srcdir)/defs/lex.c.src && \
$(CP) $@ $(srcdir)/lex.c.blt; \
fi
$(srcdir)/enc/unicode/name2ctype.kwd: enc/unicode/UnicodeData.txt enc/unicode/Scripts.txt
$(BASERUBY) $(srcdir)/tool/enc-unicode.rb $? > $@
NAME2CTYPE_OPTIONS = -7 -c -j1 -i1 -t -C -P -T -H uniname2ctype_hash -Q uniname2ctype_pool -N uniname2ctype_p

File diff suppressed because it is too large Load diff

1747
enc/unicode/Scripts.txt Normal file

File diff suppressed because it is too large Load diff

19336
enc/unicode/UnicodeData.txt Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

230
tool/enc-unicode.rb Executable file
View file

@ -0,0 +1,230 @@
#!/usr/bin/env ruby
# Creates the data structures needed by Onigurma to map Unicode codepoints to
# property names and POSIX character classes
unless ARGV.size == 2
$stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
exit(1)
end
POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII]
def pair_codepoints(codepoints)
# We have a sorted Array of codepoints that we wish to partition into
# ranges such that the start- and endpoints form an inclusive set of
# codepoints with property _property_. Note: It is intended that some ranges
# will begin with the value with which they end, e.g. 0x0020 -> 0x0020
codepoints = codepoints.uniq.sort
last_cp = codepoints.first
pairs = [[last_cp, nil]]
codepoints[1..-1].each do |codepoint|
# If the current codepoint does not follow directly on from the last
# codepoint, the last codepoint represents the end of the current range,
# and the current codepoint represents the start of the next range.
if last_cp.next != codepoint
pairs[-1][-1] = last_cp
pairs << [codepoint, nil]
end
last_cp = codepoint
end
# The final pair has as its endpoint the last codepoint for this property
pairs[-1][-1] = codepoints.last
pairs
end
def parse_unicode_data(file)
last_cp = 0
data = {'Cn' => []}
IO.foreach(file) do |line|
fields = line.split(';')
cp = fields[0].to_i(16)
# The Cn category represents unassigned characters. These are not listed in
# UnicodeData.txt so we must derive them by looking for 'holes' in the range
# of listed codepoints. We increment the last codepoint seen and compare it
# with the current codepoint. If the current codepoint is less than
# last_cp.next we have found a hole, so we add the missing codepoint to the
# Cn category.
while ((last_cp = last_cp.next) < cp)
data['Cn'] << last_cp
end
# The third field denotes the 'General' category, e.g. Lu
(data[fields[2]] ||= []) << cp
# The 'Major' category is the first letter of the 'General' category, e.g.
# 'Lu' -> 'L'
(data[fields[2][0,1]] ||= []) << cp
last_cp = cp
end
# General Category property
gcps = %w[Any Assigned]
gcps.concat data.keys.sort
# The last Cn codepoint should be 0x10ffff. If it's not, append the missing
# codepoints to Cn and C
cn_remainder = (data['Cn'].last.next..0x10ffff).to_a
data['Cn'] += cn_remainder
data['C'] += cn_remainder
# We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
#
# alnum Letter | Mark | Decimal_Number
data['Alnum'] = data['L'] + data['M'] + data['Nd']
# alpha Letter | Mark
data['Alpha'] = data['L'] + data['M']
# ascii 0000 - 007F
data['ASCII'] = (0..0x007F).to_a
# blank Space_Separator | 0009
data['Blank'] = data['Zs'] + [0x0009]
# cntrl Control
data['Cntrl'] = data['Cc']
# digit Decimal_Number
data['Digit'] = data['Nd']
# lower Lowercase_Letter
data['Lower'] = data['Ll']
# punct Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
# Final_Punctuation | Initial_Punctuation | Other_Punctuation |
# Open_Punctuation
# NOTE: This definition encompasses the entire P category, and the current
# mappings agree, but we explcitly declare this way to marry it with the above
# definition.
data['Punct'] = data['Pc'] + data['Pd'] + data['Pe'] + data['Pf'] +
data['Pi'] + data['Po'] + data['Ps']
# space Space_Separator | Line_Separator | Paragraph_Separator |
# 0009 | 000A | 000B | 000C | 000D | 0085
data['Space'] = data['Zs'] + data['Zl'] + data['Zp'] +
[0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0085]
# upper Uppercase_Letter
data['Upper'] = data['Lu']
# xdigit 0030 - 0039 | 0041 - 0046 | 0061 - 0066
# (0-9, a-f, A-F)
data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
(0x0061..0x0066).to_a
# word Letter | Mark | Decimal_Number | Connector_Punctuation
data['Word'] = data['L'] + data['M'] + data['Nd'] + data['Pc']
# graph [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
data['Graph'] = data['L'] + data['M'] + data['N'] + data['P'] + data['S']
data['Graph'] -= data['Space'] - data['C']
# print [[:graph:]] | [[:space:]]
data['Print'] = data['Graph'] + data['Space']
# NEWLINE - This was defined in unicode.c
data['NEWLINE'] = [0x000a]
# Any - Defined in unicode.c
data['Any'] = (0x0000..0x10ffff).to_a
# Assigned - Defined in unicode.c; interpreted as every character in the
# Unicode range minus the unassigned characters
data['Assigned'] = data['Any'] - data['Cn']
# Returns General Category Property names and the data
[gcps, data]
end
def parse_scripts(file)
script = nil
data = []
names = []
IO.foreach(file) do |line|
if /^# Total code points: / =~ line
make_const(script, pair_codepoints(data), 'Script')
names << script
data = []
elsif /^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?\s*;\s*(\w+)/ =~ line
script = $3
$2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
end
end
names
end
# make_const(property, pairs, name): Prints a 'static const' structure for a
# given property, group of paired codepoints, and a human-friendly name for
# the group
def make_const(prop, pairs, name)
puts "\n/* '#{prop}': #{name} */"
puts "static const OnigCodePoint CR_#{prop}[] = {"
# The first element of the constant is the number of pairs of codepoints
puts "\t#{pairs.size},"
pairs.each do |pair|
pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) }
puts "\t#{pair.first}, #{pair.last},"
end
puts "}; /* CR_#{prop} */"
end
puts '%{'
gcps, data = parse_unicode_data(ARGV[0])
POSIX_NAMES.each do |name|
make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
end
print "\n#ifdef USE_UNICODE_PROPERTIES"
gcps.each do |name|
category =
case name.size
when 1 then 'Major Category'
when 2 then 'General Category'
else '-'
end
make_const(name, pair_codepoints(data[name]), category)
end
scripts = parse_scripts(ARGV[1])
puts "#endif /* USE_UNICODE_PROPERTIES */"
puts "\n\nstatic const OnigCodePoint* const CodeRanges[] = {"
POSIX_NAMES.each{|name|puts" CR_#{name},"}
puts "#ifdef USE_UNICODE_PROPERTIES"
gcps.each{|name|puts" CR_#{name},"}
scripts.each{|name|puts" CR_#{name},"}
puts "#endif /* USE_UNICODE_PROPERTIES */"
puts "};"
puts(<<'__HEREDOC')
struct uniname2ctype_struct {
int name, ctype;
};
static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int);
%}
struct uniname2ctype_struct;
%%
__HEREDOC
i = -1
POSIX_NAMES.each {|name|puts"%-21s %3d"%[name+',', i+=1]}
puts "#ifdef USE_UNICODE_PROPERTIES"
gcps.each{|name|puts"%-21s %3d"%[name+',', i+=1]}
scripts.each{|name|puts"%-21s %3d"%[name+',', i+=1]}
puts "#endif /* USE_UNICODE_PROPERTIES */\n"
puts(<<'__HEREDOC')
%%
static int
uniname2ctype(const UChar *name, unsigned int len)
{
const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len);
if (p) return p->ctype;
return -1;
}
__HEREDOC