From 5a4ce608e28068ffc149e869a4bcf6d8c48ca45d Mon Sep 17 00:00:00 2001 From: naruse Date: Thu, 8 Oct 2009 18:07:08 +0000 Subject: [PATCH] * tool/enc-unicode.rb: optimized. * enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: U+100000-U+10FFFD is assigned, not Cn. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25271 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 8 ++++++++ enc/unicode/name2ctype.h.blt | 8 +++++--- enc/unicode/name2ctype.kwd | 8 +++++--- enc/unicode/name2ctype.src | 8 +++++--- tool/enc-unicode.rb | 29 ++++++++++++++++++----------- 5 files changed, 41 insertions(+), 20 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8a1d0b322a..8d671d556e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Fri Oct 9 02:58:18 2009 NARUSE, Yui + + * tool/enc-unicode.rb: optimized. + + * enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, + enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: + U+100000-U+10FFFD is assigned, not Cn. + Fri Oct 9 02:12:02 2009 Marc-Andre Lafortune * ext/curses/curses.c: Many functions of module Curses could cause a diff --git a/enc/unicode/name2ctype.h.blt b/enc/unicode/name2ctype.h.blt index bbc19caf4d..9fcd60a135 100644 --- a/enc/unicode/name2ctype.h.blt +++ b/enc/unicode/name2ctype.h.blt @@ -3959,7 +3959,7 @@ static const OnigCodePoint CR_Any[] = { /* 'Assigned': - */ static const OnigCodePoint CR_Assigned[] = { - 484, + 485, 0x0000, 0x0377, 0x037a, 0x037e, 0x0384, 0x038a, @@ -4444,6 +4444,7 @@ static const OnigCodePoint CR_Assigned[] = { 0xe0020, 0xe007f, 0xe0100, 0xe01ef, 0xf0000, 0xffffd, + 0x100000, 0x10fffd, }; /* CR_Assigned */ /* 'C': Major Category */ @@ -4500,7 +4501,7 @@ static const OnigCodePoint CR_Cf[] = { /* 'Cn': General Category */ static const OnigCodePoint CR_Cn[] = { - 484, + 485, 0x0378, 0x0379, 0x037f, 0x0383, 0x038b, 0x038b, @@ -4984,7 +4985,8 @@ static const OnigCodePoint CR_Cn[] = { 0xe0002, 0xe001f, 0xe0080, 0xe00ff, 0xe01f0, 0xeffff, - 0xffffe, 0x10ffff, + 0xffffe, 0xfffff, + 0x10fffe, 0x10ffff, }; /* CR_Cn */ /* 'Co': General Category */ diff --git a/enc/unicode/name2ctype.kwd b/enc/unicode/name2ctype.kwd index 46058a8341..42e1244fe6 100644 --- a/enc/unicode/name2ctype.kwd +++ b/enc/unicode/name2ctype.kwd @@ -3923,7 +3923,7 @@ static const OnigCodePoint CR_Any[] = { /* 'Assigned': - */ static const OnigCodePoint CR_Assigned[] = { - 484, + 485, 0x0000, 0x0377, 0x037a, 0x037e, 0x0384, 0x038a, @@ -4408,6 +4408,7 @@ static const OnigCodePoint CR_Assigned[] = { 0xe0020, 0xe007f, 0xe0100, 0xe01ef, 0xf0000, 0xffffd, + 0x100000, 0x10fffd, }; /* CR_Assigned */ /* 'C': Major Category */ @@ -4464,7 +4465,7 @@ static const OnigCodePoint CR_Cf[] = { /* 'Cn': General Category */ static const OnigCodePoint CR_Cn[] = { - 484, + 485, 0x0378, 0x0379, 0x037f, 0x0383, 0x038b, 0x038b, @@ -4948,7 +4949,8 @@ static const OnigCodePoint CR_Cn[] = { 0xe0002, 0xe001f, 0xe0080, 0xe00ff, 0xe01f0, 0xeffff, - 0xffffe, 0x10ffff, + 0xffffe, 0xfffff, + 0x10fffe, 0x10ffff, }; /* CR_Cn */ /* 'Co': General Category */ diff --git a/enc/unicode/name2ctype.src b/enc/unicode/name2ctype.src index 46058a8341..42e1244fe6 100644 --- a/enc/unicode/name2ctype.src +++ b/enc/unicode/name2ctype.src @@ -3923,7 +3923,7 @@ static const OnigCodePoint CR_Any[] = { /* 'Assigned': - */ static const OnigCodePoint CR_Assigned[] = { - 484, + 485, 0x0000, 0x0377, 0x037a, 0x037e, 0x0384, 0x038a, @@ -4408,6 +4408,7 @@ static const OnigCodePoint CR_Assigned[] = { 0xe0020, 0xe007f, 0xe0100, 0xe01ef, 0xf0000, 0xffffd, + 0x100000, 0x10fffd, }; /* CR_Assigned */ /* 'C': Major Category */ @@ -4464,7 +4465,7 @@ static const OnigCodePoint CR_Cf[] = { /* 'Cn': General Category */ static const OnigCodePoint CR_Cn[] = { - 484, + 485, 0x0378, 0x0379, 0x037f, 0x0383, 0x038b, 0x038b, @@ -4948,7 +4949,8 @@ static const OnigCodePoint CR_Cn[] = { 0xe0002, 0xe001f, 0xe0080, 0xe00ff, 0xe01f0, 0xeffff, - 0xffffe, 0x10ffff, + 0xffffe, 0xfffff, + 0x10fffe, 0x10ffff, }; /* CR_Cn */ /* 'Co': General Category */ diff --git a/tool/enc-unicode.rb b/tool/enc-unicode.rb index 57edb3b3e5..6b14963217 100755 --- a/tool/enc-unicode.rb +++ b/tool/enc-unicode.rb @@ -2,6 +2,13 @@ # Creates the data structures needed by Onigurma to map Unicode codepoints to # property names and POSIX character classes +# +# To use this, get UnicodeData.txt and Scripts.txt from unicode.org. +# (http://unicode.org/Public/UNIDATA/) +# And run following command. +# ruby1.9 tool/enc-unicode.rb UnicodeData.txt Scripts.txt > enc/unicode/name2ctype.kwd +# You can get source file for gperf. +# After this, simply make ruby. unless ARGV.size == 2 $stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt" @@ -17,10 +24,11 @@ def pair_codepoints(codepoints) # codepoints with property _property_. Note: It is intended that some ranges # will begin with the value with which they end, e.g. 0x0020 -> 0x0020 - codepoints = codepoints.uniq.sort + codepoints.sort! last_cp = codepoints.first pairs = [[last_cp, nil]] codepoints[1..-1].each do |codepoint| + next if last_cp == codepoint # If the current codepoint does not follow directly on from the last # codepoint, the last codepoint represents the end of the current range, @@ -39,7 +47,7 @@ end def parse_unicode_data(file) last_cp = 0 - data = {'Cn' => []} + data = {'Any' => [], 'Assigned' => [], 'Cn' => []} beg_cp = nil IO.foreach(file) do |line| fields = line.split(';') @@ -64,6 +72,10 @@ def parse_unicode_data(file) # Cn category. data['Cn'].concat((last_cp.next...beg_cp).to_a) + # Assigned - Defined in unicode.c; interpreted as every character in the + # Unicode range minus the unassigned characters + data['Assigned'].concat(cps) + # The third field denotes the 'General' category, e.g. Lu (data[fields[2]] ||= []).concat(cps) @@ -73,16 +85,15 @@ def parse_unicode_data(file) last_cp = cp end - # General Category property - gcps = %w[Any Assigned] - gcps.concat data.keys.sort - # The last Cn codepoint should be 0x10ffff. If it's not, append the missing # codepoints to Cn and C - cn_remainder = (data['Cn'].last.next..0x10ffff).to_a + cn_remainder = (last_cp.next..0x10ffff).to_a data['Cn'] += cn_remainder data['C'] += cn_remainder + # Define General Category properties + gcps = data.keys.sort + # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]] # @@ -145,10 +156,6 @@ def parse_unicode_data(file) # Any - Defined in unicode.c data['Any'] = (0x0000..0x10ffff).to_a - # Assigned - Defined in unicode.c; interpreted as every character in the - # Unicode range minus the unassigned characters - data['Assigned'] = data['Any'] - data['Cn'] - # Returns General Category Property names and the data [gcps, data] end