mirror of
				https://github.com/ruby/ruby.git
				synced 2022-11-09 12:17:21 -05:00 
			
		
		
		
	Update Oniguruma's UnicodeData to 5.1.
* tool/enc-unicode.rb: added for generate name2ctype.kwd.
  contributed by Run Paint Run Run [ruby-core:24775]
  use like following:
    ruby19 tool/enc-unicode.rb enc/unicode/UnicodeData.txt \
      enc/unicode/Scripts.txt > enc/unicode/name2ctype.kwd
* enc/unicode.c (CodeRanges): move definitions to name2ctype.h.
* enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd,
  enc/unicode/name2ctype.src: updated to v5.1.
* enc/unicode/UnicodeData.txt, enc/unicode/Scripts.txt: added v5.1.
* Makefile.in: add rule to generate name2ctype.kwd from
  UnicodeData.txt and Scripts.txt.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@24651 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
			
			
This commit is contained in:
		
							parent
							
								
									649f648ec1
								
							
						
					
					
						commit
						f1eff95745
					
				
					 9 changed files with 49881 additions and 9068 deletions
				
			
		
							
								
								
									
										230
									
								
								tool/enc-unicode.rb
									
										
									
									
									
										Executable file
									
								
							
							
						
						
									
										230
									
								
								tool/enc-unicode.rb
									
										
									
									
									
										Executable file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,230 @@
 | 
			
		|||
#!/usr/bin/env ruby
 | 
			
		||||
 | 
			
		||||
# Creates the data structures needed by Onigurma to map Unicode codepoints to
 | 
			
		||||
# property names and POSIX character classes
 | 
			
		||||
 | 
			
		||||
unless ARGV.size == 2
 | 
			
		||||
  $stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
 | 
			
		||||
  exit(1)
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII]
 | 
			
		||||
 | 
			
		||||
def pair_codepoints(codepoints)
 | 
			
		||||
 | 
			
		||||
  # We have a sorted Array of codepoints that we wish to partition into
 | 
			
		||||
  # ranges such that the start- and endpoints form an inclusive set of
 | 
			
		||||
  # codepoints with property _property_. Note: It is intended that some ranges
 | 
			
		||||
  # will begin with the value with  which they end, e.g. 0x0020 -> 0x0020
 | 
			
		||||
 | 
			
		||||
  codepoints = codepoints.uniq.sort
 | 
			
		||||
  last_cp = codepoints.first
 | 
			
		||||
  pairs = [[last_cp, nil]]
 | 
			
		||||
  codepoints[1..-1].each do |codepoint|
 | 
			
		||||
 | 
			
		||||
    # If the current codepoint does not follow directly on from the last
 | 
			
		||||
    # codepoint, the last codepoint represents the end of the current range,
 | 
			
		||||
    # and the current codepoint represents the start of the next range.
 | 
			
		||||
    if last_cp.next != codepoint
 | 
			
		||||
      pairs[-1][-1] = last_cp
 | 
			
		||||
      pairs << [codepoint, nil]
 | 
			
		||||
    end
 | 
			
		||||
    last_cp = codepoint
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # The final pair has as its endpoint the last codepoint for this property
 | 
			
		||||
  pairs[-1][-1] = codepoints.last
 | 
			
		||||
  pairs
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
def parse_unicode_data(file)
 | 
			
		||||
  last_cp = 0
 | 
			
		||||
  data = {'Cn' => []}
 | 
			
		||||
  IO.foreach(file) do |line|
 | 
			
		||||
    fields = line.split(';')
 | 
			
		||||
    cp = fields[0].to_i(16)
 | 
			
		||||
 | 
			
		||||
    # The Cn category represents unassigned characters. These are not listed in
 | 
			
		||||
    # UnicodeData.txt so we must derive them by looking for 'holes' in the range
 | 
			
		||||
    # of listed codepoints. We increment the last codepoint seen and compare it
 | 
			
		||||
    # with the current codepoint. If the current codepoint is less than
 | 
			
		||||
    # last_cp.next we have found a hole, so we add the missing codepoint to the
 | 
			
		||||
    # Cn category.
 | 
			
		||||
    while ((last_cp = last_cp.next) < cp)
 | 
			
		||||
      data['Cn'] << last_cp
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # The third field denotes the 'General' category, e.g. Lu
 | 
			
		||||
    (data[fields[2]] ||= []) << cp
 | 
			
		||||
 | 
			
		||||
    # The 'Major' category is the first letter of the 'General' category, e.g.
 | 
			
		||||
    # 'Lu' -> 'L'
 | 
			
		||||
    (data[fields[2][0,1]] ||= []) << cp
 | 
			
		||||
    last_cp = cp
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  # General Category property
 | 
			
		||||
  gcps = %w[Any Assigned]
 | 
			
		||||
  gcps.concat data.keys.sort
 | 
			
		||||
 | 
			
		||||
  # The last Cn codepoint should be 0x10ffff. If it's not, append the missing
 | 
			
		||||
  # codepoints to Cn and C
 | 
			
		||||
  cn_remainder = (data['Cn'].last.next..0x10ffff).to_a
 | 
			
		||||
  data['Cn'] += cn_remainder
 | 
			
		||||
  data['C'] += cn_remainder
 | 
			
		||||
 | 
			
		||||
  # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
 | 
			
		||||
  #
 | 
			
		||||
 | 
			
		||||
  # alnum    Letter | Mark | Decimal_Number
 | 
			
		||||
  data['Alnum'] = data['L'] + data['M'] + data['Nd']
 | 
			
		||||
 | 
			
		||||
  # alpha    Letter | Mark
 | 
			
		||||
  data['Alpha'] = data['L'] + data['M']
 | 
			
		||||
 | 
			
		||||
  # ascii    0000 - 007F
 | 
			
		||||
  data['ASCII'] = (0..0x007F).to_a
 | 
			
		||||
 | 
			
		||||
  # blank    Space_Separator | 0009
 | 
			
		||||
  data['Blank'] = data['Zs'] + [0x0009]
 | 
			
		||||
 | 
			
		||||
  # cntrl    Control
 | 
			
		||||
  data['Cntrl'] = data['Cc']
 | 
			
		||||
 | 
			
		||||
  # digit    Decimal_Number
 | 
			
		||||
  data['Digit'] = data['Nd']
 | 
			
		||||
 | 
			
		||||
  # lower    Lowercase_Letter
 | 
			
		||||
  data['Lower'] = data['Ll']
 | 
			
		||||
 | 
			
		||||
  # punct    Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
 | 
			
		||||
  #          Final_Punctuation | Initial_Punctuation | Other_Punctuation |
 | 
			
		||||
  #          Open_Punctuation
 | 
			
		||||
  # NOTE: This definition encompasses the entire P category, and the current
 | 
			
		||||
  # mappings agree, but we explcitly declare this way to marry it with the above
 | 
			
		||||
  # definition.
 | 
			
		||||
  data['Punct'] = data['Pc'] + data['Pd'] + data['Pe'] + data['Pf'] +
 | 
			
		||||
                  data['Pi'] + data['Po'] + data['Ps']
 | 
			
		||||
 | 
			
		||||
  # space    Space_Separator | Line_Separator | Paragraph_Separator |
 | 
			
		||||
  #               0009 | 000A | 000B | 000C | 000D | 0085
 | 
			
		||||
  data['Space'] = data['Zs'] + data['Zl'] + data['Zp'] +
 | 
			
		||||
                  [0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0085]
 | 
			
		||||
 | 
			
		||||
  # upper    Uppercase_Letter
 | 
			
		||||
  data['Upper'] = data['Lu']
 | 
			
		||||
 | 
			
		||||
  # xdigit   0030 - 0039 | 0041 - 0046 | 0061 - 0066
 | 
			
		||||
  #          (0-9, a-f, A-F)
 | 
			
		||||
  data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
 | 
			
		||||
                   (0x0061..0x0066).to_a
 | 
			
		||||
 | 
			
		||||
  # word     Letter | Mark | Decimal_Number | Connector_Punctuation
 | 
			
		||||
  data['Word'] = data['L'] + data['M'] + data['Nd'] + data['Pc']
 | 
			
		||||
 | 
			
		||||
  # graph    [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
 | 
			
		||||
  data['Graph'] = data['L'] + data['M'] + data['N'] + data['P'] + data['S']
 | 
			
		||||
  data['Graph'] -= data['Space'] - data['C']
 | 
			
		||||
 | 
			
		||||
  # print    [[:graph:]] | [[:space:]]
 | 
			
		||||
  data['Print'] = data['Graph'] + data['Space']
 | 
			
		||||
 | 
			
		||||
  # NEWLINE - This was defined in unicode.c
 | 
			
		||||
  data['NEWLINE'] = [0x000a]
 | 
			
		||||
 | 
			
		||||
  # Any - Defined in unicode.c
 | 
			
		||||
  data['Any'] = (0x0000..0x10ffff).to_a
 | 
			
		||||
 | 
			
		||||
  # Assigned - Defined in unicode.c; interpreted as every character in the
 | 
			
		||||
  # Unicode range minus the unassigned characters
 | 
			
		||||
  data['Assigned'] = data['Any'] - data['Cn']
 | 
			
		||||
 | 
			
		||||
  # Returns General Category Property names and the data
 | 
			
		||||
  [gcps, data]
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_scripts(file)
 | 
			
		||||
  script = nil
 | 
			
		||||
  data = []
 | 
			
		||||
  names = []
 | 
			
		||||
  IO.foreach(file) do |line|
 | 
			
		||||
    if /^# Total code points: / =~ line
 | 
			
		||||
      make_const(script, pair_codepoints(data), 'Script')
 | 
			
		||||
      names << script
 | 
			
		||||
      data = []
 | 
			
		||||
    elsif /^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?\s*;\s*(\w+)/ =~ line
 | 
			
		||||
      script = $3
 | 
			
		||||
      $2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
  names
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
# make_const(property, pairs, name): Prints a 'static const' structure for a
 | 
			
		||||
# given property, group of paired codepoints, and a human-friendly name for
 | 
			
		||||
# the group
 | 
			
		||||
def make_const(prop, pairs, name)
 | 
			
		||||
  puts "\n/* '#{prop}': #{name} */"
 | 
			
		||||
  puts "static const OnigCodePoint CR_#{prop}[] = {"
 | 
			
		||||
  # The first element of the constant is the number of pairs of codepoints
 | 
			
		||||
  puts "\t#{pairs.size},"
 | 
			
		||||
  pairs.each do |pair|
 | 
			
		||||
    pair.map! { |c|  c == 0 ? '0x0000' : sprintf("%0#6x", c) }
 | 
			
		||||
    puts "\t#{pair.first}, #{pair.last},"
 | 
			
		||||
  end
 | 
			
		||||
  puts "}; /* CR_#{prop} */"
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
puts '%{'
 | 
			
		||||
gcps, data = parse_unicode_data(ARGV[0])
 | 
			
		||||
POSIX_NAMES.each do |name|
 | 
			
		||||
  make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
 | 
			
		||||
end
 | 
			
		||||
print "\n#ifdef USE_UNICODE_PROPERTIES"
 | 
			
		||||
gcps.each do |name|
 | 
			
		||||
  category =
 | 
			
		||||
    case name.size
 | 
			
		||||
    when 1 then 'Major Category'
 | 
			
		||||
    when 2 then 'General Category'
 | 
			
		||||
    else        '-'
 | 
			
		||||
    end
 | 
			
		||||
  make_const(name, pair_codepoints(data[name]), category)
 | 
			
		||||
end
 | 
			
		||||
scripts = parse_scripts(ARGV[1])
 | 
			
		||||
puts "#endif /* USE_UNICODE_PROPERTIES */"
 | 
			
		||||
 | 
			
		||||
puts "\n\nstatic const OnigCodePoint* const CodeRanges[] = {"
 | 
			
		||||
POSIX_NAMES.each{|name|puts"  CR_#{name},"}
 | 
			
		||||
puts "#ifdef USE_UNICODE_PROPERTIES"
 | 
			
		||||
gcps.each{|name|puts"  CR_#{name},"}
 | 
			
		||||
scripts.each{|name|puts"  CR_#{name},"}
 | 
			
		||||
puts "#endif /* USE_UNICODE_PROPERTIES */"
 | 
			
		||||
puts "};"
 | 
			
		||||
 | 
			
		||||
puts(<<'__HEREDOC')
 | 
			
		||||
struct uniname2ctype_struct {
 | 
			
		||||
  int name, ctype;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int);
 | 
			
		||||
%}
 | 
			
		||||
struct uniname2ctype_struct;
 | 
			
		||||
%%
 | 
			
		||||
__HEREDOC
 | 
			
		||||
i = -1
 | 
			
		||||
POSIX_NAMES.each  {|name|puts"%-21s %3d"%[name+',', i+=1]}
 | 
			
		||||
puts "#ifdef USE_UNICODE_PROPERTIES"
 | 
			
		||||
gcps.each{|name|puts"%-21s %3d"%[name+',', i+=1]}
 | 
			
		||||
scripts.each{|name|puts"%-21s %3d"%[name+',', i+=1]}
 | 
			
		||||
puts "#endif /* USE_UNICODE_PROPERTIES */\n"
 | 
			
		||||
puts(<<'__HEREDOC')
 | 
			
		||||
%%
 | 
			
		||||
static int
 | 
			
		||||
uniname2ctype(const UChar *name, unsigned int len)
 | 
			
		||||
{
 | 
			
		||||
  const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len);
 | 
			
		||||
  if (p) return p->ctype;
 | 
			
		||||
  return -1;
 | 
			
		||||
}
 | 
			
		||||
__HEREDOC
 | 
			
		||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue