mirror of
				https://github.com/ruby/ruby.git
				synced 2022-11-09 12:17:21 -05:00 
			
		
		
		
	* tool/enc-unicode.rb,
enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: use UTS#18 for POSIX character class. http://rubyspec.org/issues/show/161 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25338 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
		
							parent
							
								
									6dd93ff60d
								
							
						
					
					
						commit
						d5537936ab
					
				
					 6 changed files with 8268 additions and 15741 deletions
				
			
		| 
						 | 
				
			
			@ -1,3 +1,11 @@
 | 
			
		|||
Thu Oct 15 00:47:42 2009  NARUSE, Yui  <naruse@ruby-lang.org>
 | 
			
		||||
 | 
			
		||||
	* tool/enc-unicode.rb,
 | 
			
		||||
	  enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
 | 
			
		||||
	  enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
 | 
			
		||||
	  use UTS#18 for POSIX character class.
 | 
			
		||||
	  http://rubyspec.org/issues/show/161
 | 
			
		||||
 | 
			
		||||
Thu Oct 15 00:26:07 2009  Tanaka Akira  <akr@fsij.org>
 | 
			
		||||
 | 
			
		||||
	* ext/socket/init.c (rsock_init_sock): validate file descriptor.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							| 
						 | 
				
			
			@ -47,7 +47,8 @@ end
 | 
			
		|||
 | 
			
		||||
def parse_unicode_data(file)
 | 
			
		||||
  last_cp = 0
 | 
			
		||||
  data = {'Any' => [], 'Assigned' => [], 'Cn' => []}
 | 
			
		||||
  data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [],
 | 
			
		||||
    'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []}
 | 
			
		||||
  beg_cp = nil
 | 
			
		||||
  IO.foreach(file) do |line|
 | 
			
		||||
    fields = line.split(';')
 | 
			
		||||
| 
						 | 
				
			
			@ -92,111 +93,76 @@ def parse_unicode_data(file)
 | 
			
		|||
  data['C'] += cn_remainder
 | 
			
		||||
 | 
			
		||||
  # Define General Category properties
 | 
			
		||||
  gcps = data.keys.sort
 | 
			
		||||
 | 
			
		||||
  # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
 | 
			
		||||
  #
 | 
			
		||||
 | 
			
		||||
  # alnum    Letter | Mark | Decimal_Number
 | 
			
		||||
  data['Alnum'] = data['L'] + data['M'] + data['Nd']
 | 
			
		||||
 | 
			
		||||
  # alpha    Letter | Mark
 | 
			
		||||
  data['Alpha'] = data['L'] + data['M']
 | 
			
		||||
 | 
			
		||||
  # ascii    0000 - 007F
 | 
			
		||||
  data['ASCII'] = (0..0x007F).to_a
 | 
			
		||||
 | 
			
		||||
  # blank    Space_Separator | 0009
 | 
			
		||||
  data['Blank'] = data['Zs'] + [0x0009]
 | 
			
		||||
 | 
			
		||||
  # cntrl    Control
 | 
			
		||||
  data['Cntrl'] = data['Cc']
 | 
			
		||||
 | 
			
		||||
  # digit    Decimal_Number
 | 
			
		||||
  data['Digit'] = data['Nd']
 | 
			
		||||
 | 
			
		||||
  # lower    Lowercase_Letter
 | 
			
		||||
  data['Lower'] = data['Ll']
 | 
			
		||||
 | 
			
		||||
  # punct    Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
 | 
			
		||||
  #          Final_Punctuation | Initial_Punctuation | Other_Punctuation |
 | 
			
		||||
  #          Open_Punctuation
 | 
			
		||||
  # NOTE: This definition encompasses the entire P category, and the current
 | 
			
		||||
  # mappings agree, but we explcitly declare this way to marry it with the above
 | 
			
		||||
  # definition.
 | 
			
		||||
  data['Punct'] = data['Pc'] + data['Pd'] + data['Pe'] + data['Pf'] +
 | 
			
		||||
                  data['Pi'] + data['Po'] + data['Ps']
 | 
			
		||||
 | 
			
		||||
  # space    Space_Separator | Line_Separator | Paragraph_Separator |
 | 
			
		||||
  #               0009 | 000A | 000B | 000C | 000D | 0085
 | 
			
		||||
  data['Space'] = data['Zs'] + data['Zl'] + data['Zp'] +
 | 
			
		||||
                  [0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0085]
 | 
			
		||||
 | 
			
		||||
  # upper    Uppercase_Letter
 | 
			
		||||
  data['Upper'] = data['Lu']
 | 
			
		||||
 | 
			
		||||
  # xdigit   0030 - 0039 | 0041 - 0046 | 0061 - 0066
 | 
			
		||||
  #          (0-9, a-f, A-F)
 | 
			
		||||
  data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
 | 
			
		||||
                   (0x0061..0x0066).to_a
 | 
			
		||||
 | 
			
		||||
  # word     Letter | Mark | Decimal_Number | Connector_Punctuation
 | 
			
		||||
  data['Word'] = data['L'] + data['M'] + data['Nd'] + data['Pc']
 | 
			
		||||
 | 
			
		||||
  # graph    [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
 | 
			
		||||
  data['Graph'] = data['L'] + data['M'] + data['N'] + data['P'] + data['S']
 | 
			
		||||
  data['Graph'] -= data['Space'] - data['C']
 | 
			
		||||
 | 
			
		||||
  # print    [[:graph:]] | [[:space:]]
 | 
			
		||||
  data['Print'] = data['Graph'] + data['Space']
 | 
			
		||||
 | 
			
		||||
  # NEWLINE - This was defined in unicode.c
 | 
			
		||||
  data['NEWLINE'] = [0x000a]
 | 
			
		||||
 | 
			
		||||
  # Any - Defined in unicode.c
 | 
			
		||||
  data['Any'] = (0x0000..0x10ffff).to_a
 | 
			
		||||
  gcps = data.keys.sort - POSIX_NAMES
 | 
			
		||||
 | 
			
		||||
  # Returns General Category Property names and the data
 | 
			
		||||
  [gcps, data]
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
def define_posix_props(data)
 | 
			
		||||
  # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
 | 
			
		||||
  #
 | 
			
		||||
 | 
			
		||||
def parse_scripts
 | 
			
		||||
  data['Alpha'] = data['Alphabetic']
 | 
			
		||||
  data['Upper'] = data['Uppercase']
 | 
			
		||||
  data['Lower'] = data['Lowercase']
 | 
			
		||||
  data['Punct'] = data['Punctuation']
 | 
			
		||||
  data['Digit'] = data['Decimal_Number']
 | 
			
		||||
  data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
 | 
			
		||||
                   (0x0061..0x0066).to_a
 | 
			
		||||
  data['Alnum'] = data['Alpha'] + data['Digit']
 | 
			
		||||
  data['Space'] = data['White_Space']
 | 
			
		||||
  data['Blank'] = data['White_Space'] - [0x0A, 0x0B, 0x0C, 0x0D, 0x85] -
 | 
			
		||||
    data['Line_Separator'] - data['Paragraph_Separator']
 | 
			
		||||
  data['Cntrl'] = data['Cc']
 | 
			
		||||
  data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation']
 | 
			
		||||
  data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] -
 | 
			
		||||
    data['Surrogate'] - data['Unassigned']
 | 
			
		||||
  data['Print'] = data['Graph'] + data['Blank'] - data['Cntrl']
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
def parse_scripts(data)
 | 
			
		||||
  files = [
 | 
			
		||||
    {fn: 'DerivedCoreProperties.txt', title: 'Derived Property'},
 | 
			
		||||
    {fn: 'Scripts.txt', title: 'Script'},
 | 
			
		||||
    {fn: 'PropList.txt', title: 'Binary Property'}
 | 
			
		||||
  ]
 | 
			
		||||
  current = nil
 | 
			
		||||
  data = []
 | 
			
		||||
  cps = []
 | 
			
		||||
  names = []
 | 
			
		||||
  files.each do |file|
 | 
			
		||||
    IO.foreach(get_file(file[:fn])) do |line|
 | 
			
		||||
      if /^# Total code points: / =~ line
 | 
			
		||||
        make_const(current, pair_codepoints(data), file[:title])
 | 
			
		||||
        data[current] = cps
 | 
			
		||||
        make_const(current, cps, file[:title])
 | 
			
		||||
        names << current
 | 
			
		||||
        data = []
 | 
			
		||||
        cps = []
 | 
			
		||||
      elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line
 | 
			
		||||
        current = $3
 | 
			
		||||
        $2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
 | 
			
		||||
        $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
  names
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
def parse_aliases
 | 
			
		||||
def parse_aliases(data)
 | 
			
		||||
  kv = {}
 | 
			
		||||
  IO.foreach(get_file('PropertyAliases.txt')) do |line|
 | 
			
		||||
    next unless /^(\w+)\s*; (\w+)/ =~ line
 | 
			
		||||
    data[$1] = data[$2]
 | 
			
		||||
    kv[normalize_propname($1)] = normalize_propname($2)
 | 
			
		||||
  end
 | 
			
		||||
  IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
 | 
			
		||||
    next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
 | 
			
		||||
    if $1 == 'gc'
 | 
			
		||||
      data[$3] = data[$2]
 | 
			
		||||
      data[$4] = data[$2]
 | 
			
		||||
      kv[normalize_propname($3)] = normalize_propname($2)
 | 
			
		||||
      kv[normalize_propname($4)] = normalize_propname($2) if $4
 | 
			
		||||
    else
 | 
			
		||||
      data[$2] = data[$3]
 | 
			
		||||
      data[$4] = data[$3]
 | 
			
		||||
      kv[normalize_propname($2)] = normalize_propname($3)
 | 
			
		||||
      kv[normalize_propname($4)] = normalize_propname($3) if $4
 | 
			
		||||
    end
 | 
			
		||||
| 
						 | 
				
			
			@ -204,19 +170,26 @@ def parse_aliases
 | 
			
		|||
  kv
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
$const_cache = {}
 | 
			
		||||
# make_const(property, pairs, name): Prints a 'static const' structure for a
 | 
			
		||||
# given property, group of paired codepoints, and a human-friendly name for
 | 
			
		||||
# the group
 | 
			
		||||
def make_const(prop, pairs, name)
 | 
			
		||||
def make_const(prop, data, name)
 | 
			
		||||
  puts "\n/* '#{prop}': #{name} */"
 | 
			
		||||
  puts "static const OnigCodePoint CR_#{prop}[] = {"
 | 
			
		||||
  # The first element of the constant is the number of pairs of codepoints
 | 
			
		||||
  puts "\t#{pairs.size},"
 | 
			
		||||
  pairs.each do |pair|
 | 
			
		||||
    pair.map! { |c|  c == 0 ? '0x0000' : sprintf("%0#6x", c) }
 | 
			
		||||
    puts "\t#{pair.first}, #{pair.last},"
 | 
			
		||||
  if origprop = $const_cache.key(data)
 | 
			
		||||
    puts "#define CR_#{prop} CR_#{origprop}"
 | 
			
		||||
  else
 | 
			
		||||
    $const_cache[prop] = data
 | 
			
		||||
    pairs = pair_codepoints(data)
 | 
			
		||||
    puts "static const OnigCodePoint CR_#{prop}[] = {"
 | 
			
		||||
    # The first element of the constant is the number of pairs of codepoints
 | 
			
		||||
    puts "\t#{pairs.size},"
 | 
			
		||||
    pairs.each do |pair|
 | 
			
		||||
      pair.map! { |c|  c == 0 ? '0x0000' : sprintf("%0#6x", c) }
 | 
			
		||||
      puts "\t#{pair.first}, #{pair.last},"
 | 
			
		||||
    end
 | 
			
		||||
    puts "}; /* CR_#{prop} */"
 | 
			
		||||
  end
 | 
			
		||||
  puts "}; /* CR_#{prop} */"
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
def normalize_propname(name)
 | 
			
		||||
| 
						 | 
				
			
			@ -233,9 +206,6 @@ end
 | 
			
		|||
# Write Data
 | 
			
		||||
puts '%{'
 | 
			
		||||
props, data = parse_unicode_data(get_file('UnicodeData.txt'))
 | 
			
		||||
POSIX_NAMES.each do |name|
 | 
			
		||||
  make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
 | 
			
		||||
end
 | 
			
		||||
print "\n#ifdef USE_UNICODE_PROPERTIES"
 | 
			
		||||
props.each do |name|
 | 
			
		||||
  category =
 | 
			
		||||
| 
						 | 
				
			
			@ -244,11 +214,16 @@ props.each do |name|
 | 
			
		|||
    when 2 then 'General Category'
 | 
			
		||||
    else        '-'
 | 
			
		||||
    end
 | 
			
		||||
  make_const(name, pair_codepoints(data[name]), category)
 | 
			
		||||
  make_const(name, data[name], category)
 | 
			
		||||
end
 | 
			
		||||
props.concat parse_scripts(data)
 | 
			
		||||
puts '#endif /* USE_UNICODE_PROPERTIES */'
 | 
			
		||||
aliases = parse_aliases(data)
 | 
			
		||||
define_posix_props(data)
 | 
			
		||||
POSIX_NAMES.each do |name|
 | 
			
		||||
  make_const(name, data[name], "[[:#{name}:]]")
 | 
			
		||||
end
 | 
			
		||||
props.concat parse_scripts
 | 
			
		||||
puts(<<'__HEREDOC')
 | 
			
		||||
#endif /* USE_UNICODE_PROPERTIES */
 | 
			
		||||
 | 
			
		||||
static const OnigCodePoint* const CodeRanges[] = {
 | 
			
		||||
__HEREDOC
 | 
			
		||||
| 
						 | 
				
			
			@ -283,7 +258,7 @@ props.each do |name|
 | 
			
		|||
  name_to_index[name] = i
 | 
			
		||||
  puts "%-40s %3d" % [name + ',', i]
 | 
			
		||||
end
 | 
			
		||||
parse_aliases.each_pair do |k, v|
 | 
			
		||||
aliases.each_pair do |k, v|
 | 
			
		||||
  next if name_to_index[k]
 | 
			
		||||
  next unless v = name_to_index[v]
 | 
			
		||||
  puts "%-40s %3d" % [k + ',', v]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue