mirror of
				https://github.com/ruby/ruby.git
				synced 2022-11-09 12:17:21 -05:00 
			
		
		
		
	 0c10564dd9
			
		
	
	
		0c10564dd9
		
	
	
	
	
		
			
			* lib/unicode_normalize/normalize.rb: [DOC] nodoc the internal UnicodeNormalize module. * lib/unicode_normalize/tables.rb: ditto. * template/unicode_norm_gen.tmpl: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58329 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
		
			
				
	
	
		
			220 lines
		
	
	
	
		
			6.8 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			220 lines
		
	
	
	
		
			6.8 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
| %# -*- mode: ruby; coding: utf-8 -*-
 | |
| <%
 | |
| # Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
 | |
| 
 | |
| # Script to generate Ruby data structures used in implementing
 | |
| # String#unicode_normalize,...
 | |
| 
 | |
| # Constants for input and ouput directory
 | |
| InputDataDir = ARGV[0] || 'enc/unicode/data'
 | |
| unicode_version = InputDataDir[/[\d.]+\z/]
 | |
| 
 | |
| # convenience methods
 | |
| class Integer
 | |
|   def to_UTF8() # convert to string, taking legibility into account
 | |
|     if self>0xFFFF
 | |
|       "\\u{#{to_s(16).upcase}}"
 | |
|     elsif self>0x7f
 | |
|       "\\u#{to_s(16).upcase.rjust(4, '0')}"
 | |
|     else
 | |
|       chr.sub(/[\\\"]/, "\\\\\\\&")
 | |
|     end
 | |
|   end
 | |
| end
 | |
| 
 | |
| module Enumerable
 | |
|   unless method_defined?(:each_slice)
 | |
|     def each_slice(n)
 | |
|       ary = []
 | |
|       each do |i|
 | |
|         ary << i
 | |
|         if ary.size >= n
 | |
|           yield ary
 | |
|           ary = []
 | |
|         end
 | |
|       end
 | |
|       yield ary unless ary.empty?
 | |
|       self
 | |
|     end
 | |
|   end
 | |
| end
 | |
| 
 | |
| class Array
 | |
|   def to_UTF8() collect {|c| c.to_UTF8}.join('') end
 | |
| 
 | |
|   def each_regexp_chars(n = 8) # converts an array of Integers to character ranges
 | |
|     sort.inject([]) do |ranges, value|
 | |
|       if ranges.last and ranges.last[1]+1>=value
 | |
|         ranges.last[1] = value
 | |
|         ranges
 | |
|       else
 | |
|         ranges << [value, value]
 | |
|       end
 | |
|     end.collect do |first, last|
 | |
|       case last-first
 | |
|       when 0
 | |
|         first.to_UTF8
 | |
|       when 1
 | |
|         first.to_UTF8 + last.to_UTF8
 | |
|       else
 | |
|         first.to_UTF8 + '-' + last.to_UTF8
 | |
|       end
 | |
|     end.each_slice(n) do |slice|
 | |
|       yield slice.join('')
 | |
|     end
 | |
|   end
 | |
| end
 | |
| 
 | |
| # read the file 'CompositionExclusions.txt'
 | |
| composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt", 'rb') {|f|
 | |
|   base = Regexp.quote(File.basename(f.path, '.*'))
 | |
|   ext = Regexp.quote(File.extname(f.path))
 | |
|   version = (line = f.gets)[/^# *#{base}-([\d.]+)#{ext}\s*$/, 1] or
 | |
|     abort "No file version in #{f.path}: #{line}"
 | |
|   (unicode_version ||= version) == version or
 | |
|     abort "Unicode version of directory (#{unicode_version}) and file (#{version}) mismatch"
 | |
|   f.grep(/^[A-Z0-9]{4,5}/) {|code| code.hex}
 | |
| }
 | |
| 
 | |
| decomposition_table = {}
 | |
| kompatible_table = {}
 | |
| combining_class = {}  # constant to allow use in Integer#to_UTF8
 | |
| 
 | |
| # read the file 'UnicodeData.txt'
 | |
| vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
 | |
|   codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")
 | |
| 
 | |
|   case decomposition
 | |
|   when /^[0-9A-F]/
 | |
|     decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex}
 | |
|   when /^</
 | |
|     kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
 | |
|   end
 | |
|   combining_class[codepoint.hex] = char_class.to_i if char_class != "0"
 | |
| 
 | |
|   if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
 | |
|     warn "Unexpected: Character range with data relevant to normalization!"
 | |
|   end
 | |
| end
 | |
| 
 | |
| # calculate compositions from decompositions
 | |
| composition_table = decomposition_table.reject do |character, decomposition|
 | |
|   composition_exclusions.member? character or # predefined composition exclusion
 | |
|     decomposition.length<=1 or                # Singleton Decomposition
 | |
|     combining_class[character] or             # character is not a Starter
 | |
|     combining_class[decomposition.first]      # decomposition begins with a character that is not a Starter
 | |
| end.invert
 | |
| 
 | |
| # recalculate composition_exclusions
 | |
| composition_exclusions = decomposition_table.keys - composition_table.values
 | |
| 
 | |
| accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last}
 | |
| 
 | |
| composition_starters = composition_table.keys.collect {|key| key.first}
 | |
| 
 | |
| hangul_no_trailing = []
 | |
| 0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c}
 | |
| 
 | |
| # expand decomposition table values
 | |
| decomposition_table.each do |key, value|
 | |
|   position = 0
 | |
|   while position < value.length
 | |
|     if decomposition = decomposition_table[value[position]]
 | |
|       decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
 | |
|       value[position, 1] = decomposition
 | |
|     else
 | |
|       position += 1
 | |
|     end
 | |
|   end
 | |
| end
 | |
| 
 | |
| # deal with relationship between canonical and kompatibility decompositions
 | |
| decomposition_table.each do |key, value|
 | |
|   value = value.dup
 | |
|   expanded = false
 | |
|   position = 0
 | |
|   while position < value.length
 | |
|     if decomposition = kompatible_table[value[position]]
 | |
|       value[position, 1] = decomposition
 | |
|       expanded = true
 | |
|     else
 | |
|       position += 1
 | |
|     end
 | |
|   end
 | |
|   kompatible_table[key] = value if expanded
 | |
| end
 | |
| 
 | |
| while kompatible_table.any? {|key, value|
 | |
|         expanded = value.map {|v| kompatible_table[v] || v}.flatten
 | |
|         kompatible_table[key] = expanded unless value == expanded
 | |
|       }
 | |
| end
 | |
| 
 | |
| # generate normalization tables file
 | |
| %># coding: us-ascii
 | |
| # frozen_string_literal: true
 | |
| %# >
 | |
| 
 | |
| # automatically generated by template/unicode_norm_gen.tmpl
 | |
| 
 | |
| module UnicodeNormalize  # :nodoc:
 | |
|   accents = "" \
 | |
|     "[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \
 | |
|     "<% end%>]"
 | |
|   ACCENTS = accents
 | |
|   REGEXP_D_STRING = "#{''  # composition starters and composition exclusions
 | |
|     }" \
 | |
|     "[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \
 | |
|     "<% end%>]#{accents}*" \
 | |
|     "|#{''  # characters that can be the result of a composition, except composition starters
 | |
|     }" \
 | |
|     "[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
 | |
|     "<% end%>]?#{accents}+" \
 | |
|     "|#{''  # precomposed Hangul syllables
 | |
|     }" \
 | |
|     "[\u{AC00}-\u{D7A4}]"
 | |
|   REGEXP_C_STRING = "#{''  # composition exclusions
 | |
|     }" \
 | |
|     "[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \
 | |
|     "<% end%>]#{accents}*" \
 | |
|     "|#{''  # composition starters and characters that can be the result of a composition
 | |
|     }" \
 | |
|     "[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
 | |
|     "<% end%>]?#{accents}+" \
 | |
|     "|#{''  # Hangul syllables with separate trailer
 | |
|     }" \
 | |
|     "[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \
 | |
|     "<% end%>][\u11A8-\u11C2]" \
 | |
|     "|#{''  # decomposed Hangul syllables
 | |
|     }" \
 | |
|     "[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?"
 | |
|   REGEXP_K_STRING = "" \
 | |
|     "[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \
 | |
|     "<%end%>]"
 | |
| 
 | |
|   class_table = {
 | |
| % combining_class.each_slice(8)  do |slice|
 | |
|    <% slice.each  do |key, value|%> "<%=key.to_UTF8%>"=><%=value%><%=%>,<% end%>
 | |
| % end
 | |
|   }
 | |
|   class_table.default = 0
 | |
|   CLASS_TABLE = class_table.freeze
 | |
| 
 | |
|   DECOMPOSITION_TABLE = {
 | |
| % decomposition_table.each_slice(8) do |slice|
 | |
|    <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,<% end%>
 | |
| % end
 | |
|   }.freeze
 | |
| 
 | |
|   KOMPATIBLE_TABLE = {
 | |
| % kompatible_table.each_slice(8)  do |slice|
 | |
|    <% slice.each  do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,<% end%>
 | |
| % end
 | |
|    }.freeze
 | |
| 
 | |
|   COMPOSITION_TABLE = {
 | |
| % composition_table.each_slice(8)  do |slice|
 | |
|    <% slice.each  do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,<% end%>
 | |
| % end
 | |
|    }.freeze
 | |
| end
 |