%# -*- mode: ruby; coding: utf-8 -*- <% # Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp) # Script to generate Ruby data structures used in implementing # String#unicode_normalize,... # Constants for input and output directory InputDataDir = ARGV[0] || 'enc/unicode/data' unicode_version = InputDataDir[/.*\/(\d+\.\d+\.\d+)(?=\/|\z)/, 1] # convenience methods class Integer def to_UTF8() # convert to string, taking legibility into account if self>0xFFFF "\\u{#{to_s(16).upcase}}" elsif self>0x7f "\\u#{to_s(16).upcase.rjust(4, '0')}" else chr.sub(/[\\\"]/, "\\\\\\\&") end end end module Enumerable unless method_defined?(:each_slice) def each_slice(n) ary = [] each do |i| ary << i if ary.size >= n yield ary ary = [] end end yield ary unless ary.empty? self end end end class Array def to_UTF8() collect {|c| c.to_UTF8}.join('') end def each_regexp_chars(n = 1) # converts an array of Integers to character ranges sort.inject([]) do |ranges, value| if ranges.last and ranges.last[1]+1>=value ranges.last[1] = value ranges else ranges << [value, value] end end.collect do |first, last| case last-first when 0 first.to_UTF8 when 1 first.to_UTF8 + last.to_UTF8 else first.to_UTF8 + '-' + last.to_UTF8 end end.each_slice(n) do |slice| yield slice.join('') end end end # read the file 'CompositionExclusions.txt' composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt", 'rb') {|f| base = Regexp.quote(File.basename(f.path, '.*')) ext = Regexp.quote(File.extname(f.path)) version = (line = f.gets)[/^# *#{base}-([\d.]+)#{ext}\s*$/, 1] or abort "No file version in #{f.path}: #{line}" (unicode_version ||= version) == version or abort "Unicode version of directory (#{unicode_version}) and file (#{version}) mismatch" f.grep(/^[A-Z0-9]{4,5}/) {|code| code.hex} } decomposition_table = {} kompatible_table = {} combining_class = {} # constant to allow use in Integer#to_UTF8 # read the file 'UnicodeData.txt' vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line| codepoint, name, _, char_class, _, decomposition, *_rest = line.split(";") case decomposition when /^[0-9A-F]/ decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex} when /^</ kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex} end combining_class[codepoint.hex] = char_class.to_i if char_class != "0" if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="") warn "Unexpected: Character range with data relevant to normalization!" end end # calculate compositions from decompositions composition_table = decomposition_table.reject do |character, decomposition| composition_exclusions.member? character or # predefined composition exclusion decomposition.length<=1 or # Singleton Decomposition combining_class[character] or # character is not a Starter combining_class[decomposition.first] # decomposition begins with a character that is not a Starter end.invert # recalculate composition_exclusions composition_exclusions = decomposition_table.keys - composition_table.values accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last} composition_starters = composition_table.keys.collect {|key| key.first} hangul_no_trailing = [] 0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c} # expand decomposition table values decomposition_table.each do |key, value| position = 0 while position < value.length if decomposition = decomposition_table[value[position]] decomposition_table[key] = value = value.dup # avoid overwriting composition_table key value[position, 1] = decomposition else position += 1 end end end # deal with relationship between canonical and kompatibility decompositions decomposition_table.each do |key, value| value = value.dup expanded = false position = 0 while position < value.length if decomposition = kompatible_table[value[position]] value[position, 1] = decomposition expanded = true else position += 1 end end kompatible_table[key] = value if expanded end while kompatible_table.any? {|key, value| expanded = value.map {|v| kompatible_table[v] || v}.flatten kompatible_table[key] = expanded unless value == expanded } end # generate normalization tables file %># coding: us-ascii # frozen_string_literal: true %# > # automatically generated by template/unicode_norm_gen.tmpl module UnicodeNormalize # :nodoc: accents = "" \ "[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \ "<% end%>]" ACCENTS = accents REGEXP_D_STRING = "#{'' # composition starters and composition exclusions }" \ "[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \ "<% end%>]#{accents}*" \ "|#{'' # characters that can be the result of a composition, except composition starters }" \ "[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \ "<% end%>]?#{accents}+" \ "|#{'' # precomposed Hangul syllables }" \ "[\u{AC00}-\u{D7A4}]" REGEXP_C_STRING = "#{'' # composition exclusions }" \ "[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \ "<% end%>]#{accents}*" \ "|#{'' # composition starters and characters that can be the result of a composition }" \ "[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \ "<% end%>]?#{accents}+" \ "|#{'' # Hangul syllables with separate trailer }" \ "[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \ "<% end%>][\u11A8-\u11C2]" \ "|#{'' # decomposed Hangul syllables }" \ "[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?" REGEXP_K_STRING = "" \ "[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \ "<%end%>]" class_table = { % combining_class.each do |key, value| "<%=key.to_UTF8%>"=><%=value%><%=%>, % end } class_table.default = 0 CLASS_TABLE = class_table.freeze DECOMPOSITION_TABLE = { % decomposition_table.each do |key, value| "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>, % end }.freeze KOMPATIBLE_TABLE = { % kompatible_table.each do |key, value| "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>, % end }.freeze COMPOSITION_TABLE = { % composition_table.each do |key, value| "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>, % end }.freeze end