diff --git a/common.mk b/common.mk index 40a227b047..4a8aa394d6 100644 --- a/common.mk +++ b/common.mk @@ -1120,10 +1120,14 @@ UPDATE_UNICODE_FILES_DEPS = $(ALWAYS_UPDATE_UNICODE:yes=PHONY) UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt @exit > .update-unicode.time -$(srcdir)/lib/unicode_normalize/tables.rb: \ - $(srcdir)/tool/unicode_norm_gen.rb $(UNICODE_FILES) - $(BASERUBY) -s -C "$(srcdir)" tool/unicode_norm_gen.rb \ - -input=enc/unicode/data -ouput=lib/unicode_normalize +$(srcdir)/lib/unicode_normalize/tables.rb: ./.unicode-tables.time + +./.unicode-tables.time: $(srcdir)/tool/generic_erb.rb \ + $(srcdir)/template/unicode_norm_gen.tmpl $(UNICODE_FILES) + $(Q) $(BASERUBY) $(srcdir)/tool/generic_erb.rb \ + -c -t$@ -o $(srcdir)/lib/unicode_normalize/tables.rb \ + -I $(srcdir) \ + $(srcdir)/template/unicode_norm_gen.tmpl enc/unicode/data lib/unicode_normalize info: info-program info-libruby_a info-libruby_so info-arch info-program: PHONY diff --git a/tool/unicode_norm_gen.rb b/template/unicode_norm_gen.tmpl similarity index 53% rename from tool/unicode_norm_gen.rb rename to template/unicode_norm_gen.tmpl index 766be26dc4..332cb156ef 100644 --- a/tool/unicode_norm_gen.rb +++ b/template/unicode_norm_gen.tmpl @@ -1,13 +1,13 @@ -# coding: utf-8 - +%# -*- mode: ruby; coding: utf-8 -*- +<% # Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp) # Script to generate Ruby data structures used in implementing # String#unicode_normalize,... # Constants for input and ouput directory -InputDataDir = $input || 'enc/unicode/data' -OuputDataDir = $ouput || 'lib/unicode_normalize' +InputDataDir = ARGV[0] || 'enc/unicode/data' +OuputDataDir = ARGV[1] || 'lib/unicode_normalize' # convenience methods class Integer @@ -22,18 +22,27 @@ class Integer end end -class Array - def line_slice(new_line) # joins items, 8 items per line - ary = [] - 0.step(size-1, 8) {|i| - ary << self[i, 8].join('') - } - ary.join(new_line).gsub(/ +$/, '') +module Enumerable + unless method_defined?(:each_slice) + def each_slice(n) + ary = [] + each do |i| + ary << i + if ary.size >= n + yield ary + ary = [] + end + end + yield ary unless ary.empty? + self + end end +end +class Array def to_UTF8() collect {|c| c.to_UTF8}.join('') end - def to_regexp_chars # converts an array of Integers to character ranges + def each_regexp_chars(n = 8) # converts an array of Integers to character ranges sort.inject([]) do |ranges, value| if ranges.last and ranges.last[1]+1>=value ranges.last[1] = value @@ -50,29 +59,23 @@ class Array else first.to_UTF8 + '-' + last.to_UTF8 end - end.line_slice "\" \\\n \"" - end -end - -class Hash - def to_hash_string - collect do |key, value| - "\"#{key.to_UTF8}\"=>\"#{value.to_UTF8}\".freeze, " - end.line_slice "\n " + end.each_slice(n) do |slice| + yield slice.join('') + end end end # read the file 'CompositionExclusions.txt' -composition_exclusions = File.open("#{InputDataDir}/CompositionExclusions.txt") {|f| +composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt") {|f| f.grep(/^[A-Z0-9]{4,5}/) {|line| line.hex} } decomposition_table = {} kompatible_table = {} -CombiningClass = {} # constant to allow use in Integer#to_UTF8 +combining_class = {} # constant to allow use in Integer#to_UTF8 # read the file 'UnicodeData.txt' -IO.foreach("#{InputDataDir}/UnicodeData.txt") do |line| +vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line| codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";") case decomposition @@ -81,7 +84,7 @@ IO.foreach("#{InputDataDir}/UnicodeData.txt") do |line| when /^$/ and (char_class!="0" or decomposition!="") warn "Unexpected: Character range with data relevant to normalization!" @@ -92,14 +95,14 @@ end composition_table = decomposition_table.reject do |character, decomposition| composition_exclusions.member? character or # predefined composition exclusion decomposition.length<=1 or # Singleton Decomposition - CombiningClass[character] or # character is not a Starter - CombiningClass[decomposition.first] # decomposition begins with a character that is not a Starter + combining_class[character] or # character is not a Starter + combining_class[decomposition.first] # decomposition begins with a character that is not a Starter end.invert # recalculate composition_exclusions composition_exclusions = decomposition_table.keys - composition_table.values -accent_array = CombiningClass.keys + composition_table.keys.collect {|key| key.last} +accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last} composition_starters = composition_table.keys.collect {|key| key.first} @@ -135,64 +138,74 @@ decomposition_table.each do |key, value| kompatible_table[key] = value if expanded end -class_table_str = CombiningClass.collect do |key, value| - "\"#{key.to_UTF8}\"=>#{value}, " -end.line_slice "\n " - # generate normalization tables file -open("#{OuputDataDir}/tables.rb", "w").print <# coding: us-ascii +%# > # automatically generated by tool/unicode_norm_gen.rb module UnicodeNormalize - accents = "" \\ - "[#{accent_array.to_regexp_chars}]" \\ + accents = "" \ + "[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>]" \ "".freeze ACCENTS = accents - REGEXP_D_STRING = "\#{'' # composition starters and composition exclusions - }" \\ - "[#{(composition_table.values+composition_exclusions).to_regexp_chars}]\#{accents}*" \\ - "|\#{'' # characters that can be the result of a composition, except composition starters - }" \\ - "[#{(composition_starters-composition_table.values).to_regexp_chars}]?\#{accents}+" \\ - "|\#{'' # precomposed Hangul syllables - }" \\ - "[\\u{AC00}-\\u{D7A4}]" \\ + REGEXP_D_STRING = "#{'' # composition starters and composition exclusions + }" \ + "[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>]#{accents}*" \ + "|#{'' # characters that can be the result of a composition, except composition starters + }" \ + "[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>]?#{accents}+" \ + "|#{'' # precomposed Hangul syllables + }" \ + "[\u{AC00}-\u{D7A4}]" \ "".freeze - REGEXP_C_STRING = "\#{'' # composition exclusions - }" \\ - "[#{composition_exclusions.to_regexp_chars}]\#{accents}*" \\ - "|\#{'' # composition starters and characters that can be the result of a composition - }" \\ - "[#{(composition_starters+composition_table.values).to_regexp_chars}]?\#{accents}+" \\ - "|\#{'' # Hangul syllables with separate trailer - }" \\ - "[#{hangul_no_trailing.to_regexp_chars}][\\u11A8-\\u11C2]" \\ - "|\#{'' # decomposed Hangul syllables - }" \\ - "[\\u1100-\\u1112][\\u1161-\\u1175][\\u11A8-\\u11C2]?" \\ + REGEXP_C_STRING = "#{'' # composition exclusions + }" \ + "[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>]#{accents}*" \ + "|#{'' # composition starters and characters that can be the result of a composition + }" \ + "[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>]?#{accents}+" \ + "|#{'' # Hangul syllables with separate trailer + }" \ + "[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>][\u11A8-\u11C2]" \ + "|#{'' # decomposed Hangul syllables + }" \ + "[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?" \ "".freeze - REGEXP_K_STRING = "" \\ - "[#{kompatible_table.keys.to_regexp_chars}]" \\ + REGEXP_K_STRING = "" \ + "[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \ + "<%end%>]" \ "".freeze class_table = { - #{class_table_str} +% combining_class.each_slice(8) do |slice| + <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=><%=value%><%=%>,<% end%> +% end } class_table.default = 0 CLASS_TABLE = class_table.freeze DECOMPOSITION_TABLE = { - #{decomposition_table.to_hash_string} +% decomposition_table.each_slice(8) do |slice| + <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%> +% end }.freeze KOMPATIBLE_TABLE = { - #{kompatible_table.to_hash_string} +% kompatible_table.each_slice(8) do |slice| + <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%> +% end }.freeze COMPOSITION_TABLE = { - #{composition_table.to_hash_string} +% composition_table.each_slice(8) do |slice| + <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%> +% end }.freeze end -MAPPING_TABLE_FILE_END