mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
template/unicode_norm_gen.tmpl: from tool/unicode_norm_gen.rb
* template/unicode_norm_gen.tmpl: use generic_erb.rb to update if changed and manage timestamp, so that source tree on read-only filesystem works. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48129 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
67a19e7a59
commit
9b581e0d0b
2 changed files with 85 additions and 68 deletions
12
common.mk
12
common.mk
|
@ -1120,10 +1120,14 @@ UPDATE_UNICODE_FILES_DEPS = $(ALWAYS_UPDATE_UNICODE:yes=PHONY)
|
||||||
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt
|
UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt
|
||||||
@exit > .update-unicode.time
|
@exit > .update-unicode.time
|
||||||
|
|
||||||
$(srcdir)/lib/unicode_normalize/tables.rb: \
|
$(srcdir)/lib/unicode_normalize/tables.rb: ./.unicode-tables.time
|
||||||
$(srcdir)/tool/unicode_norm_gen.rb $(UNICODE_FILES)
|
|
||||||
$(BASERUBY) -s -C "$(srcdir)" tool/unicode_norm_gen.rb \
|
./.unicode-tables.time: $(srcdir)/tool/generic_erb.rb \
|
||||||
-input=enc/unicode/data -ouput=lib/unicode_normalize
|
$(srcdir)/template/unicode_norm_gen.tmpl $(UNICODE_FILES)
|
||||||
|
$(Q) $(BASERUBY) $(srcdir)/tool/generic_erb.rb \
|
||||||
|
-c -t$@ -o $(srcdir)/lib/unicode_normalize/tables.rb \
|
||||||
|
-I $(srcdir) \
|
||||||
|
$(srcdir)/template/unicode_norm_gen.tmpl enc/unicode/data lib/unicode_normalize
|
||||||
|
|
||||||
info: info-program info-libruby_a info-libruby_so info-arch
|
info: info-program info-libruby_a info-libruby_so info-arch
|
||||||
info-program: PHONY
|
info-program: PHONY
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
# coding: utf-8
|
%# -*- mode: ruby; coding: utf-8 -*-
|
||||||
|
<%
|
||||||
# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
|
# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
|
||||||
|
|
||||||
# Script to generate Ruby data structures used in implementing
|
# Script to generate Ruby data structures used in implementing
|
||||||
# String#unicode_normalize,...
|
# String#unicode_normalize,...
|
||||||
|
|
||||||
# Constants for input and ouput directory
|
# Constants for input and ouput directory
|
||||||
InputDataDir = $input || 'enc/unicode/data'
|
InputDataDir = ARGV[0] || 'enc/unicode/data'
|
||||||
OuputDataDir = $ouput || 'lib/unicode_normalize'
|
OuputDataDir = ARGV[1] || 'lib/unicode_normalize'
|
||||||
|
|
||||||
# convenience methods
|
# convenience methods
|
||||||
class Integer
|
class Integer
|
||||||
|
@ -22,18 +22,27 @@ class Integer
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
class Array
|
module Enumerable
|
||||||
def line_slice(new_line) # joins items, 8 items per line
|
unless method_defined?(:each_slice)
|
||||||
|
def each_slice(n)
|
||||||
ary = []
|
ary = []
|
||||||
0.step(size-1, 8) {|i|
|
each do |i|
|
||||||
ary << self[i, 8].join('')
|
ary << i
|
||||||
}
|
if ary.size >= n
|
||||||
ary.join(new_line).gsub(/ +$/, '')
|
yield ary
|
||||||
|
ary = []
|
||||||
|
end
|
||||||
|
end
|
||||||
|
yield ary unless ary.empty?
|
||||||
|
self
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
class Array
|
||||||
def to_UTF8() collect {|c| c.to_UTF8}.join('') end
|
def to_UTF8() collect {|c| c.to_UTF8}.join('') end
|
||||||
|
|
||||||
def to_regexp_chars # converts an array of Integers to character ranges
|
def each_regexp_chars(n = 8) # converts an array of Integers to character ranges
|
||||||
sort.inject([]) do |ranges, value|
|
sort.inject([]) do |ranges, value|
|
||||||
if ranges.last and ranges.last[1]+1>=value
|
if ranges.last and ranges.last[1]+1>=value
|
||||||
ranges.last[1] = value
|
ranges.last[1] = value
|
||||||
|
@ -50,29 +59,23 @@ class Array
|
||||||
else
|
else
|
||||||
first.to_UTF8 + '-' + last.to_UTF8
|
first.to_UTF8 + '-' + last.to_UTF8
|
||||||
end
|
end
|
||||||
end.line_slice "\" \\\n \""
|
end.each_slice(n) do |slice|
|
||||||
|
yield slice.join('')
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
class Hash
|
|
||||||
def to_hash_string
|
|
||||||
collect do |key, value|
|
|
||||||
"\"#{key.to_UTF8}\"=>\"#{value.to_UTF8}\".freeze, "
|
|
||||||
end.line_slice "\n "
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# read the file 'CompositionExclusions.txt'
|
# read the file 'CompositionExclusions.txt'
|
||||||
composition_exclusions = File.open("#{InputDataDir}/CompositionExclusions.txt") {|f|
|
composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt") {|f|
|
||||||
f.grep(/^[A-Z0-9]{4,5}/) {|line| line.hex}
|
f.grep(/^[A-Z0-9]{4,5}/) {|line| line.hex}
|
||||||
}
|
}
|
||||||
|
|
||||||
decomposition_table = {}
|
decomposition_table = {}
|
||||||
kompatible_table = {}
|
kompatible_table = {}
|
||||||
CombiningClass = {} # constant to allow use in Integer#to_UTF8
|
combining_class = {} # constant to allow use in Integer#to_UTF8
|
||||||
|
|
||||||
# read the file 'UnicodeData.txt'
|
# read the file 'UnicodeData.txt'
|
||||||
IO.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
|
vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
|
||||||
codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")
|
codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")
|
||||||
|
|
||||||
case decomposition
|
case decomposition
|
||||||
|
@ -81,7 +84,7 @@ IO.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
|
||||||
when /^</
|
when /^</
|
||||||
kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
|
kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
|
||||||
end
|
end
|
||||||
CombiningClass[codepoint.hex] = char_class.to_i if char_class != "0"
|
combining_class[codepoint.hex] = char_class.to_i if char_class != "0"
|
||||||
|
|
||||||
if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
|
if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
|
||||||
warn "Unexpected: Character range with data relevant to normalization!"
|
warn "Unexpected: Character range with data relevant to normalization!"
|
||||||
|
@ -92,14 +95,14 @@ end
|
||||||
composition_table = decomposition_table.reject do |character, decomposition|
|
composition_table = decomposition_table.reject do |character, decomposition|
|
||||||
composition_exclusions.member? character or # predefined composition exclusion
|
composition_exclusions.member? character or # predefined composition exclusion
|
||||||
decomposition.length<=1 or # Singleton Decomposition
|
decomposition.length<=1 or # Singleton Decomposition
|
||||||
CombiningClass[character] or # character is not a Starter
|
combining_class[character] or # character is not a Starter
|
||||||
CombiningClass[decomposition.first] # decomposition begins with a character that is not a Starter
|
combining_class[decomposition.first] # decomposition begins with a character that is not a Starter
|
||||||
end.invert
|
end.invert
|
||||||
|
|
||||||
# recalculate composition_exclusions
|
# recalculate composition_exclusions
|
||||||
composition_exclusions = decomposition_table.keys - composition_table.values
|
composition_exclusions = decomposition_table.keys - composition_table.values
|
||||||
|
|
||||||
accent_array = CombiningClass.keys + composition_table.keys.collect {|key| key.last}
|
accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last}
|
||||||
|
|
||||||
composition_starters = composition_table.keys.collect {|key| key.first}
|
composition_starters = composition_table.keys.collect {|key| key.first}
|
||||||
|
|
||||||
|
@ -135,64 +138,74 @@ decomposition_table.each do |key, value|
|
||||||
kompatible_table[key] = value if expanded
|
kompatible_table[key] = value if expanded
|
||||||
end
|
end
|
||||||
|
|
||||||
class_table_str = CombiningClass.collect do |key, value|
|
|
||||||
"\"#{key.to_UTF8}\"=>#{value}, "
|
|
||||||
end.line_slice "\n "
|
|
||||||
|
|
||||||
# generate normalization tables file
|
# generate normalization tables file
|
||||||
open("#{OuputDataDir}/tables.rb", "w").print <<MAPPING_TABLE_FILE_END
|
%># coding: us-ascii
|
||||||
# coding: us-ascii
|
%# >
|
||||||
|
|
||||||
# automatically generated by tool/unicode_norm_gen.rb
|
# automatically generated by tool/unicode_norm_gen.rb
|
||||||
|
|
||||||
module UnicodeNormalize
|
module UnicodeNormalize
|
||||||
accents = "" \\
|
accents = "" \
|
||||||
"[#{accent_array.to_regexp_chars}]" \\
|
"[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \
|
||||||
|
"<% end%>]" \
|
||||||
"".freeze
|
"".freeze
|
||||||
ACCENTS = accents
|
ACCENTS = accents
|
||||||
REGEXP_D_STRING = "\#{'' # composition starters and composition exclusions
|
REGEXP_D_STRING = "#{'' # composition starters and composition exclusions
|
||||||
}" \\
|
}" \
|
||||||
"[#{(composition_table.values+composition_exclusions).to_regexp_chars}]\#{accents}*" \\
|
"[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \
|
||||||
"|\#{'' # characters that can be the result of a composition, except composition starters
|
"<% end%>]#{accents}*" \
|
||||||
}" \\
|
"|#{'' # characters that can be the result of a composition, except composition starters
|
||||||
"[#{(composition_starters-composition_table.values).to_regexp_chars}]?\#{accents}+" \\
|
}" \
|
||||||
"|\#{'' # precomposed Hangul syllables
|
"[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
|
||||||
}" \\
|
"<% end%>]?#{accents}+" \
|
||||||
"[\\u{AC00}-\\u{D7A4}]" \\
|
"|#{'' # precomposed Hangul syllables
|
||||||
|
}" \
|
||||||
|
"[\u{AC00}-\u{D7A4}]" \
|
||||||
"".freeze
|
"".freeze
|
||||||
REGEXP_C_STRING = "\#{'' # composition exclusions
|
REGEXP_C_STRING = "#{'' # composition exclusions
|
||||||
}" \\
|
}" \
|
||||||
"[#{composition_exclusions.to_regexp_chars}]\#{accents}*" \\
|
"[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \
|
||||||
"|\#{'' # composition starters and characters that can be the result of a composition
|
"<% end%>]#{accents}*" \
|
||||||
}" \\
|
"|#{'' # composition starters and characters that can be the result of a composition
|
||||||
"[#{(composition_starters+composition_table.values).to_regexp_chars}]?\#{accents}+" \\
|
}" \
|
||||||
"|\#{'' # Hangul syllables with separate trailer
|
"[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
|
||||||
}" \\
|
"<% end%>]?#{accents}+" \
|
||||||
"[#{hangul_no_trailing.to_regexp_chars}][\\u11A8-\\u11C2]" \\
|
"|#{'' # Hangul syllables with separate trailer
|
||||||
"|\#{'' # decomposed Hangul syllables
|
}" \
|
||||||
}" \\
|
"[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \
|
||||||
"[\\u1100-\\u1112][\\u1161-\\u1175][\\u11A8-\\u11C2]?" \\
|
"<% end%>][\u11A8-\u11C2]" \
|
||||||
|
"|#{'' # decomposed Hangul syllables
|
||||||
|
}" \
|
||||||
|
"[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?" \
|
||||||
"".freeze
|
"".freeze
|
||||||
REGEXP_K_STRING = "" \\
|
REGEXP_K_STRING = "" \
|
||||||
"[#{kompatible_table.keys.to_regexp_chars}]" \\
|
"[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \
|
||||||
|
"<%end%>]" \
|
||||||
"".freeze
|
"".freeze
|
||||||
|
|
||||||
class_table = {
|
class_table = {
|
||||||
#{class_table_str}
|
% combining_class.each_slice(8) do |slice|
|
||||||
|
<% slice.each do |key, value|%> "<%=key.to_UTF8%>"=><%=value%><%=%>,<% end%>
|
||||||
|
% end
|
||||||
}
|
}
|
||||||
class_table.default = 0
|
class_table.default = 0
|
||||||
CLASS_TABLE = class_table.freeze
|
CLASS_TABLE = class_table.freeze
|
||||||
|
|
||||||
DECOMPOSITION_TABLE = {
|
DECOMPOSITION_TABLE = {
|
||||||
#{decomposition_table.to_hash_string}
|
% decomposition_table.each_slice(8) do |slice|
|
||||||
|
<% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
|
||||||
|
% end
|
||||||
}.freeze
|
}.freeze
|
||||||
|
|
||||||
KOMPATIBLE_TABLE = {
|
KOMPATIBLE_TABLE = {
|
||||||
#{kompatible_table.to_hash_string}
|
% kompatible_table.each_slice(8) do |slice|
|
||||||
|
<% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
|
||||||
|
% end
|
||||||
}.freeze
|
}.freeze
|
||||||
|
|
||||||
COMPOSITION_TABLE = {
|
COMPOSITION_TABLE = {
|
||||||
#{composition_table.to_hash_string}
|
% composition_table.each_slice(8) do |slice|
|
||||||
|
<% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
|
||||||
|
% end
|
||||||
}.freeze
|
}.freeze
|
||||||
end
|
end
|
||||||
MAPPING_TABLE_FILE_END
|
|
Loading…
Reference in a new issue