mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
tool/unicode_norm_gen.rb: Data generation script imported from
https://github.com/duerst/eprun/blob/master/lib/generate.rb git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@47808 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
0fb67d59b2
commit
33447b80d5
2 changed files with 183 additions and 0 deletions
|
@ -1,3 +1,8 @@
|
|||
Mon Oct 6 10:27:27 2014 Martin Duerst <duerst@it.aoyama.ac.jp>
|
||||
|
||||
* tool/unicode_norm_gen.rb: Data generation script imported from
|
||||
https://github.com/duerst/eprun/blob/master/lib/generate.rb
|
||||
|
||||
Mon Oct 6 10:15:15 2014 Martin Duerst <duerst@it.aoyama.ac.jp>
|
||||
|
||||
* tool/downloader.rb: Adjust example in documentation for
|
||||
|
|
178
tool/unicode_norm_gen.rb
Normal file
178
tool/unicode_norm_gen.rb
Normal file
|
@ -0,0 +1,178 @@
|
|||
# coding: utf-8
|
||||
|
||||
# Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
|
||||
# available under the same licence as Ruby itself
|
||||
# (see http://www.ruby-lang.org/en/LICENSE.txt)
|
||||
|
||||
class Integer
|
||||
def to_UTF8()
|
||||
if self>0xFFFF
|
||||
"\\u{#{to_s(16).upcase}}"
|
||||
elsif CombiningClass[self] or self=='\\'.ord or self=='"'.ord
|
||||
"\\u#{to_s(16).upcase.rjust(4, '0')}"
|
||||
else
|
||||
chr Encoding::UTF_8
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class Array
|
||||
def line_slice (new_line) # joins items, 16 items per line
|
||||
each_slice(16).collect(&:join).join new_line
|
||||
end
|
||||
|
||||
def to_UTF8() collect(&:to_UTF8).join end
|
||||
|
||||
def to_regexp_chars # converts an array of Integers to character ranges
|
||||
sort.inject([]) do |ranges, value|
|
||||
if ranges.last and ranges.last[1]+1>=value
|
||||
ranges.last[1] = value
|
||||
ranges
|
||||
else
|
||||
ranges << [value, value]
|
||||
end
|
||||
end.collect do |first, last|
|
||||
case last-first
|
||||
when 0
|
||||
first.to_UTF8
|
||||
when 1
|
||||
first.to_UTF8 + last.to_UTF8
|
||||
else
|
||||
first.to_UTF8 + '-' + last.to_UTF8
|
||||
end
|
||||
end.line_slice "\" +\n \""
|
||||
end
|
||||
end
|
||||
|
||||
class Hash
|
||||
def to_hash_string
|
||||
collect do |key, value|
|
||||
"\"#{key.to_UTF8}\"=>\"#{value.to_UTF8}\", "
|
||||
end.line_slice "\n "
|
||||
end
|
||||
end
|
||||
|
||||
# read the file 'CompositionExclusions.txt'
|
||||
composition_exclusions = IO.readlines("../data/CompositionExclusions.txt")
|
||||
.select { |line| line =~ /^[A-Z0-9]{4,5}/ }
|
||||
.collect { |line| line.split(' ').first.hex }
|
||||
|
||||
decomposition_table = {}
|
||||
kompatible_table = {}
|
||||
CombiningClass = {} # constant to allow use in Integer#to_UTF8
|
||||
|
||||
# read the file 'UnicodeData.txt'
|
||||
IO.foreach("../data/UnicodeData.txt") do |line|
|
||||
codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")
|
||||
|
||||
case decomposition
|
||||
when /^[0-9A-F]/
|
||||
decomposition_table[codepoint.hex] = decomposition.split(' ').collect(&:hex)
|
||||
when /^</
|
||||
kompatible_table[codepoint.hex] = decomposition.split(' ').drop(1).collect(&:hex)
|
||||
end
|
||||
CombiningClass[codepoint.hex] = char_class.to_i if char_class != "0"
|
||||
|
||||
if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
|
||||
warn "Unexpected: Character range with data relevant to normalization!"
|
||||
end
|
||||
end
|
||||
|
||||
# calculate compositions from decompositions
|
||||
composition_table = decomposition_table.reject do |character, decomposition|
|
||||
composition_exclusions.member? character or # predefined composition exclusion
|
||||
decomposition.length<=1 or # Singleton Decomposition
|
||||
CombiningClass[character] or # character is not a Starter
|
||||
CombiningClass[decomposition.first] # decomposition begins with a character that is not a Starter
|
||||
end.invert
|
||||
|
||||
# recalculate composition_exclusions
|
||||
composition_exclusions = decomposition_table.keys - composition_table.values
|
||||
|
||||
accent_array = CombiningClass.keys + composition_table.keys.collect(&:last)
|
||||
|
||||
composition_starters = composition_table.keys.collect(&:first)
|
||||
|
||||
hangul_no_trailing = 0xAC00.step(0xD7A3, 28).to_a
|
||||
|
||||
# expand decomposition table values
|
||||
decomposition_table.each do |key, value|
|
||||
position = 0
|
||||
while position < value.length
|
||||
if decomposition = decomposition_table[value[position]]
|
||||
decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
|
||||
value[position, 1] = decomposition
|
||||
else
|
||||
position += 1
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# deal with relationship between canonical and kompatibility decompositions
|
||||
decomposition_table.each do |key, value|
|
||||
value = value.dup
|
||||
expanded = false
|
||||
position = 0
|
||||
while position < value.length
|
||||
if decomposition = kompatible_table[value[position]]
|
||||
value[position, 1] = decomposition
|
||||
expanded = true
|
||||
else
|
||||
position += 1
|
||||
end
|
||||
end
|
||||
kompatible_table[key] = value if expanded
|
||||
end
|
||||
|
||||
class_table_str = CombiningClass.collect do |key, value|
|
||||
"\"#{key.to_UTF8}\"=>#{value}, "
|
||||
end.line_slice "\n "
|
||||
|
||||
# generate normalization tables file
|
||||
open("normalize_tables.rb", "w").print <<MAPPING_TABLE_FILE_END
|
||||
# coding: utf-8
|
||||
|
||||
# automatically generated by generate.rb
|
||||
|
||||
module Normalize
|
||||
ACCENTS = "
|
||||
[#{accent_array.to_regexp_chars}]
|
||||
"
|
||||
REGEXP_D_STRING = " # composition starters and composition exclusions
|
||||
[#{(composition_table.values+composition_exclusions).to_regexp_chars}]\#{ACCENTS}*
|
||||
| # characters that can be the result of a composition, except composition starters
|
||||
[#{(composition_starters-composition_table.values).to_regexp_chars}]?\#{ACCENTS}+
|
||||
| # precomposed Hangul syllables
|
||||
[\\u{AC00}-\\u{D7A4}]
|
||||
"
|
||||
REGEXP_C_STRING = " # composition exclusions
|
||||
[#{composition_exclusions.to_regexp_chars}]\#{ACCENTS}*
|
||||
| # composition starters and characters that can be the result of a composition
|
||||
[#{(composition_starters+composition_table.values).to_regexp_chars}]?\#{ACCENTS}+
|
||||
| # Hangul syllables with separate trailer
|
||||
[#{hangul_no_trailing.to_regexp_chars}][\\u11A8-\\u11C2]
|
||||
| # decomposed Hangul syllables
|
||||
[\\u1100-\\u1112][\\u1161-\\u1175][\\u11A8-\\u11C2]?
|
||||
"
|
||||
REGEXP_K_STRING = "
|
||||
[#{kompatible_table.keys.to_regexp_chars}]
|
||||
"
|
||||
|
||||
CLASS_TABLE = {
|
||||
#{class_table_str}
|
||||
}
|
||||
CLASS_TABLE.default = 0
|
||||
|
||||
DECOMPOSITION_TABLE = {
|
||||
#{decomposition_table.to_hash_string}
|
||||
}
|
||||
|
||||
KOMPATIBLE_TABLE = {
|
||||
#{kompatible_table.to_hash_string}
|
||||
}
|
||||
|
||||
COMPOSITION_TABLE = {
|
||||
#{composition_table.to_hash_string}
|
||||
}
|
||||
end
|
||||
MAPPING_TABLE_FILE_END
|
Loading…
Reference in a new issue