2014-10-25 03:20:15 -04:00
|
|
|
%# -*- mode: ruby; coding: utf-8 -*-
|
|
|
|
<%
|
2014-10-05 22:21:23 -04:00
|
|
|
# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
|
2014-10-05 21:27:34 -04:00
|
|
|
|
2014-10-05 22:21:23 -04:00
|
|
|
# Script to generate Ruby data structures used in implementing
|
|
|
|
# String#unicode_normalize,...
|
|
|
|
|
2018-01-12 08:05:09 -05:00
|
|
|
# Constants for input and output directory
|
2014-10-25 03:20:15 -04:00
|
|
|
InputDataDir = ARGV[0] || 'enc/unicode/data'
|
2017-12-01 22:12:51 -05:00
|
|
|
unicode_version = InputDataDir[/.*\/(\d+\.\d+\.\d+)(?=\/|\z)/, 1]
|
2014-10-05 21:58:01 -04:00
|
|
|
|
2014-10-05 22:21:23 -04:00
|
|
|
# convenience methods
|
2014-10-05 21:27:34 -04:00
|
|
|
class Integer
|
2014-10-05 22:21:23 -04:00
|
|
|
def to_UTF8() # convert to string, taking legibility into account
|
2014-10-05 21:27:34 -04:00
|
|
|
if self>0xFFFF
|
|
|
|
"\\u{#{to_s(16).upcase}}"
|
2014-10-05 23:08:13 -04:00
|
|
|
elsif self>0x7f
|
2014-10-05 21:27:34 -04:00
|
|
|
"\\u#{to_s(16).upcase.rjust(4, '0')}"
|
|
|
|
else
|
2014-10-22 06:18:20 -04:00
|
|
|
chr.sub(/[\\\"]/, "\\\\\\\&")
|
2014-10-05 21:27:34 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2014-10-25 03:20:15 -04:00
|
|
|
module Enumerable
|
|
|
|
unless method_defined?(:each_slice)
|
|
|
|
def each_slice(n)
|
|
|
|
ary = []
|
|
|
|
each do |i|
|
|
|
|
ary << i
|
|
|
|
if ary.size >= n
|
|
|
|
yield ary
|
|
|
|
ary = []
|
|
|
|
end
|
|
|
|
end
|
|
|
|
yield ary unless ary.empty?
|
|
|
|
self
|
|
|
|
end
|
2014-10-05 21:27:34 -04:00
|
|
|
end
|
2014-10-25 03:20:15 -04:00
|
|
|
end
|
2014-10-05 21:27:43 -04:00
|
|
|
|
2014-10-25 03:20:15 -04:00
|
|
|
class Array
|
2014-10-19 23:42:42 -04:00
|
|
|
def to_UTF8() collect {|c| c.to_UTF8}.join('') end
|
2014-10-05 21:27:43 -04:00
|
|
|
|
2019-04-04 19:40:48 -04:00
|
|
|
def each_regexp_chars(n = 1) # converts an array of Integers to character ranges
|
2014-10-05 21:27:34 -04:00
|
|
|
sort.inject([]) do |ranges, value|
|
|
|
|
if ranges.last and ranges.last[1]+1>=value
|
|
|
|
ranges.last[1] = value
|
|
|
|
ranges
|
|
|
|
else
|
|
|
|
ranges << [value, value]
|
|
|
|
end
|
|
|
|
end.collect do |first, last|
|
|
|
|
case last-first
|
|
|
|
when 0
|
|
|
|
first.to_UTF8
|
|
|
|
when 1
|
|
|
|
first.to_UTF8 + last.to_UTF8
|
|
|
|
else
|
|
|
|
first.to_UTF8 + '-' + last.to_UTF8
|
|
|
|
end
|
2014-10-25 03:20:15 -04:00
|
|
|
end.each_slice(n) do |slice|
|
|
|
|
yield slice.join('')
|
|
|
|
end
|
2014-10-05 21:27:34 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# read the file 'CompositionExclusions.txt'
|
2016-08-16 09:01:30 -04:00
|
|
|
composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt", 'rb') {|f|
|
2014-11-10 01:56:41 -05:00
|
|
|
base = Regexp.quote(File.basename(f.path, '.*'))
|
|
|
|
ext = Regexp.quote(File.extname(f.path))
|
|
|
|
version = (line = f.gets)[/^# *#{base}-([\d.]+)#{ext}\s*$/, 1] or
|
|
|
|
abort "No file version in #{f.path}: #{line}"
|
|
|
|
(unicode_version ||= version) == version or
|
|
|
|
abort "Unicode version of directory (#{unicode_version}) and file (#{version}) mismatch"
|
2014-11-11 05:18:13 -05:00
|
|
|
f.grep(/^[A-Z0-9]{4,5}/) {|code| code.hex}
|
2014-10-19 23:42:42 -04:00
|
|
|
}
|
2014-10-05 21:27:34 -04:00
|
|
|
|
|
|
|
decomposition_table = {}
|
|
|
|
kompatible_table = {}
|
2014-10-25 03:20:15 -04:00
|
|
|
combining_class = {} # constant to allow use in Integer#to_UTF8
|
2014-10-05 21:27:34 -04:00
|
|
|
|
|
|
|
# read the file 'UnicodeData.txt'
|
2014-10-25 03:20:15 -04:00
|
|
|
vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
|
2014-10-05 21:27:34 -04:00
|
|
|
codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")
|
2014-10-05 21:27:43 -04:00
|
|
|
|
2014-10-05 21:27:34 -04:00
|
|
|
case decomposition
|
|
|
|
when /^[0-9A-F]/
|
2014-10-19 23:42:42 -04:00
|
|
|
decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex}
|
2014-10-05 21:27:34 -04:00
|
|
|
when /^</
|
2014-10-19 23:42:42 -04:00
|
|
|
kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
|
2014-10-05 21:27:34 -04:00
|
|
|
end
|
2014-10-25 03:20:15 -04:00
|
|
|
combining_class[codepoint.hex] = char_class.to_i if char_class != "0"
|
2014-10-05 21:27:43 -04:00
|
|
|
|
2014-10-05 21:27:34 -04:00
|
|
|
if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
|
|
|
|
warn "Unexpected: Character range with data relevant to normalization!"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# calculate compositions from decompositions
|
|
|
|
composition_table = decomposition_table.reject do |character, decomposition|
|
|
|
|
composition_exclusions.member? character or # predefined composition exclusion
|
|
|
|
decomposition.length<=1 or # Singleton Decomposition
|
2014-10-25 03:20:15 -04:00
|
|
|
combining_class[character] or # character is not a Starter
|
|
|
|
combining_class[decomposition.first] # decomposition begins with a character that is not a Starter
|
2014-10-05 21:27:34 -04:00
|
|
|
end.invert
|
|
|
|
|
|
|
|
# recalculate composition_exclusions
|
|
|
|
composition_exclusions = decomposition_table.keys - composition_table.values
|
|
|
|
|
2014-10-25 03:20:15 -04:00
|
|
|
accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last}
|
2014-10-05 21:27:34 -04:00
|
|
|
|
2014-10-19 23:42:42 -04:00
|
|
|
composition_starters = composition_table.keys.collect {|key| key.first}
|
2014-10-05 21:27:34 -04:00
|
|
|
|
2014-10-19 23:42:42 -04:00
|
|
|
hangul_no_trailing = []
|
|
|
|
0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c}
|
2014-10-05 21:27:34 -04:00
|
|
|
|
|
|
|
# expand decomposition table values
|
|
|
|
decomposition_table.each do |key, value|
|
|
|
|
position = 0
|
|
|
|
while position < value.length
|
|
|
|
if decomposition = decomposition_table[value[position]]
|
|
|
|
decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
|
|
|
|
value[position, 1] = decomposition
|
|
|
|
else
|
|
|
|
position += 1
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# deal with relationship between canonical and kompatibility decompositions
|
|
|
|
decomposition_table.each do |key, value|
|
|
|
|
value = value.dup
|
|
|
|
expanded = false
|
|
|
|
position = 0
|
|
|
|
while position < value.length
|
|
|
|
if decomposition = kompatible_table[value[position]]
|
|
|
|
value[position, 1] = decomposition
|
|
|
|
expanded = true
|
|
|
|
else
|
|
|
|
position += 1
|
|
|
|
end
|
|
|
|
end
|
|
|
|
kompatible_table[key] = value if expanded
|
|
|
|
end
|
|
|
|
|
2014-11-06 10:00:24 -05:00
|
|
|
while kompatible_table.any? {|key, value|
|
|
|
|
expanded = value.map {|v| kompatible_table[v] || v}.flatten
|
|
|
|
kompatible_table[key] = expanded unless value == expanded
|
|
|
|
}
|
|
|
|
end
|
|
|
|
|
2014-10-05 21:27:34 -04:00
|
|
|
# generate normalization tables file
|
2014-10-25 03:20:15 -04:00
|
|
|
%># coding: us-ascii
|
2016-01-18 07:48:41 -05:00
|
|
|
# frozen_string_literal: true
|
2014-10-25 03:20:15 -04:00
|
|
|
%# >
|
2014-10-05 21:27:34 -04:00
|
|
|
|
2014-10-25 05:41:44 -04:00
|
|
|
# automatically generated by template/unicode_norm_gen.tmpl
|
2014-10-05 21:27:34 -04:00
|
|
|
|
2017-04-12 14:07:32 -04:00
|
|
|
module UnicodeNormalize # :nodoc:
|
2014-10-25 03:20:15 -04:00
|
|
|
accents = "" \
|
|
|
|
"[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \
|
2015-09-29 03:54:05 -04:00
|
|
|
"<% end%>]"
|
2014-10-05 23:08:09 -04:00
|
|
|
ACCENTS = accents
|
2014-10-25 03:20:15 -04:00
|
|
|
REGEXP_D_STRING = "#{'' # composition starters and composition exclusions
|
|
|
|
}" \
|
|
|
|
"[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \
|
|
|
|
"<% end%>]#{accents}*" \
|
|
|
|
"|#{'' # characters that can be the result of a composition, except composition starters
|
|
|
|
}" \
|
|
|
|
"[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
|
|
|
|
"<% end%>]?#{accents}+" \
|
|
|
|
"|#{'' # precomposed Hangul syllables
|
|
|
|
}" \
|
2015-09-29 03:54:05 -04:00
|
|
|
"[\u{AC00}-\u{D7A4}]"
|
2014-10-25 03:20:15 -04:00
|
|
|
REGEXP_C_STRING = "#{'' # composition exclusions
|
|
|
|
}" \
|
|
|
|
"[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \
|
|
|
|
"<% end%>]#{accents}*" \
|
|
|
|
"|#{'' # composition starters and characters that can be the result of a composition
|
|
|
|
}" \
|
|
|
|
"[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
|
|
|
|
"<% end%>]?#{accents}+" \
|
|
|
|
"|#{'' # Hangul syllables with separate trailer
|
|
|
|
}" \
|
|
|
|
"[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \
|
|
|
|
"<% end%>][\u11A8-\u11C2]" \
|
|
|
|
"|#{'' # decomposed Hangul syllables
|
|
|
|
}" \
|
2015-09-29 03:54:05 -04:00
|
|
|
"[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?"
|
2014-10-25 03:20:15 -04:00
|
|
|
REGEXP_K_STRING = "" \
|
|
|
|
"[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \
|
2015-09-29 03:54:05 -04:00
|
|
|
"<%end%>]"
|
2014-10-05 23:08:09 -04:00
|
|
|
|
|
|
|
class_table = {
|
2019-04-04 19:40:48 -04:00
|
|
|
% combining_class.each do |key, value|
|
|
|
|
"<%=key.to_UTF8%>"=><%=value%><%=%>,
|
2014-10-25 03:20:15 -04:00
|
|
|
% end
|
2014-10-05 21:27:34 -04:00
|
|
|
}
|
2014-10-05 23:08:09 -04:00
|
|
|
class_table.default = 0
|
2014-10-05 23:08:25 -04:00
|
|
|
CLASS_TABLE = class_table.freeze
|
2014-10-05 21:27:34 -04:00
|
|
|
|
|
|
|
DECOMPOSITION_TABLE = {
|
2019-04-04 19:40:48 -04:00
|
|
|
% decomposition_table.each do |key, value|
|
|
|
|
"<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,
|
2014-10-25 03:20:15 -04:00
|
|
|
% end
|
2014-10-05 23:08:25 -04:00
|
|
|
}.freeze
|
2014-10-05 21:27:34 -04:00
|
|
|
|
|
|
|
KOMPATIBLE_TABLE = {
|
2019-04-04 19:40:48 -04:00
|
|
|
% kompatible_table.each do |key, value|
|
|
|
|
"<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,
|
2014-10-25 03:20:15 -04:00
|
|
|
% end
|
2014-10-05 23:08:25 -04:00
|
|
|
}.freeze
|
2014-10-05 21:27:34 -04:00
|
|
|
|
|
|
|
COMPOSITION_TABLE = {
|
2019-04-04 19:40:48 -04:00
|
|
|
% composition_table.each do |key, value|
|
|
|
|
"<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,
|
2014-10-25 03:20:15 -04:00
|
|
|
% end
|
2014-10-05 23:08:25 -04:00
|
|
|
}.freeze
|
2014-10-05 21:27:34 -04:00
|
|
|
end
|