2014-05-30 19:49:54 -04:00
|
|
|
#!/usr/bin/ruby
|
|
|
|
|
2016-02-07 08:10:20 -05:00
|
|
|
# Usage (for case folding only):
|
2014-05-30 19:49:54 -04:00
|
|
|
# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
|
2014-06-02 09:46:33 -04:00
|
|
|
# $ ruby case-folding.rb CaseFolding.txt -o casefold.h
|
2016-02-07 08:10:20 -05:00
|
|
|
# or (for case folding and case mapping):
|
2016-02-06 20:39:26 -05:00
|
|
|
# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
|
2016-02-07 08:10:20 -05:00
|
|
|
# $ wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
|
|
|
|
# $ wget http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt
|
2016-02-06 20:39:26 -05:00
|
|
|
# $ ruby case-folding.rb -m . -o casefold.h
|
2016-02-15 00:43:55 -05:00
|
|
|
# using -d or --debug will include UTF-8 characters in comments for debugging
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
class CaseFolding
|
|
|
|
module Util
|
|
|
|
module_function
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
def hex_seq(v)
|
|
|
|
v.map {|i| "0x%04x" % i}.join(", ")
|
2014-05-30 19:49:54 -04:00
|
|
|
end
|
|
|
|
|
2016-02-07 23:00:31 -05:00
|
|
|
def print_table_1(dest, type, mapping_data, data)
|
2014-05-30 19:57:45 -04:00
|
|
|
for k, v in data = data.sort
|
2014-05-30 19:55:00 -04:00
|
|
|
sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k)
|
2016-02-15 00:43:55 -05:00
|
|
|
ck = cv = ''
|
|
|
|
ck = ' /* ' + Array(k).pack("U*") + ' */' if @debug
|
|
|
|
cv = ' /* ' + Array(v).map{|c|[c].pack("U*")}.join(", ") + ' */' if @debug
|
|
|
|
dest.print(" {#{sk}#{ck}, {#{v.length}#{mapping_data.flags(k, type, v)}, {#{hex_seq(v)}#{cv}}}},\n")
|
2014-05-30 19:55:00 -04:00
|
|
|
end
|
2014-05-30 19:57:45 -04:00
|
|
|
data
|
2014-05-30 19:49:54 -04:00
|
|
|
end
|
|
|
|
|
2016-02-07 00:12:44 -05:00
|
|
|
def print_table(dest, type, mapping_data, data)
|
2014-05-30 19:56:00 -04:00
|
|
|
dest.print("static const #{type}_Type #{type}_Table[] = {\n")
|
|
|
|
i = 0
|
2014-05-30 19:57:45 -04:00
|
|
|
ret = data.inject([]) do |a, (n, d)|
|
2014-05-30 19:56:00 -04:00
|
|
|
dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n")
|
|
|
|
i += d.size
|
2016-02-07 23:00:31 -05:00
|
|
|
a.concat(print_table_1(dest, type, mapping_data, d))
|
2014-05-30 19:49:54 -04:00
|
|
|
end
|
2014-05-30 19:56:00 -04:00
|
|
|
dest.print("};\n\n")
|
2014-05-30 19:57:45 -04:00
|
|
|
ret
|
2014-05-30 19:49:54 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
include Util
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
attr_reader :fold, :fold_locale, :unfold, :unfold_locale
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
def load(filename)
|
|
|
|
pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
@fold = fold = {}
|
|
|
|
@unfold = unfold = [{}, {}, {}]
|
2016-02-15 00:43:55 -05:00
|
|
|
@debug = false
|
2014-05-30 19:55:00 -04:00
|
|
|
turkic = []
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
IO.foreach(filename) do |line|
|
|
|
|
next unless res = pattern.match(line)
|
|
|
|
ch_from = res[1].to_i(16)
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
if res[2] == 'T'
|
|
|
|
# Turkic case folding
|
|
|
|
turkic << ch_from
|
|
|
|
next
|
|
|
|
end
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
# store folding data
|
|
|
|
ch_to = res[3..6].inject([]) do |a, i|
|
|
|
|
break a unless i
|
|
|
|
a << i.to_i(16)
|
|
|
|
end
|
|
|
|
fold[ch_from] = ch_to
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
# store unfolding data
|
|
|
|
i = ch_to.length - 1
|
|
|
|
(unfold[i][ch_to] ||= []) << ch_from
|
|
|
|
end
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
# move locale dependent data to (un)fold_locale
|
|
|
|
@fold_locale = fold_locale = {}
|
|
|
|
@unfold_locale = unfold_locale = [{}, {}]
|
|
|
|
for ch_from in turkic
|
|
|
|
key = fold[ch_from]
|
|
|
|
i = key.length - 1
|
|
|
|
unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key)
|
|
|
|
fold_locale[ch_from] = fold.delete(ch_from)
|
|
|
|
end
|
|
|
|
self
|
|
|
|
end
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2014-05-30 19:58:14 -04:00
|
|
|
def range_check(code)
|
|
|
|
"#{code} <= MAX_CODE_VALUE && #{code} >= MIN_CODE_VALUE"
|
|
|
|
end
|
|
|
|
|
2016-02-07 00:12:44 -05:00
|
|
|
def lookup_hash(key, type, data)
|
2014-05-30 19:57:45 -04:00
|
|
|
hash = "onigenc_unicode_#{key}_hash"
|
|
|
|
lookup = "onigenc_unicode_#{key}_lookup"
|
2014-05-30 19:58:14 -04:00
|
|
|
arity = Array(data[0][0]).size
|
2014-07-15 23:27:25 -04:00
|
|
|
gperf = %W"gperf -7 -k#{[*1..(arity*3)].join(",")} -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup} -n"
|
2014-05-30 19:58:14 -04:00
|
|
|
argname = arity > 1 ? "codes" : "code"
|
|
|
|
argdecl = "const OnigCodePoint #{arity > 1 ? "*": ""}#{argname}"
|
2014-05-30 19:57:45 -04:00
|
|
|
n = 7
|
|
|
|
m = (1 << n) - 1
|
2014-05-30 19:58:01 -04:00
|
|
|
min, max = data.map {|c, *|c}.flatten.minmax
|
2014-05-30 19:57:45 -04:00
|
|
|
src = IO.popen(gperf, "r+") {|f|
|
|
|
|
f << "short\n%%\n"
|
|
|
|
data.each_with_index {|(k, _), i|
|
2014-05-30 19:58:01 -04:00
|
|
|
k = Array(k)
|
|
|
|
ks = k.map {|j| [(j >> n*2) & m, (j >> n) & m, (j) & m]}.flatten.map {|c| "\\x%.2x" % c}.join("")
|
|
|
|
f.printf "\"%s\", ::::/*%s*/ %d\n", ks, k.map {|c| "0x%.4x" % c}.join(","), i
|
2014-05-30 19:57:45 -04:00
|
|
|
}
|
|
|
|
f << "%%\n"
|
|
|
|
f.close_write
|
|
|
|
f.read
|
|
|
|
}
|
|
|
|
src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
|
|
|
|
name = $1
|
|
|
|
body = $2
|
2014-05-30 19:58:14 -04:00
|
|
|
body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_#{arity > 1 ? 'at' : 'of'}(#{argname}, \\1)")
|
2014-05-30 19:57:45 -04:00
|
|
|
"#{name}(#{argdecl})\n{\n#{body}}"
|
|
|
|
}
|
|
|
|
src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
|
|
|
|
name = $1
|
|
|
|
body = $2
|
|
|
|
body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1")
|
2014-05-30 19:58:14 -04:00
|
|
|
body.gsub!(/(#{hash})\s*\(.*?\)/, "\\1(#{argname})")
|
2014-05-30 19:57:45 -04:00
|
|
|
body.gsub!(/\{"",-1}/, "-1")
|
|
|
|
body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1')
|
2014-05-30 19:58:14 -04:00
|
|
|
body.sub!(/(\s+if\s)\(len\b.*\)/) do
|
|
|
|
"#$1(" <<
|
|
|
|
(arity > 1 ? (0...arity).map {|i| range_check("#{argname}[#{i}]")}.join(" &&\n ") : range_check(argname)) <<
|
|
|
|
")"
|
|
|
|
end
|
2014-05-30 19:57:45 -04:00
|
|
|
v = nil
|
|
|
|
body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) {
|
|
|
|
pre = $1
|
|
|
|
indent = $2
|
|
|
|
s = $3
|
|
|
|
s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]')
|
|
|
|
v = $1
|
2014-05-30 19:58:14 -04:00
|
|
|
s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code#{arity}_equal(#{argname}, #{key}_Table[#{v}].from))")
|
2014-05-30 19:57:45 -04:00
|
|
|
"#{pre}{#{s}\n#{indent}}"
|
|
|
|
}
|
|
|
|
body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;")
|
|
|
|
"static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}"
|
|
|
|
}
|
|
|
|
src
|
|
|
|
end
|
|
|
|
|
2016-02-06 21:44:14 -05:00
|
|
|
def display(dest, mapping_data)
|
2014-05-30 19:55:00 -04:00
|
|
|
# print the header
|
|
|
|
dest.print("/* DO NOT EDIT THIS FILE. */\n")
|
|
|
|
dest.print("/* Generated by enc/unicode/case-folding.rb */\n\n")
|
|
|
|
|
|
|
|
# print folding data
|
|
|
|
|
|
|
|
# CaseFold + CaseFold_Locale
|
|
|
|
name = "CaseFold_11"
|
2016-02-07 00:12:44 -05:00
|
|
|
data = print_table(dest, name, mapping_data, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale)
|
|
|
|
dest.print lookup_hash(name, "CodePointList3", data)
|
2014-05-30 19:55:00 -04:00
|
|
|
|
|
|
|
# print unfolding data
|
|
|
|
|
|
|
|
# CaseUnfold_11 + CaseUnfold_11_Locale
|
|
|
|
name = "CaseUnfold_11"
|
2016-02-07 00:12:44 -05:00
|
|
|
data = print_table(dest, name, mapping_data, name=>unfold[0], "#{name}_Locale"=>unfold_locale[0])
|
|
|
|
dest.print lookup_hash(name, "CodePointList3", data)
|
2014-05-30 19:55:00 -04:00
|
|
|
|
|
|
|
# CaseUnfold_12 + CaseUnfold_12_Locale
|
|
|
|
name = "CaseUnfold_12"
|
2016-02-07 00:12:44 -05:00
|
|
|
data = print_table(dest, name, mapping_data, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1])
|
|
|
|
dest.print lookup_hash(name, "CodePointList2", data)
|
2014-05-30 19:55:00 -04:00
|
|
|
|
|
|
|
# CaseUnfold_13
|
|
|
|
name = "CaseUnfold_13"
|
2016-02-07 00:12:44 -05:00
|
|
|
data = print_table(dest, name, mapping_data, name=>unfold[2])
|
|
|
|
dest.print lookup_hash(name, "CodePointList2", data)
|
2016-02-23 07:53:10 -05:00
|
|
|
|
|
|
|
# TitleCase
|
|
|
|
dest.print mapping_data.titlecase_output
|
2014-05-30 19:55:00 -04:00
|
|
|
end
|
2014-05-30 19:49:54 -04:00
|
|
|
|
2016-02-15 00:43:55 -05:00
|
|
|
def debug!
|
|
|
|
@debug = true
|
|
|
|
end
|
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
def self.load(*args)
|
|
|
|
new.load(*args)
|
|
|
|
end
|
2014-05-30 19:49:54 -04:00
|
|
|
end
|
|
|
|
|
2016-02-07 08:10:20 -05:00
|
|
|
class MapItem
|
2016-02-23 01:21:55 -05:00
|
|
|
attr_accessor :upper, :lower, :title
|
2016-02-08 07:26:35 -05:00
|
|
|
|
2016-02-07 08:10:20 -05:00
|
|
|
def initialize(code, upper, lower, title)
|
|
|
|
@code = code
|
|
|
|
@upper = upper unless upper == ''
|
|
|
|
@lower = lower unless lower == ''
|
|
|
|
@title = title unless title == ''
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2016-02-06 21:44:14 -05:00
|
|
|
class CaseMapping
|
|
|
|
def initialize (mapping_directory)
|
2016-02-07 08:10:20 -05:00
|
|
|
@mappings = {}
|
2016-02-23 07:53:10 -05:00
|
|
|
@titlecase = []
|
2016-02-07 08:10:20 -05:00
|
|
|
IO.readlines(File.expand_path('UnicodeData.txt', mapping_directory), encoding: Encoding::ASCII_8BIT).each do |line|
|
2016-02-22 04:17:43 -05:00
|
|
|
next if line =~ /^</
|
2016-02-07 08:10:20 -05:00
|
|
|
code, _1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11, upper, lower, title = line.chomp.split ';'
|
|
|
|
unless upper and lower and title and (upper+lower+title)==''
|
|
|
|
@mappings[code] = MapItem.new(code, upper, lower, title)
|
|
|
|
end
|
|
|
|
end
|
2016-02-07 08:10:21 -05:00
|
|
|
|
2016-02-23 01:21:55 -05:00
|
|
|
IO.readlines(File.expand_path('SpecialCasing.txt', mapping_directory), encoding: Encoding::ASCII_8BIT).each do |line|
|
|
|
|
line.chomp!
|
|
|
|
line, comment = line.split(/ *#/)
|
|
|
|
next if not line or line == ''
|
|
|
|
code, lower, title, upper, conditions = line.split(/ *; */)
|
|
|
|
unless conditions
|
|
|
|
item = @mappings[code]
|
|
|
|
item.lower = lower
|
|
|
|
item.title = title
|
|
|
|
item.upper = upper
|
|
|
|
end
|
|
|
|
end
|
2016-02-06 21:44:14 -05:00
|
|
|
end
|
|
|
|
|
2016-02-08 07:26:35 -05:00
|
|
|
def flags(from, type, to)
|
2016-02-07 23:00:31 -05:00
|
|
|
# types: CaseFold_11, CaseUnfold_11, CaseUnfold_12, CaseUnfold_13
|
|
|
|
flags = ""
|
|
|
|
flags += '|F' if type=='CaseFold_11'
|
2016-02-08 07:26:35 -05:00
|
|
|
from = Array(from).map {|i| "%04X" % i}.join(" ")
|
|
|
|
to = Array(to).map {|i| "%04X" % i}.join(" ")
|
2016-02-16 05:10:37 -05:00
|
|
|
to = to.split(/ /).first if type=='CaseUnfold_11'
|
2016-02-08 07:26:35 -05:00
|
|
|
item = @mappings[from]
|
|
|
|
if item
|
2016-02-08 07:26:36 -05:00
|
|
|
flags += '|U' if to==item.upper
|
|
|
|
flags += '|D' if to==item.lower
|
2016-02-23 07:53:10 -05:00
|
|
|
unless item.upper == item.title
|
|
|
|
flags += "|T(#{@titlecase.length})"
|
|
|
|
@titlecase << item
|
|
|
|
end
|
2016-02-08 07:26:35 -05:00
|
|
|
end
|
2016-02-07 23:00:31 -05:00
|
|
|
flags
|
2016-02-06 21:44:14 -05:00
|
|
|
end
|
|
|
|
|
2016-02-23 07:53:10 -05:00
|
|
|
def titlecase_output
|
|
|
|
"CodePointList3 TitleCase[] = {\n" +
|
|
|
|
@titlecase.map do |item|
|
|
|
|
chars = item.title.split(/ /)
|
|
|
|
" {#{chars.length}, {" + chars.map {|c| "0x"+c }.join(', ') + "}},\n"
|
|
|
|
end.join + "};\n"
|
|
|
|
end
|
|
|
|
|
2016-02-06 21:44:14 -05:00
|
|
|
def self.load(*args)
|
|
|
|
new(*args)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
class CaseMappingDummy
|
2016-02-08 07:26:35 -05:00
|
|
|
def flags(from, type, to)
|
2016-02-06 21:44:14 -05:00
|
|
|
""
|
|
|
|
end
|
2016-02-23 07:53:10 -05:00
|
|
|
|
|
|
|
def titlecase_output() '' end
|
2016-02-06 21:44:14 -05:00
|
|
|
end
|
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
if $0 == __FILE__
|
|
|
|
require 'optparse'
|
|
|
|
dest = nil
|
2016-02-06 20:39:26 -05:00
|
|
|
mapping_directory = nil
|
|
|
|
mapping_data = nil
|
2016-02-15 00:43:55 -05:00
|
|
|
debug = false
|
2014-05-30 19:55:00 -04:00
|
|
|
fold_1 = false
|
|
|
|
ARGV.options do |opt|
|
|
|
|
opt.banner << " [INPUT]"
|
|
|
|
opt.on("--output-file=FILE", "-o", "output to the FILE instead of STDOUT") {|output|
|
|
|
|
dest = (output unless output == '-')
|
|
|
|
}
|
2016-02-06 21:16:12 -05:00
|
|
|
opt.on('--mapping-data-directory=DIRECTORY', '-m', 'data DIRECTORY of mapping files') { |directory|
|
2016-02-06 20:39:26 -05:00
|
|
|
mapping_directory = directory
|
|
|
|
}
|
2016-02-15 00:43:55 -05:00
|
|
|
opt.on('--debug', '-d') {
|
|
|
|
debug = true
|
|
|
|
}
|
2014-05-30 19:55:00 -04:00
|
|
|
opt.parse!
|
|
|
|
abort(opt.to_s) if ARGV.size > 1
|
|
|
|
end
|
2016-02-06 20:39:26 -05:00
|
|
|
if mapping_directory
|
|
|
|
if ARGV[0]
|
|
|
|
warn "Either specify directory or individual file, but not both."
|
|
|
|
exit
|
|
|
|
end
|
2016-02-07 08:10:20 -05:00
|
|
|
filename = File.expand_path('CaseFolding.txt', mapping_directory)
|
2016-02-06 21:44:14 -05:00
|
|
|
mapping_data = CaseMapping.load(mapping_directory)
|
2016-02-06 20:39:26 -05:00
|
|
|
end
|
|
|
|
filename ||= ARGV[0] || 'CaseFolding.txt'
|
2016-02-06 21:44:14 -05:00
|
|
|
mapping_data ||= CaseMappingDummy.new
|
2016-02-06 20:39:27 -05:00
|
|
|
|
2014-05-30 19:55:00 -04:00
|
|
|
data = CaseFolding.load(filename)
|
2016-02-15 00:43:55 -05:00
|
|
|
data.debug! if debug
|
2014-05-30 19:55:00 -04:00
|
|
|
if dest
|
|
|
|
open(dest, "wb") do |f|
|
2016-02-06 21:44:14 -05:00
|
|
|
data.display(f, mapping_data)
|
2014-05-30 19:55:00 -04:00
|
|
|
end
|
|
|
|
else
|
2016-02-06 21:44:14 -05:00
|
|
|
data.display(STDOUT, mapping_data)
|
2014-05-30 19:55:00 -04:00
|
|
|
end
|
|
|
|
end
|