1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

case-folding.rb: perfect hash for case folding

* enc/unicode/case-folding.rb (lookup_hash): make perfect hash to
  lookup case folding table.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@46269 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
nobu 2014-05-30 23:57:45 +00:00
parent 88eae35862
commit c39e659263
4 changed files with 1575 additions and 27 deletions

View file

@ -13,21 +13,23 @@ class CaseFolding
end
def print_table_1(dest, data)
for k, v in data.sort
for k, v in data = data.sort
sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k)
dest.print(" {#{sk}, {#{v.length}, {#{hex_seq(v)}}}},\n")
end
data
end
def print_table(dest, type, data)
dest.print("static const #{type}_Type #{type}_Table[] = {\n")
i = 0
data.each do |n, d|
ret = data.inject([]) do |a, (n, d)|
dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n")
i += d.size
print_table_1(dest, d)
a.concat(print_table_1(dest, d))
end
dest.print("};\n\n")
ret
end
end
@ -76,6 +78,55 @@ class CaseFolding
self
end
def lookup_hash(key, type, data)
hash = "onigenc_unicode_#{key}_hash"
lookup = "onigenc_unicode_#{key}_lookup"
gperf = %W"gperf -7 -k1,2,3 -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup}"
argname = "code"
argdecl = "const OnigCodePoint #{argname}"
n = 7
m = (1 << n) - 1
min, max = data.map {|c, *|c}.minmax
src = IO.popen(gperf, "r+") {|f|
f << "short\n%%\n"
data.each_with_index {|(k, _), i|
ks = [(k >> n*2) & m, (k >> n) & m, (k) & m].map {|c| "\\x%.2x" % c}.join("")
f.printf "\"%s\", ::::/*0x%.4x*/ %d\n", ks, k, i
}
f << "%%\n"
f.close_write
f.read
}
src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
name = $1
body = $2
body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_of(#{argname}, \\1)")
"#{name}(#{argdecl})\n{\n#{body}}"
}
src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
name = $1
body = $2
body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1")
body.gsub!(/(#{hash})\s*\(.*?\)/, '\1(code)')
body.gsub!(/\{"",-1}/, "-1")
body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1')
body.sub!(/(\s+if\s)\(len\b.*\)/) {"#$1(code <= MAX_CODE_VALUE && code >= MIN_CODE_VALUE)"}
v = nil
body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) {
pre = $1
indent = $2
s = $3
s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]')
v = $1
s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code1_equal(#{argname}, #{key}_Table[#{v}].from))")
"#{pre}{#{s}\n#{indent}}"
}
body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;")
"static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}"
}
src
end
def display(dest)
# print the header
dest.print("/* DO NOT EDIT THIS FILE. */\n")
@ -85,7 +136,8 @@ class CaseFolding
# CaseFold + CaseFold_Locale
name = "CaseFold_11"
print_table(dest, name, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale)
data = print_table(dest, name, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale)
dest.print lookup_hash(name, "CodePointList3", data)
# print unfolding data
@ -102,8 +154,6 @@ class CaseFolding
print_table(dest, name, name=>unfold[2])
# table sizes
fold_table_size = fold.size + fold_locale.size
dest.printf("#define FOLD_TABLE_SIZE\t\t%d\n", (fold_table_size * 1.2))
unfold1_table_size = unfold[0].size + unfold_locale[0].size
dest.printf("#define UNFOLD1_TABLE_SIZE\t%d\n", (unfold1_table_size * 1.2))
unfold2_table_size = unfold[1].size + unfold_locale[1].size

File diff suppressed because it is too large Load diff