1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00
ruby--ruby/tool/transcode-tblgen.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1119 lines
30 KiB
Ruby
Raw Normal View History

# frozen_string_literal: true
require 'optparse'
require 'erb'
require 'fileutils'
require 'pp'
class Array
unless [].respond_to? :product
def product(*args)
if args.empty?
self.map {|e| [e] }
else
result = []
self.each {|e0|
result.concat args.first.product(*args[1..-1]).map {|es| [e0, *es] }
}
result
end
end
end
end
class String
unless "".respond_to? :start_with?
def start_with?(*prefixes)
prefixes.each {|prefix|
return true if prefix.length <= self.length && prefix == self[0, prefix.length]
}
false
end
end
end
NUM_ELEM_BYTELOOKUP = 2
C_ESC = {
"\\" => "\\\\",
'"' => '\"',
"\n" => '\n',
}
0x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch }
0x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch }
C_ESC_PAT = Regexp.union(*C_ESC.keys)
def c_esc(str)
'"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"'
end
HEX2 = /(?:[0-9A-Fa-f]{2})/
class ArrayCode
def initialize(type, name)
@type = type
@name = name
@len = 0;
@content = ''.dup
end
def length
@len
end
def insert_at_last(num, str)
# newnum = self.length + num
@content << str
@len += num
end
def to_s
<<"End"
static const #{@type}
#{@name}[#{@len}] = {
#{@content}};
End
end
end
class Action
def initialize(value)
@value = value
end
attr_reader :value
def hash
@value.hash
end
def eql?(other)
self.class == other.class &&
@value == other.value
end
alias == eql?
end
class Branch
def initialize(byte_min, byte_max, child_tree)
@byte_min = byte_min
@byte_max = byte_max
@child_tree = child_tree
@hash = byte_min.hash ^ byte_max.hash ^ child_tree.hash
end
attr_reader :byte_min, :byte_max, :child_tree, :hash
def eql?(other)
self.class == other.class &&
@hash == other.hash &&
@byte_min == other.byte_min &&
@byte_max == other.byte_max &&
@child_tree == other.child_tree
end
alias == eql?
end
class ActionMap
def self.parse_to_rects(mapping)
rects = []
n = 0
mapping.each {|pat, action|
pat = pat.to_s
if /\A\s*\(empset\)\s*\z/ =~ pat
next
elsif /\A\s*\(empstr\)\s*\z/ =~ pat
rects << ['', '', action]
n += 1
elsif /\A\s*(#{HEX2}+)\s*\z/o =~ pat
hex = $1.upcase
rects << [hex, hex, action]
elsif /\A\s*((#{HEX2}|\{#{HEX2}(?:-#{HEX2})?(,#{HEX2}(?:-#{HEX2})?)*\})+(\s+|\z))*\z/o =~ pat
pat = pat.upcase
pat.scan(/\S+/) {
pat1 = $&
ranges_list = []
pat1.scan(/#{HEX2}|\{([^\}]*)\}/o) {
ranges_list << []
if !$1
ranges_list.last << [$&,$&]
else
set = {}
$1.scan(/(#{HEX2})(?:-(#{HEX2}))?/o) {
if !$2
c = $1.to_i(16)
set[c] = true
else
b = $1.to_i(16)
e = $2.to_i(16)
b.upto(e) {|_| set[_] = true }
end
}
i = nil
0.upto(256) {|j|
if set[j]
if !i
i = j
end
if !set[j+1]
ranges_list.last << ["%02X" % i, "%02X" % j]
i = nil
end
end
}
end
}
first_ranges = ranges_list.shift
first_ranges.product(*ranges_list).each {|range_list|
min = range_list.map {|x, y| x }.join
max = range_list.map {|x, y| y }.join
rects << [min, max, action]
}
}
else
raise ArgumentError, "invalid pattern: #{pat.inspect}"
end
}
rects
end
def self.unambiguous_action(actions0)
actions = actions0.uniq
if actions.length == 1
actions[0]
else
actions.delete(:nomap0)
if actions.length == 1
actions[0]
else
raise ArgumentError, "ambiguous actions: #{actions0.inspect}"
end
end
end
def self.build_tree(rects)
expand(rects) {|prefix, actions|
unambiguous_action(actions)
}
end
def self.parse(mapping)
rects = parse_to_rects(mapping)
tree = build_tree(rects)
self.new(tree)
end
def self.merge_rects(*rects_list)
if rects_list.length < 2
raise ArgumentError, "not enough arguments"
end
all_rects = []
rects_list.each_with_index {|rects, i|
all_rects.concat rects.map {|min, max, action| [min, max, [i, action]] }
}
tree = expand(all_rects) {|prefix, actions|
args = Array.new(rects_list.length) { [] }
actions.each {|i, action|
args[i] << action
}
yield(prefix, *args)
}
self.new(tree)
end
def self.merge(*mappings, &block)
merge_rects(*mappings.map {|m| parse_to_rects(m) }, &block)
end
def self.merge2(map1, map2, &block)
rects1 = parse_to_rects(map1)
rects2 = parse_to_rects(map2)
actions = []
all_rects = []
rects1.each {|rect|
_, _, action = rect
rect[2] = actions.length
actions << action
all_rects << rect
}
boundary = actions.length
rects2.each {|rect|
_, _, action = rect
rect[2] = actions.length
actions << action
all_rects << rect
}
tree = expand(all_rects) {|prefix, as0|
as1 = []
as2 = []
as0.each {|i|
if i < boundary
as1 << actions[i]
else
as2 << actions[i]
end
}
yield(prefix, as1, as2)
}
self.new(tree)
end
def self.expand(rects, &block)
#numsing = numreg = 0
#rects.each {|min, max, action| if min == max then numsing += 1 else numreg += 1 end }
#puts "#{numsing} singleton mappings and #{numreg} region mappings."
singleton_rects = []
region_rects = []
rects.each {|rect|
min, max, = rect
if min == max
singleton_rects << rect
else
region_rects << rect
end
}
@singleton_rects = singleton_rects.sort_by {|min, max, action| min }
@singleton_rects.reverse!
ret = expand_rec("", region_rects, &block)
@singleton_rects = nil
ret
end
TMPHASH = {}
def self.expand_rec(prefix, region_rects, &block)
return region_rects if region_rects.empty? && !((s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix))
if region_rects.empty? ? s_rect[0].length == prefix.length : region_rects[0][0].empty?
h = TMPHASH
while (s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix)
min, _, action = @singleton_rects.pop
raise ArgumentError, "ambiguous pattern: #{prefix}" if min.length != prefix.length
h[action] = true
end
for min, _, action in region_rects
raise ArgumentError, "ambiguous pattern: #{prefix}" if !min.empty?
h[action] = true
end
tree = Action.new(block.call(prefix, h.keys))
h.clear
else
tree = []
each_firstbyte_range(prefix, region_rects) {|byte_min, byte_max, r_rects2|
if byte_min == byte_max
prefix2 = prefix + "%02X" % byte_min
else
prefix2 = prefix + "{%02X-%02X}" % [byte_min, byte_max]
end
child_tree = expand_rec(prefix2, r_rects2, &block)
tree << Branch.new(byte_min, byte_max, child_tree)
}
end
return tree
end
def self.each_firstbyte_range(prefix, region_rects)
index_from = TMPHASH
region_ary = []
region_rects.each {|min, max, action|
raise ArgumentError, "ambiguous pattern: #{prefix}" if min.empty?
min_firstbyte = min[0,2].to_i(16)
min_rest = min[2..-1]
max_firstbyte = max[0,2].to_i(16)
max_rest = max[2..-1]
region_ary << [min_firstbyte, max_firstbyte, [min_rest, max_rest, action]]
index_from[min_firstbyte] = true
index_from[max_firstbyte+1] = true
}
byte_from = Array.new(index_from.size)
bytes = index_from.keys
bytes.sort!
bytes.reverse!
bytes.each_with_index {|byte, i|
index_from[byte] = i
byte_from[i] = byte
}
region_rects_ary = Array.new(index_from.size) { [] }
region_ary.each {|min_firstbyte, max_firstbyte, rest_elt|
index_from[min_firstbyte].downto(index_from[max_firstbyte+1]+1) {|i|
region_rects_ary[i] << rest_elt
}
}
index_from.clear
r_rects = region_rects_ary.pop
region_byte = byte_from.pop
prev_r_start = region_byte
prev_r_rects = []
while r_rects && (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix)
singleton_byte = seq[prefix.length, 2].to_i(16)
min_byte = singleton_byte < region_byte ? singleton_byte : region_byte
if prev_r_start < min_byte && !prev_r_rects.empty?
yield prev_r_start, min_byte-1, prev_r_rects
end
if region_byte < singleton_byte
prev_r_start = region_byte
prev_r_rects = r_rects
r_rects = region_rects_ary.pop
region_byte = byte_from.pop
elsif region_byte > singleton_byte
yield singleton_byte, singleton_byte, prev_r_rects
prev_r_start = singleton_byte+1
else # region_byte == singleton_byte
prev_r_start = region_byte+1
prev_r_rects = r_rects
r_rects = region_rects_ary.pop
region_byte = byte_from.pop
yield singleton_byte, singleton_byte, prev_r_rects
end
end
while r_rects
if prev_r_start < region_byte && !prev_r_rects.empty?
yield prev_r_start, region_byte-1, prev_r_rects
end
prev_r_start = region_byte
prev_r_rects = r_rects
r_rects = region_rects_ary.pop
region_byte = byte_from.pop
end
while (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix)
singleton_byte = seq[prefix.length, 2].to_i(16)
yield singleton_byte, singleton_byte, []
end
end
def initialize(tree)
@tree = tree
end
def inspect
"\#<#{self.class}:" +
@tree.inspect +
">"
end
def max_input_length_rec(tree)
case tree
when Action
0
else
tree.map {|branch|
max_input_length_rec(branch.child_tree)
}.max + 1
end
end
def max_input_length
max_input_length_rec(@tree)
end
def empty_action
if @tree.kind_of? Action
@tree.value
else
nil
end
end
OffsetsMemo = {}
InfosMemo = {}
def format_offsets(min, max, offsets)
offsets = offsets[min..max]
code = "%d, %d,\n" % [min, max]
0.step(offsets.length-1,16) {|i|
code << " "
code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
if i+8 < offsets.length
code << " "
code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
end
code << "\n"
}
code
end
UsedName = {}
StrMemo = {}
def str_name(bytes)
size = @bytes_code.length
rawbytes = [bytes].pack("H*")
n = nil
if !n && !(suf = rawbytes.gsub(/[^A-Za-z0-9_]/, '')).empty? && !UsedName[nn = "str1_" + suf] then n = nn end
if !n && !UsedName[nn = "str1_" + bytes] then n = nn end
n ||= "str1s_#{size}"
StrMemo[bytes] = n
UsedName[n] = true
n
end
def gen_str(bytes)
if n = StrMemo[bytes]
n
else
len = bytes.length/2
size = @bytes_code.length
n = str_name(bytes)
@bytes_code.insert_at_last(1 + len,
"\#define #{n} makeSTR1(#{size})\n" +
" makeSTR1LEN(#{len})," + bytes.gsub(/../, ' 0x\&,') + "\n\n")
n
end
end
def generate_info(info)
case info
when :nomap, :nomap0
# :nomap0 is low priority. it never collides.
"NOMAP"
when :undef
"UNDEF"
when :invalid
"INVALID"
* transcode_data.h (rb_transcoding): new field "stateful". (rb_transcoder): preprocessor and postprocessor field removed. change arguments of func_ii, func_si, func_io and func_so. new field "finish_func". * tool/transcode-tblgen.rb: make FUNii, FUNsi and FUNio generatable. * transcode.c (transcoder_lib_table): removed. (transcoder_table): change structure. (transcoder_key): removed because the above structure change. (make_transcoder_entry): new function. (get_transcoder_entry): ditto. (rb_register_transcoder): follow the structure change. (declare_transcoder): ditto. (transcode_search_path): new function for breadth first search to find a list of converters. (transcode_search_path_i): new function. (transcode_dispatch_cb): ditto. (transcode_dispatch): use transcode_search_path. (transcode_loop): follow the argument change. (str_transcode): preprocessor and postprocessor stuff removed. * enc/trans/iso2022.erb.c: new file. ISO-2022-JP conversion re-implemented. * enc/trans/japanese.erb.c: ISO-2022-JP stuff removed. nute(23:52:53)% head -40 ChangeLog Thu Aug 7 23:43:11 2008 Tanaka Akira <akr@fsij.org> * transcode_data.h (rb_transcoding): new field "stateful". (rb_transcoder): preprocessor and postprocessor field removed. change arguments of func_ii, func_si, func_io and func_so. new field "finish_func". * tool/transcode-tblgen.rb: make FUNii, FUNsi and FUNio generatable. * transcode.c (transcoder_lib_table): removed. (transcoder_table): change structure. (transcoder_key): removed because the above structure change. (make_transcoder_entry): new function. (get_transcoder_entry): ditto. (rb_register_transcoder): follow the structure change. (declare_transcoder): ditto. (transcode_search_path): new function for breadth first search to find a list of converters. (transcode_search_path_i): new function. (transcode_dispatch_cb): ditto. (transcode_dispatch): use transcode_search_path. (transcode_loop): follow the argument change. (str_transcode): preprocessor and postprocessor stuff removed. * enc/trans/iso2022.erb.c: new file. ISO-2022-JP conversion re-implemented. * enc/trans/japanese.erb.c: ISO-2022-JP stuff removed. * enc/trans/utf_16_32.erb.c: follow argument change of FUNso. [ruby-dev:35798] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18419 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2008-08-07 10:53:30 -04:00
when :func_ii
"FUNii"
when :func_si
"FUNsi"
when :func_io
"FUNio"
when :func_so
"FUNso"
when /\A(#{HEX2})\z/o
"o1(0x#$1)"
when /\A(#{HEX2})(#{HEX2})\z/o
"o2(0x#$1,0x#$2)"
when /\A(#{HEX2})(#{HEX2})(#{HEX2})\z/o
"o3(0x#$1,0x#$2,0x#$3)"
when /funsio\((\d+)\)/
"funsio(#{$1})"
when /\A(#{HEX2})(3[0-9])(#{HEX2})(3[0-9])\z/o
"g4(0x#$1,0x#$2,0x#$3,0x#$4)"
when /\A(f[0-7])(#{HEX2})(#{HEX2})(#{HEX2})\z/o
"o4(0x#$1,0x#$2,0x#$3,0x#$4)"
when /\A(#{HEX2}){4,259}\z/o
gen_str(info.upcase)
when /\A\/\*BYTE_LOOKUP\*\// # pointer to BYTE_LOOKUP structure
$'.to_s
else
raise "unexpected action: #{info.inspect}"
end
end
def format_infos(infos)
infos = infos.map {|info| generate_info(info) }
maxlen = infos.map {|info| info.length }.max
columns = maxlen <= 16 ? 4 : 2
code = "".dup
0.step(infos.length-1, columns) {|i|
code << " "
is = infos[i,columns]
is.each {|info|
code << sprintf(" %#{maxlen}s,", info)
}
code << "\n"
}
code
end
def generate_lookup_node(name, table)
bytes_code = @bytes_code
words_code = @words_code
offsets = []
infos = []
infomap = {}
min = max = nil
table.each_with_index {|action, byte|
action ||= :invalid
if action != :invalid
min = byte if !min
max = byte
end
unless o = infomap[action]
infomap[action] = o = infos.length
infos[o] = action
end
offsets[byte] = o
}
infomap.clear
if !min
min = max = 0
end
offsets_key = [min, max, offsets[min..max]]
if n = OffsetsMemo[offsets_key]
offsets_name = n
else
offsets_name = "#{name}_offsets"
OffsetsMemo[offsets_key] = offsets_name
size = bytes_code.length
bytes_code.insert_at_last(2+max-min+1,
"\#define #{offsets_name} #{size}\n" +
format_offsets(min,max,offsets) + "\n")
end
if n = InfosMemo[infos]
infos_name = n
else
infos_name = "#{name}_infos"
InfosMemo[infos] = infos_name
size = words_code.length
words_code.insert_at_last(infos.length,
"\#define #{infos_name} WORDINDEX2INFO(#{size})\n" +
format_infos(infos) + "\n")
end
size = words_code.length
words_code.insert_at_last(NUM_ELEM_BYTELOOKUP,
"\#define #{name} WORDINDEX2INFO(#{size})\n" +
<<"End" + "\n")
#{offsets_name},
#{infos_name},
End
end
PreMemo = {}
NextName = "a"
def generate_node(name_hint=nil)
if n = PreMemo[@tree]
return n
end
table = Array.new(0x100, :invalid)
@tree.each {|branch|
byte_min, byte_max, child_tree = branch.byte_min, branch.byte_max, branch.child_tree
rest = ActionMap.new(child_tree)
if a = rest.empty_action
table.fill(a, byte_min..byte_max)
else
name_hint2 = nil
if name_hint
name_hint2 = "#{name_hint}_#{byte_min == byte_max ? '%02X' % byte_min : '%02Xto%02X' % [byte_min, byte_max]}"
end
v = "/*BYTE_LOOKUP*/" + rest.gennode(@bytes_code, @words_code, name_hint2)
table.fill(v, byte_min..byte_max)
end
}
if !name_hint
name_hint = "fun_" + NextName
NextName.succ!
end
PreMemo[@tree] = name_hint
generate_lookup_node(name_hint, table)
name_hint
end
def gennode(bytes_code, words_code, name_hint=nil)
@bytes_code = bytes_code
@words_code = words_code
name = generate_node(name_hint)
@bytes_code = nil
@words_code = nil
return name
end
end
def citrus_mskanji_cstomb(csid, index)
case csid
when 0
index
when 1
index + 0x80
when 2, 3
row = index >> 8
raise "invalid byte sequence" if row < 0x21
if csid == 3
if row <= 0x2F
offset = (row == 0x22 || row >= 0x26) ? 0xED : 0xF0
elsif row >= 0x4D && row <= 0x7E
offset = 0xCE
else
raise "invalid byte sequence"
end
else
raise "invalid byte sequence" if row > 0x97
offset = (row < 0x5F) ? 0x81 : 0xC1
end
col = index & 0xFF
raise "invalid byte sequence" if (col < 0x21 || col > 0x7E)
row -= 0x21
col -= 0x21
if (row & 1) == 0
col += 0x40
col += 1 if (col >= 0x7F)
else
col += 0x9F;
end
row = row / 2 + offset
(row << 8) | col
end.to_s(16)
end
def citrus_euc_cstomb(csid, index)
case csid
when 0x0000
index
when 0x8080
index | 0x8080
when 0x0080
index | 0x8E80
when 0x8000
index | 0x8F8080
end.to_s(16)
end
def citrus_stateless_iso_cstomb(csid, index)
(index | 0x8080 | (csid << 16)).to_s(16)
end
def citrus_cstomb(ces, csid, index)
case ces
when 'mskanji'
citrus_mskanji_cstomb(csid, index)
when 'euc'
citrus_euc_cstomb(csid, index)
when 'stateless_iso'
citrus_stateless_iso_cstomb(csid, index)
end
end
SUBDIR = %w/APPLE AST BIG5 CNS CP EBCDIC EMOJI GB GEORGIAN ISO646 ISO-8859 JIS KAZAKH KOI KS MISC TCVN/
def citrus_decode_mapsrc(ces, csid, mapsrcs)
table = []
mapsrcs.split(',').each do |mapsrc|
path = [$srcdir]
mode = nil
if mapsrc.rindex(/UCS(?:@[A-Z]+)?/, 0)
mode = :from_ucs
from = mapsrc[$&.size+1..-1]
path << SUBDIR.find{|x| from.rindex(x, 0) }
else
mode = :to_ucs
path << SUBDIR.find{|x| mapsrc.rindex(x, 0) }
end
if /\bUCS@(BMP|SMP|SIP|TIP|SSP)\b/ =~ mapsrc
plane = {"BMP"=>0, "SMP"=>1, "SIP"=>2, "TIP"=>3, "SSP"=>14}[$1]
else
plane = 0
end
plane <<= 16
path << mapsrc.gsub(':', '@')
path = File.join(*path)
path << ".src"
path[path.rindex('/')] = '%'
STDOUT.puts 'load mapsrc %s' % path if VERBOSE_MODE > 1
open(path, 'rb') do |f|
f.each_line do |l|
break if /^BEGIN_MAP/ =~ l
end
f.each_line do |l|
next if /^\s*(?:#|$)/ =~ l
break if /^END_MAP/ =~ l
case mode
when :from_ucs
case l
when /0x(\w+)\s*-\s*0x(\w+)\s*=\s*INVALID/
# Citrus OOB_MODE
when /(0x\w+)\s*=\s*(0x\w+)/
table.push << [plane | $1.hex, citrus_cstomb(ces, csid, $2.hex)]
else
raise "unknown notation '%s'"% l.chomp
end
when :to_ucs
case l
when /(0x\w+)\s*=\s*(0x\w+)/
table.push << [citrus_cstomb(ces, csid, $1.hex), plane | $2.hex]
else
raise "unknown notation '%s'"% l.chomp
end
end
end
end
end
return table
end
def import_ucm(path)
to_ucs = []
from_ucs = []
File.foreach(File.join($srcdir, "ucm", path)) do |line|
uc, bs, fb = nil
if /^<U([0-9a-fA-F]+)>\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line
uc = $1.hex
bs = $2.delete('x\\')
fb = $3.to_i
next if uc < 128 && uc == bs.hex
elsif /^([<U0-9a-fA-F>+]+)\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line
uc = $1.scan(/[0-9a-fA-F]+>/).map(&:hex).pack("U*").unpack("H*")[0]
bs = $2.delete('x\\')
fb = $3.to_i
end
to_ucs << [bs, uc] if fb == 0 || fb == 3
from_ucs << [uc, bs] if fb == 0 || fb == 1
end
[to_ucs, from_ucs]
end
def encode_utf8(map)
r = []
map.each {|k, v|
# integer means UTF-8 encoded sequence.
k = [k].pack("U").unpack("H*")[0].upcase if Integer === k
v = [v].pack("U").unpack("H*")[0].upcase if Integer === v
r << [k,v]
}
r
end
UnspecifiedValidEncoding = Object.new
def transcode_compile_tree(name, from, map, valid_encoding)
map = encode_utf8(map)
h = {}
map.each {|k, v|
h[k] = v unless h[k] # use first mapping
}
if valid_encoding.equal? UnspecifiedValidEncoding
valid_encoding = ValidEncoding.fetch(from)
end
if valid_encoding
am = ActionMap.merge2(h, {valid_encoding => :undef}) {|prefix, as1, as2|
a1 = as1.empty? ? nil : ActionMap.unambiguous_action(as1)
a2 = as2.empty? ? nil : ActionMap.unambiguous_action(as2)
if !a2
raise "invalid mapping: #{prefix}"
end
a1 || a2
}
else
am = ActionMap.parse(h)
end
h.clear
max_input = am.max_input_length
defined_name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name)
return defined_name, max_input
end
TRANSCODERS = []
TRANSCODE_GENERATED_TRANSCODER_CODE = ''.dup
def transcode_tbl_only(from, to, map, valid_encoding=UnspecifiedValidEncoding)
if VERBOSE_MODE > 1
if from.empty? || to.empty?
STDOUT.puts "converter for #{from.empty? ? to : from}"
else
STDOUT.puts "converter from #{from} to #{to}"
end
end
id_from = from.tr('^0-9A-Za-z', '_')
id_to = to.tr('^0-9A-Za-z', '_')
if from == "UTF-8"
tree_name = "to_#{id_to}"
elsif to == "UTF-8"
tree_name = "from_#{id_from}"
else
tree_name = "from_#{id_from}_to_#{id_to}"
end
real_tree_name, max_input = transcode_compile_tree(tree_name, from, map, valid_encoding)
return map, tree_name, real_tree_name, max_input
end
#
# call-seq:
# transcode_tblgen(from_name, to_name, map [, valid_encoding_check [, ascii_compatibility]]) -> ''
#
# Returns an empty string just in case the result is used somewhere.
# Stores the actual product for later output with transcode_generated_code and
# transcode_register_code.
#
# The first argument is a string that will be used for the source (from) encoding.
# The second argument is a string that will be used for the target (to) encoding.
#
# The third argument is the actual data, a map represented as an array of two-element
# arrays. Each element of the array stands for one character being converted. The
# first element of each subarray is the code of the character in the source encoding,
# the second element of each subarray is the code of the character in the target encoding.
#
# Each code (i.e. byte sequence) is represented as a string of hexadecimal characters
# of even length. Codes can also be represented as integers (usually in the form Ox...),
# in which case they are interpreted as Unicode codepoints encoded in UTF-8. So as
# an example, 0x677E is the same as "E69DBE" (but somewhat easier to produce and check).
#
# In addition, the following symbols can also be used instead of actual codes in the
# second element of a subarray:
# :nomap (no mapping, just copy input to output), :nomap0 (same as :nomap, but low priority),
# :undef (input code undefined in the destination encoding),
# :invalid (input code is an invalid byte sequence in the source encoding),
# :func_ii, :func_si, :func_io, :func_so (conversion by function with specific call
# convention).
#
# The forth argument specifies the overall structure of the encoding. For examples,
# see ValidEncoding below. This is used to cross-check the data in the third argument
# and to automatically add :undef and :invalid mappings where necessary.
#
# The fifth argument gives the ascii-compatibility of the transcoding. See
# rb_transcoder_asciicompat_type_t in transcode_data.h for details. In most
# cases, this argument can be left out.
#
def transcode_tblgen(from, to, map, valid_encoding=UnspecifiedValidEncoding,
ascii_compatibility='asciicompat_converter')
map, tree_name, real_tree_name, max_input = transcode_tbl_only(from, to, map, valid_encoding)
transcoder_name = "rb_#{tree_name}"
TRANSCODERS << transcoder_name
input_unit_length = UnitLength[from]
max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
transcoder_code = <<"End"
static const rb_transcoder
#{transcoder_name} = {
#{c_esc from}, #{c_esc to}, #{real_tree_name},
TRANSCODE_TABLE_INFO,
#{input_unit_length}, /* input_unit_length */
#{max_input}, /* max_input */
#{max_output}, /* max_output */
#{ascii_compatibility}, /* asciicompat_type */
0, 0, 0, /* state_size, state_init, state_fini */
0, 0, 0, 0,
0, 0, 0
};
End
TRANSCODE_GENERATED_TRANSCODER_CODE << transcoder_code
''
end
def transcode_generate_node(am, name_hint=nil)
STDOUT.puts "converter for #{name_hint}" if VERBOSE_MODE > 1
am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name_hint)
''
end
def transcode_generated_code
TRANSCODE_GENERATED_BYTES_CODE.to_s +
TRANSCODE_GENERATED_WORDS_CODE.to_s +
"\#define TRANSCODE_TABLE_INFO " +
"#{OUTPUT_PREFIX}byte_array, #{TRANSCODE_GENERATED_BYTES_CODE.length}, " +
"#{OUTPUT_PREFIX}word_array, #{TRANSCODE_GENERATED_WORDS_CODE.length}, " +
"((int)sizeof(unsigned int))\n" +
TRANSCODE_GENERATED_TRANSCODER_CODE
end
def transcode_register_code
code = ''.dup
TRANSCODERS.each {|transcoder_name|
code << " rb_register_transcoder(&#{transcoder_name});\n"
}
code
end
UnitLength = {
'UTF-16BE' => 2,
'UTF-16LE' => 2,
'UTF-32BE' => 4,
'UTF-32LE' => 4,
}
UnitLength.default = 1
ValidEncoding = {
'1byte' => '{00-ff}',
'2byte' => '{00-ff}{00-ff}',
'4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}',
'US-ASCII' => '{00-7f}',
'UTF-8' => '{00-7f}
{c2-df}{80-bf}
e0{a0-bf}{80-bf}
{e1-ec}{80-bf}{80-bf}
ed{80-9f}{80-bf}
{ee-ef}{80-bf}{80-bf}
f0{90-bf}{80-bf}{80-bf}
{f1-f3}{80-bf}{80-bf}{80-bf}
f4{80-8f}{80-bf}{80-bf}',
'UTF-16BE' => '{00-d7,e0-ff}{00-ff}
{d8-db}{00-ff}{dc-df}{00-ff}',
'UTF-16LE' => '{00-ff}{00-d7,e0-ff}
{00-ff}{d8-db}{00-ff}{dc-df}',
'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff}
00{01-10}{00-ff}{00-ff}',
'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000
{00-ff}{00-ff}{01-10}00',
'EUC-JP' => '{00-7f}
{a1-fe}{a1-fe}
8e{a1-fe}
8f{a1-fe}{a1-fe}',
'CP51932' => '{00-7f}
{a1-fe}{a1-fe}
8e{a1-fe}',
'EUC-JIS-2004' => '{00-7f}
{a1-fe}{a1-fe}
8e{a1-fe}
8f{a1-fe}{a1-fe}',
'Shift_JIS' => '{00-7f}
{81-9f,e0-fc}{40-7e,80-fc}
{a1-df}',
'EUC-KR' => '{00-7f}
{a1-fe}{a1-fe}',
'CP949' => '{00-7f}
{81-fe}{41-5a,61-7a,81-fe}',
'Big5' => '{00-7f}
{81-fe}{40-7e,a1-fe}',
'EUC-TW' => '{00-7f}
{a1-fe}{a1-fe}
8e{a1-b0}{a1-fe}{a1-fe}',
'GBK' => '{00-80}
{81-fe}{40-7e,80-fe}',
'GB18030' => '{00-7f}
{81-fe}{40-7e,80-fe}
{81-fe}{30-39}{81-fe}{30-39}',
}
def ValidEncoding(enc)
ValidEncoding.fetch(enc)
end
def set_valid_byte_pattern(encoding, pattern_or_label)
pattern =
if ValidEncoding[pattern_or_label]
ValidEncoding[pattern_or_label]
else
pattern_or_label
end
if ValidEncoding[encoding] and ValidEncoding[encoding]!=pattern
raise ArgumentError, "trying to change valid byte pattern for encoding #{encoding} from #{ValidEncoding[encoding]} to #{pattern}"
end
ValidEncoding[encoding] = pattern
end
# the following may be used in different places, so keep them here for the moment
set_valid_byte_pattern 'ASCII-8BIT', '1byte'
set_valid_byte_pattern 'Windows-31J', 'Shift_JIS'
set_valid_byte_pattern 'eucJP-ms', 'EUC-JP'
def make_signature(filename, src)
"src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}"
end
if __FILE__ == $0
start_time = Time.now
output_filename = nil
verbose_mode = 0
force_mode = false
op = OptionParser.new
op.def_option("--help", "show help message") { puts op; exit 0 }
op.def_option("--verbose", "verbose mode, twice for more verbose") { verbose_mode += 1 }
op.def_option("--force", "force table generation") { force_mode = true }
op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg }
op.parse!
VERBOSE_MODE = verbose_mode
OUTPUT_FILENAME = output_filename
OUTPUT_PREFIX = output_filename ? File.basename(output_filename)[/\A[A-Za-z0-9_]*/] : "".dup
OUTPUT_PREFIX.sub!(/\A_+/, '')
OUTPUT_PREFIX.sub!(/_*\z/, '_')
TRANSCODE_GENERATED_BYTES_CODE = ArrayCode.new("unsigned char", "#{OUTPUT_PREFIX}byte_array")
TRANSCODE_GENERATED_WORDS_CODE = ArrayCode.new("unsigned int", "#{OUTPUT_PREFIX}word_array")
arg = ARGV.shift
$srcdir = File.dirname(arg)
$:.unshift $srcdir unless $:.include? $srcdir
src = File.read(arg)
src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding
this_script = File.read(__FILE__)
this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding
base_signature = "/* autogenerated. */\n".dup
base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n"
base_signature << "/* #{make_signature(File.basename(arg), src)} */\n"
if !force_mode && output_filename && File.readable?(output_filename)
old_signature = File.open(output_filename) {|f| f.gets("").chomp }
chk_signature = base_signature.dup
old_signature.each_line {|line|
if %r{/\* src="([0-9a-z_.-]+)",} =~ line
name = $1
next if name == File.basename(arg) || name == File.basename(__FILE__)
path = File.join($srcdir, name)
if File.readable? path
chk_signature << "/* #{make_signature(name, File.read(path))} */\n"
end
end
}
if old_signature == chk_signature
now = Time.now
File.utime(now, now, output_filename)
STDOUT.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE > 0
exit
end
end
if VERBOSE_MODE > 0
if output_filename
STDOUT.puts "generating #{output_filename} ..."
end
end
libs1 = $".dup
if ERB.instance_method(:initialize).parameters.assoc(:key) # Ruby 2.6+
erb = ERB.new(src, trim_mode: '%')
else
erb = ERB.new(src, nil, '%')
end
erb.filename = arg
erb_result = erb.result(binding)
libs2 = $".dup
libs = libs2 - libs1
lib_sigs = ''.dup
libs.each {|lib|
lib = File.basename(lib)
path = File.join($srcdir, lib)
if File.readable? path
lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n"
end
}
result = ''.dup
result << base_signature
result << lib_sigs
result << "\n"
result << erb_result
result << "\n"
if output_filename
new_filename = output_filename + ".new"
FileUtils.mkdir_p(File.dirname(output_filename))
File.open(new_filename, "wb") {|f| f << result }
File.rename(new_filename, output_filename)
tms = Process.times
elapsed = Time.now - start_time
STDOUT.puts "done. (#{'%.2f' % tms.utime}user #{'%.2f' % tms.stime}system #{'%.2f' % elapsed}elapsed)" if VERBOSE_MODE > 1
else
print result
end
end