mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
45741918e1
The reason why this was commented out was because of gperf 3.0 vs 3.1 differences (see [Feature #13883]). Five years passed, I am pretty confident that we can drop support of old versions here. Ditto for uniname2ctype_p(), onig_jis_property(), and zonetab().
574 lines
16 KiB
Ruby
Executable file
574 lines
16 KiB
Ruby
Executable file
#!/usr/bin/env ruby
|
|
|
|
# Creates the data structures needed by Oniguruma to map Unicode codepoints to
|
|
# property names and POSIX character classes
|
|
#
|
|
# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt,
|
|
# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt,
|
|
# DerivedAge.txt and Blocks.txt from unicode.org.
|
|
# (http://unicode.org/Public/UNIDATA/) And run following command.
|
|
# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
|
|
# You can get source file for gperf. After this, simply make ruby.
|
|
|
|
if ARGV[0] == "--header"
|
|
header = true
|
|
ARGV.shift
|
|
end
|
|
unless ARGV.size == 2
|
|
abort "Usage: #{$0} data_directory emoji_data_directory"
|
|
end
|
|
|
|
pat = /(?:\A|\/)([.\d]+)\z/
|
|
$versions = {
|
|
:Unicode => ARGV[0][pat, 1],
|
|
:Emoji => ARGV[1][pat, 1],
|
|
}
|
|
|
|
POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print XPosixPunct Space Upper XDigit Word Alnum ASCII Punct]
|
|
|
|
def pair_codepoints(codepoints)
|
|
|
|
# We have a sorted Array of codepoints that we wish to partition into
|
|
# ranges such that the start- and endpoints form an inclusive set of
|
|
# codepoints with property _property_. Note: It is intended that some ranges
|
|
# will begin with the value with which they end, e.g. 0x0020 -> 0x0020
|
|
|
|
codepoints.sort!
|
|
last_cp = codepoints.first
|
|
pairs = [[last_cp, nil]]
|
|
codepoints[1..-1].each do |codepoint|
|
|
next if last_cp == codepoint
|
|
|
|
# If the current codepoint does not follow directly on from the last
|
|
# codepoint, the last codepoint represents the end of the current range,
|
|
# and the current codepoint represents the start of the next range.
|
|
if last_cp.next != codepoint
|
|
pairs[-1][-1] = last_cp
|
|
pairs << [codepoint, nil]
|
|
end
|
|
last_cp = codepoint
|
|
end
|
|
|
|
# The final pair has as its endpoint the last codepoint for this property
|
|
pairs[-1][-1] = codepoints.last
|
|
pairs
|
|
end
|
|
|
|
def parse_unicode_data(file)
|
|
last_cp = 0
|
|
data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [],
|
|
'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []}
|
|
beg_cp = nil
|
|
IO.foreach(file) do |line|
|
|
fields = line.split(';')
|
|
cp = fields[0].to_i(16)
|
|
|
|
case fields[1]
|
|
when /\A<(.*),\s*First>\z/
|
|
beg_cp = cp
|
|
next
|
|
when /\A<(.*),\s*Last>\z/
|
|
cps = (beg_cp..cp).to_a
|
|
else
|
|
beg_cp = cp
|
|
cps = [cp]
|
|
end
|
|
|
|
# The Cn category represents unassigned characters. These are not listed in
|
|
# UnicodeData.txt so we must derive them by looking for 'holes' in the range
|
|
# of listed codepoints. We increment the last codepoint seen and compare it
|
|
# with the current codepoint. If the current codepoint is less than
|
|
# last_cp.next we have found a hole, so we add the missing codepoint to the
|
|
# Cn category.
|
|
data['Cn'].concat((last_cp.next...beg_cp).to_a)
|
|
|
|
# Assigned - Defined in unicode.c; interpreted as every character in the
|
|
# Unicode range minus the unassigned characters
|
|
data['Assigned'].concat(cps)
|
|
|
|
# The third field denotes the 'General' category, e.g. Lu
|
|
(data[fields[2]] ||= []).concat(cps)
|
|
|
|
# The 'Major' category is the first letter of the 'General' category, e.g.
|
|
# 'Lu' -> 'L'
|
|
(data[fields[2][0,1]] ||= []).concat(cps)
|
|
last_cp = cp
|
|
end
|
|
|
|
# The last Cn codepoint should be 0x10ffff. If it's not, append the missing
|
|
# codepoints to Cn and C
|
|
cn_remainder = (last_cp.next..0x10ffff).to_a
|
|
data['Cn'] += cn_remainder
|
|
data['C'] += data['Cn']
|
|
|
|
# Special case for LC (Cased_Letter). LC = Ll + Lt + Lu
|
|
data['LC'] = data['Ll'] + data['Lt'] + data['Lu']
|
|
|
|
# Define General Category properties
|
|
gcps = data.keys.sort - POSIX_NAMES
|
|
|
|
# Returns General Category Property names and the data
|
|
[gcps, data]
|
|
end
|
|
|
|
def define_posix_props(data)
|
|
# We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
|
|
#
|
|
|
|
data['Alpha'] = data['Alphabetic']
|
|
data['Upper'] = data['Uppercase']
|
|
data['Lower'] = data['Lowercase']
|
|
data['Punct'] = data['Punctuation']
|
|
data['XPosixPunct'] = data['Punctuation'] + [0x24, 0x2b, 0x3c, 0x3d, 0x3e, 0x5e, 0x60, 0x7c, 0x7e]
|
|
data['Digit'] = data['Decimal_Number']
|
|
data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
|
|
(0x0061..0x0066).to_a
|
|
data['Alnum'] = data['Alpha'] + data['Digit']
|
|
data['Space'] = data['White_Space']
|
|
data['Blank'] = data['Space_Separator'] + [0x0009]
|
|
data['Cntrl'] = data['Cc']
|
|
data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation']
|
|
data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] -
|
|
data['Surrogate'] - data['Unassigned']
|
|
data['Print'] = data['Graph'] + data['Space_Separator']
|
|
end
|
|
|
|
def parse_scripts(data, categories)
|
|
files = [
|
|
{:fn => 'DerivedCoreProperties.txt', :title => 'Derived Property'},
|
|
{:fn => 'Scripts.txt', :title => 'Script'},
|
|
{:fn => 'PropList.txt', :title => 'Binary Property'},
|
|
{:fn => 'emoji/emoji-data.txt', :title => 'Emoji'}
|
|
]
|
|
current = nil
|
|
cps = []
|
|
names = {}
|
|
files.each do |file|
|
|
data_foreach(file[:fn]) do |line|
|
|
if /^# Total (?:code points|elements): / =~ line
|
|
data[current] = cps
|
|
categories[current] = file[:title]
|
|
(names[file[:title]] ||= []) << current
|
|
cps = []
|
|
elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line
|
|
current = $3
|
|
$2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
|
|
end
|
|
end
|
|
end
|
|
# All code points not explicitly listed for Script
|
|
# have the value Unknown (Zzzz).
|
|
data['Unknown'] = (0..0x10ffff).to_a - data.values_at(*names['Script']).flatten
|
|
categories['Unknown'] = 'Script'
|
|
names.values.flatten << 'Unknown'
|
|
end
|
|
|
|
def parse_aliases(data)
|
|
kv = {}
|
|
data_foreach('PropertyAliases.txt') do |line|
|
|
next unless /^(\w+)\s*; (\w+)/ =~ line
|
|
data[$1] = data[$2]
|
|
kv[normalize_propname($1)] = normalize_propname($2)
|
|
end
|
|
data_foreach('PropertyValueAliases.txt') do |line|
|
|
next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
|
|
if $1 == 'gc'
|
|
data[$3] = data[$2]
|
|
data[$4] = data[$2]
|
|
kv[normalize_propname($3)] = normalize_propname($2)
|
|
kv[normalize_propname($4)] = normalize_propname($2) if $4
|
|
else
|
|
data[$2] = data[$3]
|
|
data[$4] = data[$3]
|
|
kv[normalize_propname($2)] = normalize_propname($3)
|
|
kv[normalize_propname($4)] = normalize_propname($3) if $4
|
|
end
|
|
end
|
|
kv
|
|
end
|
|
|
|
# According to Unicode6.0.0/ch03.pdf, Section 3.1, "An update version
|
|
# never involves any additions to the character repertoire." Versions
|
|
# in DerivedAge.txt should always be /\d+\.\d+/
|
|
def parse_age(data)
|
|
current = nil
|
|
last_constname = nil
|
|
cps = []
|
|
ages = []
|
|
data_foreach('DerivedAge.txt') do |line|
|
|
if /^# Total code points: / =~ line
|
|
constname = constantize_agename(current)
|
|
# each version matches all previous versions
|
|
cps.concat(data[last_constname]) if last_constname
|
|
data[constname] = cps
|
|
make_const(constname, cps, "Derived Age #{current}")
|
|
ages << current
|
|
last_constname = constname
|
|
cps = []
|
|
elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line
|
|
current = $3
|
|
$2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
|
|
end
|
|
end
|
|
ages
|
|
end
|
|
|
|
def parse_GraphemeBreakProperty(data)
|
|
current = nil
|
|
cps = []
|
|
ages = []
|
|
data_foreach('auxiliary/GraphemeBreakProperty.txt') do |line|
|
|
if /^# Total code points: / =~ line
|
|
constname = constantize_Grapheme_Cluster_Break(current)
|
|
data[constname] = cps
|
|
make_const(constname, cps, "Grapheme_Cluster_Break=#{current}")
|
|
ages << current
|
|
cps = []
|
|
elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line
|
|
current = $3
|
|
$2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
|
|
end
|
|
end
|
|
ages
|
|
end
|
|
|
|
def parse_block(data)
|
|
cps = []
|
|
blocks = []
|
|
data_foreach('Blocks.txt') do |line|
|
|
if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line
|
|
cps = ($1.to_i(16)..$2.to_i(16)).to_a
|
|
constname = constantize_blockname($3)
|
|
data[constname] = cps
|
|
make_const(constname, cps, "Block")
|
|
blocks << constname
|
|
end
|
|
end
|
|
|
|
# All code points not belonging to any of the named blocks
|
|
# have the value No_Block.
|
|
no_block = (0..0x10ffff).to_a - data.values_at(*blocks).flatten
|
|
constname = constantize_blockname("No_Block")
|
|
make_const(constname, no_block, "Block")
|
|
blocks << constname
|
|
end
|
|
|
|
# shim for Ruby 1.8
|
|
unless {}.respond_to?(:key)
|
|
class Hash
|
|
alias key index
|
|
end
|
|
end
|
|
|
|
$const_cache = {}
|
|
# make_const(property, pairs, name): Prints a 'static const' structure for a
|
|
# given property, group of paired codepoints, and a human-friendly name for
|
|
# the group
|
|
def make_const(prop, data, name)
|
|
if name.empty?
|
|
puts "\n/* '#{prop}' */"
|
|
else
|
|
puts "\n/* '#{prop}': #{name} */"
|
|
end
|
|
if origprop = $const_cache.key(data)
|
|
puts "#define CR_#{prop} CR_#{origprop}"
|
|
else
|
|
$const_cache[prop] = data
|
|
pairs = pair_codepoints(data)
|
|
puts "static const OnigCodePoint CR_#{prop}[] = {"
|
|
# The first element of the constant is the number of pairs of codepoints
|
|
puts "\t#{pairs.size},"
|
|
pairs.each do |pair|
|
|
pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) }
|
|
puts "\t#{pair.first}, #{pair.last},"
|
|
end
|
|
puts "}; /* CR_#{prop} */"
|
|
end
|
|
end
|
|
|
|
def normalize_propname(name)
|
|
name = name.downcase
|
|
name.delete!('- _')
|
|
name
|
|
end
|
|
|
|
def constantize_agename(name)
|
|
"Age_#{name.sub(/\./, '_')}"
|
|
end
|
|
|
|
def constantize_Grapheme_Cluster_Break(name)
|
|
"Grapheme_Cluster_Break_#{name}"
|
|
end
|
|
|
|
def constantize_blockname(name)
|
|
"In_#{name.gsub(/\W/, '_')}"
|
|
end
|
|
|
|
def get_file(name)
|
|
File.join(ARGV[name.start_with?("emoji-[stz]") ? 1 : 0], name)
|
|
end
|
|
|
|
def data_foreach(name, &block)
|
|
fn = get_file(name)
|
|
warn "Reading #{name}"
|
|
File.open(fn, 'rb') do |f|
|
|
if /^emoji/ =~ name
|
|
line = f.gets("")
|
|
# Headers till Emoji 13 or 15
|
|
version = line[/^# #{Regexp.quote(File.basename(name))}.*(?:^# Version:|Emoji Version) ([\d.]+)/m, 1]
|
|
type = :Emoji
|
|
else
|
|
# Headers since Emoji 14 or other Unicode data
|
|
line = f.gets("\n")
|
|
type = :Unicode
|
|
end
|
|
version ||= line[/^# #{File.basename(name).sub(/\./, '-([\\d.]+)\\.')}/, 1]
|
|
unless version
|
|
raise ArgumentError, <<-ERROR
|
|
#{name}: no #{type} version
|
|
#{line.gsub(/^/, '> ')}
|
|
ERROR
|
|
end
|
|
if !(v = $versions[type])
|
|
$versions[type] = version
|
|
elsif v != version and "#{v}.0" != version
|
|
raise ArgumentError, <<-ERROR
|
|
#{name}: #{type} version mismatch: #{version} to #{v}
|
|
#{line.gsub(/^/, '> ')}
|
|
ERROR
|
|
end
|
|
f.each(&block)
|
|
end
|
|
end
|
|
|
|
# Write Data
|
|
class Unifdef
|
|
attr_accessor :output, :top, :stack, :stdout, :kwdonly
|
|
def initialize(out)
|
|
@top = @output = []
|
|
@stack = []
|
|
$stdout, @stdout = self, out
|
|
end
|
|
def restore
|
|
$stdout = @stdout
|
|
end
|
|
def ifdef(sym)
|
|
if @kwdonly
|
|
@stdout.puts "#ifdef #{sym}"
|
|
else
|
|
@stack << @top
|
|
@top << tmp = [sym]
|
|
@top = tmp
|
|
end
|
|
if block_given?
|
|
begin
|
|
return yield
|
|
ensure
|
|
endif(sym)
|
|
end
|
|
end
|
|
end
|
|
def endif(sym)
|
|
if @kwdonly
|
|
@stdout.puts "#endif /* #{sym} */"
|
|
else
|
|
unless sym == @top[0]
|
|
restore
|
|
raise ArgumentError, "#{sym} unmatch to #{@top[0]}"
|
|
end
|
|
@top = @stack.pop
|
|
end
|
|
end
|
|
def show(dest, *syms)
|
|
_show(dest, @output, syms)
|
|
end
|
|
def _show(dest, ary, syms)
|
|
if Symbol === (sym = ary[0])
|
|
unless syms.include?(sym)
|
|
return
|
|
end
|
|
end
|
|
ary.each do |e|
|
|
case e
|
|
when Array
|
|
_show(dest, e, syms)
|
|
when String
|
|
dest.print e
|
|
end
|
|
end
|
|
end
|
|
def write(str)
|
|
if @kwdonly
|
|
@stdout.write(str)
|
|
else
|
|
@top << str
|
|
end
|
|
self
|
|
end
|
|
alias << write
|
|
end
|
|
|
|
output = Unifdef.new($stdout)
|
|
output.kwdonly = !header
|
|
|
|
puts '%{'
|
|
props, data = parse_unicode_data(get_file('UnicodeData.txt'))
|
|
categories = {}
|
|
props.concat parse_scripts(data, categories)
|
|
aliases = parse_aliases(data)
|
|
ages = blocks = graphemeBreaks = nil
|
|
define_posix_props(data)
|
|
POSIX_NAMES.each do |name|
|
|
if name == 'XPosixPunct'
|
|
make_const(name, data[name], "[[:Punct:]]")
|
|
elsif name == 'Punct'
|
|
make_const(name, data[name], "")
|
|
else
|
|
make_const(name, data[name], "[[:#{name}:]]")
|
|
end
|
|
end
|
|
output.ifdef :USE_UNICODE_PROPERTIES do
|
|
props.each do |name|
|
|
category = categories[name] ||
|
|
case name.size
|
|
when 1 then 'Major Category'
|
|
when 2 then 'General Category'
|
|
else '-'
|
|
end
|
|
make_const(name, data[name], category)
|
|
end
|
|
output.ifdef :USE_UNICODE_AGE_PROPERTIES do
|
|
ages = parse_age(data)
|
|
end
|
|
graphemeBreaks = parse_GraphemeBreakProperty(data)
|
|
blocks = parse_block(data)
|
|
end
|
|
puts(<<'__HEREDOC')
|
|
|
|
static const OnigCodePoint* const CodeRanges[] = {
|
|
__HEREDOC
|
|
POSIX_NAMES.each{|name|puts" CR_#{name},"}
|
|
output.ifdef :USE_UNICODE_PROPERTIES do
|
|
props.each{|name| puts" CR_#{name},"}
|
|
output.ifdef :USE_UNICODE_AGE_PROPERTIES do
|
|
ages.each{|name| puts" CR_#{constantize_agename(name)},"}
|
|
end
|
|
graphemeBreaks.each{|name| puts" CR_#{constantize_Grapheme_Cluster_Break(name)},"}
|
|
blocks.each{|name|puts" CR_#{name},"}
|
|
end
|
|
|
|
puts(<<'__HEREDOC')
|
|
};
|
|
struct uniname2ctype_struct {
|
|
short name;
|
|
unsigned short ctype;
|
|
};
|
|
#define uniname2ctype_offset(str) offsetof(struct uniname2ctype_pool_t, uniname2ctype_pool_##str)
|
|
|
|
static const struct uniname2ctype_struct *uniname2ctype_p(register const char *str, register size_t len);
|
|
%}
|
|
struct uniname2ctype_struct;
|
|
%%
|
|
__HEREDOC
|
|
|
|
i = -1
|
|
name_to_index = {}
|
|
POSIX_NAMES.each do |name|
|
|
i += 1
|
|
next if name == 'NEWLINE'
|
|
name = normalize_propname(name)
|
|
name_to_index[name] = i
|
|
puts"%-40s %3d" % [name + ',', i]
|
|
end
|
|
output.ifdef :USE_UNICODE_PROPERTIES do
|
|
props.each do |name|
|
|
i += 1
|
|
name = normalize_propname(name)
|
|
name_to_index[name] = i
|
|
puts "%-40s %3d" % [name + ',', i]
|
|
end
|
|
aliases.each_pair do |k, v|
|
|
next if name_to_index[k]
|
|
next unless v = name_to_index[v]
|
|
puts "%-40s %3d" % [k + ',', v]
|
|
end
|
|
output.ifdef :USE_UNICODE_AGE_PROPERTIES do
|
|
ages.each do |name|
|
|
i += 1
|
|
name = "age=#{name}"
|
|
name_to_index[name] = i
|
|
puts "%-40s %3d" % [name + ',', i]
|
|
end
|
|
end
|
|
graphemeBreaks.each do |name|
|
|
i += 1
|
|
name = "graphemeclusterbreak=#{name.delete('_').downcase}"
|
|
name_to_index[name] = i
|
|
puts "%-40s %3d" % [name + ',', i]
|
|
end
|
|
blocks.each do |name|
|
|
i += 1
|
|
name = normalize_propname(name)
|
|
name_to_index[name] = i
|
|
puts "%-40s %3d" % [name + ',', i]
|
|
end
|
|
end
|
|
puts(<<'__HEREDOC')
|
|
%%
|
|
static int
|
|
uniname2ctype(const UChar *name, unsigned int len)
|
|
{
|
|
const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len);
|
|
if (p) return p->ctype;
|
|
return -1;
|
|
}
|
|
__HEREDOC
|
|
$versions.each do |type, ver|
|
|
name = type == :Unicode ? "ONIG_UNICODE_VERSION" : "ONIG_UNICODE_EMOJI_VERSION"
|
|
versions = ver.scan(/\d+/)
|
|
print("#if defined #{name}_STRING && !( \\\n")
|
|
versions.zip(%w[MAJOR MINOR TEENY]) do |v, n|
|
|
print(" #{name}_#{n} == #{v} && \\\n")
|
|
end
|
|
print(" 1)\n")
|
|
print("# error #{name}_STRING mismatch\n")
|
|
print("#endif\n")
|
|
print("#define #{name}_STRING #{ver.dump}\n")
|
|
versions.zip(%w[MAJOR MINOR TEENY]) do |v, n|
|
|
print("#define #{name}_#{n} #{v}\n")
|
|
end
|
|
end
|
|
|
|
output.restore
|
|
|
|
if header
|
|
require 'tempfile'
|
|
|
|
NAME2CTYPE = %w[gperf -7 -c -j1 -i1 -t -C -P -T -H uniname2ctype_hash -Q uniname2ctype_pool -N uniname2ctype_p]
|
|
|
|
fds = []
|
|
syms = %i[USE_UNICODE_PROPERTIES USE_UNICODE_AGE_PROPERTIES]
|
|
begin
|
|
fds << (tmp = Tempfile.new(%w"name2ctype .h"))
|
|
IO.popen([*NAME2CTYPE, out: tmp], "w") {|f| output.show(f, *syms)}
|
|
end while syms.pop
|
|
fds.each(&:close)
|
|
ff = nil
|
|
IO.popen(%W[diff -DUSE_UNICODE_AGE_PROPERTIES #{fds[1].path} #{fds[0].path}], "r") {|age|
|
|
IO.popen(%W[diff -DUSE_UNICODE_PROPERTIES #{fds[2].path} -], "r", in: age) {|f|
|
|
ansi = false
|
|
f.each {|line|
|
|
if /ANSI-C code produced by gperf/ =~ line
|
|
ansi = true
|
|
end
|
|
line.sub!(/\/\*ANSI\*\//, '1') if ansi
|
|
line.gsub!(/\(int\)\((?:long|size_t)\)&\(\(struct uniname2ctype_pool_t \*\)0\)->uniname2ctype_pool_(str\d+),\s+/,
|
|
'uniname2ctype_offset(\1), ')
|
|
if ff = (!ff ? /^(uniname2ctype_hash) /=~line : /^\}/!~line) # no line can match both, exclusive flip-flop
|
|
line.sub!(/^( *(?:register\s+)?(.*\S)\s+hval\s*=\s*)(?=len;)/, '\1(\2)')
|
|
end
|
|
puts line
|
|
}
|
|
}
|
|
}
|
|
end
|