2008-09-21 11:21:30 -04:00
|
|
|
#!/usr/bin/env ruby
|
|
|
|
|
|
|
|
begin
|
2016-08-06 13:28:46 -04:00
|
|
|
$:.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
|
|
|
|
require "active_support"
|
2008-09-21 11:21:30 -04:00
|
|
|
rescue IOError
|
|
|
|
end
|
|
|
|
|
2016-08-06 13:28:46 -04:00
|
|
|
require "open-uri"
|
|
|
|
require "tmpdir"
|
2017-01-26 04:15:37 -05:00
|
|
|
require "fileutils"
|
2008-09-21 11:21:30 -04:00
|
|
|
|
|
|
|
module ActiveSupport
|
|
|
|
module Multibyte
|
2010-05-10 09:46:37 -04:00
|
|
|
module Unicode
|
|
|
|
class UnicodeDatabase
|
|
|
|
def load; end
|
2008-09-21 11:21:30 -04:00
|
|
|
end
|
|
|
|
|
2010-05-10 09:46:37 -04:00
|
|
|
class DatabaseGenerator
|
|
|
|
BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/"
|
|
|
|
SOURCES = {
|
2016-08-06 13:38:33 -04:00
|
|
|
codepoints: BASE_URI + "UnicodeData.txt",
|
|
|
|
composition_exclusion: BASE_URI + "CompositionExclusions.txt",
|
|
|
|
grapheme_break_property: BASE_URI + "auxiliary/GraphemeBreakProperty.txt",
|
|
|
|
cp1252: "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT"
|
2010-05-10 09:46:37 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
def initialize
|
|
|
|
@ucd = Unicode::UnicodeDatabase.new
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_codepoints(line)
|
|
|
|
codepoint = Codepoint.new
|
|
|
|
raise "Could not parse input." unless line =~ /^
|
|
|
|
([0-9A-F]+); # code
|
|
|
|
([^;]+); # name
|
|
|
|
([A-Z]+); # general category
|
|
|
|
([0-9]+); # canonical combining class
|
|
|
|
([A-Z]+); # bidi class
|
|
|
|
(<([A-Z]*)>)? # decomposition type
|
2011-03-03 23:14:18 -05:00
|
|
|
((\ ?[0-9A-F]+)*); # decomposition mapping
|
2010-05-10 09:46:37 -04:00
|
|
|
([0-9]*); # decimal digit
|
|
|
|
([0-9]*); # digit
|
|
|
|
([^;]*); # numeric
|
|
|
|
([YN]*); # bidi mirrored
|
|
|
|
([^;]*); # unicode 1.0 name
|
|
|
|
([^;]*); # iso comment
|
|
|
|
([0-9A-F]*); # simple uppercase mapping
|
|
|
|
([0-9A-F]*); # simple lowercase mapping
|
|
|
|
([0-9A-F]*)$/ix # simple titlecase mapping
|
|
|
|
codepoint.code = $1.hex
|
|
|
|
codepoint.combining_class = Integer($4)
|
|
|
|
codepoint.decomp_type = $7
|
2016-10-28 23:05:58 -04:00
|
|
|
codepoint.decomp_mapping = ($8 == "") ? nil : $8.split.collect(&:hex)
|
|
|
|
codepoint.uppercase_mapping = ($16 == "") ? 0 : $16.hex
|
|
|
|
codepoint.lowercase_mapping = ($17 == "") ? 0 : $17.hex
|
2010-05-10 09:46:37 -04:00
|
|
|
@ucd.codepoints[codepoint.code] = codepoint
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_grapheme_break_property(line)
|
2010-10-15 10:31:00 -04:00
|
|
|
if line =~ /^([0-9A-F.]+)\s*;\s*([\w]+)\s*#/
|
2010-05-10 09:46:37 -04:00
|
|
|
type = $2.downcase.intern
|
|
|
|
@ucd.boundary[type] ||= []
|
2016-08-06 13:28:46 -04:00
|
|
|
if $1.include? ".."
|
|
|
|
parts = $1.split ".."
|
2010-05-10 09:46:37 -04:00
|
|
|
@ucd.boundary[type] << (parts[0].hex..parts[1].hex)
|
|
|
|
else
|
|
|
|
@ucd.boundary[type] << $1.hex
|
|
|
|
end
|
2008-09-21 11:21:30 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2010-05-10 09:46:37 -04:00
|
|
|
def parse_composition_exclusion(line)
|
|
|
|
if line =~ /^([0-9A-F]+)/i
|
|
|
|
@ucd.composition_exclusion << $1.hex
|
|
|
|
end
|
2008-09-21 11:21:30 -04:00
|
|
|
end
|
|
|
|
|
2010-05-10 09:46:37 -04:00
|
|
|
def parse_cp1252(line)
|
|
|
|
if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
|
|
|
|
@ucd.cp1252[$1.hex] = $2.hex
|
|
|
|
end
|
2008-09-21 11:21:30 -04:00
|
|
|
end
|
|
|
|
|
2010-05-10 09:46:37 -04:00
|
|
|
def create_composition_map
|
|
|
|
@ucd.codepoints.each do |_, cp|
|
2016-09-01 17:41:49 -04:00
|
|
|
if !cp.nil? && cp.combining_class == 0 && cp.decomp_type.nil? && !cp.decomp_mapping.nil? && cp.decomp_mapping.length == 2 && @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 && !@ucd.composition_exclusion.include?(cp.code)
|
2010-05-10 09:46:37 -04:00
|
|
|
@ucd.composition_map[cp.decomp_mapping[0]] ||= {}
|
|
|
|
@ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
|
|
|
|
end
|
2008-09-21 11:21:30 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2010-05-10 09:46:37 -04:00
|
|
|
def normalize_boundary_map
|
2016-10-28 23:05:58 -04:00
|
|
|
@ucd.boundary.each do |k, v|
|
2011-04-11 01:35:20 -04:00
|
|
|
if [:lf, :cr].include? k
|
2010-05-10 09:46:37 -04:00
|
|
|
@ucd.boundary[k] = v[0]
|
|
|
|
end
|
2008-09-21 11:21:30 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2010-05-10 09:46:37 -04:00
|
|
|
def parse
|
|
|
|
SOURCES.each do |type, url|
|
2017-01-26 04:15:37 -05:00
|
|
|
filename = File.join(Dir.tmpdir, UNICODE_VERSION, "#{url.split('/').last}")
|
2010-05-10 09:46:37 -04:00
|
|
|
unless File.exist?(filename)
|
|
|
|
$stderr.puts "Downloading #{url.split('/').last}"
|
2017-01-26 04:15:37 -05:00
|
|
|
FileUtils.mkdir_p(File.dirname(filename))
|
2016-08-06 13:28:46 -04:00
|
|
|
File.open(filename, "wb") do |target|
|
2010-05-10 09:46:37 -04:00
|
|
|
open(url) do |source|
|
|
|
|
source.each_line { |line| target.write line }
|
|
|
|
end
|
2008-09-21 11:21:30 -04:00
|
|
|
end
|
|
|
|
end
|
2010-05-10 09:46:37 -04:00
|
|
|
File.open(filename) do |file|
|
|
|
|
file.each_line { |line| send "parse_#{type}".intern, line }
|
|
|
|
end
|
2008-09-21 11:21:30 -04:00
|
|
|
end
|
2010-05-10 09:46:37 -04:00
|
|
|
create_composition_map
|
|
|
|
normalize_boundary_map
|
2008-09-21 11:21:30 -04:00
|
|
|
end
|
|
|
|
|
2010-05-10 09:46:37 -04:00
|
|
|
def dump_to(filename)
|
2016-08-06 13:28:46 -04:00
|
|
|
File.open(filename, "wb") do |f|
|
2010-05-10 09:46:37 -04:00
|
|
|
f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
|
|
|
|
end
|
2008-09-21 11:21:30 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
if __FILE__ == $0
|
2010-05-10 09:46:37 -04:00
|
|
|
filename = ActiveSupport::Multibyte::Unicode::UnicodeDatabase.filename
|
|
|
|
generator = ActiveSupport::Multibyte::Unicode::DatabaseGenerator.new
|
2008-09-21 11:21:30 -04:00
|
|
|
generator.parse
|
|
|
|
print "Writing to: #{filename}"
|
|
|
|
generator.dump_to filename
|
|
|
|
puts " (#{File.size(filename)} bytes)"
|
|
|
|
end
|