1
0
Fork 0
mirror of https://github.com/rails/rails.git synced 2022-11-09 12:12:34 -05:00
rails--rails/activesupport/bin/generate_tables
Fumiaki MATSUSHIMA bdcfdef214 Update Unicode Version to 9.0.0
9.0.0 was released on June 21, 2016

http://blog.unicode.org/2016/06/announcing-unicode-standard-version-90.html

http://www.unicode.org/versions/Unicode9.0.0/

There are some changes about grapheme cluster in Unicode 9.0.0:

http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules

------------

I noticed that `unpack_graphemes` returns [Other] when the argument is Other ÷ Prepend
(it must be [Other, Prepend]).
But in [Unicode 8.0.0's Prepend has no characters](http://www.unicode.org/reports/tr29/tr29-27.html#Prepend)
so we don't have to backport following patch:

```diff
should_break =
+ if pos == eoc
+   true
```
2017-01-28 16:57:36 +09:00

140 lines
4.8 KiB
Ruby
Executable file

#!/usr/bin/env ruby
begin
$:.unshift(File.expand_path(File.dirname(__FILE__) + "/../lib"))
require "active_support"
rescue IOError
end
require "open-uri"
require "tmpdir"
require "fileutils"
module ActiveSupport
module Multibyte
module Unicode
class UnicodeDatabase
def load; end
end
class DatabaseGenerator
BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/"
SOURCES = {
codepoints: BASE_URI + "UnicodeData.txt",
composition_exclusion: BASE_URI + "CompositionExclusions.txt",
grapheme_break_property: BASE_URI + "auxiliary/GraphemeBreakProperty.txt",
cp1252: "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT"
}
def initialize
@ucd = Unicode::UnicodeDatabase.new
end
def parse_codepoints(line)
codepoint = Codepoint.new
raise "Could not parse input." unless line =~ /^
([0-9A-F]+); # code
([^;]+); # name
([A-Z]+); # general category
([0-9]+); # canonical combining class
([A-Z]+); # bidi class
(<([A-Z]*)>)? # decomposition type
((\ ?[0-9A-F]+)*); # decomposition mapping
([0-9]*); # decimal digit
([0-9]*); # digit
([^;]*); # numeric
([YN]*); # bidi mirrored
([^;]*); # unicode 1.0 name
([^;]*); # iso comment
([0-9A-F]*); # simple uppercase mapping
([0-9A-F]*); # simple lowercase mapping
([0-9A-F]*)$/ix # simple titlecase mapping
codepoint.code = $1.hex
codepoint.combining_class = Integer($4)
codepoint.decomp_type = $7
codepoint.decomp_mapping = ($8 == "") ? nil : $8.split.collect(&:hex)
codepoint.uppercase_mapping = ($16 == "") ? 0 : $16.hex
codepoint.lowercase_mapping = ($17 == "") ? 0 : $17.hex
@ucd.codepoints[codepoint.code] = codepoint
end
def parse_grapheme_break_property(line)
if line =~ /^([0-9A-F.]+)\s*;\s*([\w]+)\s*#/
type = $2.downcase.intern
@ucd.boundary[type] ||= []
if $1.include? ".."
parts = $1.split ".."
@ucd.boundary[type] << (parts[0].hex..parts[1].hex)
else
@ucd.boundary[type] << $1.hex
end
end
end
def parse_composition_exclusion(line)
if line =~ /^([0-9A-F]+)/i
@ucd.composition_exclusion << $1.hex
end
end
def parse_cp1252(line)
if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
@ucd.cp1252[$1.hex] = $2.hex
end
end
def create_composition_map
@ucd.codepoints.each do |_, cp|
if !cp.nil? && cp.combining_class == 0 && cp.decomp_type.nil? && !cp.decomp_mapping.nil? && cp.decomp_mapping.length == 2 && @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 && !@ucd.composition_exclusion.include?(cp.code)
@ucd.composition_map[cp.decomp_mapping[0]] ||= {}
@ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
end
end
end
def normalize_boundary_map
@ucd.boundary.each do |k, v|
if [:lf, :cr].include? k
@ucd.boundary[k] = v[0]
end
end
end
def parse
SOURCES.each do |type, url|
filename = File.join(Dir.tmpdir, UNICODE_VERSION, "#{url.split('/').last}")
unless File.exist?(filename)
$stderr.puts "Downloading #{url.split('/').last}"
FileUtils.mkdir_p(File.dirname(filename))
File.open(filename, "wb") do |target|
open(url) do |source|
source.each_line { |line| target.write line }
end
end
end
File.open(filename) do |file|
file.each_line { |line| send "parse_#{type}".intern, line }
end
end
create_composition_map
normalize_boundary_map
end
def dump_to(filename)
File.open(filename, "wb") do |f|
f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
end
end
end
end
end
end
if __FILE__ == $0
filename = ActiveSupport::Multibyte::Unicode::UnicodeDatabase.filename
generator = ActiveSupport::Multibyte::Unicode::DatabaseGenerator.new
generator.parse
print "Writing to: #{filename}"
generator.dump_to filename
puts " (#{File.size(filename)} bytes)"
end