1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

update to Unicode 11.0.0 (main step, not complete yet)

- common.mk: Change Unicode version to 11.0.0, and Emoji version to 11.0
- test/ruby/enc/test_emoji_breaks.rb: update hard-coded Emoji version
- enc/unicode/11.0.0, enc/unicode/11.0.0/casefold.h, enc/unicode/name2ctype.h:
  Add generated files. Files for Unicode 10.0.0 will be removed once we are
  sure 11.0.0 works.
- lib/unicode_normalize/tables.rb: Updated table.
- regparse.c: Almost completely reimplement grapheme cluster detection in
  function node_extended_grapheme_cluster().


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66213 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
duerst 2018-12-05 08:10:24 +00:00
parent 78be4478d1
commit 66a6073859
6 changed files with 47704 additions and 407 deletions

View file

@ -15,8 +15,8 @@ mflags = $(MFLAGS)
gnumake_recursive =
enable_shared = $(ENABLE_SHARED:no=)
UNICODE_VERSION = 10.0.0
UNICODE_EMOJI_VERSION = 5.0
UNICODE_VERSION = 11.0.0
UNICODE_EMOJI_VERSION = 11.0
### set the following environment variable or uncomment the line if
### the Unicode data files should be updated completely on every update ('make up',...).

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -7,28 +7,29 @@ module UnicodeNormalize # :nodoc:
accents = "" \
"[\u0300-\u034E\u0350-\u036F\u0483-\u0487\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7" \
"\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0711" \
"\u0730-\u074A\u07EB-\u07F3\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u08D4-\u08E1" \
"\u08E3-\u08FF\u093C\u094D\u0951-\u0954\u09BC\u09BE\u09CD\u09D7" \
"\u0A3C\u0A4D\u0ABC\u0ACD\u0B3C\u0B3E\u0B4D\u0B56\u0B57" \
"\u0BBE\u0BCD\u0BD7\u0C4D\u0C55\u0C56\u0CBC\u0CC2\u0CCD" \
"\u0CD5\u0CD6\u0D3B\u0D3C\u0D3E\u0D4D\u0D57\u0DCA\u0DCF\u0DDF" \
"\u0E38-\u0E3A\u0E48-\u0E4B\u0EB8\u0EB9\u0EC8-\u0ECB\u0F18\u0F19\u0F35\u0F37\u0F39" \
"\u0F71\u0F72\u0F74\u0F7A-\u0F7D\u0F80\u0F82-\u0F84\u0F86\u0F87\u0FC6\u102E" \
"\u1037\u1039\u103A\u108D\u135D-\u135F\u1714\u1734\u17D2\u17DD" \
"\u18A9\u1939-\u193B\u1A17\u1A18\u1A60\u1A75-\u1A7C\u1A7F\u1AB0-\u1ABD\u1B34\u1B35" \
"\u1B44\u1B6B-\u1B73\u1BAA\u1BAB\u1BE6\u1BF2\u1BF3\u1C37\u1CD0-\u1CD2\u1CD4-\u1CE0" \
"\u1CE2-\u1CE8\u1CED\u1CF4\u1CF8\u1CF9\u1DC0-\u1DF9\u1DFB-\u1DFF\u20D0-\u20DC\u20E1" \
"\u20E5-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A\uA66F\uA674-\uA67D" \
"\uA69E\uA69F\uA6F0\uA6F1\uA806\uA8C4\uA8E0-\uA8F1\uA92B-\uA92D\uA953\uA9B3" \
"\uA9C0\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1\uAAF6\uABED" \
"\uFB1E\uFE20-\uFE2F\u{101FD}\u{102E0}\u{10376}-\u{1037A}\u{10A0D}\u{10A0F}\u{10A38}-\u{10A3A}" \
"\u{10A3F}\u{10AE5}\u{10AE6}\u{11046}\u{1107F}\u{110B9}\u{110BA}\u{11100}-\u{11102}\u{11127}\u{11133}\u{11134}" \
"\u{11173}\u{111C0}\u{111CA}\u{11235}\u{11236}\u{112E9}\u{112EA}\u{1133C}\u{1133E}\u{1134D}" \
"\u{11357}\u{11366}-\u{1136C}\u{11370}-\u{11374}\u{11442}\u{11446}\u{114B0}\u{114BA}\u{114BD}" \
"\u{114C2}\u{114C3}\u{115AF}\u{115BF}\u{115C0}\u{1163F}\u{116B6}\u{116B7}\u{1172B}\u{11A34}\u{11A47}" \
"\u{11A99}\u{11C3F}\u{11D42}\u{11D44}\u{11D45}\u{16AF0}-\u{16AF4}\u{16B30}-\u{16B36}\u{1BC9E}\u{1D165}-\u{1D169}" \
"\u{1D16D}-\u{1D172}\u{1D17B}-\u{1D182}\u{1D185}-\u{1D18B}\u{1D1AA}-\u{1D1AD}\u{1D242}-\u{1D244}\u{1E000}-\u{1E006}\u{1E008}-\u{1E018}\u{1E01B}-\u{1E021}" \
"\u{1E023}\u{1E024}\u{1E026}-\u{1E02A}\u{1E8D0}-\u{1E8D6}\u{1E944}-\u{1E94A}" \
"\u0730-\u074A\u07EB-\u07F3\u07FD\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B" \
"\u08D3-\u08E1\u08E3-\u08FF\u093C\u094D\u0951-\u0954\u09BC\u09BE\u09CD" \
"\u09D7\u09FE\u0A3C\u0A4D\u0ABC\u0ACD\u0B3C\u0B3E" \
"\u0B4D\u0B56\u0B57\u0BBE\u0BCD\u0BD7\u0C4D\u0C55\u0C56\u0CBC" \
"\u0CC2\u0CCD\u0CD5\u0CD6\u0D3B\u0D3C\u0D3E\u0D4D\u0D57\u0DCA" \
"\u0DCF\u0DDF\u0E38-\u0E3A\u0E48-\u0E4B\u0EB8\u0EB9\u0EC8-\u0ECB\u0F18\u0F19\u0F35" \
"\u0F37\u0F39\u0F71\u0F72\u0F74\u0F7A-\u0F7D\u0F80\u0F82-\u0F84\u0F86\u0F87" \
"\u0FC6\u102E\u1037\u1039\u103A\u108D\u135D-\u135F\u1714\u1734" \
"\u17D2\u17DD\u18A9\u1939-\u193B\u1A17\u1A18\u1A60\u1A75-\u1A7C\u1A7F" \
"\u1AB0-\u1ABD\u1B34\u1B35\u1B44\u1B6B-\u1B73\u1BAA\u1BAB\u1BE6\u1BF2\u1BF3\u1C37" \
"\u1CD0-\u1CD2\u1CD4-\u1CE0\u1CE2-\u1CE8\u1CED\u1CF4\u1CF8\u1CF9\u1DC0-\u1DF9\u1DFB-\u1DFF" \
"\u20D0-\u20DC\u20E1\u20E5-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A" \
"\uA66F\uA674-\uA67D\uA69E\uA69F\uA6F0\uA6F1\uA806\uA8C4\uA8E0-\uA8F1\uA92B-\uA92D" \
"\uA953\uA9B3\uA9C0\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1" \
"\uAAF6\uABED\uFB1E\uFE20-\uFE2F\u{101FD}\u{102E0}\u{10376}-\u{1037A}\u{10A0D}" \
"\u{10A0F}\u{10A38}-\u{10A3A}\u{10A3F}\u{10AE5}\u{10AE6}\u{10D24}-\u{10D27}\u{10F46}-\u{10F50}\u{11046}\u{1107F}" \
"\u{110B9}\u{110BA}\u{11100}-\u{11102}\u{11127}\u{11133}\u{11134}\u{11173}\u{111C0}\u{111CA}\u{11235}\u{11236}" \
"\u{112E9}\u{112EA}\u{1133B}\u{1133C}\u{1133E}\u{1134D}\u{11357}\u{11366}-\u{1136C}\u{11370}-\u{11374}\u{11442}" \
"\u{11446}\u{1145E}\u{114B0}\u{114BA}\u{114BD}\u{114C2}\u{114C3}\u{115AF}\u{115BF}\u{115C0}" \
"\u{1163F}\u{116B6}\u{116B7}\u{1172B}\u{11839}\u{1183A}\u{11A34}\u{11A47}\u{11A99}\u{11C3F}" \
"\u{11D42}\u{11D44}\u{11D45}\u{11D97}\u{16AF0}-\u{16AF4}\u{16B30}-\u{16B36}\u{1BC9E}\u{1D165}-\u{1D169}\u{1D16D}-\u{1D172}" \
"\u{1D17B}-\u{1D182}\u{1D185}-\u{1D18B}\u{1D1AA}-\u{1D1AD}\u{1D242}-\u{1D244}\u{1E000}-\u{1E006}\u{1E008}-\u{1E018}\u{1E01B}-\u{1E021}\u{1E023}\u{1E024}" \
"\u{1E026}-\u{1E02A}\u{1E8D0}-\u{1E8D6}\u{1E944}-\u{1E94A}" \
"]"
ACCENTS = accents
REGEXP_D_STRING = "#{'' # composition starters and composition exclusions
@ -247,78 +248,81 @@ module UnicodeNormalize # :nodoc:
"\u073C"=>220, "\u073D"=>230, "\u073E"=>220, "\u073F"=>230, "\u0740"=>230, "\u0741"=>230, "\u0742"=>220, "\u0743"=>230,
"\u0744"=>220, "\u0745"=>230, "\u0746"=>220, "\u0747"=>230, "\u0748"=>220, "\u0749"=>230, "\u074A"=>230, "\u07EB"=>230,
"\u07EC"=>230, "\u07ED"=>230, "\u07EE"=>230, "\u07EF"=>230, "\u07F0"=>230, "\u07F1"=>230, "\u07F2"=>220, "\u07F3"=>230,
"\u0816"=>230, "\u0817"=>230, "\u0818"=>230, "\u0819"=>230, "\u081B"=>230, "\u081C"=>230, "\u081D"=>230, "\u081E"=>230,
"\u081F"=>230, "\u0820"=>230, "\u0821"=>230, "\u0822"=>230, "\u0823"=>230, "\u0825"=>230, "\u0826"=>230, "\u0827"=>230,
"\u0829"=>230, "\u082A"=>230, "\u082B"=>230, "\u082C"=>230, "\u082D"=>230, "\u0859"=>220, "\u085A"=>220, "\u085B"=>220,
"\u08D4"=>230, "\u08D5"=>230, "\u08D6"=>230, "\u08D7"=>230, "\u08D8"=>230, "\u08D9"=>230, "\u08DA"=>230, "\u08DB"=>230,
"\u08DC"=>230, "\u08DD"=>230, "\u08DE"=>230, "\u08DF"=>230, "\u08E0"=>230, "\u08E1"=>230, "\u08E3"=>220, "\u08E4"=>230,
"\u08E5"=>230, "\u08E6"=>220, "\u08E7"=>230, "\u08E8"=>230, "\u08E9"=>220, "\u08EA"=>230, "\u08EB"=>230, "\u08EC"=>230,
"\u08ED"=>220, "\u08EE"=>220, "\u08EF"=>220, "\u08F0"=>27, "\u08F1"=>28, "\u08F2"=>29, "\u08F3"=>230, "\u08F4"=>230,
"\u08F5"=>230, "\u08F6"=>220, "\u08F7"=>230, "\u08F8"=>230, "\u08F9"=>220, "\u08FA"=>220, "\u08FB"=>230, "\u08FC"=>230,
"\u08FD"=>230, "\u08FE"=>230, "\u08FF"=>230, "\u093C"=>7, "\u094D"=>9, "\u0951"=>230, "\u0952"=>220, "\u0953"=>230,
"\u0954"=>230, "\u09BC"=>7, "\u09CD"=>9, "\u0A3C"=>7, "\u0A4D"=>9, "\u0ABC"=>7, "\u0ACD"=>9, "\u0B3C"=>7,
"\u0B4D"=>9, "\u0BCD"=>9, "\u0C4D"=>9, "\u0C55"=>84, "\u0C56"=>91, "\u0CBC"=>7, "\u0CCD"=>9, "\u0D3B"=>9,
"\u0D3C"=>9, "\u0D4D"=>9, "\u0DCA"=>9, "\u0E38"=>103, "\u0E39"=>103, "\u0E3A"=>9, "\u0E48"=>107, "\u0E49"=>107,
"\u0E4A"=>107, "\u0E4B"=>107, "\u0EB8"=>118, "\u0EB9"=>118, "\u0EC8"=>122, "\u0EC9"=>122, "\u0ECA"=>122, "\u0ECB"=>122,
"\u0F18"=>220, "\u0F19"=>220, "\u0F35"=>220, "\u0F37"=>220, "\u0F39"=>216, "\u0F71"=>129, "\u0F72"=>130, "\u0F74"=>132,
"\u0F7A"=>130, "\u0F7B"=>130, "\u0F7C"=>130, "\u0F7D"=>130, "\u0F80"=>130, "\u0F82"=>230, "\u0F83"=>230, "\u0F84"=>9,
"\u0F86"=>230, "\u0F87"=>230, "\u0FC6"=>220, "\u1037"=>7, "\u1039"=>9, "\u103A"=>9, "\u108D"=>220, "\u135D"=>230,
"\u135E"=>230, "\u135F"=>230, "\u1714"=>9, "\u1734"=>9, "\u17D2"=>9, "\u17DD"=>230, "\u18A9"=>228, "\u1939"=>222,
"\u193A"=>230, "\u193B"=>220, "\u1A17"=>230, "\u1A18"=>220, "\u1A60"=>9, "\u1A75"=>230, "\u1A76"=>230, "\u1A77"=>230,
"\u1A78"=>230, "\u1A79"=>230, "\u1A7A"=>230, "\u1A7B"=>230, "\u1A7C"=>230, "\u1A7F"=>220, "\u1AB0"=>230, "\u1AB1"=>230,
"\u1AB2"=>230, "\u1AB3"=>230, "\u1AB4"=>230, "\u1AB5"=>220, "\u1AB6"=>220, "\u1AB7"=>220, "\u1AB8"=>220, "\u1AB9"=>220,
"\u1ABA"=>220, "\u1ABB"=>230, "\u1ABC"=>230, "\u1ABD"=>220, "\u1B34"=>7, "\u1B44"=>9, "\u1B6B"=>230, "\u1B6C"=>220,
"\u1B6D"=>230, "\u1B6E"=>230, "\u1B6F"=>230, "\u1B70"=>230, "\u1B71"=>230, "\u1B72"=>230, "\u1B73"=>230, "\u1BAA"=>9,
"\u1BAB"=>9, "\u1BE6"=>7, "\u1BF2"=>9, "\u1BF3"=>9, "\u1C37"=>7, "\u1CD0"=>230, "\u1CD1"=>230, "\u1CD2"=>230,
"\u1CD4"=>1, "\u1CD5"=>220, "\u1CD6"=>220, "\u1CD7"=>220, "\u1CD8"=>220, "\u1CD9"=>220, "\u1CDA"=>230, "\u1CDB"=>230,
"\u1CDC"=>220, "\u1CDD"=>220, "\u1CDE"=>220, "\u1CDF"=>220, "\u1CE0"=>230, "\u1CE2"=>1, "\u1CE3"=>1, "\u1CE4"=>1,
"\u1CE5"=>1, "\u1CE6"=>1, "\u1CE7"=>1, "\u1CE8"=>1, "\u1CED"=>220, "\u1CF4"=>230, "\u1CF8"=>230, "\u1CF9"=>230,
"\u1DC0"=>230, "\u1DC1"=>230, "\u1DC2"=>220, "\u1DC3"=>230, "\u1DC4"=>230, "\u1DC5"=>230, "\u1DC6"=>230, "\u1DC7"=>230,
"\u1DC8"=>230, "\u1DC9"=>230, "\u1DCA"=>220, "\u1DCB"=>230, "\u1DCC"=>230, "\u1DCD"=>234, "\u1DCE"=>214, "\u1DCF"=>220,
"\u1DD0"=>202, "\u1DD1"=>230, "\u1DD2"=>230, "\u1DD3"=>230, "\u1DD4"=>230, "\u1DD5"=>230, "\u1DD6"=>230, "\u1DD7"=>230,
"\u1DD8"=>230, "\u1DD9"=>230, "\u1DDA"=>230, "\u1DDB"=>230, "\u1DDC"=>230, "\u1DDD"=>230, "\u1DDE"=>230, "\u1DDF"=>230,
"\u1DE0"=>230, "\u1DE1"=>230, "\u1DE2"=>230, "\u1DE3"=>230, "\u1DE4"=>230, "\u1DE5"=>230, "\u1DE6"=>230, "\u1DE7"=>230,
"\u1DE8"=>230, "\u1DE9"=>230, "\u1DEA"=>230, "\u1DEB"=>230, "\u1DEC"=>230, "\u1DED"=>230, "\u1DEE"=>230, "\u1DEF"=>230,
"\u1DF0"=>230, "\u1DF1"=>230, "\u1DF2"=>230, "\u1DF3"=>230, "\u1DF4"=>230, "\u1DF5"=>230, "\u1DF6"=>232, "\u1DF7"=>228,
"\u1DF8"=>228, "\u1DF9"=>220, "\u1DFB"=>230, "\u1DFC"=>233, "\u1DFD"=>220, "\u1DFE"=>230, "\u1DFF"=>220, "\u20D0"=>230,
"\u20D1"=>230, "\u20D2"=>1, "\u20D3"=>1, "\u20D4"=>230, "\u20D5"=>230, "\u20D6"=>230, "\u20D7"=>230, "\u20D8"=>1,
"\u20D9"=>1, "\u20DA"=>1, "\u20DB"=>230, "\u20DC"=>230, "\u20E1"=>230, "\u20E5"=>1, "\u20E6"=>1, "\u20E7"=>230,
"\u20E8"=>220, "\u20E9"=>230, "\u20EA"=>1, "\u20EB"=>1, "\u20EC"=>220, "\u20ED"=>220, "\u20EE"=>220, "\u20EF"=>220,
"\u20F0"=>230, "\u2CEF"=>230, "\u2CF0"=>230, "\u2CF1"=>230, "\u2D7F"=>9, "\u2DE0"=>230, "\u2DE1"=>230, "\u2DE2"=>230,
"\u2DE3"=>230, "\u2DE4"=>230, "\u2DE5"=>230, "\u2DE6"=>230, "\u2DE7"=>230, "\u2DE8"=>230, "\u2DE9"=>230, "\u2DEA"=>230,
"\u2DEB"=>230, "\u2DEC"=>230, "\u2DED"=>230, "\u2DEE"=>230, "\u2DEF"=>230, "\u2DF0"=>230, "\u2DF1"=>230, "\u2DF2"=>230,
"\u2DF3"=>230, "\u2DF4"=>230, "\u2DF5"=>230, "\u2DF6"=>230, "\u2DF7"=>230, "\u2DF8"=>230, "\u2DF9"=>230, "\u2DFA"=>230,
"\u2DFB"=>230, "\u2DFC"=>230, "\u2DFD"=>230, "\u2DFE"=>230, "\u2DFF"=>230, "\u302A"=>218, "\u302B"=>228, "\u302C"=>232,
"\u302D"=>222, "\u302E"=>224, "\u302F"=>224, "\u3099"=>8, "\u309A"=>8, "\uA66F"=>230, "\uA674"=>230, "\uA675"=>230,
"\uA676"=>230, "\uA677"=>230, "\uA678"=>230, "\uA679"=>230, "\uA67A"=>230, "\uA67B"=>230, "\uA67C"=>230, "\uA67D"=>230,
"\uA69E"=>230, "\uA69F"=>230, "\uA6F0"=>230, "\uA6F1"=>230, "\uA806"=>9, "\uA8C4"=>9, "\uA8E0"=>230, "\uA8E1"=>230,
"\uA8E2"=>230, "\uA8E3"=>230, "\uA8E4"=>230, "\uA8E5"=>230, "\uA8E6"=>230, "\uA8E7"=>230, "\uA8E8"=>230, "\uA8E9"=>230,
"\uA8EA"=>230, "\uA8EB"=>230, "\uA8EC"=>230, "\uA8ED"=>230, "\uA8EE"=>230, "\uA8EF"=>230, "\uA8F0"=>230, "\uA8F1"=>230,
"\uA92B"=>220, "\uA92C"=>220, "\uA92D"=>220, "\uA953"=>9, "\uA9B3"=>7, "\uA9C0"=>9, "\uAAB0"=>230, "\uAAB2"=>230,
"\uAAB3"=>230, "\uAAB4"=>220, "\uAAB7"=>230, "\uAAB8"=>230, "\uAABE"=>230, "\uAABF"=>230, "\uAAC1"=>230, "\uAAF6"=>9,
"\uABED"=>9, "\uFB1E"=>26, "\uFE20"=>230, "\uFE21"=>230, "\uFE22"=>230, "\uFE23"=>230, "\uFE24"=>230, "\uFE25"=>230,
"\uFE26"=>230, "\uFE27"=>220, "\uFE28"=>220, "\uFE29"=>220, "\uFE2A"=>220, "\uFE2B"=>220, "\uFE2C"=>220, "\uFE2D"=>220,
"\uFE2E"=>230, "\uFE2F"=>230, "\u{101FD}"=>220, "\u{102E0}"=>220, "\u{10376}"=>230, "\u{10377}"=>230, "\u{10378}"=>230, "\u{10379}"=>230,
"\u{1037A}"=>230, "\u{10A0D}"=>220, "\u{10A0F}"=>230, "\u{10A38}"=>230, "\u{10A39}"=>1, "\u{10A3A}"=>220, "\u{10A3F}"=>9, "\u{10AE5}"=>230,
"\u{10AE6}"=>220, "\u{11046}"=>9, "\u{1107F}"=>9, "\u{110B9}"=>9, "\u{110BA}"=>7, "\u{11100}"=>230, "\u{11101}"=>230, "\u{11102}"=>230,
"\u{11133}"=>9, "\u{11134}"=>9, "\u{11173}"=>7, "\u{111C0}"=>9, "\u{111CA}"=>7, "\u{11235}"=>9, "\u{11236}"=>7, "\u{112E9}"=>7,
"\u{112EA}"=>9, "\u{1133C}"=>7, "\u{1134D}"=>9, "\u{11366}"=>230, "\u{11367}"=>230, "\u{11368}"=>230, "\u{11369}"=>230, "\u{1136A}"=>230,
"\u{1136B}"=>230, "\u{1136C}"=>230, "\u{11370}"=>230, "\u{11371}"=>230, "\u{11372}"=>230, "\u{11373}"=>230, "\u{11374}"=>230, "\u{11442}"=>9,
"\u{11446}"=>7, "\u{114C2}"=>9, "\u{114C3}"=>7, "\u{115BF}"=>9, "\u{115C0}"=>7, "\u{1163F}"=>9, "\u{116B6}"=>9, "\u{116B7}"=>7,
"\u{1172B}"=>9, "\u{11A34}"=>9, "\u{11A47}"=>9, "\u{11A99}"=>9, "\u{11C3F}"=>9, "\u{11D42}"=>7, "\u{11D44}"=>9, "\u{11D45}"=>9,
"\u{16AF0}"=>1, "\u{16AF1}"=>1, "\u{16AF2}"=>1, "\u{16AF3}"=>1, "\u{16AF4}"=>1, "\u{16B30}"=>230, "\u{16B31}"=>230, "\u{16B32}"=>230,
"\u{16B33}"=>230, "\u{16B34}"=>230, "\u{16B35}"=>230, "\u{16B36}"=>230, "\u{1BC9E}"=>1, "\u{1D165}"=>216, "\u{1D166}"=>216, "\u{1D167}"=>1,
"\u{1D168}"=>1, "\u{1D169}"=>1, "\u{1D16D}"=>226, "\u{1D16E}"=>216, "\u{1D16F}"=>216, "\u{1D170}"=>216, "\u{1D171}"=>216, "\u{1D172}"=>216,
"\u{1D17B}"=>220, "\u{1D17C}"=>220, "\u{1D17D}"=>220, "\u{1D17E}"=>220, "\u{1D17F}"=>220, "\u{1D180}"=>220, "\u{1D181}"=>220, "\u{1D182}"=>220,
"\u{1D185}"=>230, "\u{1D186}"=>230, "\u{1D187}"=>230, "\u{1D188}"=>230, "\u{1D189}"=>230, "\u{1D18A}"=>220, "\u{1D18B}"=>220, "\u{1D1AA}"=>230,
"\u{1D1AB}"=>230, "\u{1D1AC}"=>230, "\u{1D1AD}"=>230, "\u{1D242}"=>230, "\u{1D243}"=>230, "\u{1D244}"=>230, "\u{1E000}"=>230, "\u{1E001}"=>230,
"\u{1E002}"=>230, "\u{1E003}"=>230, "\u{1E004}"=>230, "\u{1E005}"=>230, "\u{1E006}"=>230, "\u{1E008}"=>230, "\u{1E009}"=>230, "\u{1E00A}"=>230,
"\u{1E00B}"=>230, "\u{1E00C}"=>230, "\u{1E00D}"=>230, "\u{1E00E}"=>230, "\u{1E00F}"=>230, "\u{1E010}"=>230, "\u{1E011}"=>230, "\u{1E012}"=>230,
"\u{1E013}"=>230, "\u{1E014}"=>230, "\u{1E015}"=>230, "\u{1E016}"=>230, "\u{1E017}"=>230, "\u{1E018}"=>230, "\u{1E01B}"=>230, "\u{1E01C}"=>230,
"\u{1E01D}"=>230, "\u{1E01E}"=>230, "\u{1E01F}"=>230, "\u{1E020}"=>230, "\u{1E021}"=>230, "\u{1E023}"=>230, "\u{1E024}"=>230, "\u{1E026}"=>230,
"\u{1E027}"=>230, "\u{1E028}"=>230, "\u{1E029}"=>230, "\u{1E02A}"=>230, "\u{1E8D0}"=>220, "\u{1E8D1}"=>220, "\u{1E8D2}"=>220, "\u{1E8D3}"=>220,
"\u{1E8D4}"=>220, "\u{1E8D5}"=>220, "\u{1E8D6}"=>220, "\u{1E944}"=>230, "\u{1E945}"=>230, "\u{1E946}"=>230, "\u{1E947}"=>230, "\u{1E948}"=>230,
"\u{1E949}"=>230, "\u{1E94A}"=>7,
"\u07FD"=>220, "\u0816"=>230, "\u0817"=>230, "\u0818"=>230, "\u0819"=>230, "\u081B"=>230, "\u081C"=>230, "\u081D"=>230,
"\u081E"=>230, "\u081F"=>230, "\u0820"=>230, "\u0821"=>230, "\u0822"=>230, "\u0823"=>230, "\u0825"=>230, "\u0826"=>230,
"\u0827"=>230, "\u0829"=>230, "\u082A"=>230, "\u082B"=>230, "\u082C"=>230, "\u082D"=>230, "\u0859"=>220, "\u085A"=>220,
"\u085B"=>220, "\u08D3"=>220, "\u08D4"=>230, "\u08D5"=>230, "\u08D6"=>230, "\u08D7"=>230, "\u08D8"=>230, "\u08D9"=>230,
"\u08DA"=>230, "\u08DB"=>230, "\u08DC"=>230, "\u08DD"=>230, "\u08DE"=>230, "\u08DF"=>230, "\u08E0"=>230, "\u08E1"=>230,
"\u08E3"=>220, "\u08E4"=>230, "\u08E5"=>230, "\u08E6"=>220, "\u08E7"=>230, "\u08E8"=>230, "\u08E9"=>220, "\u08EA"=>230,
"\u08EB"=>230, "\u08EC"=>230, "\u08ED"=>220, "\u08EE"=>220, "\u08EF"=>220, "\u08F0"=>27, "\u08F1"=>28, "\u08F2"=>29,
"\u08F3"=>230, "\u08F4"=>230, "\u08F5"=>230, "\u08F6"=>220, "\u08F7"=>230, "\u08F8"=>230, "\u08F9"=>220, "\u08FA"=>220,
"\u08FB"=>230, "\u08FC"=>230, "\u08FD"=>230, "\u08FE"=>230, "\u08FF"=>230, "\u093C"=>7, "\u094D"=>9, "\u0951"=>230,
"\u0952"=>220, "\u0953"=>230, "\u0954"=>230, "\u09BC"=>7, "\u09CD"=>9, "\u09FE"=>230, "\u0A3C"=>7, "\u0A4D"=>9,
"\u0ABC"=>7, "\u0ACD"=>9, "\u0B3C"=>7, "\u0B4D"=>9, "\u0BCD"=>9, "\u0C4D"=>9, "\u0C55"=>84, "\u0C56"=>91,
"\u0CBC"=>7, "\u0CCD"=>9, "\u0D3B"=>9, "\u0D3C"=>9, "\u0D4D"=>9, "\u0DCA"=>9, "\u0E38"=>103, "\u0E39"=>103,
"\u0E3A"=>9, "\u0E48"=>107, "\u0E49"=>107, "\u0E4A"=>107, "\u0E4B"=>107, "\u0EB8"=>118, "\u0EB9"=>118, "\u0EC8"=>122,
"\u0EC9"=>122, "\u0ECA"=>122, "\u0ECB"=>122, "\u0F18"=>220, "\u0F19"=>220, "\u0F35"=>220, "\u0F37"=>220, "\u0F39"=>216,
"\u0F71"=>129, "\u0F72"=>130, "\u0F74"=>132, "\u0F7A"=>130, "\u0F7B"=>130, "\u0F7C"=>130, "\u0F7D"=>130, "\u0F80"=>130,
"\u0F82"=>230, "\u0F83"=>230, "\u0F84"=>9, "\u0F86"=>230, "\u0F87"=>230, "\u0FC6"=>220, "\u1037"=>7, "\u1039"=>9,
"\u103A"=>9, "\u108D"=>220, "\u135D"=>230, "\u135E"=>230, "\u135F"=>230, "\u1714"=>9, "\u1734"=>9, "\u17D2"=>9,
"\u17DD"=>230, "\u18A9"=>228, "\u1939"=>222, "\u193A"=>230, "\u193B"=>220, "\u1A17"=>230, "\u1A18"=>220, "\u1A60"=>9,
"\u1A75"=>230, "\u1A76"=>230, "\u1A77"=>230, "\u1A78"=>230, "\u1A79"=>230, "\u1A7A"=>230, "\u1A7B"=>230, "\u1A7C"=>230,
"\u1A7F"=>220, "\u1AB0"=>230, "\u1AB1"=>230, "\u1AB2"=>230, "\u1AB3"=>230, "\u1AB4"=>230, "\u1AB5"=>220, "\u1AB6"=>220,
"\u1AB7"=>220, "\u1AB8"=>220, "\u1AB9"=>220, "\u1ABA"=>220, "\u1ABB"=>230, "\u1ABC"=>230, "\u1ABD"=>220, "\u1B34"=>7,
"\u1B44"=>9, "\u1B6B"=>230, "\u1B6C"=>220, "\u1B6D"=>230, "\u1B6E"=>230, "\u1B6F"=>230, "\u1B70"=>230, "\u1B71"=>230,
"\u1B72"=>230, "\u1B73"=>230, "\u1BAA"=>9, "\u1BAB"=>9, "\u1BE6"=>7, "\u1BF2"=>9, "\u1BF3"=>9, "\u1C37"=>7,
"\u1CD0"=>230, "\u1CD1"=>230, "\u1CD2"=>230, "\u1CD4"=>1, "\u1CD5"=>220, "\u1CD6"=>220, "\u1CD7"=>220, "\u1CD8"=>220,
"\u1CD9"=>220, "\u1CDA"=>230, "\u1CDB"=>230, "\u1CDC"=>220, "\u1CDD"=>220, "\u1CDE"=>220, "\u1CDF"=>220, "\u1CE0"=>230,
"\u1CE2"=>1, "\u1CE3"=>1, "\u1CE4"=>1, "\u1CE5"=>1, "\u1CE6"=>1, "\u1CE7"=>1, "\u1CE8"=>1, "\u1CED"=>220,
"\u1CF4"=>230, "\u1CF8"=>230, "\u1CF9"=>230, "\u1DC0"=>230, "\u1DC1"=>230, "\u1DC2"=>220, "\u1DC3"=>230, "\u1DC4"=>230,
"\u1DC5"=>230, "\u1DC6"=>230, "\u1DC7"=>230, "\u1DC8"=>230, "\u1DC9"=>230, "\u1DCA"=>220, "\u1DCB"=>230, "\u1DCC"=>230,
"\u1DCD"=>234, "\u1DCE"=>214, "\u1DCF"=>220, "\u1DD0"=>202, "\u1DD1"=>230, "\u1DD2"=>230, "\u1DD3"=>230, "\u1DD4"=>230,
"\u1DD5"=>230, "\u1DD6"=>230, "\u1DD7"=>230, "\u1DD8"=>230, "\u1DD9"=>230, "\u1DDA"=>230, "\u1DDB"=>230, "\u1DDC"=>230,
"\u1DDD"=>230, "\u1DDE"=>230, "\u1DDF"=>230, "\u1DE0"=>230, "\u1DE1"=>230, "\u1DE2"=>230, "\u1DE3"=>230, "\u1DE4"=>230,
"\u1DE5"=>230, "\u1DE6"=>230, "\u1DE7"=>230, "\u1DE8"=>230, "\u1DE9"=>230, "\u1DEA"=>230, "\u1DEB"=>230, "\u1DEC"=>230,
"\u1DED"=>230, "\u1DEE"=>230, "\u1DEF"=>230, "\u1DF0"=>230, "\u1DF1"=>230, "\u1DF2"=>230, "\u1DF3"=>230, "\u1DF4"=>230,
"\u1DF5"=>230, "\u1DF6"=>232, "\u1DF7"=>228, "\u1DF8"=>228, "\u1DF9"=>220, "\u1DFB"=>230, "\u1DFC"=>233, "\u1DFD"=>220,
"\u1DFE"=>230, "\u1DFF"=>220, "\u20D0"=>230, "\u20D1"=>230, "\u20D2"=>1, "\u20D3"=>1, "\u20D4"=>230, "\u20D5"=>230,
"\u20D6"=>230, "\u20D7"=>230, "\u20D8"=>1, "\u20D9"=>1, "\u20DA"=>1, "\u20DB"=>230, "\u20DC"=>230, "\u20E1"=>230,
"\u20E5"=>1, "\u20E6"=>1, "\u20E7"=>230, "\u20E8"=>220, "\u20E9"=>230, "\u20EA"=>1, "\u20EB"=>1, "\u20EC"=>220,
"\u20ED"=>220, "\u20EE"=>220, "\u20EF"=>220, "\u20F0"=>230, "\u2CEF"=>230, "\u2CF0"=>230, "\u2CF1"=>230, "\u2D7F"=>9,
"\u2DE0"=>230, "\u2DE1"=>230, "\u2DE2"=>230, "\u2DE3"=>230, "\u2DE4"=>230, "\u2DE5"=>230, "\u2DE6"=>230, "\u2DE7"=>230,
"\u2DE8"=>230, "\u2DE9"=>230, "\u2DEA"=>230, "\u2DEB"=>230, "\u2DEC"=>230, "\u2DED"=>230, "\u2DEE"=>230, "\u2DEF"=>230,
"\u2DF0"=>230, "\u2DF1"=>230, "\u2DF2"=>230, "\u2DF3"=>230, "\u2DF4"=>230, "\u2DF5"=>230, "\u2DF6"=>230, "\u2DF7"=>230,
"\u2DF8"=>230, "\u2DF9"=>230, "\u2DFA"=>230, "\u2DFB"=>230, "\u2DFC"=>230, "\u2DFD"=>230, "\u2DFE"=>230, "\u2DFF"=>230,
"\u302A"=>218, "\u302B"=>228, "\u302C"=>232, "\u302D"=>222, "\u302E"=>224, "\u302F"=>224, "\u3099"=>8, "\u309A"=>8,
"\uA66F"=>230, "\uA674"=>230, "\uA675"=>230, "\uA676"=>230, "\uA677"=>230, "\uA678"=>230, "\uA679"=>230, "\uA67A"=>230,
"\uA67B"=>230, "\uA67C"=>230, "\uA67D"=>230, "\uA69E"=>230, "\uA69F"=>230, "\uA6F0"=>230, "\uA6F1"=>230, "\uA806"=>9,
"\uA8C4"=>9, "\uA8E0"=>230, "\uA8E1"=>230, "\uA8E2"=>230, "\uA8E3"=>230, "\uA8E4"=>230, "\uA8E5"=>230, "\uA8E6"=>230,
"\uA8E7"=>230, "\uA8E8"=>230, "\uA8E9"=>230, "\uA8EA"=>230, "\uA8EB"=>230, "\uA8EC"=>230, "\uA8ED"=>230, "\uA8EE"=>230,
"\uA8EF"=>230, "\uA8F0"=>230, "\uA8F1"=>230, "\uA92B"=>220, "\uA92C"=>220, "\uA92D"=>220, "\uA953"=>9, "\uA9B3"=>7,
"\uA9C0"=>9, "\uAAB0"=>230, "\uAAB2"=>230, "\uAAB3"=>230, "\uAAB4"=>220, "\uAAB7"=>230, "\uAAB8"=>230, "\uAABE"=>230,
"\uAABF"=>230, "\uAAC1"=>230, "\uAAF6"=>9, "\uABED"=>9, "\uFB1E"=>26, "\uFE20"=>230, "\uFE21"=>230, "\uFE22"=>230,
"\uFE23"=>230, "\uFE24"=>230, "\uFE25"=>230, "\uFE26"=>230, "\uFE27"=>220, "\uFE28"=>220, "\uFE29"=>220, "\uFE2A"=>220,
"\uFE2B"=>220, "\uFE2C"=>220, "\uFE2D"=>220, "\uFE2E"=>230, "\uFE2F"=>230, "\u{101FD}"=>220, "\u{102E0}"=>220, "\u{10376}"=>230,
"\u{10377}"=>230, "\u{10378}"=>230, "\u{10379}"=>230, "\u{1037A}"=>230, "\u{10A0D}"=>220, "\u{10A0F}"=>230, "\u{10A38}"=>230, "\u{10A39}"=>1,
"\u{10A3A}"=>220, "\u{10A3F}"=>9, "\u{10AE5}"=>230, "\u{10AE6}"=>220, "\u{10D24}"=>230, "\u{10D25}"=>230, "\u{10D26}"=>230, "\u{10D27}"=>230,
"\u{10F46}"=>220, "\u{10F47}"=>220, "\u{10F48}"=>230, "\u{10F49}"=>230, "\u{10F4A}"=>230, "\u{10F4B}"=>220, "\u{10F4C}"=>230, "\u{10F4D}"=>220,
"\u{10F4E}"=>220, "\u{10F4F}"=>220, "\u{10F50}"=>220, "\u{11046}"=>9, "\u{1107F}"=>9, "\u{110B9}"=>9, "\u{110BA}"=>7, "\u{11100}"=>230,
"\u{11101}"=>230, "\u{11102}"=>230, "\u{11133}"=>9, "\u{11134}"=>9, "\u{11173}"=>7, "\u{111C0}"=>9, "\u{111CA}"=>7, "\u{11235}"=>9,
"\u{11236}"=>7, "\u{112E9}"=>7, "\u{112EA}"=>9, "\u{1133B}"=>7, "\u{1133C}"=>7, "\u{1134D}"=>9, "\u{11366}"=>230, "\u{11367}"=>230,
"\u{11368}"=>230, "\u{11369}"=>230, "\u{1136A}"=>230, "\u{1136B}"=>230, "\u{1136C}"=>230, "\u{11370}"=>230, "\u{11371}"=>230, "\u{11372}"=>230,
"\u{11373}"=>230, "\u{11374}"=>230, "\u{11442}"=>9, "\u{11446}"=>7, "\u{1145E}"=>230, "\u{114C2}"=>9, "\u{114C3}"=>7, "\u{115BF}"=>9,
"\u{115C0}"=>7, "\u{1163F}"=>9, "\u{116B6}"=>9, "\u{116B7}"=>7, "\u{1172B}"=>9, "\u{11839}"=>9, "\u{1183A}"=>7, "\u{11A34}"=>9,
"\u{11A47}"=>9, "\u{11A99}"=>9, "\u{11C3F}"=>9, "\u{11D42}"=>7, "\u{11D44}"=>9, "\u{11D45}"=>9, "\u{11D97}"=>9, "\u{16AF0}"=>1,
"\u{16AF1}"=>1, "\u{16AF2}"=>1, "\u{16AF3}"=>1, "\u{16AF4}"=>1, "\u{16B30}"=>230, "\u{16B31}"=>230, "\u{16B32}"=>230, "\u{16B33}"=>230,
"\u{16B34}"=>230, "\u{16B35}"=>230, "\u{16B36}"=>230, "\u{1BC9E}"=>1, "\u{1D165}"=>216, "\u{1D166}"=>216, "\u{1D167}"=>1, "\u{1D168}"=>1,
"\u{1D169}"=>1, "\u{1D16D}"=>226, "\u{1D16E}"=>216, "\u{1D16F}"=>216, "\u{1D170}"=>216, "\u{1D171}"=>216, "\u{1D172}"=>216, "\u{1D17B}"=>220,
"\u{1D17C}"=>220, "\u{1D17D}"=>220, "\u{1D17E}"=>220, "\u{1D17F}"=>220, "\u{1D180}"=>220, "\u{1D181}"=>220, "\u{1D182}"=>220, "\u{1D185}"=>230,
"\u{1D186}"=>230, "\u{1D187}"=>230, "\u{1D188}"=>230, "\u{1D189}"=>230, "\u{1D18A}"=>220, "\u{1D18B}"=>220, "\u{1D1AA}"=>230, "\u{1D1AB}"=>230,
"\u{1D1AC}"=>230, "\u{1D1AD}"=>230, "\u{1D242}"=>230, "\u{1D243}"=>230, "\u{1D244}"=>230, "\u{1E000}"=>230, "\u{1E001}"=>230, "\u{1E002}"=>230,
"\u{1E003}"=>230, "\u{1E004}"=>230, "\u{1E005}"=>230, "\u{1E006}"=>230, "\u{1E008}"=>230, "\u{1E009}"=>230, "\u{1E00A}"=>230, "\u{1E00B}"=>230,
"\u{1E00C}"=>230, "\u{1E00D}"=>230, "\u{1E00E}"=>230, "\u{1E00F}"=>230, "\u{1E010}"=>230, "\u{1E011}"=>230, "\u{1E012}"=>230, "\u{1E013}"=>230,
"\u{1E014}"=>230, "\u{1E015}"=>230, "\u{1E016}"=>230, "\u{1E017}"=>230, "\u{1E018}"=>230, "\u{1E01B}"=>230, "\u{1E01C}"=>230, "\u{1E01D}"=>230,
"\u{1E01E}"=>230, "\u{1E01F}"=>230, "\u{1E020}"=>230, "\u{1E021}"=>230, "\u{1E023}"=>230, "\u{1E024}"=>230, "\u{1E026}"=>230, "\u{1E027}"=>230,
"\u{1E028}"=>230, "\u{1E029}"=>230, "\u{1E02A}"=>230, "\u{1E8D0}"=>220, "\u{1E8D1}"=>220, "\u{1E8D2}"=>220, "\u{1E8D3}"=>220, "\u{1E8D4}"=>220,
"\u{1E8D5}"=>220, "\u{1E8D6}"=>220, "\u{1E944}"=>230, "\u{1E945}"=>230, "\u{1E946}"=>230, "\u{1E947}"=>230, "\u{1E948}"=>230, "\u{1E949}"=>230,
"\u{1E94A}"=>7,
}
class_table.default = 0
CLASS_TABLE = class_table.freeze

View file

@ -5831,6 +5831,7 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
Node* list2 = NULL;
Node* alt = NULL;
Node* alt2 = NULL;
Node* top_alt = NULL;
BBuf *pbuf1 = NULL;
int r = 0;
int num1;
@ -5845,9 +5846,9 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
if (ONIGENC_IS_UNICODE(env->enc)) {
/* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
CClassNode* cc;
OnigCodePoint sb_out = (ONIGENC_MBC_MINLEN(env->enc) > 1) ? 0x00 : 0x80;
Node **seq = node_array; /* seq[5] */
Node **alts = node_array+5; /* alts[4] */
/* OnigCodePoint sb_out = (ONIGENC_MBC_MINLEN(env->enc) > 1) ? 0x00 : 0x80; */
/* Node **seq = node_array; * seq[5] */
/* Node **alts = node_array+5; * alts[4] */
for (i=0; i<NODE_ARRAY_SIZE; i++)
node_array[i] = NULL_NODE;
@ -5857,320 +5858,183 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
* order the various expressions appear in the grammar)
* in the old-style parts. It is forwards in the new-style
* parts (in blocks ending with create_sequence_node()). */
/* Unicode 10.0.0 */
/* CRLF
* | Prepend*
* ( RI-sequence | Hangul-Syllable | !Control )
* ( Grapheme_Extend | SpacingMark )*
* | . */
/* Unicode 10.0.0 */
/* ( Grapheme_Extend | SpacingMark )* */
R_ERR(create_property_node(&np1, env, "Grapheme_Cluster_Break=Extend"));
cc = NCCLASS(np1);
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env));
R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D));
R_ERR(quantify_node(&np1, 0, REPEAT_INFINITE));
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list = tmp;
np1 = NULL;
/* Unicode 10.0.0 */
/* ( RI-sequence | Hangul-Syllable | !Control ) */
/* !Control */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env));
if (! (ONIGENC_MBC_MINLEN(env->enc) > 1)) {
BITSET_CLEAR_BIT(cc->bs, 0x0a);
BITSET_CLEAR_BIT(cc->bs, 0x0d);
}
tmp = onig_node_new_alt(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
alt = tmp;
np1 = NULL;
/* Unicode 10.0.0 */
/* Hangul-Syllable
* := L* V+ T*
* | L* LV V* T*
* | L* LVT T*
* | L+
* | T+ */
/* Unicode 11.0.0 */
/* Hangul-Syllable
* := L* (V+ | LV V* | LVT) T*
* | L+
* | T+ */
/* these are equivalent, so we leave things as is for the moment */
/* T+ */
R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=T", '+'));
tmp = onig_node_new_alt(np1, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
np1 = NULL;
/* L+ */
R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=L", '+'));
tmp = onig_node_new_alt(np1, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
np1 = NULL;
/* L* LVT T* */
/* Unicode 11.0.0
* CRLF (this is added last because it is common with non-Unicode encodings)
* | [Control CR LF]
* | precore* core postcore*
* | . (to catch invalid stuff, because this seems to be spec for String#grapheme_clusters) */
{
R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*'));
R_ERR(create_property_node(seq+1, env, "Grapheme_Cluster_Break=LVT"));
R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=T", '*'));
Node *alts[4];
R_ERR(create_sequence_node(&list2, seq));
}
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
/* L* LV V* T* */
{
R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*'));
R_ERR(create_property_node(seq+1, env, "Grapheme_Cluster_Break=LV"));
R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=V", '*'));
R_ERR(quantify_property_node(seq+3, env, "Grapheme_Cluster_Break=T", '*'));
R_ERR(create_sequence_node(&list2, seq));
}
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
/* L* V+ T* */
{
R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*'));
R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=V", '+'));
R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=T", '*'));
R_ERR(create_sequence_node(&list2, seq));
}
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
/* end of Hangul-Syllable */
/* Unicode 10.0.0 */
/* Emoji sequence := (E_Base | EBG) Extend* E_Modifier?
* (ZWJ (Glue_After_Zwj | EBG Extend* E_Modifier?) )* */
/* ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?) */
{
/* Unicode 10.0.0 */
/* Emoji variation sequence
* http://unicode.org/Public/emoji/4.0/emoji-zwj-sequences.txt
*/
/* Emoji U+FE0F */
{
seq[0] = node_new_cclass();
if (IS_NULL(seq[0])) goto err;
cc = NCCLASS(seq[0]);
R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_Emoji));
r = ONIGENC_CODE_TO_MBC(env->enc, 0xfe0f, buf); /* VARIATION SELECTOR-16 */
if (r < 0) goto err;
seq[1] = node_new_str_raw(buf, buf + r);
if (IS_NULL(seq[1])) goto err;
R_ERR(quantify_node(seq+1, 0, 1));
R_ERR(create_sequence_node(alts+0, seq));
/* [Control CR LF] (CR and LF are not in the spec, but this is a conformed fix) */
alts[0] = node_new_cclass();
if (IS_NULL(alts[0])) goto err;
cc = NCCLASS(alts[0]);
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
}
else {
BITSET_SET_BIT(cc->bs, 0x0a);
BITSET_SET_BIT(cc->bs, 0x0d);
}
/* Unicode 10.0.0 */
/* Glue_After_Zwj */
/* precore* core postcore* */
{
seq[0] = node_new_cclass();
if (IS_NULL(seq[0])) goto err;
cc = NCCLASS(seq[0]);
R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_GAZ));
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Glue_After_Zwj", 0, env));
Node *seq[4];
R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=Extend", '*'));
/* precore*; precore := Prepend */
R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=Prepend", '*'));
/* core := hangul-syllable
* | ri-sequence
* | xpicto-sequence
* | [^Control CR LF] */
{
Node *core_alts[7];
/* hangul-syllable :=
* L* (V+ | LV V* | LVT) T*
* | L+
* | T+ */
/* hangul-syllable is an alternative (would be called H_alt)
* inside an alternative, but we flatten it into core_alts */
/* L* (V+ | LV V* | LVT) T* */
{
Node *H_seq[4];
R_ERR(quantify_property_node(H_seq+0, env, "Grapheme_Cluster_Break=L", '*'));
/* V+ | LV V* | LVT */
{
Node *H_alt2[4];
R_ERR(quantify_property_node(H_alt2+0, env, "Grapheme_Cluster_Break=V", '+'));
/* LV V* */
{
Node *H_seq2[3];
R_ERR(create_property_node(H_seq2+0, env, "Grapheme_Cluster_Break=LV"));
R_ERR(quantify_property_node(H_seq2+1, env, "Grapheme_Cluster_Break=V", '*'));
H_seq2[2] = NULL_NODE;
R_ERR(create_sequence_node(H_alt2+1, H_seq2));
}
R_ERR(create_property_node(H_alt2+2, env, "Grapheme_Cluster_Break=LVT"));
H_alt2[3] = NULL_NODE;
R_ERR(create_alternate_node(H_seq+1, H_alt2));
}
R_ERR(quantify_property_node(H_seq+2, env, "Grapheme_Cluster_Break=T", '*'));
H_seq[3] = NULL_NODE;
R_ERR(create_sequence_node(core_alts+0, H_seq));
}
/* end of L* (V+ | LV V* | LVT) T*, result is in core_alts[0] */
/* L+ */
R_ERR(quantify_property_node(core_alts+1, env, "Grapheme_Cluster_Break=L", '+'));
/* T+ */
R_ERR(quantify_property_node(core_alts+2, env, "Grapheme_Cluster_Break=T", '+'));
/* end of hangul-syllable */
/* ri-sequence := RI RI */
R_ERR(quantify_property_node(core_alts+3, env, "Regional_Indicator", '2'));
/* xpicto-sequence := \p{Extended_Pictographic} (Extend* ZWJ \p{Extended_Pictographic})* */
{
Node *XP_seq[3];
R_ERR(create_property_node(XP_seq+0, env, "Extended_Pictographic"));
/* (Extend* ZWJ \p{Extended_Pictographic})* */
{
Node *Ex_seq[4];
R_ERR(quantify_property_node(Ex_seq+0, env, "Grapheme_Cluster_Break=Extend", '*'));
/* ZWJ (ZERO WIDTH JOINER) */
r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf);
if (r < 0) goto err;
Ex_seq[1] = node_new_str_raw(buf, buf + r);
if (IS_NULL(Ex_seq[1])) goto err;
R_ERR(create_property_node(Ex_seq+2, env, "Extended_Pictographic"));
Ex_seq[3] = NULL_NODE;
R_ERR(create_sequence_node(XP_seq+1, Ex_seq));
}
R_ERR(quantify_node(XP_seq+1, 0, REPEAT_INFINITE)); /* TODO: Check about node freeing */
/* end of (Extend* ZWJ \p{Extended_Pictographic})* */
XP_seq[2] = NULL_NODE;
R_ERR(create_sequence_node(core_alts+4, XP_seq));
}
/* end of xpicto-sequence, result is in core_alts[4] */
/* [^Control CR LF] */
core_alts[5] = node_new_cclass();
if (IS_NULL(core_alts[5])) goto err;
cc = NCCLASS(core_alts[5]);
if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
BBuf *inverted_buf = NULL;
/* Start with a positive buffer and invert at the end,
* because otherwise adding single-character ranges works the wrong way. */
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env));
cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */
}
else {
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env));
BITSET_CLEAR_BIT(cc->bs, 0x0a);
BITSET_CLEAR_BIT(cc->bs, 0x0d);
}
/* end of [^Control CR LF], result in core_alts[5] */
core_alts[6] = NULL_NODE;
R_ERR(create_alternate_node(seq+1, core_alts));
}
/* end of core := hangul-syllable | ri-sequence | xpicto-sequence | [^Control CR LF],
* result is in seq[1] */
/* postcore*; postcore = [Extend ZWJ SpacingMark] */
R_ERR(create_property_node(seq+2, env, "Grapheme_Cluster_Break=Extend"));
cc = NCCLASS(seq[2]);
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env));
R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D));
R_ERR(quantify_node(seq+2, 0, REPEAT_INFINITE));
seq[3] = NULL_NODE;
R_ERR(create_sequence_node(alts+1, seq));
}
/* end of (precore* core postcore*), result is in alts[1] */
/* E_Base_GAZ Extend* E_Modifier? */
{
R_ERR(create_property_node(seq+0, env, "Grapheme_Cluster_Break=E_Base_GAZ"));
R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=Extend", '*'));
R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=E_Modifier", '?'));
/* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
/* Not in spec, but added to catch invalid stuff,
* because this is spec for String#grapheme_clusters. */
np1 = node_new_anychar();
if (IS_NULL(np1)) goto err;
R_ERR(create_sequence_node(alts+2, seq));
}
option = env->option;
ONOFF(option, ONIG_OPTION_MULTILINE, 0);
tmp = node_new_option(option);
if (IS_NULL(tmp)) goto err;
NENCLOSE(tmp)->target = np1;
alts[2] = tmp;
R_ERR(create_alternate_node(&alt2, alts));
}
tmp = node_new_list(alt2, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
alt2 = NULL;
/* ZWJ */
r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* ZERO WIDTH JOINER (ZWJ) */
if (r < 0) goto err;
np1 = node_new_str_raw(buf, buf + r);
if (IS_NULL(np1)) goto err;
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
R_ERR(quantify_node(&list2, 0, REPEAT_INFINITE));
np1 = list2;
list2 = NULL;
tmp = node_new_list(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
/* E_Modifier? */
R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=E_Modifier", '?'));
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
/* Extend* */
R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=Extend", '*'));
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
/* (E_Base | EBG) */
np1 = node_new_cclass();
if (IS_NULL(np1)) goto err;
cc = NCCLASS(np1);
R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_E_Base));
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base", 0, env));
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base_GAZ", 0, env));
tmp = node_new_list(np1, list2);
if (IS_NULL(tmp)) goto err;
list2 = tmp;
np1 = NULL;
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
/* Unicode 10.0.0 */
/* a sequence starting with ZWJ seems artificial, but GraphemeBreakTest
* has such examples.
* http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.html
*/
/* ZWJ (E_Base_GAZ | Glue_After_Zwj) E_Modifier? */
{
r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* ZERO WIDTH JOINER (ZWJ) */
if (r < 0) goto err;
seq[0] = node_new_str_raw(buf, buf + r);
if (IS_NULL(seq[0])) goto err;
seq[1] = node_new_cclass();
if (IS_NULL(seq[1])) goto err;
cc = NCCLASS(seq[1]);
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Glue_After_Zwj", 0, env));
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base_GAZ", 0, env));
R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=E_Modifier", '?'));
R_ERR(create_sequence_node(&list2, seq));
} /* End of ZWJ (E_Base_GAZ | Glue_After_Zwj) E_Modifier? */
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
/* Unicode 10.0.0/11.0.0 */
/* this is Regional_Indicator+ in the Unicode 10.0.0 regular expression,
* but the segmentation rules and Unicode 11.0.0 use Regional_Indicator{2}, so no need to fix */
/* RI-Sequence := Regional_Indicator{2} */
R_ERR(quantify_property_node(&np1, env, "Regional_Indicator", '2'));
tmp = onig_node_new_alt(np1, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
np1 = NULL;
tmp = node_new_list(alt, list);
if (IS_NULL(tmp)) goto err;
list = tmp;
alt = NULL;
/* Prepend* */
R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=Prepend", '*'));
tmp = node_new_list(np1, list);
if (IS_NULL(tmp)) goto err;
list = tmp;
np1 = NULL;
/* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
np1 = node_new_anychar();
if (IS_NULL(np1)) goto err;
option = env->option;
ONOFF(option, ONIG_OPTION_MULTILINE, 0);
tmp = node_new_option(option);
if (IS_NULL(tmp)) goto err;
NENCLOSE(tmp)->target = np1;
np1 = tmp;
tmp = onig_node_new_alt(np1, NULL_NODE);
if (IS_NULL(tmp)) goto err;
alt = tmp;
np1 = NULL;
/* Prepend+ ZWJ* */
{
R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=Prepend", '+'));
r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* does this belong to Prepend?? */
if (r < 0) goto err;
seq[1] = node_new_str_raw(buf, buf + r);
if (IS_NULL(seq[1])) goto err;
R_ERR(quantify_node(seq+1, 0, 1));
R_ERR(create_sequence_node(&list2, seq));
}
tmp = onig_node_new_alt(list2, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list2 = NULL;
tmp = onig_node_new_alt(list, alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
list = NULL;
alts[3] = NULL_NODE;
R_ERR(create_alternate_node(&top_alt, alts));
} /* end of (CRLF | Control | precore* core postcore*) (without CRLF!), result is in top_alt */
}
else
#endif /* USE_UNICODE_PROPERTIES */
@ -6186,11 +6050,12 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
NENCLOSE(tmp)->target = np1;
np1 = tmp;
alt = onig_node_new_alt(np1, NULL_NODE);
if (IS_NULL(alt)) goto err;
top_alt = onig_node_new_alt(np1, NULL_NODE);
if (IS_NULL(top_alt)) goto err;
np1 = NULL;
}
/* add in CRLF to complete (CRLF | Control | precore* core postcore*) */
/* \x0D\x0A */
r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
if (r < 0) goto err;
@ -6200,15 +6065,15 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
np1 = node_new_str_raw(buf, buf + num1 + r);
if (IS_NULL(np1)) goto err;
tmp = onig_node_new_alt(np1, alt);
tmp = onig_node_new_alt(np1, top_alt);
if (IS_NULL(tmp)) goto err;
alt = tmp;
top_alt = tmp;
np1 = NULL;
/* (?>\x0D\x0A|...) */
tmp = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
if (IS_NULL(tmp)) goto err;
NENCLOSE(tmp)->target = alt;
NENCLOSE(tmp)->target = top_alt;
np1 = tmp;
#ifdef USE_UNICODE_PROPERTIES

View file

@ -30,7 +30,7 @@ end
class TestEmojiBreaks < Test::Unit::TestCase
EMOJI_DATA_FILES = %w[emoji-sequences emoji-test emoji-variation-sequences emoji-zwj-sequences]
EMOJI_VERSION = '5.0' # hard-coded, should be replaced by
EMOJI_VERSION = '11.0' # hard-coded, should be replaced by
# RbConfig::CONFIG['UNICODE_EMOJI_VERSION'] or so, see feature #15341
EMOJI_DATA_PATH = File.expand_path("../../../enc/unicode/data/emoji/#{EMOJI_VERSION}", __dir__)
@ -100,7 +100,7 @@ TestEmojiBreaks.data_files_available? and class TestEmojiBreaks
def test_mixed_emoji
srand 0
length = all_tests.length
step = 503 # use a prime number
step = 503 # use a prime number
all_tests.each do |test1|
start = rand step
start.step(by: step, to: length-1) do |t2|