From 81515b2381dcd325ca57c0272a551bba4f112afd Mon Sep 17 00:00:00 2001 From: duerst Date: Sat, 6 Feb 2016 05:37:29 +0000 Subject: [PATCH] * enc/unicode.c, test/ruby/enc/test_case_mapping.rb: Implemented :fold option for String#downcase by using case folding data from regular expression engine, and added a few simple tests. (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53747 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 7 +++++++ enc/unicode.c | 31 +++++++++++++++++++++++++----- test/ruby/enc/test_case_mapping.rb | 6 ++++++ 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index 61df2a6c4b..313e452a1f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +Sat Feb 6 14:37:16 2016 Martin Duerst + + * enc/unicode.c, test/ruby/enc/test_case_mapping.rb: Implemented :fold + option for String#downcase by using case folding data from + regular expression engine, and added a few simple tests. + (with Kimihito Matsui) + Fri Feb 5 20:08:59 2016 Martin Duerst * test/ruby/enc/test_case_mapping.rb: added tests for :ascii option. diff --git a/enc/unicode.c b/enc/unicode.c index 3f41ea3844..cb9b0a94b4 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -628,40 +628,61 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP, if (code<='z') { /* ASCII comes first */ if (code>='a' && code<='z') { if (flags&ONIGENC_CASE_UPCASE) { + MODIFIED; if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0069) /* i -> I WITH DOT ABOVE */ code = 0x0130; else code += 'A'-'a'; - MODIFIED; } } else if (code>='A' && code<='Z') { if (flags&ONIGENC_CASE_DOWNCASE) { + MODIFIED; if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0049) /* I -> DOTLESS i */ code = 0x0131; else code += 'a'-'A'; - MODIFIED; } } } else if (!(flags&ONIGENC_CASE_ASCII_ONLY) && code>=0x00C0) { /* deal with non-ASCII; nothing relevant below U+00C0 */ + const CodePointList3 *folded; + if (code==0x0130) { if (flags&ONIGENC_CASE_DOWNCASE) { + MODIFIED; if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI) code = 0x0069; /* I WITH DOT ABOVE -> i */ else { /* make dot above explicit */ to += ONIGENC_CODE_TO_MBC(enc, 0x0069, to); code = 0x0307; /* dot above */ } - MODIFIED; } } - /* the following case can be removed once we rely on data, + /* the following special case for DOTLESS i -> I + * can be removed once we rely on data, * because the mapping is always the same */ - else if (code==0x0131 && (flags&ONIGENC_CASE_UPCASE)) { /* DOTLESS i -> I */ + else if (code==0x0131 && (flags&ONIGENC_CASE_UPCASE)) { code = 0x0049; MODIFIED; } + else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { + if (flags&ONIGENC_CASE_FOLD) { + const OnigCodePoint *next = folded->code; + int count = OnigCodePointCount(folded->n); + MODIFIED; + if (count==1) + code = *next; + else if (count==2) { + to += ONIGENC_CODE_TO_MBC(enc, *next++, to); + code = *next; + } + else { /* count == 3 */ + to += ONIGENC_CODE_TO_MBC(enc, *next++, to); + to += ONIGENC_CODE_TO_MBC(enc, *next++, to); + code = *next; + } + } + } } to += ONIGENC_CODE_TO_MBC(enc, code, to); /* switch from titlecase to lowercase for capitalize */ diff --git a/test/ruby/enc/test_case_mapping.rb b/test/ruby/enc/test_case_mapping.rb index 3ac647717d..2c712e448d 100644 --- a/test/ruby/enc/test_case_mapping.rb +++ b/test/ruby/enc/test_case_mapping.rb @@ -59,6 +59,12 @@ class TestCaseMappingPreliminary < Test::Unit::TestCase check_swapcase_properties 'rÉsumÉ dÜrst ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', :ascii end + def test_fold_option + check_downcase_properties 'ss', 'ß', :fold + check_downcase_properties 'fifl', 'fifl', :fold + check_downcase_properties 'σ', 'ς', :fold + end + def test_turcic check_downcase_properties 'yukihiro matsumoto (matz)', 'Yukihiro MATSUMOTO (MATZ)', :turkic check_upcase_properties 'YUKİHİRO MATSUMOTO (MATZ)', 'Yukihiro Matsumoto (matz)', :turkic