1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

* enc/unicode.c, test/ruby/enc/test_case_mapping.rb: Implemented :fold

option for String#downcase by using case folding data from
  regular expression engine, and added a few simple tests.
  (with Kimihito Matsui)


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53747 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
duerst 2016-02-06 05:37:29 +00:00
parent 6ab70ad72a
commit 81515b2381
3 changed files with 39 additions and 5 deletions

View file

@ -1,3 +1,10 @@
Sat Feb 6 14:37:16 2016 Martin Duerst <duerst@it.aoyama.ac.jp>
* enc/unicode.c, test/ruby/enc/test_case_mapping.rb: Implemented :fold
option for String#downcase by using case folding data from
regular expression engine, and added a few simple tests.
(with Kimihito Matsui)
Fri Feb 5 20:08:59 2016 Martin Duerst <duerst@it.aoyama.ac.jp>
* test/ruby/enc/test_case_mapping.rb: added tests for :ascii option.

View file

@ -628,40 +628,61 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP,
if (code<='z') { /* ASCII comes first */
if (code>='a' && code<='z') {
if (flags&ONIGENC_CASE_UPCASE) {
MODIFIED;
if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0069) /* i -> I WITH DOT ABOVE */
code = 0x0130;
else
code += 'A'-'a';
MODIFIED;
}
}
else if (code>='A' && code<='Z') {
if (flags&ONIGENC_CASE_DOWNCASE) {
MODIFIED;
if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0049) /* I -> DOTLESS i */
code = 0x0131;
else
code += 'a'-'A';
MODIFIED;
}
}
}
else if (!(flags&ONIGENC_CASE_ASCII_ONLY) && code>=0x00C0) { /* deal with non-ASCII; nothing relevant below U+00C0 */
const CodePointList3 *folded;
if (code==0x0130) {
if (flags&ONIGENC_CASE_DOWNCASE) {
MODIFIED;
if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI)
code = 0x0069; /* I WITH DOT ABOVE -> i */
else { /* make dot above explicit */
to += ONIGENC_CODE_TO_MBC(enc, 0x0069, to);
code = 0x0307; /* dot above */
}
MODIFIED;
}
}
/* the following case can be removed once we rely on data,
/* the following special case for DOTLESS i -> I
* can be removed once we rely on data,
* because the mapping is always the same */
else if (code==0x0131 && (flags&ONIGENC_CASE_UPCASE)) { /* DOTLESS i -> I */
else if (code==0x0131 && (flags&ONIGENC_CASE_UPCASE)) {
code = 0x0049; MODIFIED;
}
else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) {
if (flags&ONIGENC_CASE_FOLD) {
const OnigCodePoint *next = folded->code;
int count = OnigCodePointCount(folded->n);
MODIFIED;
if (count==1)
code = *next;
else if (count==2) {
to += ONIGENC_CODE_TO_MBC(enc, *next++, to);
code = *next;
}
else { /* count == 3 */
to += ONIGENC_CODE_TO_MBC(enc, *next++, to);
to += ONIGENC_CODE_TO_MBC(enc, *next++, to);
code = *next;
}
}
}
}
to += ONIGENC_CODE_TO_MBC(enc, code, to);
/* switch from titlecase to lowercase for capitalize */

View file

@ -59,6 +59,12 @@ class TestCaseMappingPreliminary < Test::Unit::TestCase
check_swapcase_properties 'rÉsumÉ dÜrst ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', :ascii
end
def test_fold_option
check_downcase_properties 'ss', 'ß', :fold
check_downcase_properties 'fifl', 'fifl', :fold
check_downcase_properties 'σ', 'ς', :fold
end
def test_turcic
check_downcase_properties 'yukihiro matsumoto (matz)', 'Yukihiro MATSUMOTO (MATZ)', :turkic
check_upcase_properties 'YUKİHİRO MATSUMOTO (MATZ)', 'Yukihiro Matsumoto (matz)', :turkic