From be897c2507a9d7710f218ccf377e6ea67d6d47bf Mon Sep 17 00:00:00 2001 From: duerst Date: Sat, 16 Jan 2016 01:24:03 +0000 Subject: [PATCH] * string.c, enc/unicode.c: New code path as a preparation for Unicode-wide case mapping. The code path is currently guarded by the :lithuanian option to avoid accidental problems in daily use. * test/ruby/enc/test_case_mapping.rb: Test for above. * string.c: function 'check_case_options': fixed logical errors git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53548 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 10 +++- enc/unicode.c | 29 ++++++++++ string.c | 88 +++++++++++++++++++++++++++--- test/ruby/enc/test_case_mapping.rb | 11 ++++ 4 files changed, 129 insertions(+), 9 deletions(-) create mode 100644 test/ruby/enc/test_case_mapping.rb diff --git a/ChangeLog b/ChangeLog index 5b448df4e0..e7e74556f9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Sat Jan 16 10:23:23 2016 Martin Duerst + + * string.c, enc/unicode.c: New code path as a preparation for Unicode-wide + case mapping. The code path is currently guarded by the :lithuanian + option to avoid accidental problems in daily use. + * test/ruby/enc/test_case_mapping.rb: Test for above. + * string.c: function 'check_case_options': fixed logical errors + Fri Jan 15 20:20:20 2016 Naohisa Goto * regint.h (PLATFORM_UNALIGNED_WORD_ACCESS): The value of @@ -8,7 +16,7 @@ Fri Jan 15 20:20:20 2016 Naohisa Goto Fri Jan 15 16:12:10 2016 Nobuyoshi Nakada - * parse.y (string1): reset heredoc indent fore each string leteral + * parse.y (string1): reset heredoc indent for each string literal so that concatenated string would not be dedented. [ruby-core:72857] [Bug #11990] diff --git a/enc/unicode.c b/enc/unicode.c index 9c0b326d0b..2f45f2f88c 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -603,3 +603,32 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, return n; } + +/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */ +#define CASE_MAPPING_SLACK 12 +/* The following declaration should be moved to an include file rather than + be duplicated here (and in string.c), but we'll wait for this because we + want this to become a primitive anyway. */ +extern int +onigenc_unicode_case_map(OnigCaseFoldType* flags, + const OnigUChar** pp, const OnigUChar* end, + OnigUChar* to, OnigUChar* to_end, + const struct OnigEncodingTypeST* enc) +{ + OnigCodePoint code; + OnigUChar *to_start = to; + to_end -= CASE_MAPPING_SLACK; + + /* hopelessly preliminary implementation, just dealing with ASCII, + * and just for downcase */ + while (*pp='A' && code<='Z') { + code += 'a'-'A'; + *flags |= ONIGENC_CASE_MODIFIED; + } + to += ONIGENC_CODE_TO_MBC(enc, code, to); + } + return to-to_start; +} diff --git a/string.c b/string.c index e4b323db3f..895ec37b47 100644 --- a/string.c +++ b/string.c @@ -5600,19 +5600,19 @@ check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags) if (argc>2) rb_raise(rb_eArgError, "too many options"); if (argv[0]==sym_turkic) { - flags &= ONIGENC_CASE_FOLD_TURKISH_AZERI; + flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI; if (argc==2) { if (argv[1]==sym_lithuanian) - flags &= ONIGENC_CASE_FOLD_LITHUANIAN; + flags |= ONIGENC_CASE_FOLD_LITHUANIAN; else rb_raise(rb_eArgError, "invalid second option"); } } else if (argv[0]==sym_lithuanian) { - flags &= ONIGENC_CASE_FOLD_LITHUANIAN; + flags |= ONIGENC_CASE_FOLD_LITHUANIAN; if (argc==2) { if (argv[1]==sym_turkic) - flags &= ONIGENC_CASE_FOLD_TURKISH_AZERI; + flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI; else rb_raise(rb_eArgError, "invalid second option"); } @@ -5620,10 +5620,10 @@ check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags) else if (argc>1) rb_raise(rb_eArgError, "too many options"); else if (argv[0]==sym_ascii) - flags &= ONIGENC_CASE_ASCII_ONLY; + flags |= ONIGENC_CASE_ASCII_ONLY; else if (argv[0]==sym_fold) { if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE) - flags &= ONIGENC_CASE_FOLD; + flags |= ONIGENC_CASE_FOLD; else rb_raise(rb_eArgError, "option :fold only allowed for downcasing"); } @@ -5632,6 +5632,75 @@ check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags) return flags; } +/* The following declaration should be moved to an include file rather than + be duplicated here (and in enc/unicode.c), but we'll wait for this because + we want this to become a primitive anyway. */ +extern int +onigenc_unicode_case_map(OnigCaseFoldType* flag, + const OnigUChar** pp, + const OnigUChar* end, + OnigUChar* to, + OnigUChar* to_end, + const struct OnigEncodingTypeST* enc); + +/* 16 should be long enough to absorb any kind of single character length increase */ +#define CASE_MAPPING_ADDITIONAL_LENGTH 20 + +struct mapping_buffer; +typedef struct mapping_buffer { + size_t capa; + size_t used; + struct mapping_buffer *next; + OnigUChar space[0]; +} mapping_buffer; + +static VALUE +rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc) +{ + VALUE target; + + OnigUChar *source_current, *source_end; + int target_length = 0; + mapping_buffer pre_buffer, /* only next pointer used */ + *current_buffer = &pre_buffer; + int buffer_count = 0; + + if (RSTRING_LEN(source) == 0) return rb_str_dup(source); + + source_current = (OnigUChar*)RSTRING_PTR(source); + source_end = (OnigUChar*)RSTRING_END(source); + + while (source_current < source_end) { + /* increase multiplier using buffer count to converge quickly */ + int capa = (source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH; + current_buffer->next = (mapping_buffer*)ALLOC_N(char, sizeof(mapping_buffer)+capa); + current_buffer = current_buffer->next; + current_buffer->next = NULL; + current_buffer->capa = capa; + target_length += current_buffer->used + = onigenc_unicode_case_map(flags, + (const OnigUChar**)&source_current, source_end, + current_buffer->space, + current_buffer->space+current_buffer->capa, + enc); + } + + if (buffer_count==1) + target = rb_str_new_with_class(source, (const char*)current_buffer->space, target_length); + else { + char *target_current = RSTRING_PTR(target = rb_str_new_with_class(source, 0, target_length)); + for (current_buffer=pre_buffer.next; current_buffer; current_buffer=current_buffer->next) + memcpy(target_current, current_buffer->space, current_buffer->used); + } + + /* TODO: check about string terminator character */ + OBJ_INFECT_RAW(target, source); + str_enc_copy(target, source); + /*ENC_CODERANGE_SET(mapped, cr);*/ + + return target; +} + /* * call-seq: * str.upcase! -> str or nil @@ -5716,7 +5785,6 @@ rb_str_upcase(int argc, VALUE *argv, VALUE str) return str; } - /* * call-seq: * str.downcase! -> str or nil @@ -5739,7 +5807,11 @@ rb_str_downcase_bang(int argc, VALUE *argv, VALUE str) enc = STR_ENC_GET(str); rb_str_check_dummy_enc(enc); s = RSTRING_PTR(str); send = RSTRING_END(str); - if (single_byte_optimizable(str)) { + if (/*enc==rb_utf8_encoding() &&*/ flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */ + str_shared_replace(str, rb_str_casemap(str, &flags, enc)); + modify = ONIGENC_CASE_MODIFIED & flags; + } + else if (single_byte_optimizable(str)) { while (s < send) { unsigned int c = *(unsigned char*)s; diff --git a/test/ruby/enc/test_case_mapping.rb b/test/ruby/enc/test_case_mapping.rb new file mode 100644 index 0000000000..529e86fbaa --- /dev/null +++ b/test/ruby/enc/test_case_mapping.rb @@ -0,0 +1,11 @@ +# Copyright © 2016 Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp) + +require "test/unit" + +# preliminary tests, using :lithuanian as a guard +# to test new implementation strategy +class TestCaseMappingPreliminary < Test::Unit::TestCase + def test_case_mapping_preliminary + assert_equal "yukihiro matsumoto (matz)", "Yukihiro MATSUMOTO (MATZ)".downcase(:lithuanian) + end +end