mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
* string.c, enc/unicode.c: New code path as a preparation for Unicode-wide
case mapping. The code path is currently guarded by the :lithuanian option to avoid accidental problems in daily use. * test/ruby/enc/test_case_mapping.rb: Test for above. * string.c: function 'check_case_options': fixed logical errors git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53548 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
2fd11c760c
commit
be897c2507
4 changed files with 129 additions and 9 deletions
10
ChangeLog
10
ChangeLog
|
@ -1,3 +1,11 @@
|
||||||
|
Sat Jan 16 10:23:23 2016 Martin Duerst <duerst@it.aoyama.ac.jp>
|
||||||
|
|
||||||
|
* string.c, enc/unicode.c: New code path as a preparation for Unicode-wide
|
||||||
|
case mapping. The code path is currently guarded by the :lithuanian
|
||||||
|
option to avoid accidental problems in daily use.
|
||||||
|
* test/ruby/enc/test_case_mapping.rb: Test for above.
|
||||||
|
* string.c: function 'check_case_options': fixed logical errors
|
||||||
|
|
||||||
Fri Jan 15 20:20:20 2016 Naohisa Goto <ngotogenome@gmail.com>
|
Fri Jan 15 20:20:20 2016 Naohisa Goto <ngotogenome@gmail.com>
|
||||||
|
|
||||||
* regint.h (PLATFORM_UNALIGNED_WORD_ACCESS): The value of
|
* regint.h (PLATFORM_UNALIGNED_WORD_ACCESS): The value of
|
||||||
|
@ -8,7 +16,7 @@ Fri Jan 15 20:20:20 2016 Naohisa Goto <ngotogenome@gmail.com>
|
||||||
|
|
||||||
Fri Jan 15 16:12:10 2016 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
Fri Jan 15 16:12:10 2016 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
||||||
|
|
||||||
* parse.y (string1): reset heredoc indent fore each string leteral
|
* parse.y (string1): reset heredoc indent for each string literal
|
||||||
so that concatenated string would not be dedented.
|
so that concatenated string would not be dedented.
|
||||||
[ruby-core:72857] [Bug #11990]
|
[ruby-core:72857] [Bug #11990]
|
||||||
|
|
||||||
|
|
|
@ -603,3 +603,32 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
|
||||||
|
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */
|
||||||
|
#define CASE_MAPPING_SLACK 12
|
||||||
|
/* The following declaration should be moved to an include file rather than
|
||||||
|
be duplicated here (and in string.c), but we'll wait for this because we
|
||||||
|
want this to become a primitive anyway. */
|
||||||
|
extern int
|
||||||
|
onigenc_unicode_case_map(OnigCaseFoldType* flags,
|
||||||
|
const OnigUChar** pp, const OnigUChar* end,
|
||||||
|
OnigUChar* to, OnigUChar* to_end,
|
||||||
|
const struct OnigEncodingTypeST* enc)
|
||||||
|
{
|
||||||
|
OnigCodePoint code;
|
||||||
|
OnigUChar *to_start = to;
|
||||||
|
to_end -= CASE_MAPPING_SLACK;
|
||||||
|
|
||||||
|
/* hopelessly preliminary implementation, just dealing with ASCII,
|
||||||
|
* and just for downcase */
|
||||||
|
while (*pp<end && to<=to_end) {
|
||||||
|
code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
|
||||||
|
*pp += enclen(enc, *pp, end);
|
||||||
|
if (code>='A' && code<='Z') {
|
||||||
|
code += 'a'-'A';
|
||||||
|
*flags |= ONIGENC_CASE_MODIFIED;
|
||||||
|
}
|
||||||
|
to += ONIGENC_CODE_TO_MBC(enc, code, to);
|
||||||
|
}
|
||||||
|
return to-to_start;
|
||||||
|
}
|
||||||
|
|
88
string.c
88
string.c
|
@ -5600,19 +5600,19 @@ check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
|
||||||
if (argc>2)
|
if (argc>2)
|
||||||
rb_raise(rb_eArgError, "too many options");
|
rb_raise(rb_eArgError, "too many options");
|
||||||
if (argv[0]==sym_turkic) {
|
if (argv[0]==sym_turkic) {
|
||||||
flags &= ONIGENC_CASE_FOLD_TURKISH_AZERI;
|
flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
|
||||||
if (argc==2) {
|
if (argc==2) {
|
||||||
if (argv[1]==sym_lithuanian)
|
if (argv[1]==sym_lithuanian)
|
||||||
flags &= ONIGENC_CASE_FOLD_LITHUANIAN;
|
flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
|
||||||
else
|
else
|
||||||
rb_raise(rb_eArgError, "invalid second option");
|
rb_raise(rb_eArgError, "invalid second option");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (argv[0]==sym_lithuanian) {
|
else if (argv[0]==sym_lithuanian) {
|
||||||
flags &= ONIGENC_CASE_FOLD_LITHUANIAN;
|
flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
|
||||||
if (argc==2) {
|
if (argc==2) {
|
||||||
if (argv[1]==sym_turkic)
|
if (argv[1]==sym_turkic)
|
||||||
flags &= ONIGENC_CASE_FOLD_TURKISH_AZERI;
|
flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
|
||||||
else
|
else
|
||||||
rb_raise(rb_eArgError, "invalid second option");
|
rb_raise(rb_eArgError, "invalid second option");
|
||||||
}
|
}
|
||||||
|
@ -5620,10 +5620,10 @@ check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
|
||||||
else if (argc>1)
|
else if (argc>1)
|
||||||
rb_raise(rb_eArgError, "too many options");
|
rb_raise(rb_eArgError, "too many options");
|
||||||
else if (argv[0]==sym_ascii)
|
else if (argv[0]==sym_ascii)
|
||||||
flags &= ONIGENC_CASE_ASCII_ONLY;
|
flags |= ONIGENC_CASE_ASCII_ONLY;
|
||||||
else if (argv[0]==sym_fold) {
|
else if (argv[0]==sym_fold) {
|
||||||
if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
|
if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
|
||||||
flags &= ONIGENC_CASE_FOLD;
|
flags |= ONIGENC_CASE_FOLD;
|
||||||
else
|
else
|
||||||
rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
|
rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
|
||||||
}
|
}
|
||||||
|
@ -5632,6 +5632,75 @@ check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* The following declaration should be moved to an include file rather than
|
||||||
|
be duplicated here (and in enc/unicode.c), but we'll wait for this because
|
||||||
|
we want this to become a primitive anyway. */
|
||||||
|
extern int
|
||||||
|
onigenc_unicode_case_map(OnigCaseFoldType* flag,
|
||||||
|
const OnigUChar** pp,
|
||||||
|
const OnigUChar* end,
|
||||||
|
OnigUChar* to,
|
||||||
|
OnigUChar* to_end,
|
||||||
|
const struct OnigEncodingTypeST* enc);
|
||||||
|
|
||||||
|
/* 16 should be long enough to absorb any kind of single character length increase */
|
||||||
|
#define CASE_MAPPING_ADDITIONAL_LENGTH 20
|
||||||
|
|
||||||
|
struct mapping_buffer;
|
||||||
|
typedef struct mapping_buffer {
|
||||||
|
size_t capa;
|
||||||
|
size_t used;
|
||||||
|
struct mapping_buffer *next;
|
||||||
|
OnigUChar space[0];
|
||||||
|
} mapping_buffer;
|
||||||
|
|
||||||
|
static VALUE
|
||||||
|
rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
|
||||||
|
{
|
||||||
|
VALUE target;
|
||||||
|
|
||||||
|
OnigUChar *source_current, *source_end;
|
||||||
|
int target_length = 0;
|
||||||
|
mapping_buffer pre_buffer, /* only next pointer used */
|
||||||
|
*current_buffer = &pre_buffer;
|
||||||
|
int buffer_count = 0;
|
||||||
|
|
||||||
|
if (RSTRING_LEN(source) == 0) return rb_str_dup(source);
|
||||||
|
|
||||||
|
source_current = (OnigUChar*)RSTRING_PTR(source);
|
||||||
|
source_end = (OnigUChar*)RSTRING_END(source);
|
||||||
|
|
||||||
|
while (source_current < source_end) {
|
||||||
|
/* increase multiplier using buffer count to converge quickly */
|
||||||
|
int capa = (source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
|
||||||
|
current_buffer->next = (mapping_buffer*)ALLOC_N(char, sizeof(mapping_buffer)+capa);
|
||||||
|
current_buffer = current_buffer->next;
|
||||||
|
current_buffer->next = NULL;
|
||||||
|
current_buffer->capa = capa;
|
||||||
|
target_length += current_buffer->used
|
||||||
|
= onigenc_unicode_case_map(flags,
|
||||||
|
(const OnigUChar**)&source_current, source_end,
|
||||||
|
current_buffer->space,
|
||||||
|
current_buffer->space+current_buffer->capa,
|
||||||
|
enc);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (buffer_count==1)
|
||||||
|
target = rb_str_new_with_class(source, (const char*)current_buffer->space, target_length);
|
||||||
|
else {
|
||||||
|
char *target_current = RSTRING_PTR(target = rb_str_new_with_class(source, 0, target_length));
|
||||||
|
for (current_buffer=pre_buffer.next; current_buffer; current_buffer=current_buffer->next)
|
||||||
|
memcpy(target_current, current_buffer->space, current_buffer->used);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* TODO: check about string terminator character */
|
||||||
|
OBJ_INFECT_RAW(target, source);
|
||||||
|
str_enc_copy(target, source);
|
||||||
|
/*ENC_CODERANGE_SET(mapped, cr);*/
|
||||||
|
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* call-seq:
|
* call-seq:
|
||||||
* str.upcase! -> str or nil
|
* str.upcase! -> str or nil
|
||||||
|
@ -5716,7 +5785,6 @@ rb_str_upcase(int argc, VALUE *argv, VALUE str)
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* call-seq:
|
* call-seq:
|
||||||
* str.downcase! -> str or nil
|
* str.downcase! -> str or nil
|
||||||
|
@ -5739,7 +5807,11 @@ rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
|
||||||
enc = STR_ENC_GET(str);
|
enc = STR_ENC_GET(str);
|
||||||
rb_str_check_dummy_enc(enc);
|
rb_str_check_dummy_enc(enc);
|
||||||
s = RSTRING_PTR(str); send = RSTRING_END(str);
|
s = RSTRING_PTR(str); send = RSTRING_END(str);
|
||||||
if (single_byte_optimizable(str)) {
|
if (/*enc==rb_utf8_encoding() &&*/ flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */
|
||||||
|
str_shared_replace(str, rb_str_casemap(str, &flags, enc));
|
||||||
|
modify = ONIGENC_CASE_MODIFIED & flags;
|
||||||
|
}
|
||||||
|
else if (single_byte_optimizable(str)) {
|
||||||
while (s < send) {
|
while (s < send) {
|
||||||
unsigned int c = *(unsigned char*)s;
|
unsigned int c = *(unsigned char*)s;
|
||||||
|
|
||||||
|
|
11
test/ruby/enc/test_case_mapping.rb
Normal file
11
test/ruby/enc/test_case_mapping.rb
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
# Copyright © 2016 Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
|
||||||
|
|
||||||
|
require "test/unit"
|
||||||
|
|
||||||
|
# preliminary tests, using :lithuanian as a guard
|
||||||
|
# to test new implementation strategy
|
||||||
|
class TestCaseMappingPreliminary < Test::Unit::TestCase
|
||||||
|
def test_case_mapping_preliminary
|
||||||
|
assert_equal "yukihiro matsumoto (matz)", "Yukihiro MATSUMOTO (MATZ)".downcase(:lithuanian)
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in a new issue