mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
String#each_grapheme_cluster and String#grapheme_clusters
added to enumerate grapheme clusters [Feature #13780] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@59698 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
54217d68b3
commit
df49fc659e
3 changed files with 155 additions and 0 deletions
2
NEWS
2
NEWS
|
@ -94,6 +94,8 @@ with all sufficient information, see the ChangeLog file or Redmine
|
|||
* String#delete_prefix! is added to remove prefix destructively [Feature #12694]
|
||||
* String#delete_suffix is added to remove suffix [Feature #13665]
|
||||
* String#delete_suffix! is added to remove suffix destructively [Feature #13665]
|
||||
* String#each_grapheme_cluster and String#grapheme_clusters is added to
|
||||
enumerate grapheme clusters [Feature #13780]
|
||||
|
||||
* Thread
|
||||
|
||||
|
|
113
string.c
113
string.c
|
@ -8081,6 +8081,117 @@ rb_str_codepoints(VALUE str)
|
|||
return rb_str_enumerate_codepoints(str, 1);
|
||||
}
|
||||
|
||||
static VALUE
|
||||
rb_str_enumerate_grapheme_clusters(VALUE str, int wantarray)
|
||||
{
|
||||
regex_t *reg_grapheme_cluster = NULL;
|
||||
static regex_t *reg_grapheme_cluster_utf8 = NULL;
|
||||
int encidx = ENCODING_GET(str);
|
||||
rb_encoding *enc = rb_enc_from_index(encidx);
|
||||
int unicode_p = rb_enc_unicode_p(enc);
|
||||
const char *ptr, *end;
|
||||
VALUE ary;
|
||||
|
||||
if (!unicode_p) {
|
||||
return rb_str_enumerate_codepoints(str, wantarray);
|
||||
}
|
||||
|
||||
/* synchronize */
|
||||
if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
|
||||
reg_grapheme_cluster = reg_grapheme_cluster_utf8;
|
||||
}
|
||||
if (!reg_grapheme_cluster) {
|
||||
const OnigUChar source[] = "\\X";
|
||||
int r = onig_new(®_grapheme_cluster, source, source + sizeof(source) - 1,
|
||||
ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, NULL);
|
||||
if (r) {
|
||||
rb_bug("cannot compile grapheme cluster regexp");
|
||||
}
|
||||
if (encidx == rb_utf8_encindex()) {
|
||||
reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
|
||||
}
|
||||
}
|
||||
|
||||
ptr = RSTRING_PTR(str);
|
||||
end = RSTRING_END(str);
|
||||
|
||||
if (rb_block_given_p()) {
|
||||
if (wantarray) {
|
||||
#if STRING_ENUMERATORS_WANTARRAY
|
||||
rb_warn("given block not used");
|
||||
ary = rb_ary_new_capa(str_strlen(str, enc)); /* str's enc*/
|
||||
#else
|
||||
rb_warning("passing a block to String#grapheme_clusters is deprecated");
|
||||
wantarray = 0;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (wantarray)
|
||||
ary = rb_ary_new_capa(str_strlen(str, enc)); /* str's enc*/
|
||||
else
|
||||
return SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
|
||||
}
|
||||
|
||||
while (ptr < end) {
|
||||
VALUE grapheme_cluster;
|
||||
OnigPosition len = onig_match(reg_grapheme_cluster,
|
||||
(const OnigUChar *)ptr, (const OnigUChar *)end,
|
||||
(const OnigUChar *)ptr, NULL, 0);
|
||||
if (len == 0) break;
|
||||
if (len < 0) {
|
||||
break;
|
||||
}
|
||||
grapheme_cluster = rb_enc_str_new(ptr, len, enc);
|
||||
if (wantarray)
|
||||
rb_ary_push(ary, grapheme_cluster);
|
||||
else
|
||||
rb_yield(grapheme_cluster);
|
||||
ptr += len;
|
||||
}
|
||||
if (wantarray)
|
||||
return ary;
|
||||
else
|
||||
return str;
|
||||
}
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* str.each_grapheme_cluster {|cstr| block } -> str
|
||||
* str.each_grapheme_cluster -> an_enumerator
|
||||
*
|
||||
* Passes each grapheme cluster in <i>str</i> to the given block, or returns
|
||||
* an enumerator if no block is given.
|
||||
* Unlike String#each_char, this enumerates by grapheme clusters defined by
|
||||
* Unicode Standard Annex #29 http://unicode.org/reports/tr29/
|
||||
*
|
||||
* "a\u0300".each_chars.to_a.size #=> 2
|
||||
* "a\u0300".each_grapheme_cluster.to_a.size #=> 1
|
||||
*
|
||||
*/
|
||||
|
||||
static VALUE
|
||||
rb_str_each_grapheme_cluster(VALUE str)
|
||||
{
|
||||
return rb_str_enumerate_grapheme_clusters(str, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* str.grapheme_clusters -> an_array
|
||||
*
|
||||
* Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
|
||||
* for <code>str.each_grapheme_cluster.to_a</code>.
|
||||
*
|
||||
* If a block is given, which is a deprecated form, works the same as
|
||||
* <code>each_grapheme_cluster</code>.
|
||||
*/
|
||||
|
||||
static VALUE
|
||||
rb_str_grapheme_clusters(VALUE str)
|
||||
{
|
||||
return rb_str_enumerate_grapheme_clusters(str, 1);
|
||||
}
|
||||
|
||||
static long
|
||||
chopped_length(VALUE str)
|
||||
|
@ -10492,6 +10603,7 @@ Init_String(void)
|
|||
rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
|
||||
rb_define_method(rb_cString, "chars", rb_str_chars, 0);
|
||||
rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
|
||||
rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
|
||||
rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
|
||||
rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
|
||||
rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
|
||||
|
@ -10547,6 +10659,7 @@ Init_String(void)
|
|||
rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
|
||||
rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
|
||||
rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
|
||||
rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
|
||||
|
||||
rb_define_method(rb_cString, "sum", rb_str_sum, -1);
|
||||
|
||||
|
|
|
@ -885,6 +885,46 @@ CODE
|
|||
end
|
||||
end
|
||||
|
||||
def test_each_grapheme_cluster
|
||||
[
|
||||
"\u{20 200d}",
|
||||
"\u{600 600}",
|
||||
"\u{600 20}",
|
||||
"\u{261d 1F3FB}",
|
||||
"\u{1f600}",
|
||||
"\u{20 308}",
|
||||
"\u{1F477 1F3FF 200D 2640 FE0F}",
|
||||
"\u{1F468 200D 1F393}",
|
||||
"\u{1F46F 200D 2642 FE0F}",
|
||||
"\u{1f469 200d 2764 fe0f 200d 1f469}",
|
||||
].each do |g|
|
||||
assert_equal [g], g.each_grapheme_cluster.to_a
|
||||
end
|
||||
|
||||
assert_equal ["\u000A", "\u0308"], "\u{a 308}".each_grapheme_cluster.to_a
|
||||
assert_equal ["\u000D", "\u0308"], "\u{d 308}".each_grapheme_cluster.to_a
|
||||
end
|
||||
|
||||
def test_grapheme_clusters
|
||||
[
|
||||
"\u{20 200d}",
|
||||
"\u{600 600}",
|
||||
"\u{600 20}",
|
||||
"\u{261d 1F3FB}",
|
||||
"\u{1f600}",
|
||||
"\u{20 308}",
|
||||
"\u{1F477 1F3FF 200D 2640 FE0F}",
|
||||
"\u{1F468 200D 1F393}",
|
||||
"\u{1F46F 200D 2642 FE0F}",
|
||||
"\u{1f469 200d 2764 fe0f 200d 1f469}",
|
||||
].each do |g|
|
||||
assert_equal [g], g.grapheme_clusters
|
||||
end
|
||||
|
||||
assert_equal ["\u000A", "\u0308"], "\u{a 308}".grapheme_clusters
|
||||
assert_equal ["\u000D", "\u0308"], "\u{d 308}".grapheme_clusters
|
||||
end
|
||||
|
||||
def test_each_line
|
||||
save = $/
|
||||
$/ = "\n"
|
||||
|
|
Loading…
Reference in a new issue