mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
Fixed String#grapheme_clusters with wide encodings
* string.c (get_reg_grapheme_cluster): make regexp from properly encoded sources fro wide-char encodings. [Bug #15965] * regparse.c (node_extended_grapheme_cluster): suppress false duplicated range warning for the time being.
This commit is contained in:
parent
8aecc90974
commit
2f6cc15cdb
3 changed files with 40 additions and 6 deletions
|
@ -5961,6 +5961,10 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
|
|||
if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
|
||||
BBuf *inverted_buf = NULL;
|
||||
|
||||
/* TODO: fix false warning */
|
||||
const int dup_not_warned = env->warnings_flag | ~ONIG_SYN_WARN_CC_DUP;
|
||||
env->warnings_flag |= ONIG_SYN_WARN_CC_DUP;
|
||||
|
||||
/* Start with a positive buffer and invert at the end.
|
||||
* Otherwise, adding single-character ranges work the wrong way. */
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
|
||||
|
@ -5968,6 +5972,8 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
|
|||
R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
|
||||
R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env));
|
||||
cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */
|
||||
|
||||
env->warnings_flag &= dup_not_warned; /* TODO: fix false warning */
|
||||
}
|
||||
else {
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env));
|
||||
|
|
25
string.c
25
string.c
|
@ -8593,9 +8593,30 @@ get_reg_grapheme_cluster(rb_encoding *enc)
|
|||
reg_grapheme_cluster = reg_grapheme_cluster_utf8;
|
||||
}
|
||||
if (!reg_grapheme_cluster) {
|
||||
const OnigUChar source[] = "\\X";
|
||||
const OnigUChar source_ascii[] = "\\X";
|
||||
OnigErrorInfo einfo;
|
||||
int r = onig_new(®_grapheme_cluster, source, source + sizeof(source) - 1,
|
||||
const OnigUChar *source = source_ascii;
|
||||
size_t source_len = sizeof(source_ascii) - 1;
|
||||
switch (encidx) {
|
||||
#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
|
||||
#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
|
||||
#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
|
||||
#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
|
||||
#define CASE_UTF(e) \
|
||||
case ENCINDEX_UTF_##e: { \
|
||||
static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
|
||||
source = source_UTF_##e; \
|
||||
source_len = sizeof(source_UTF_##e); \
|
||||
break; \
|
||||
}
|
||||
CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
|
||||
#undef CASE_UTF
|
||||
#undef CHARS_16BE
|
||||
#undef CHARS_16LE
|
||||
#undef CHARS_32BE
|
||||
#undef CHARS_32LE
|
||||
}
|
||||
int r = onig_new(®_grapheme_cluster, source, source + source_len,
|
||||
ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
|
||||
if (r) {
|
||||
UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
|
||||
|
|
|
@ -1036,13 +1036,20 @@ CODE
|
|||
"\u{1F468 200D 1F393}",
|
||||
"\u{1F46F 200D 2642 FE0F}",
|
||||
"\u{1f469 200d 2764 fe0f 200d 1f469}",
|
||||
].each do |g|
|
||||
].product([Encoding::UTF_8, *WIDE_ENCODINGS]) do |g, enc|
|
||||
g = g.encode(enc)
|
||||
assert_equal [g], g.grapheme_clusters
|
||||
assert_predicate g.dup.taint.grapheme_clusters[0], :tainted?
|
||||
assert_predicate g.taint.grapheme_clusters[0], :tainted?
|
||||
end
|
||||
|
||||
assert_equal ["\u000A", "\u0324"], "\u{a 324}".grapheme_clusters
|
||||
assert_equal ["\u000D", "\u0324"], "\u{d 324}".grapheme_clusters
|
||||
[
|
||||
"\u{a 324}",
|
||||
"\u{d 324}",
|
||||
"abc",
|
||||
].product([Encoding::UTF_8, *WIDE_ENCODINGS]) do |g, enc|
|
||||
g = g.encode(enc)
|
||||
assert_equal g.chars, g.grapheme_clusters
|
||||
end
|
||||
assert_equal ["a", "b", "c"], "abc".b.grapheme_clusters
|
||||
|
||||
if ENUMERATOR_WANTARRAY
|
||||
|
|
Loading…
Add table
Reference in a new issue