mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
Ignore invalid escapes in regexp comments
Invalid escapes are handled at multiple levels. The first level is in parse.y, so skip invalid unicode escape checks for regexps in parse.y. Make rb_reg_preprocess and unescape_nonascii accept the regexp options. In unescape_nonascii, if the regexp is an extended regexp, when "#" is encountered, ignore all characters until the end of line or end of regexp. Unfortunately, in extended regexps, you can use "#" as a non-comment character inside a character class, so also parse "[" and "]" specially for extended regexps, and only skip comments if "#" is not inside a character class. Handle nested character classes as well. This issue doesn't just affect extended regexps, it also affects "(#?" comments inside all regexps. So for those comments, scan until trailing ")" and ignore content inside. I'm not sure if there are other corner cases not handled. A better fix would be to redesign the regexp parser so that it unescaped during parsing instead of before parsing, so you already know the current parsing state. Fixes [Bug #18294] Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>
This commit is contained in:
parent
c85d1cda86
commit
ec3542229b
Notes:
git
2022-06-07 05:50:26 +09:00
Merged: https://github.com/ruby/ruby/pull/5721 Merged-By: jeremyevans <code@jeremyevans.net>
3 changed files with 131 additions and 19 deletions
26
parse.y
26
parse.y
|
@ -6803,17 +6803,21 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
|
|||
int codepoint = scan_hex(p->lex.pcur, wide ? p->lex.pend - p->lex.pcur : 4, &numlen);
|
||||
literal_flush(p, p->lex.pcur);
|
||||
p->lex.pcur += numlen;
|
||||
if (wide ? (numlen == 0 || numlen > 6) : (numlen < 4)) {
|
||||
yyerror0("invalid Unicode escape");
|
||||
return wide && numlen > 0;
|
||||
}
|
||||
if (codepoint > 0x10ffff) {
|
||||
yyerror0("invalid Unicode codepoint (too large)");
|
||||
return wide;
|
||||
}
|
||||
if ((codepoint & 0xfffff800) == 0xd800) {
|
||||
yyerror0("invalid Unicode codepoint");
|
||||
return wide;
|
||||
if (p->lex.strterm == NULL ||
|
||||
(p->lex.strterm->flags & STRTERM_HEREDOC) ||
|
||||
(p->lex.strterm->u.literal.u1.func != str_regexp)) {
|
||||
if (wide ? (numlen == 0 || numlen > 6) : (numlen < 4)) {
|
||||
yyerror0("invalid Unicode escape");
|
||||
return wide && numlen > 0;
|
||||
}
|
||||
if (codepoint > 0x10ffff) {
|
||||
yyerror0("invalid Unicode codepoint (too large)");
|
||||
return wide;
|
||||
}
|
||||
if ((codepoint & 0xfffff800) == 0xd800) {
|
||||
yyerror0("invalid Unicode codepoint");
|
||||
return wide;
|
||||
}
|
||||
}
|
||||
if (regexp_literal) {
|
||||
tokcopy(p, (int)numlen);
|
||||
|
|
71
re.c
71
re.c
|
@ -1527,7 +1527,7 @@ rb_reg_fixed_encoding_p(VALUE re)
|
|||
|
||||
static VALUE
|
||||
rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
|
||||
rb_encoding **fixed_enc, onig_errmsg_buffer err);
|
||||
rb_encoding **fixed_enc, onig_errmsg_buffer err, int options);
|
||||
|
||||
NORETURN(static void reg_enc_error(VALUE re, VALUE str));
|
||||
|
||||
|
@ -1608,7 +1608,7 @@ rb_reg_prepare_re0(VALUE re, VALUE str, onig_errmsg_buffer err)
|
|||
|
||||
unescaped = rb_reg_preprocess(
|
||||
pattern, pattern + RREGEXP_SRC_LEN(re), enc,
|
||||
&fixed_enc, err);
|
||||
&fixed_enc, err, 0);
|
||||
|
||||
if (NIL_P(unescaped)) {
|
||||
rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
|
||||
|
@ -2718,10 +2718,11 @@ unescape_unicode_bmp(const char **pp, const char *end,
|
|||
static int
|
||||
unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
|
||||
VALUE buf, rb_encoding **encp, int *has_property,
|
||||
onig_errmsg_buffer err)
|
||||
onig_errmsg_buffer err, int options)
|
||||
{
|
||||
unsigned char c;
|
||||
char smallbuf[2];
|
||||
int in_char_class = 0;
|
||||
|
||||
while (p < end) {
|
||||
int chlen = rb_enc_precise_mbclen(p, end, enc);
|
||||
|
@ -2833,6 +2834,60 @@ escape_asis:
|
|||
}
|
||||
break;
|
||||
|
||||
case '#':
|
||||
if ((options & ONIG_OPTION_EXTEND) && !in_char_class) {
|
||||
/* consume and ignore comment in extended regexp */
|
||||
while ((p < end) && ((c = *p++) != '\n'));
|
||||
break;
|
||||
}
|
||||
rb_str_buf_cat(buf, (char *)&c, 1);
|
||||
break;
|
||||
case '[':
|
||||
in_char_class++;
|
||||
rb_str_buf_cat(buf, (char *)&c, 1);
|
||||
break;
|
||||
case ']':
|
||||
if (in_char_class) {
|
||||
in_char_class--;
|
||||
}
|
||||
rb_str_buf_cat(buf, (char *)&c, 1);
|
||||
break;
|
||||
case '(':
|
||||
if (!in_char_class && p + 1 < end && *p == '?' && *(p+1) == '#') {
|
||||
/* (?# is comment inside any regexp, and content inside should be ignored */
|
||||
const char *orig_p = p;
|
||||
int cont = 1;
|
||||
|
||||
while (cont && (p < end)) {
|
||||
switch (c = *p++) {
|
||||
default:
|
||||
if (!(c & 0x80)) break;
|
||||
--p;
|
||||
/* fallthrough */
|
||||
case '\\':
|
||||
chlen = rb_enc_precise_mbclen(p, end, enc);
|
||||
if (!MBCLEN_CHARFOUND_P(chlen)) {
|
||||
goto invalid_multibyte;
|
||||
}
|
||||
p += MBCLEN_CHARFOUND_LEN(chlen);
|
||||
break;
|
||||
case ')':
|
||||
cont = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (cont) {
|
||||
/* unterminated (?#, rewind so it is syntax error */
|
||||
p = orig_p;
|
||||
c = '(';
|
||||
rb_str_buf_cat(buf, (char *)&c, 1);
|
||||
}
|
||||
}
|
||||
else {
|
||||
rb_str_buf_cat(buf, (char *)&c, 1);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
rb_str_buf_cat(buf, (char *)&c, 1);
|
||||
break;
|
||||
|
@ -2844,7 +2899,7 @@ escape_asis:
|
|||
|
||||
static VALUE
|
||||
rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
|
||||
rb_encoding **fixed_enc, onig_errmsg_buffer err)
|
||||
rb_encoding **fixed_enc, onig_errmsg_buffer err, int options)
|
||||
{
|
||||
VALUE buf;
|
||||
int has_property = 0;
|
||||
|
@ -2858,7 +2913,7 @@ rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
|
|||
rb_enc_associate(buf, enc);
|
||||
}
|
||||
|
||||
if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
|
||||
if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err, options) != 0)
|
||||
return Qnil;
|
||||
|
||||
if (has_property && !*fixed_enc) {
|
||||
|
@ -2886,7 +2941,7 @@ rb_reg_check_preprocess(VALUE str)
|
|||
end = p + RSTRING_LEN(str);
|
||||
enc = rb_enc_get(str);
|
||||
|
||||
buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
|
||||
buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err, 0);
|
||||
RB_GC_GUARD(str);
|
||||
|
||||
if (NIL_P(buf)) {
|
||||
|
@ -2928,7 +2983,7 @@ rb_reg_preprocess_dregexp(VALUE ary, int options)
|
|||
p = RSTRING_PTR(str);
|
||||
end = p + RSTRING_LEN(str);
|
||||
|
||||
buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
|
||||
buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err, options);
|
||||
|
||||
if (NIL_P(buf))
|
||||
rb_raise(rb_eArgError, "%s", err);
|
||||
|
@ -2975,7 +3030,7 @@ rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
|
|||
return -1;
|
||||
}
|
||||
|
||||
unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
|
||||
unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err, options);
|
||||
if (NIL_P(unescaped))
|
||||
return -1;
|
||||
|
||||
|
|
|
@ -91,6 +91,59 @@ class TestRegexp < Test::Unit::TestCase
|
|||
assert_warn('', '[ruby-core:82328] [Bug #13798]') {re.to_s}
|
||||
end
|
||||
|
||||
def test_extended_comment_invalid_escape_bug_18294
|
||||
assert_separately([], <<-RUBY)
|
||||
re = / C:\\\\[a-z]{5} # e.g. C:\\users /x
|
||||
assert_match(re, 'C:\\users')
|
||||
assert_not_match(re, 'C:\\user')
|
||||
|
||||
re = /
|
||||
foo # \\M-ca
|
||||
bar
|
||||
/x
|
||||
assert_match(re, 'foobar')
|
||||
assert_not_match(re, 'foobaz')
|
||||
|
||||
re = /
|
||||
f[#o]o # \\M-ca
|
||||
bar
|
||||
/x
|
||||
assert_match(re, 'foobar')
|
||||
assert_not_match(re, 'foobaz')
|
||||
|
||||
re = /
|
||||
f[[:alnum:]#]o # \\M-ca
|
||||
bar
|
||||
/x
|
||||
assert_match(re, 'foobar')
|
||||
assert_not_match(re, 'foobaz')
|
||||
|
||||
re = /
|
||||
f(?# \\M-ca)oo # \\M-ca
|
||||
bar
|
||||
/x
|
||||
assert_match(re, 'foobar')
|
||||
assert_not_match(re, 'foobaz')
|
||||
|
||||
re = /f(?# \\M-ca)oobar/
|
||||
assert_match(re, 'foobar')
|
||||
assert_not_match(re, 'foobaz')
|
||||
|
||||
re = /[-(?# fca)]oobar/
|
||||
assert_match(re, 'foobar')
|
||||
assert_not_match(re, 'foobaz')
|
||||
|
||||
re = /f(?# ca\0\\M-ca)oobar/
|
||||
assert_match(re, 'foobar')
|
||||
assert_not_match(re, 'foobaz')
|
||||
RUBY
|
||||
|
||||
assert_raise(SyntaxError) {eval "/\\users/x"}
|
||||
assert_raise(SyntaxError) {eval "/[\\users]/x"}
|
||||
assert_raise(SyntaxError) {eval "/(?<\\users)/x"}
|
||||
assert_raise(SyntaxError) {eval "/# \\users/"}
|
||||
end
|
||||
|
||||
def test_union
|
||||
assert_equal :ok, begin
|
||||
Regexp.union(
|
||||
|
|
Loading…
Add table
Reference in a new issue