Multiple codepoints are not allowed at single character literal

It has unintentionally passed since 2.5.
2022-11-09 12:17:21 -05:00 · 2019-07-05 22:18:08 +09:00 · 2019-07-05 22:18:08 +09:00 · d746a41e85
commit d746a41e85
parent 0a2f598d23
2 changed files with 21 additions and 5 deletions
--- a/parse.y
+++ b/parse.y
@ -6246,24 +6246,28 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
 /* return value is for ?\u3042 */
 static void
 tokadd_utf8(struct parser_params *p, rb_encoding **encp,
-	    int string_literal, int symbol_literal, int regexp_literal)
+	    int term, int symbol_literal, int regexp_literal)
 {
    /*
-     * If string_literal is true, then we allow multiple codepoints
+     * If `term` is not -1, then we allow multiple codepoints in \u{}
-     * in \u{}, and add the codepoints to the current token.
+     * upto `term` byte, otherwise we're parsing a character literal.
-     * Otherwise we're parsing a character literal and return a single
+     * And then add the codepoints to the current token.
     * codepoint without adding it
     */
    static const char multiple_codepoints[] = "Multiple codepoints at single character literal";
    const int open_brace = '{', close_brace = '}';
    if (regexp_literal) { tokadd(p, '\\'); tokadd(p, 'u'); }
    if (peek(p, open_brace)) {  /* handle \u{...} form */
 	const char *second = NULL;
 	int c, last = nextc(p);
 	if (p->lex.pcur >= p->lex.pend) goto unterminated;
 	while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
 	while (c != close_brace) {
 	    if (c == term) goto unterminated;
 	    if (second == multiple_codepoints)
 		second = p->lex.pcur;
 	    if (regexp_literal) tokadd(p, last);
 	    if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
 		break;
@ -6272,6 +6276,8 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
 		if (++p->lex.pcur >= p->lex.pend) goto unterminated;
 		last = c;
 	    }
 	    if (term == -1 && !second)
 		second = multiple_codepoints;
 	}
 	if (c != close_brace) {
@ -6280,6 +6286,15 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
 	    yyerror0("unterminated Unicode escape");
 	    return;
 	}
 	if (second && second != multiple_codepoints) {
 	    const char *pcur = p->lex.pcur;
 	    p->lex.pcur = second;
 	    dispatch_scan_event(p, tSTRING_CONTENT);
 	    token_flush(p);
 	    p->lex.pcur = pcur;
 	    yyerror0(multiple_codepoints);
 	    token_flush(p);
 	}
 	if (regexp_literal) tokadd(p, close_brace);
 	nextc(p);
--- a/test/ruby/test_parse.rb
+++ b/test/ruby/test_parse.rb
@ -577,6 +577,7 @@ class TestParse < Test::Unit::TestCase
    assert_equal("\u{1234}", eval("?\u{1234}"))
    assert_equal("\u{1234}", eval('?\u{1234}'))
    assert_equal("\u{1234}", eval('?\u1234'))
    assert_syntax_error('?\u{41 42}', 'Multiple codepoints at single character literal')
    e = assert_syntax_error('"#{?\u123}"', 'invalid Unicode escape')
    assert_not_match(/end-of-input/, e.message)