1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

parse.y: utf-8 codepoints

* parse.y (parser_tokadd_utf8): skip spaces in the current line,
  without advancing the line, to get rid of dangling pointer.
  [ruby-core:82029] [Bug #13742]

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@59344 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
nobu 2017-07-16 13:39:18 +00:00
parent f01f98f022
commit 80d502b881
2 changed files with 41 additions and 23 deletions

61
parse.y
View file

@ -5652,6 +5652,7 @@ parser_tokadd_codepoint(struct parser_params *parser, rb_encoding **encp,
{ {
size_t numlen; size_t numlen;
int codepoint = scan_hex(lex_p, wide ? 6 : 4, &numlen); int codepoint = scan_hex(lex_p, wide ? 6 : 4, &numlen);
lex_p += numlen;
if (wide ? (numlen == 0) : (numlen < 4)) { if (wide ? (numlen == 0) : (numlen < 4)) {
yyerror("invalid Unicode escape"); yyerror("invalid Unicode escape");
return FALSE; return FALSE;
@ -5664,12 +5665,20 @@ parser_tokadd_codepoint(struct parser_params *parser, rb_encoding **encp,
yyerror("invalid Unicode codepoint"); yyerror("invalid Unicode codepoint");
return FALSE; return FALSE;
} }
lex_p += numlen;
if (regexp_literal) { if (regexp_literal) {
tokcopy((int)numlen); tokcopy((int)numlen);
} }
else if (codepoint >= 0x80) { else if (codepoint >= 0x80) {
*encp = rb_utf8_encoding(); rb_encoding *utf8 = rb_utf8_encoding();
if (*encp && utf8 != *encp) {
static const char mixed_utf8[] = "UTF-8 mixed within %s source";
size_t len = sizeof(mixed_utf8) - 2 + strlen(rb_enc_name(*encp));
char *mesg = alloca(len);
snprintf(mesg, len, mixed_utf8, rb_enc_name(*encp));
yyerror(mesg);
return TRUE;
}
*encp = utf8;
tokaddmbc(codepoint, *encp); tokaddmbc(codepoint, *encp);
} }
else { else {
@ -5696,19 +5705,23 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
if (peek(open_brace)) { /* handle \u{...} form */ if (peek(open_brace)) { /* handle \u{...} form */
int c, last = nextc(); int c, last = nextc();
do c = nextc(); while (ISSPACE(c)); if (lex_p >= lex_pend) goto unterminated;
pushback(c); while (ISSPACE(c = *lex_p) && ++lex_p < lex_pend);
while (!string_literal || c != close_brace) { while (!string_literal || c != close_brace) {
if (regexp_literal) tokadd(last); if (regexp_literal) tokadd(last);
if (!parser_tokadd_codepoint(parser, encp, regexp_literal, TRUE)) { if (!parser_tokadd_codepoint(parser, encp, regexp_literal, TRUE)) {
return 0; return 0;
} }
while (ISSPACE(c = nextc())) last = c; while (ISSPACE(c = *lex_p)) {
pushback(c); if (++lex_p >= lex_pend) goto unterminated;
last = c;
}
if (!string_literal) break; if (!string_literal) break;
} }
if (c != close_brace) { if (c != close_brace) {
unterminated:
parser->tokp = lex_p;
yyerror("unterminated Unicode escape"); yyerror("unterminated Unicode escape");
return 0; return 0;
} }
@ -5999,8 +6012,7 @@ parser_tokadd_string(struct parser_params *parser,
rb_encoding **encp) rb_encoding **encp)
{ {
int c; int c;
int has_nonascii = 0; rb_encoding *enc = 0;
rb_encoding *enc = *encp;
char *errbuf = 0; char *errbuf = 0;
static const char mixed_msg[] = "%s mixed within %s source"; static const char mixed_msg[] = "%s mixed within %s source";
@ -6044,9 +6056,8 @@ parser_tokadd_string(struct parser_params *parser,
} }
} }
else if (c == '\\') { else if (c == '\\') {
const char *beg = lex_p - 1;
#ifndef RIPPER #ifndef RIPPER
parser->tokp = beg; parser->tokp = lex_p - 1;
#endif #endif
c = nextc(); c = nextc();
switch (c) { switch (c) {
@ -6065,11 +6076,10 @@ parser_tokadd_string(struct parser_params *parser,
tokadd('\\'); tokadd('\\');
break; break;
} }
parser_tokadd_utf8(parser, &enc, 1, if (!parser_tokadd_utf8(parser, &enc, term,
func & STR_FUNC_SYMBOL, func & STR_FUNC_SYMBOL,
func & STR_FUNC_REGEXP); func & STR_FUNC_REGEXP)) {
if (has_nonascii && enc != *encp) { return -1;
mixed_escape(beg, enc, *encp);
} }
continue; continue;
@ -6087,8 +6097,8 @@ parser_tokadd_string(struct parser_params *parser,
pushback(c); pushback(c);
if ((c = tokadd_escape(&enc)) < 0) if ((c = tokadd_escape(&enc)) < 0)
return -1; return -1;
if (has_nonascii && enc != *encp) { if (enc && enc != *encp) {
mixed_escape(beg, enc, *encp); mixed_escape(parser->tokp+2, enc, *encp);
} }
continue; continue;
} }
@ -6109,8 +6119,10 @@ parser_tokadd_string(struct parser_params *parser,
} }
else if (!parser_isascii()) { else if (!parser_isascii()) {
non_ascii: non_ascii:
has_nonascii = 1; if (!enc) {
if (enc != *encp) { enc = *encp;
}
else if (enc != *encp) {
mixed_error(enc, *encp); mixed_error(enc, *encp);
continue; continue;
} }
@ -6122,15 +6134,17 @@ parser_tokadd_string(struct parser_params *parser,
break; break;
} }
if (c & 0x80) { if (c & 0x80) {
has_nonascii = 1; if (!enc) {
if (enc != *encp) { enc = *encp;
}
else if (enc != *encp) {
mixed_error(enc, *encp); mixed_error(enc, *encp);
continue; continue;
} }
} }
tokadd(c); tokadd(c);
} }
*encp = enc; if (enc) *encp = enc;
return c; return c;
} }
@ -7460,7 +7474,8 @@ parse_qmark(struct parser_params *parser, int space_seen)
else if (c == '\\') { else if (c == '\\') {
if (peek('u')) { if (peek('u')) {
nextc(); nextc();
if (!parser_tokadd_utf8(parser, &enc, 0, 0, 0)) enc = rb_utf8_encoding();
if (!parser_tokadd_utf8(parser, &enc, -1, 0, 0))
return 0; return 0;
} }
else if (!lex_eol_p() && !(c = *lex_p, ISASCII(c))) { else if (!lex_eol_p() && !(c = *lex_p, ISASCII(c))) {

View file

@ -498,6 +498,9 @@ class TestParse < Test::Unit::TestCase
e = assert_syntax_error('"\C1"', /escape character syntax/) e = assert_syntax_error('"\C1"', /escape character syntax/)
assert_equal(' ^~~', e.message.lines.last, mesg) assert_equal(' ^~~', e.message.lines.last, mesg)
src = '"\xD0\u{90'"\n""000000000000000000000000"
assert_syntax_error(src, /:#{__LINE__}: unterminated/o)
assert_equal("\x81", eval('"\C-\M-a"')) assert_equal("\x81", eval('"\C-\M-a"'))
assert_equal("\177", eval('"\c?"')) assert_equal("\177", eval('"\c?"'))
end end