1
0
Fork 0
mirror of https://github.com/jashkenas/coffeescript.git synced 2022-11-09 12:23:24 -05:00

Fix #4248: Unicode code point escapes (#4498)

This commit is contained in:
Julian Rosse 2017-04-20 01:03:06 -05:00 committed by Simon Lydell
parent bfce05438b
commit 96b6c5f65a
5 changed files with 241 additions and 24 deletions

View file

@ -1,6 +1,6 @@
// Generated by CoffeeScript 1.12.5
(function() {
var BOM, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HERECOMMENT_ILLEGAL, HEREDOC_DOUBLE, HEREDOC_INDENT, HEREDOC_SINGLE, HEREGEX, HEREGEX_OMIT, HERE_JSTOKEN, IDENTIFIER, INDENTABLE_CLOSERS, INDEXABLE, INVERSES, JSTOKEN, JS_KEYWORDS, LEADING_BLANK_LINE, LINE_BREAK, LINE_CONTINUER, Lexer, MATH, MULTI_DENT, NOT_REGEX, NUMBER, OPERATOR, POSSIBLY_DIVISION, REGEX, REGEX_FLAGS, REGEX_ILLEGAL, REGEX_INVALID_ESCAPE, RELATION, RESERVED, Rewriter, SHIFT, SIMPLE_STRING_OMIT, STRICT_PROSCRIBED, STRING_DOUBLE, STRING_INVALID_ESCAPE, STRING_OMIT, STRING_SINGLE, STRING_START, TRAILING_BLANK_LINE, TRAILING_SPACES, UNARY, UNARY_MATH, VALID_FLAGS, WHITESPACE, compact, count, invertLiterate, isForFrom, isUnassignable, key, locationDataToString, ref, ref1, repeat, starts, throwSyntaxError,
var BOM, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HERECOMMENT_ILLEGAL, HEREDOC_DOUBLE, HEREDOC_INDENT, HEREDOC_SINGLE, HEREGEX, HEREGEX_OMIT, HERE_JSTOKEN, IDENTIFIER, INDENTABLE_CLOSERS, INDEXABLE, INVERSES, JSTOKEN, JS_KEYWORDS, LEADING_BLANK_LINE, LINE_BREAK, LINE_CONTINUER, Lexer, MATH, MULTI_DENT, NOT_REGEX, NUMBER, OPERATOR, POSSIBLY_DIVISION, REGEX, REGEX_FLAGS, REGEX_ILLEGAL, REGEX_INVALID_ESCAPE, RELATION, RESERVED, Rewriter, SHIFT, SIMPLE_STRING_OMIT, STRICT_PROSCRIBED, STRING_DOUBLE, STRING_INVALID_ESCAPE, STRING_OMIT, STRING_SINGLE, STRING_START, TRAILING_BLANK_LINE, TRAILING_SPACES, UNARY, UNARY_MATH, UNICODE_CODE_POINT_ESCAPE, VALID_FLAGS, WHITESPACE, compact, count, invertLiterate, isForFrom, isUnassignable, key, locationDataToString, ref, ref1, repeat, starts, throwSyntaxError,
indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; },
slice = [].slice;
@ -282,7 +282,9 @@
delimiter: delimiter
}, (function(_this) {
return function(value, i) {
value = _this.formatString(value);
value = _this.formatString(value, {
delimiter: quote
});
if (indentRegex) {
value = value.replace(indentRegex, '\n');
}
@ -300,7 +302,9 @@
delimiter: delimiter
}, (function(_this) {
return function(value, i) {
value = _this.formatString(value);
value = _this.formatString(value, {
delimiter: quote
});
value = value.replace(SIMPLE_STRING_OMIT, function(match, offset) {
if ((i === 0 && offset === 0) || (i === $ && offset + match.length === value.length)) {
return '';
@ -365,6 +369,9 @@
isRegex: true,
offsetInChunk: 1
});
body = this.formatRegex(body, {
delimiter: '/'
});
index = regex.length;
ref2 = this.tokens, prev = ref2[ref2.length - 1];
if (prev) {
@ -745,7 +752,7 @@
tokensToPush = value;
break;
case 'NEOSTRING':
converted = fn(token[1], i);
converted = fn.call(this, token[1], i);
if (converted.length === 0) {
if (i === 0) {
firstEmptyStringIndex = this.tokens.length;
@ -870,16 +877,56 @@
return LINE_CONTINUER.test(this.chunk) || ((ref2 = this.tag()) === '\\' || ref2 === '.' || ref2 === '?.' || ref2 === '?::' || ref2 === 'UNARY' || ref2 === 'MATH' || ref2 === 'UNARY_MATH' || ref2 === '+' || ref2 === '-' || ref2 === '**' || ref2 === 'SHIFT' || ref2 === 'RELATION' || ref2 === 'COMPARE' || ref2 === '&' || ref2 === '^' || ref2 === '|' || ref2 === '&&' || ref2 === '||' || ref2 === 'BIN?' || ref2 === 'THROW' || ref2 === 'EXTENDS');
};
Lexer.prototype.formatString = function(str) {
return str.replace(STRING_OMIT, '$1');
Lexer.prototype.formatString = function(str, options) {
return this.replaceUnicodeCodePointEscapes(str.replace(STRING_OMIT, '$1'), options);
};
Lexer.prototype.formatHeregex = function(str) {
return str.replace(HEREGEX_OMIT, '$1$2');
return this.formatRegex(str.replace(HEREGEX_OMIT, '$1$2'), {
delimiter: '///'
});
};
Lexer.prototype.formatRegex = function(str, options) {
return this.replaceUnicodeCodePointEscapes(str, options);
};
Lexer.prototype.unicodeCodePointToUnicodeEscapes = function(codePoint) {
var high, low, toUnicodeEscape;
toUnicodeEscape = function(val) {
var str;
str = val.toString(16);
return "\\u" + (repeat('0', 4 - str.length)) + str;
};
if (codePoint < 0x10000) {
return toUnicodeEscape(codePoint);
}
high = Math.floor((codePoint - 0x10000) / 0x400) + 0xD800;
low = (codePoint - 0x10000) % 0x400 + 0xDC00;
return "" + (toUnicodeEscape(high)) + (toUnicodeEscape(low));
};
Lexer.prototype.replaceUnicodeCodePointEscapes = function(str, options) {
return str.replace(UNICODE_CODE_POINT_ESCAPE, (function(_this) {
return function(match, escapedBackslash, codePointHex, offset) {
var codePointDecimal;
if (escapedBackslash) {
return escapedBackslash;
}
codePointDecimal = parseInt(codePointHex, 16);
if (codePointDecimal > 0x10ffff) {
_this.error("unicode code point escapes greater than \\u{10ffff} are not allowed", {
offset: offset + options.delimiter.length,
length: codePointHex.length + 4
});
}
return _this.unicodeCodePointToUnicodeEscapes(codePointDecimal);
};
})(this));
};
Lexer.prototype.validateEscapes = function(str, options) {
var before, hex, invalidEscape, invalidEscapeRegex, match, message, octal, ref2, unicode;
var before, hex, invalidEscape, invalidEscapeRegex, match, message, octal, ref2, unicode, unicodeCodePoint;
if (options == null) {
options = {};
}
@ -888,9 +935,9 @@
if (!match) {
return;
}
match[0], before = match[1], octal = match[2], hex = match[3], unicode = match[4];
match[0], before = match[1], octal = match[2], hex = match[3], unicodeCodePoint = match[4], unicode = match[5];
message = octal ? "octal escape sequences are not allowed" : "invalid escape sequence";
invalidEscape = "\\" + (octal || hex || unicode);
invalidEscape = "\\" + (octal || hex || unicodeCodePoint || unicode);
return this.error(message + " " + invalidEscape, {
offset: ((ref2 = options.offsetInChunk) != null ? ref2 : 0) + match.index + before.length,
length: invalidEscape.length
@ -1062,7 +1109,7 @@
REGEX_FLAGS = /^\w*/;
VALID_FLAGS = /^(?!.*(.).*\1)[imgy]*$/;
VALID_FLAGS = /^(?!.*(.).*\1)[imguy]*$/;
HEREGEX = /^(?:[^\\\/#]|\\[\s\S]|\/(?!\/\/)|\#(?!\{))*/;
@ -1076,9 +1123,11 @@
LINE_CONTINUER = /^\s*(?:,|\??\.(?![.\d])|::)/;
STRING_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7]|[1-7])|(x(?![\da-fA-F]{2}).{0,2})|(u(?![\da-fA-F]{4}).{0,4}))/;
STRING_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7]|[1-7])|(x(?![\da-fA-F]{2}).{0,2})|(u\{(?![\da-fA-F]{1,}\})[^}]*\}?)|(u(?!\{|[\da-fA-F]{4}).{0,4}))/;
REGEX_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7])|(x(?![\da-fA-F]{2}).{0,2})|(u(?![\da-fA-F]{4}).{0,4}))/;
REGEX_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7])|(x(?![\da-fA-F]{2}).{0,2})|(u\{(?![\da-fA-F]{1,}\})[^}]*\}?)|(u(?!\{|[\da-fA-F]{4}).{0,4}))/;
UNICODE_CODE_POINT_ESCAPE = /(\\\\)|\\u\{([\da-fA-F]+)\}/g;
LEADING_BLANK_LINE = /^[^\n\S]*\n/;

View file

@ -261,14 +261,14 @@ exports.Lexer = class Lexer
indent = attempt if indent is null or 0 < attempt.length < indent.length
indentRegex = /// \n#{indent} ///g if indent
@mergeInterpolationTokens tokens, {delimiter}, (value, i) =>
value = @formatString value
value = @formatString value, delimiter: quote
value = value.replace indentRegex, '\n' if indentRegex
value = value.replace LEADING_BLANK_LINE, '' if i is 0
value = value.replace TRAILING_BLANK_LINE, '' if i is $
value
else
@mergeInterpolationTokens tokens, {delimiter}, (value, i) =>
value = @formatString value
value = @formatString value, delimiter: quote
value = value.replace SIMPLE_STRING_OMIT, (match, offset) ->
if (i is 0 and offset is 0) or
(i is $ and offset + match.length is value.length)
@ -318,6 +318,7 @@ exports.Lexer = class Lexer
when match = REGEX.exec @chunk
[regex, body, closed] = match
@validateEscapes body, isRegex: yes, offsetInChunk: 1
body = @formatRegex body, delimiter: '/'
index = regex.length
[..., prev] = @tokens
if prev
@ -632,7 +633,7 @@ exports.Lexer = class Lexer
tokensToPush = value
when 'NEOSTRING'
# Convert 'NEOSTRING' into 'STRING'.
converted = fn token[1], i
converted = fn.call this, token[1], i
# Optimize out empty strings. We ensure that the tokens stream always
# starts with a string token, though, to make sure that the result
# really is a string.
@ -762,11 +763,37 @@ exports.Lexer = class Lexer
'**', 'SHIFT', 'RELATION', 'COMPARE', '&', '^', '|', '&&', '||',
'BIN?', 'THROW', 'EXTENDS']
formatString: (str) ->
str.replace STRING_OMIT, '$1'
formatString: (str, options) ->
@replaceUnicodeCodePointEscapes str.replace(STRING_OMIT, '$1'), options
formatHeregex: (str) ->
str.replace HEREGEX_OMIT, '$1$2'
@formatRegex str.replace(HEREGEX_OMIT, '$1$2'), delimiter: '///'
formatRegex: (str, options) ->
@replaceUnicodeCodePointEscapes str, options
unicodeCodePointToUnicodeEscapes: (codePoint) ->
toUnicodeEscape = (val) ->
str = val.toString 16
"\\u#{repeat '0', 4 - str.length}#{str}"
return toUnicodeEscape(codePoint) if codePoint < 0x10000
# surrogate pair
high = Math.floor((codePoint - 0x10000) / 0x400) + 0xD800
low = (codePoint - 0x10000) % 0x400 + 0xDC00
"#{toUnicodeEscape(high)}#{toUnicodeEscape(low)}"
# Replace \u{...} with \uxxxx[\uxxxx] in strings and regexes
replaceUnicodeCodePointEscapes: (str, options) ->
str.replace UNICODE_CODE_POINT_ESCAPE, (match, escapedBackslash, codePointHex, offset) =>
return escapedBackslash if escapedBackslash
codePointDecimal = parseInt codePointHex, 16
if codePointDecimal > 0x10ffff
@error "unicode code point escapes greater than \\u{10ffff} are not allowed",
offset: offset + options.delimiter.length
length: codePointHex.length + 4
@unicodeCodePointToUnicodeEscapes codePointDecimal
# Validates escapes in strings and regexes.
validateEscapes: (str, options = {}) ->
@ -777,13 +804,13 @@ exports.Lexer = class Lexer
STRING_INVALID_ESCAPE
match = invalidEscapeRegex.exec str
return unless match
[[], before, octal, hex, unicode] = match
[[], before, octal, hex, unicodeCodePoint, unicode] = match
message =
if octal
"octal escape sequences are not allowed"
else
"invalid escape sequence"
invalidEscape = "\\#{octal or hex or unicode}"
invalidEscape = "\\#{octal or hex or unicodeCodePoint or unicode}"
@error "#{message} #{invalidEscape}",
offset: (options.offsetInChunk ? 0) + match.index + before.length
length: invalidEscape.length
@ -970,7 +997,7 @@ REGEX = /// ^
///
REGEX_FLAGS = /^\w*/
VALID_FLAGS = /^(?!.*(.).*\1)[imgy]*$/
VALID_FLAGS = /^(?!.*(.).*\1)[imguy]*$/
HEREGEX = /// ^(?: [^\\/#] | \\[\s\S] | /(?!//) | \#(?!\{) )* ///
@ -994,7 +1021,8 @@ STRING_INVALID_ESCAPE = ///
\\ (
?: (0[0-7]|[1-7]) # octal escape
| (x(?![\da-fA-F]{2}).{0,2}) # hex escape
| (u(?![\da-fA-F]{4}).{0,4}) # unicode escape
| (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape
| (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape
)
///
REGEX_INVALID_ESCAPE = ///
@ -1002,10 +1030,17 @@ REGEX_INVALID_ESCAPE = ///
\\ (
?: (0[0-7]) # octal escape
| (x(?![\da-fA-F]{2}).{0,2}) # hex escape
| (u(?![\da-fA-F]{4}).{0,4}) # unicode escape
| (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape
| (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape
)
///
UNICODE_CODE_POINT_ESCAPE = ///
( \\\\ ) # make sure the escape isnt escaped
|
\\u\{ ( [\da-fA-F]+ ) \}
///g
LEADING_BLANK_LINE = /^[^\n\S]*\n/
TRAILING_BLANK_LINE = /\n[^\n\S]*$/

View file

@ -1257,3 +1257,65 @@ test "can't use pattern matches for loop indices", ->
a for b, {c} in d
^^^
'''
test "#4248: Unicode code point escapes", ->
assertErrorFormat '''
"a
#{b} \\u{G02}
c"
''', '''
[stdin]:2:8: error: invalid escape sequence \\u{G02}
#{b} \\u{G02}
^\^^^^^^
'''
assertErrorFormat '''
/a\\u{}b/
''', '''
[stdin]:1:3: error: invalid escape sequence \\u{}
/a\\u{}b/
^\^^^
'''
assertErrorFormat '''
///a \\u{01abc///
''', '''
[stdin]:1:6: error: invalid escape sequence \\u{01abc
///a \\u{01abc///
^\^^^^^^^
'''
assertErrorFormat '''
/\\u{123} \\u{110000}/
''', '''
[stdin]:1:10: error: unicode code point escapes greater than \\u{10ffff} are not allowed
/\\u{123} \\u{110000}/
\ ^\^^^^^^^^^
'''
assertErrorFormat '''
///abc\\\\\\u{123456}///u
''', '''
[stdin]:1:9: error: unicode code point escapes greater than \\u{10ffff} are not allowed
///abc\\\\\\u{123456}///u
\ \^\^^^^^^^^^
'''
assertErrorFormat '''
"""
\\u{123}
a
\\u{00110000}
#{ 'b' }
"""
''', '''
[stdin]:4:5: error: unicode code point escapes greater than \\u{10ffff} are not allowed
\\u{00110000}
^\^^^^^^^^^^^
'''
assertErrorFormat '''
'\\u{a}\\u{1111110000}'
''', '''
[stdin]:1:7: error: unicode code point escapes greater than \\u{10ffff} are not allowed
'\\u{a}\\u{1111110000}'
\ ^\^^^^^^^^^^^^^
'''

View file

@ -6,6 +6,12 @@
# * Regexen
# * Heregexen
# Helper function
toJS = (str) ->
CoffeeScript.compile str, bare: yes
.replace /^\s+|\s+$/g, '' # Trim leading/trailing whitespace
test "basic regular expression literals", ->
ok 'a'.match(/a/)
ok 'a'.match /a/
@ -286,3 +292,32 @@ test "#3795: Escape otherwise invalid characters", ->
ok ///#{a}\///.test 'a\u2029'
ok ///#{a}\0
1///.test 'a\x001'
test "#4248: Unicode code point escapes", ->
ok /a\u{1ab}c/u.test 'a\u01abc'
ok ///#{ 'a' }\u{000001ab}c///u.test 'a\u{1ab}c'
ok ///a\u{000001ab}c///u.test 'a\u{1ab}c'
ok /a\u{12345}c/u.test 'a\ud808\udf45c'
# and now without u flag
ok /a\u{1ab}c/.test 'a\u01abc'
ok ///#{ 'a' }\u{000001ab}c///.test 'a\u{1ab}c'
ok ///a\u{000001ab}c///.test 'a\u{1ab}c'
ok /a\u{12345}c/.test 'a\ud808\udf45c'
# rewrite code point escapes
input = """
/\\u{bcdef}\\u{abc}/u
"""
output = """
/\\udab3\\uddef\\u0abc/u;
"""
eq toJS(input), output
input = """
///#{ 'a' }\\u{bcdef}///
"""
output = """
/a\\udab3\\uddef/;
"""
eq toJS(input), output

View file

@ -7,6 +7,12 @@
# * Strings
# * Heredocs
# Helper function
toJS = (str) ->
CoffeeScript.compile str, bare: yes
.replace /^\s+|\s+$/g, '' # Trim leading/trailing whitespace
test "backslash escapes", ->
eq "\\/\\\\", /\/\\/.source
@ -400,3 +406,33 @@ test "#4314: Whitespace less than or equal to stripped indentation", ->
eq '1 2 3 4 5 end\na 0 b', """
#{1} #{2} #{3} #{4} #{5} end
a #{0} b"""
test "#4248: Unicode code point escapes", ->
eq '\u01ab\u00cd', '\u{1ab}\u{cd}'
eq '\u01ab', '\u{000001ab}'
eq 'a\u01ab', "#{ 'a' }\u{1ab}"
eq '\u01abc', '''\u{01ab}c'''
eq '\u01abc', """\u{1ab}#{ 'c' }"""
eq '\udab3\uddef', '\u{bcdef}'
eq '\udab3\uddef', '\u{0000bcdef}'
eq 'a\udab3\uddef', "#{ 'a' }\u{bcdef}"
eq '\udab3\uddefc', '''\u{0bcdef}c'''
eq '\udab3\uddefc', """\u{bcdef}#{ 'c' }"""
eq '\\u{123456}', "#{'\\'}#{'u{123456}'}"
# rewrite code point escapes
input = """
'\\u{bcdef}\\u{abc}'
"""
output = """
'\\udab3\\uddef\\u0abc';
"""
eq toJS(input), output
input = """
"#{ 'a' }\\u{bcdef}"
"""
output = """
"a\\udab3\\uddef";
"""
eq toJS(input), output