diff --git a/lib/coffee-script/lexer.js b/lib/coffee-script/lexer.js index 9e1d98f9..86456857 100644 --- a/lib/coffee-script/lexer.js +++ b/lib/coffee-script/lexer.js @@ -1,6 +1,6 @@ // Generated by CoffeeScript 1.12.5 (function() { - var BOM, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HERECOMMENT_ILLEGAL, HEREDOC_DOUBLE, HEREDOC_INDENT, HEREDOC_SINGLE, HEREGEX, HEREGEX_OMIT, HERE_JSTOKEN, IDENTIFIER, INDENTABLE_CLOSERS, INDEXABLE, INVERSES, JSTOKEN, JS_KEYWORDS, LEADING_BLANK_LINE, LINE_BREAK, LINE_CONTINUER, Lexer, MATH, MULTI_DENT, NOT_REGEX, NUMBER, OPERATOR, POSSIBLY_DIVISION, REGEX, REGEX_FLAGS, REGEX_ILLEGAL, REGEX_INVALID_ESCAPE, RELATION, RESERVED, Rewriter, SHIFT, SIMPLE_STRING_OMIT, STRICT_PROSCRIBED, STRING_DOUBLE, STRING_INVALID_ESCAPE, STRING_OMIT, STRING_SINGLE, STRING_START, TRAILING_BLANK_LINE, TRAILING_SPACES, UNARY, UNARY_MATH, VALID_FLAGS, WHITESPACE, compact, count, invertLiterate, isForFrom, isUnassignable, key, locationDataToString, ref, ref1, repeat, starts, throwSyntaxError, + var BOM, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HERECOMMENT_ILLEGAL, HEREDOC_DOUBLE, HEREDOC_INDENT, HEREDOC_SINGLE, HEREGEX, HEREGEX_OMIT, HERE_JSTOKEN, IDENTIFIER, INDENTABLE_CLOSERS, INDEXABLE, INVERSES, JSTOKEN, JS_KEYWORDS, LEADING_BLANK_LINE, LINE_BREAK, LINE_CONTINUER, Lexer, MATH, MULTI_DENT, NOT_REGEX, NUMBER, OPERATOR, POSSIBLY_DIVISION, REGEX, REGEX_FLAGS, REGEX_ILLEGAL, REGEX_INVALID_ESCAPE, RELATION, RESERVED, Rewriter, SHIFT, SIMPLE_STRING_OMIT, STRICT_PROSCRIBED, STRING_DOUBLE, STRING_INVALID_ESCAPE, STRING_OMIT, STRING_SINGLE, STRING_START, TRAILING_BLANK_LINE, TRAILING_SPACES, UNARY, UNARY_MATH, UNICODE_CODE_POINT_ESCAPE, VALID_FLAGS, WHITESPACE, compact, count, invertLiterate, isForFrom, isUnassignable, key, locationDataToString, ref, ref1, repeat, starts, throwSyntaxError, indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; }, slice = [].slice; @@ -282,7 +282,9 @@ delimiter: delimiter }, (function(_this) { return function(value, i) { - value = _this.formatString(value); + value = _this.formatString(value, { + delimiter: quote + }); if (indentRegex) { value = value.replace(indentRegex, '\n'); } @@ -300,7 +302,9 @@ delimiter: delimiter }, (function(_this) { return function(value, i) { - value = _this.formatString(value); + value = _this.formatString(value, { + delimiter: quote + }); value = value.replace(SIMPLE_STRING_OMIT, function(match, offset) { if ((i === 0 && offset === 0) || (i === $ && offset + match.length === value.length)) { return ''; @@ -365,6 +369,9 @@ isRegex: true, offsetInChunk: 1 }); + body = this.formatRegex(body, { + delimiter: '/' + }); index = regex.length; ref2 = this.tokens, prev = ref2[ref2.length - 1]; if (prev) { @@ -745,7 +752,7 @@ tokensToPush = value; break; case 'NEOSTRING': - converted = fn(token[1], i); + converted = fn.call(this, token[1], i); if (converted.length === 0) { if (i === 0) { firstEmptyStringIndex = this.tokens.length; @@ -870,16 +877,56 @@ return LINE_CONTINUER.test(this.chunk) || ((ref2 = this.tag()) === '\\' || ref2 === '.' || ref2 === '?.' || ref2 === '?::' || ref2 === 'UNARY' || ref2 === 'MATH' || ref2 === 'UNARY_MATH' || ref2 === '+' || ref2 === '-' || ref2 === '**' || ref2 === 'SHIFT' || ref2 === 'RELATION' || ref2 === 'COMPARE' || ref2 === '&' || ref2 === '^' || ref2 === '|' || ref2 === '&&' || ref2 === '||' || ref2 === 'BIN?' || ref2 === 'THROW' || ref2 === 'EXTENDS'); }; - Lexer.prototype.formatString = function(str) { - return str.replace(STRING_OMIT, '$1'); + Lexer.prototype.formatString = function(str, options) { + return this.replaceUnicodeCodePointEscapes(str.replace(STRING_OMIT, '$1'), options); }; Lexer.prototype.formatHeregex = function(str) { - return str.replace(HEREGEX_OMIT, '$1$2'); + return this.formatRegex(str.replace(HEREGEX_OMIT, '$1$2'), { + delimiter: '///' + }); + }; + + Lexer.prototype.formatRegex = function(str, options) { + return this.replaceUnicodeCodePointEscapes(str, options); + }; + + Lexer.prototype.unicodeCodePointToUnicodeEscapes = function(codePoint) { + var high, low, toUnicodeEscape; + toUnicodeEscape = function(val) { + var str; + str = val.toString(16); + return "\\u" + (repeat('0', 4 - str.length)) + str; + }; + if (codePoint < 0x10000) { + return toUnicodeEscape(codePoint); + } + high = Math.floor((codePoint - 0x10000) / 0x400) + 0xD800; + low = (codePoint - 0x10000) % 0x400 + 0xDC00; + return "" + (toUnicodeEscape(high)) + (toUnicodeEscape(low)); + }; + + Lexer.prototype.replaceUnicodeCodePointEscapes = function(str, options) { + return str.replace(UNICODE_CODE_POINT_ESCAPE, (function(_this) { + return function(match, escapedBackslash, codePointHex, offset) { + var codePointDecimal; + if (escapedBackslash) { + return escapedBackslash; + } + codePointDecimal = parseInt(codePointHex, 16); + if (codePointDecimal > 0x10ffff) { + _this.error("unicode code point escapes greater than \\u{10ffff} are not allowed", { + offset: offset + options.delimiter.length, + length: codePointHex.length + 4 + }); + } + return _this.unicodeCodePointToUnicodeEscapes(codePointDecimal); + }; + })(this)); }; Lexer.prototype.validateEscapes = function(str, options) { - var before, hex, invalidEscape, invalidEscapeRegex, match, message, octal, ref2, unicode; + var before, hex, invalidEscape, invalidEscapeRegex, match, message, octal, ref2, unicode, unicodeCodePoint; if (options == null) { options = {}; } @@ -888,9 +935,9 @@ if (!match) { return; } - match[0], before = match[1], octal = match[2], hex = match[3], unicode = match[4]; + match[0], before = match[1], octal = match[2], hex = match[3], unicodeCodePoint = match[4], unicode = match[5]; message = octal ? "octal escape sequences are not allowed" : "invalid escape sequence"; - invalidEscape = "\\" + (octal || hex || unicode); + invalidEscape = "\\" + (octal || hex || unicodeCodePoint || unicode); return this.error(message + " " + invalidEscape, { offset: ((ref2 = options.offsetInChunk) != null ? ref2 : 0) + match.index + before.length, length: invalidEscape.length @@ -1062,7 +1109,7 @@ REGEX_FLAGS = /^\w*/; - VALID_FLAGS = /^(?!.*(.).*\1)[imgy]*$/; + VALID_FLAGS = /^(?!.*(.).*\1)[imguy]*$/; HEREGEX = /^(?:[^\\\/#]|\\[\s\S]|\/(?!\/\/)|\#(?!\{))*/; @@ -1076,9 +1123,11 @@ LINE_CONTINUER = /^\s*(?:,|\??\.(?![.\d])|::)/; - STRING_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7]|[1-7])|(x(?![\da-fA-F]{2}).{0,2})|(u(?![\da-fA-F]{4}).{0,4}))/; + STRING_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7]|[1-7])|(x(?![\da-fA-F]{2}).{0,2})|(u\{(?![\da-fA-F]{1,}\})[^}]*\}?)|(u(?!\{|[\da-fA-F]{4}).{0,4}))/; - REGEX_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7])|(x(?![\da-fA-F]{2}).{0,2})|(u(?![\da-fA-F]{4}).{0,4}))/; + REGEX_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7])|(x(?![\da-fA-F]{2}).{0,2})|(u\{(?![\da-fA-F]{1,}\})[^}]*\}?)|(u(?!\{|[\da-fA-F]{4}).{0,4}))/; + + UNICODE_CODE_POINT_ESCAPE = /(\\\\)|\\u\{([\da-fA-F]+)\}/g; LEADING_BLANK_LINE = /^[^\n\S]*\n/; diff --git a/src/lexer.coffee b/src/lexer.coffee index 868a2aad..669e8a4b 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -261,14 +261,14 @@ exports.Lexer = class Lexer indent = attempt if indent is null or 0 < attempt.length < indent.length indentRegex = /// \n#{indent} ///g if indent @mergeInterpolationTokens tokens, {delimiter}, (value, i) => - value = @formatString value + value = @formatString value, delimiter: quote value = value.replace indentRegex, '\n' if indentRegex value = value.replace LEADING_BLANK_LINE, '' if i is 0 value = value.replace TRAILING_BLANK_LINE, '' if i is $ value else @mergeInterpolationTokens tokens, {delimiter}, (value, i) => - value = @formatString value + value = @formatString value, delimiter: quote value = value.replace SIMPLE_STRING_OMIT, (match, offset) -> if (i is 0 and offset is 0) or (i is $ and offset + match.length is value.length) @@ -318,6 +318,7 @@ exports.Lexer = class Lexer when match = REGEX.exec @chunk [regex, body, closed] = match @validateEscapes body, isRegex: yes, offsetInChunk: 1 + body = @formatRegex body, delimiter: '/' index = regex.length [..., prev] = @tokens if prev @@ -632,7 +633,7 @@ exports.Lexer = class Lexer tokensToPush = value when 'NEOSTRING' # Convert 'NEOSTRING' into 'STRING'. - converted = fn token[1], i + converted = fn.call this, token[1], i # Optimize out empty strings. We ensure that the tokens stream always # starts with a string token, though, to make sure that the result # really is a string. @@ -762,11 +763,37 @@ exports.Lexer = class Lexer '**', 'SHIFT', 'RELATION', 'COMPARE', '&', '^', '|', '&&', '||', 'BIN?', 'THROW', 'EXTENDS'] - formatString: (str) -> - str.replace STRING_OMIT, '$1' + formatString: (str, options) -> + @replaceUnicodeCodePointEscapes str.replace(STRING_OMIT, '$1'), options formatHeregex: (str) -> - str.replace HEREGEX_OMIT, '$1$2' + @formatRegex str.replace(HEREGEX_OMIT, '$1$2'), delimiter: '///' + + formatRegex: (str, options) -> + @replaceUnicodeCodePointEscapes str, options + + unicodeCodePointToUnicodeEscapes: (codePoint) -> + toUnicodeEscape = (val) -> + str = val.toString 16 + "\\u#{repeat '0', 4 - str.length}#{str}" + return toUnicodeEscape(codePoint) if codePoint < 0x10000 + # surrogate pair + high = Math.floor((codePoint - 0x10000) / 0x400) + 0xD800 + low = (codePoint - 0x10000) % 0x400 + 0xDC00 + "#{toUnicodeEscape(high)}#{toUnicodeEscape(low)}" + + # Replace \u{...} with \uxxxx[\uxxxx] in strings and regexes + replaceUnicodeCodePointEscapes: (str, options) -> + str.replace UNICODE_CODE_POINT_ESCAPE, (match, escapedBackslash, codePointHex, offset) => + return escapedBackslash if escapedBackslash + + codePointDecimal = parseInt codePointHex, 16 + if codePointDecimal > 0x10ffff + @error "unicode code point escapes greater than \\u{10ffff} are not allowed", + offset: offset + options.delimiter.length + length: codePointHex.length + 4 + + @unicodeCodePointToUnicodeEscapes codePointDecimal # Validates escapes in strings and regexes. validateEscapes: (str, options = {}) -> @@ -777,13 +804,13 @@ exports.Lexer = class Lexer STRING_INVALID_ESCAPE match = invalidEscapeRegex.exec str return unless match - [[], before, octal, hex, unicode] = match + [[], before, octal, hex, unicodeCodePoint, unicode] = match message = if octal "octal escape sequences are not allowed" else "invalid escape sequence" - invalidEscape = "\\#{octal or hex or unicode}" + invalidEscape = "\\#{octal or hex or unicodeCodePoint or unicode}" @error "#{message} #{invalidEscape}", offset: (options.offsetInChunk ? 0) + match.index + before.length length: invalidEscape.length @@ -970,7 +997,7 @@ REGEX = /// ^ /// REGEX_FLAGS = /^\w*/ -VALID_FLAGS = /^(?!.*(.).*\1)[imgy]*$/ +VALID_FLAGS = /^(?!.*(.).*\1)[imguy]*$/ HEREGEX = /// ^(?: [^\\/#] | \\[\s\S] | /(?!//) | \#(?!\{) )* /// @@ -994,7 +1021,8 @@ STRING_INVALID_ESCAPE = /// \\ ( ?: (0[0-7]|[1-7]) # octal escape | (x(?![\da-fA-F]{2}).{0,2}) # hex escape - | (u(?![\da-fA-F]{4}).{0,4}) # unicode escape + | (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape + | (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape ) /// REGEX_INVALID_ESCAPE = /// @@ -1002,10 +1030,17 @@ REGEX_INVALID_ESCAPE = /// \\ ( ?: (0[0-7]) # octal escape | (x(?![\da-fA-F]{2}).{0,2}) # hex escape - | (u(?![\da-fA-F]{4}).{0,4}) # unicode escape + | (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape + | (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape ) /// +UNICODE_CODE_POINT_ESCAPE = /// + ( \\\\ ) # make sure the escape isn’t escaped + | + \\u\{ ( [\da-fA-F]+ ) \} +///g + LEADING_BLANK_LINE = /^[^\n\S]*\n/ TRAILING_BLANK_LINE = /\n[^\n\S]*$/ diff --git a/test/error_messages.coffee b/test/error_messages.coffee index 3e33db5e..d09acb02 100644 --- a/test/error_messages.coffee +++ b/test/error_messages.coffee @@ -1257,3 +1257,65 @@ test "can't use pattern matches for loop indices", -> a for b, {c} in d ^^^ ''' + +test "#4248: Unicode code point escapes", -> + assertErrorFormat ''' + "a + #{b} \\u{G02} + c" + ''', ''' + [stdin]:2:8: error: invalid escape sequence \\u{G02} + #{b} \\u{G02} + ^\^^^^^^ + ''' + assertErrorFormat ''' + /a\\u{}b/ + ''', ''' + [stdin]:1:3: error: invalid escape sequence \\u{} + /a\\u{}b/ + ^\^^^ + ''' + assertErrorFormat ''' + ///a \\u{01abc/// + ''', ''' + [stdin]:1:6: error: invalid escape sequence \\u{01abc + ///a \\u{01abc/// + ^\^^^^^^^ + ''' + + assertErrorFormat ''' + /\\u{123} \\u{110000}/ + ''', ''' + [stdin]:1:10: error: unicode code point escapes greater than \\u{10ffff} are not allowed + /\\u{123} \\u{110000}/ + \ ^\^^^^^^^^^ + ''' + + assertErrorFormat ''' + ///abc\\\\\\u{123456}///u + ''', ''' + [stdin]:1:9: error: unicode code point escapes greater than \\u{10ffff} are not allowed + ///abc\\\\\\u{123456}///u + \ \^\^^^^^^^^^ + ''' + + assertErrorFormat ''' + """ + \\u{123} + a + \\u{00110000} + #{ 'b' } + """ + ''', ''' + [stdin]:4:5: error: unicode code point escapes greater than \\u{10ffff} are not allowed + \\u{00110000} + ^\^^^^^^^^^^^ + ''' + + assertErrorFormat ''' + '\\u{a}\\u{1111110000}' + ''', ''' + [stdin]:1:7: error: unicode code point escapes greater than \\u{10ffff} are not allowed + '\\u{a}\\u{1111110000}' + \ ^\^^^^^^^^^^^^^ + ''' diff --git a/test/regexps.coffee b/test/regexps.coffee index beb3711d..623ae41d 100644 --- a/test/regexps.coffee +++ b/test/regexps.coffee @@ -6,6 +6,12 @@ # * Regexen # * Heregexen +# Helper function +toJS = (str) -> + CoffeeScript.compile str, bare: yes + .replace /^\s+|\s+$/g, '' # Trim leading/trailing whitespace + + test "basic regular expression literals", -> ok 'a'.match(/a/) ok 'a'.match /a/ @@ -286,3 +292,32 @@ test "#3795: Escape otherwise invalid characters", -> ok ///#{a}\
///.test 'a\u2029' ok ///#{a}\0 1///.test 'a\x001' + +test "#4248: Unicode code point escapes", -> + ok /a\u{1ab}c/u.test 'a\u01abc' + ok ///#{ 'a' }\u{000001ab}c///u.test 'a\u{1ab}c' + ok ///a\u{000001ab}c///u.test 'a\u{1ab}c' + ok /a\u{12345}c/u.test 'a\ud808\udf45c' + + # and now without u flag + ok /a\u{1ab}c/.test 'a\u01abc' + ok ///#{ 'a' }\u{000001ab}c///.test 'a\u{1ab}c' + ok ///a\u{000001ab}c///.test 'a\u{1ab}c' + ok /a\u{12345}c/.test 'a\ud808\udf45c' + + # rewrite code point escapes + input = """ + /\\u{bcdef}\\u{abc}/u + """ + output = """ + /\\udab3\\uddef\\u0abc/u; + """ + eq toJS(input), output + + input = """ + ///#{ 'a' }\\u{bcdef}/// + """ + output = """ + /a\\udab3\\uddef/; + """ + eq toJS(input), output diff --git a/test/strings.coffee b/test/strings.coffee index 0f1975e2..1cd17efa 100644 --- a/test/strings.coffee +++ b/test/strings.coffee @@ -7,6 +7,12 @@ # * Strings # * Heredocs +# Helper function +toJS = (str) -> + CoffeeScript.compile str, bare: yes + .replace /^\s+|\s+$/g, '' # Trim leading/trailing whitespace + + test "backslash escapes", -> eq "\\/\\\\", /\/\\/.source @@ -400,3 +406,33 @@ test "#4314: Whitespace less than or equal to stripped indentation", -> eq '1 2 3 4 5 end\na 0 b', """ #{1} #{2} #{3} #{4} #{5} end a #{0} b""" + +test "#4248: Unicode code point escapes", -> + eq '\u01ab\u00cd', '\u{1ab}\u{cd}' + eq '\u01ab', '\u{000001ab}' + eq 'a\u01ab', "#{ 'a' }\u{1ab}" + eq '\u01abc', '''\u{01ab}c''' + eq '\u01abc', """\u{1ab}#{ 'c' }""" + eq '\udab3\uddef', '\u{bcdef}' + eq '\udab3\uddef', '\u{0000bcdef}' + eq 'a\udab3\uddef', "#{ 'a' }\u{bcdef}" + eq '\udab3\uddefc', '''\u{0bcdef}c''' + eq '\udab3\uddefc', """\u{bcdef}#{ 'c' }""" + eq '\\u{123456}', "#{'\\'}#{'u{123456}'}" + + # rewrite code point escapes + input = """ + '\\u{bcdef}\\u{abc}' + """ + output = """ + '\\udab3\\uddef\\u0abc'; + """ + eq toJS(input), output + + input = """ + "#{ 'a' }\\u{bcdef}" + """ + output = """ + "a\\udab3\\uddef"; + """ + eq toJS(input), output