Fix #4248: Unicode code point escapes (#4498)

2022-11-09 12:23:24 -05:00 · 2017-04-20 01:03:06 -05:00 · 2017-04-20 01:03:06 -05:00 · 96b6c5f65a
commit 96b6c5f65a
parent bfce05438b
5 changed files with 241 additions and 24 deletions
--- a/lib/coffee-script/lexer.js
+++ b/lib/coffee-script/lexer.js
@ -1,6 +1,6 @@
 // Generated by CoffeeScript 1.12.5
 (function() {
-  var BOM, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HERECOMMENT_ILLEGAL, HEREDOC_DOUBLE, HEREDOC_INDENT, HEREDOC_SINGLE, HEREGEX, HEREGEX_OMIT, HERE_JSTOKEN, IDENTIFIER, INDENTABLE_CLOSERS, INDEXABLE, INVERSES, JSTOKEN, JS_KEYWORDS, LEADING_BLANK_LINE, LINE_BREAK, LINE_CONTINUER, Lexer, MATH, MULTI_DENT, NOT_REGEX, NUMBER, OPERATOR, POSSIBLY_DIVISION, REGEX, REGEX_FLAGS, REGEX_ILLEGAL, REGEX_INVALID_ESCAPE, RELATION, RESERVED, Rewriter, SHIFT, SIMPLE_STRING_OMIT, STRICT_PROSCRIBED, STRING_DOUBLE, STRING_INVALID_ESCAPE, STRING_OMIT, STRING_SINGLE, STRING_START, TRAILING_BLANK_LINE, TRAILING_SPACES, UNARY, UNARY_MATH, VALID_FLAGS, WHITESPACE, compact, count, invertLiterate, isForFrom, isUnassignable, key, locationDataToString, ref, ref1, repeat, starts, throwSyntaxError,
+  var BOM, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HERECOMMENT_ILLEGAL, HEREDOC_DOUBLE, HEREDOC_INDENT, HEREDOC_SINGLE, HEREGEX, HEREGEX_OMIT, HERE_JSTOKEN, IDENTIFIER, INDENTABLE_CLOSERS, INDEXABLE, INVERSES, JSTOKEN, JS_KEYWORDS, LEADING_BLANK_LINE, LINE_BREAK, LINE_CONTINUER, Lexer, MATH, MULTI_DENT, NOT_REGEX, NUMBER, OPERATOR, POSSIBLY_DIVISION, REGEX, REGEX_FLAGS, REGEX_ILLEGAL, REGEX_INVALID_ESCAPE, RELATION, RESERVED, Rewriter, SHIFT, SIMPLE_STRING_OMIT, STRICT_PROSCRIBED, STRING_DOUBLE, STRING_INVALID_ESCAPE, STRING_OMIT, STRING_SINGLE, STRING_START, TRAILING_BLANK_LINE, TRAILING_SPACES, UNARY, UNARY_MATH, UNICODE_CODE_POINT_ESCAPE, VALID_FLAGS, WHITESPACE, compact, count, invertLiterate, isForFrom, isUnassignable, key, locationDataToString, ref, ref1, repeat, starts, throwSyntaxError,
    indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; },
    slice = [].slice;

@ -282,7 +282,9 @@
          delimiter: delimiter
        }, (function(_this) {
          return function(value, i) {
-            value = _this.formatString(value);
+            value = _this.formatString(value, {
+              delimiter: quote
+            });
            if (indentRegex) {
              value = value.replace(indentRegex, '\n');
            }
@ -300,7 +302,9 @@
          delimiter: delimiter
        }, (function(_this) {
          return function(value, i) {
-            value = _this.formatString(value);
+            value = _this.formatString(value, {
+              delimiter: quote
+            });
            value = value.replace(SIMPLE_STRING_OMIT, function(match, offset) {
              if ((i === 0 && offset === 0) || (i === $ && offset + match.length === value.length)) {
                return '';
@ -365,6 +369,9 @@
            isRegex: true,
            offsetInChunk: 1
          });
+          body = this.formatRegex(body, {
+            delimiter: '/'
+          });
          index = regex.length;
          ref2 = this.tokens, prev = ref2[ref2.length - 1];
          if (prev) {
@ -745,7 +752,7 @@
            tokensToPush = value;
            break;
          case 'NEOSTRING':
-            converted = fn(token[1], i);
+            converted = fn.call(this, token[1], i);
            if (converted.length === 0) {
              if (i === 0) {
                firstEmptyStringIndex = this.tokens.length;
@ -870,16 +877,56 @@
      return LINE_CONTINUER.test(this.chunk) || ((ref2 = this.tag()) === '\\' || ref2 === '.' || ref2 === '?.' || ref2 === '?::' || ref2 === 'UNARY' || ref2 === 'MATH' || ref2 === 'UNARY_MATH' || ref2 === '+' || ref2 === '-' || ref2 === '**' || ref2 === 'SHIFT' || ref2 === 'RELATION' || ref2 === 'COMPARE' || ref2 === '&' || ref2 === '^' || ref2 === '|' || ref2 === '&&' || ref2 === '||' || ref2 === 'BIN?' || ref2 === 'THROW' || ref2 === 'EXTENDS');
    };

-    Lexer.prototype.formatString = function(str) {
-      return str.replace(STRING_OMIT, '$1');
+    Lexer.prototype.formatString = function(str, options) {
+      return this.replaceUnicodeCodePointEscapes(str.replace(STRING_OMIT, '$1'), options);
    };

    Lexer.prototype.formatHeregex = function(str) {
-      return str.replace(HEREGEX_OMIT, '$1$2');
+      return this.formatRegex(str.replace(HEREGEX_OMIT, '$1$2'), {
+        delimiter: '///'
+      });
+    };
+
+    Lexer.prototype.formatRegex = function(str, options) {
+      return this.replaceUnicodeCodePointEscapes(str, options);
+    };
+
+    Lexer.prototype.unicodeCodePointToUnicodeEscapes = function(codePoint) {
+      var high, low, toUnicodeEscape;
+      toUnicodeEscape = function(val) {
+        var str;
+        str = val.toString(16);
+        return "\\u" + (repeat('0', 4 - str.length)) + str;
+      };
+      if (codePoint < 0x10000) {
+        return toUnicodeEscape(codePoint);
+      }
+      high = Math.floor((codePoint - 0x10000) / 0x400) + 0xD800;
+      low = (codePoint - 0x10000) % 0x400 + 0xDC00;
+      return "" + (toUnicodeEscape(high)) + (toUnicodeEscape(low));
+    };
+
+    Lexer.prototype.replaceUnicodeCodePointEscapes = function(str, options) {
+      return str.replace(UNICODE_CODE_POINT_ESCAPE, (function(_this) {
+        return function(match, escapedBackslash, codePointHex, offset) {
+          var codePointDecimal;
+          if (escapedBackslash) {
+            return escapedBackslash;
+          }
+          codePointDecimal = parseInt(codePointHex, 16);
+          if (codePointDecimal > 0x10ffff) {
+            _this.error("unicode code point escapes greater than \\u{10ffff} are not allowed", {
+              offset: offset + options.delimiter.length,
+              length: codePointHex.length + 4
+            });
+          }
+          return _this.unicodeCodePointToUnicodeEscapes(codePointDecimal);
+        };
+      })(this));
    };

    Lexer.prototype.validateEscapes = function(str, options) {
-      var before, hex, invalidEscape, invalidEscapeRegex, match, message, octal, ref2, unicode;
+      var before, hex, invalidEscape, invalidEscapeRegex, match, message, octal, ref2, unicode, unicodeCodePoint;
      if (options == null) {
        options = {};
      }
@ -888,9 +935,9 @@
      if (!match) {
        return;
      }
-      match[0], before = match[1], octal = match[2], hex = match[3], unicode = match[4];
+      match[0], before = match[1], octal = match[2], hex = match[3], unicodeCodePoint = match[4], unicode = match[5];
      message = octal ? "octal escape sequences are not allowed" : "invalid escape sequence";
-      invalidEscape = "\\" + (octal || hex || unicode);
+      invalidEscape = "\\" + (octal || hex || unicodeCodePoint || unicode);
      return this.error(message + " " + invalidEscape, {
        offset: ((ref2 = options.offsetInChunk) != null ? ref2 : 0) + match.index + before.length,
        length: invalidEscape.length
@ -1062,7 +1109,7 @@

  REGEX_FLAGS = /^\w*/;

-  VALID_FLAGS = /^(?!.*(.).*\1)[imgy]*$/;
+  VALID_FLAGS = /^(?!.*(.).*\1)[imguy]*$/;

  HEREGEX = /^(?:[^\\\/#]|\\[\s\S]|\/(?!\/\/)|\#(?!\{))*/;

@ -1076,9 +1123,11 @@

  LINE_CONTINUER = /^\s*(?:,|\??\.(?![.\d])|::)/;

-  STRING_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7]|[1-7])|(x(?![\da-fA-F]{2}).{0,2})|(u(?![\da-fA-F]{4}).{0,4}))/;
+  STRING_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7]|[1-7])|(x(?![\da-fA-F]{2}).{0,2})|(u\{(?![\da-fA-F]{1,}\})[^}]*\}?)|(u(?!\{|[\da-fA-F]{4}).{0,4}))/;

-  REGEX_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7])|(x(?![\da-fA-F]{2}).{0,2})|(u(?![\da-fA-F]{4}).{0,4}))/;
+  REGEX_INVALID_ESCAPE = /((?:^|[^\\])(?:\\\\)*)\\(?:(0[0-7])|(x(?![\da-fA-F]{2}).{0,2})|(u\{(?![\da-fA-F]{1,}\})[^}]*\}?)|(u(?!\{|[\da-fA-F]{4}).{0,4}))/;
+
+  UNICODE_CODE_POINT_ESCAPE = /(\\\\)|\\u\{([\da-fA-F]+)\}/g;

  LEADING_BLANK_LINE = /^[^\n\S]*\n/;

--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@ -261,14 +261,14 @@ exports.Lexer = class Lexer
        indent = attempt if indent is null or 0 < attempt.length < indent.length
      indentRegex = /// \n#{indent} ///g if indent
      @mergeInterpolationTokens tokens, {delimiter}, (value, i) =>
-        value = @formatString value
+        value = @formatString value, delimiter: quote
        value = value.replace indentRegex, '\n' if indentRegex
        value = value.replace LEADING_BLANK_LINE,  '' if i is 0
        value = value.replace TRAILING_BLANK_LINE, '' if i is $
        value
    else
      @mergeInterpolationTokens tokens, {delimiter}, (value, i) =>
-        value = @formatString value
+        value = @formatString value, delimiter: quote
        value = value.replace SIMPLE_STRING_OMIT, (match, offset) ->
          if (i is 0 and offset is 0) or
             (i is $ and offset + match.length is value.length)
@ -318,6 +318,7 @@ exports.Lexer = class Lexer
      when match = REGEX.exec @chunk
        [regex, body, closed] = match
        @validateEscapes body, isRegex: yes, offsetInChunk: 1
+        body = @formatRegex body, delimiter: '/'
        index = regex.length
        [..., prev] = @tokens
        if prev
@ -632,7 +633,7 @@ exports.Lexer = class Lexer
          tokensToPush = value
        when 'NEOSTRING'
          # Convert 'NEOSTRING' into 'STRING'.
-          converted = fn token[1], i
+          converted = fn.call this, token[1], i
          # Optimize out empty strings. We ensure that the tokens stream always
          # starts with a string token, though, to make sure that the result
          # really is a string.
@ -762,11 +763,37 @@ exports.Lexer = class Lexer
               '**', 'SHIFT', 'RELATION', 'COMPARE', '&', '^', '|', '&&', '||',
               'BIN?', 'THROW', 'EXTENDS']

-  formatString: (str) ->
-    str.replace STRING_OMIT, '$1'
+  formatString: (str, options) ->
+    @replaceUnicodeCodePointEscapes str.replace(STRING_OMIT, '$1'), options

  formatHeregex: (str) ->
-    str.replace HEREGEX_OMIT, '$1$2'
+    @formatRegex str.replace(HEREGEX_OMIT, '$1$2'), delimiter: '///'
+
+  formatRegex: (str, options) ->
+    @replaceUnicodeCodePointEscapes str, options
+
+  unicodeCodePointToUnicodeEscapes: (codePoint) ->
+    toUnicodeEscape = (val) ->
+      str = val.toString 16
+      "\\u#{repeat '0', 4 - str.length}#{str}"
+    return toUnicodeEscape(codePoint) if codePoint < 0x10000
+    # surrogate pair
+    high = Math.floor((codePoint - 0x10000) / 0x400) + 0xD800
+    low = (codePoint - 0x10000) % 0x400 + 0xDC00
+    "#{toUnicodeEscape(high)}#{toUnicodeEscape(low)}"
+
+  # Replace \u{...} with \uxxxx[\uxxxx] in strings and regexes
+  replaceUnicodeCodePointEscapes: (str, options) ->
+    str.replace UNICODE_CODE_POINT_ESCAPE, (match, escapedBackslash, codePointHex, offset) =>
+      return escapedBackslash if escapedBackslash
+
+      codePointDecimal = parseInt codePointHex, 16
+      if codePointDecimal > 0x10ffff
+        @error "unicode code point escapes greater than \\u{10ffff} are not allowed",
+          offset: offset + options.delimiter.length
+          length: codePointHex.length + 4
+
+      @unicodeCodePointToUnicodeEscapes codePointDecimal

  # Validates escapes in strings and regexes.
  validateEscapes: (str, options = {}) ->
@ -777,13 +804,13 @@ exports.Lexer = class Lexer
        STRING_INVALID_ESCAPE
    match = invalidEscapeRegex.exec str
    return unless match
-    [[], before, octal, hex, unicode] = match
+    [[], before, octal, hex, unicodeCodePoint, unicode] = match
    message =
      if octal
        "octal escape sequences are not allowed"
      else
        "invalid escape sequence"
-    invalidEscape = "\\#{octal or hex or unicode}"
+    invalidEscape = "\\#{octal or hex or unicodeCodePoint or unicode}"
    @error "#{message} #{invalidEscape}",
      offset: (options.offsetInChunk ? 0) + match.index + before.length
      length: invalidEscape.length
@ -970,7 +997,7 @@ REGEX = /// ^
 ///

 REGEX_FLAGS  = /^\w*/
-VALID_FLAGS  = /^(?!.*(.).*\1)[imgy]*$/
+VALID_FLAGS  = /^(?!.*(.).*\1)[imguy]*$/

 HEREGEX      = /// ^(?: [^\\/#] | \\[\s\S] | /(?!//) | \#(?!\{) )* ///

@ -994,7 +1021,8 @@ STRING_INVALID_ESCAPE = ///
  \\ (
     ?: (0[0-7]|[1-7])             # octal escape
      | (x(?![\da-fA-F]{2}).{0,2}) # hex escape
-      | (u(?![\da-fA-F]{4}).{0,4}) # unicode escape
+      | (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape
+      | (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape
  )
 ///
 REGEX_INVALID_ESCAPE = ///
@ -1002,10 +1030,17 @@ REGEX_INVALID_ESCAPE = ///
  \\ (
     ?: (0[0-7])                   # octal escape
      | (x(?![\da-fA-F]{2}).{0,2}) # hex escape
-      | (u(?![\da-fA-F]{4}).{0,4}) # unicode escape
+      | (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape
+      | (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape
  )
 ///

+UNICODE_CODE_POINT_ESCAPE = ///
+  ( \\\\ )        # make sure the escape isn’t escaped
+  |
+  \\u\{ ( [\da-fA-F]+ ) \}
+///g
+
 LEADING_BLANK_LINE  = /^[^\n\S]*\n/
 TRAILING_BLANK_LINE = /\n[^\n\S]*$/

--- a/test/error_messages.coffee
+++ b/test/error_messages.coffee
@ -1257,3 +1257,65 @@ test "can't use pattern matches for loop indices", ->
    a for b, {c} in d
             ^^^
  '''
+
+test "#4248: Unicode code point escapes", ->
+  assertErrorFormat '''
+    "a
+      #{b} \\u{G02}
+     c"
+  ''', '''
+    [stdin]:2:8: error: invalid escape sequence \\u{G02}
+      #{b} \\u{G02}
+           ^\^^^^^^
+  '''
+  assertErrorFormat '''
+    /a\\u{}b/
+  ''', '''
+    [stdin]:1:3: error: invalid escape sequence \\u{}
+    /a\\u{}b/
+      ^\^^^
+  '''
+  assertErrorFormat '''
+    ///a \\u{01abc///
+  ''', '''
+    [stdin]:1:6: error: invalid escape sequence \\u{01abc
+    ///a \\u{01abc///
+         ^\^^^^^^^
+  '''
+
+  assertErrorFormat '''
+    /\\u{123} \\u{110000}/
+  ''', '''
+    [stdin]:1:10: error: unicode code point escapes greater than \\u{10ffff} are not allowed
+    /\\u{123} \\u{110000}/
+      \       ^\^^^^^^^^^
+  '''
+
+  assertErrorFormat '''
+    ///abc\\\\\\u{123456}///u
+  ''', '''
+    [stdin]:1:9: error: unicode code point escapes greater than \\u{10ffff} are not allowed
+    ///abc\\\\\\u{123456}///u
+           \ \^\^^^^^^^^^
+  '''
+
+  assertErrorFormat '''
+    """
+      \\u{123}
+      a
+        \\u{00110000}
+      #{ 'b' }
+    """
+  ''', '''
+    [stdin]:4:5: error: unicode code point escapes greater than \\u{10ffff} are not allowed
+        \\u{00110000}
+        ^\^^^^^^^^^^^
+  '''
+
+  assertErrorFormat '''
+    '\\u{a}\\u{1111110000}'
+  ''', '''
+    [stdin]:1:7: error: unicode code point escapes greater than \\u{10ffff} are not allowed
+    '\\u{a}\\u{1111110000}'
+      \    ^\^^^^^^^^^^^^^
+  '''
--- a/test/regexps.coffee
+++ b/test/regexps.coffee
@ -6,6 +6,12 @@
 # * Regexen
 # * Heregexen

+# Helper function
+toJS = (str) ->
+  CoffeeScript.compile str, bare: yes
+  .replace /^\s+|\s+$/g, '' # Trim leading/trailing whitespace
+
+
 test "basic regular expression literals", ->
  ok 'a'.match(/a/)
  ok 'a'.match /a/
@ -286,3 +292,32 @@ test "#3795: Escape otherwise invalid characters", ->
  ok ///#{a}\ ///.test 'a\u2029'
  ok ///#{a}\0
      1///.test 'a\x001'
+
+test "#4248: Unicode code point escapes", ->
+  ok /a\u{1ab}c/u.test 'a\u01abc'
+  ok ///#{ 'a' }\u{000001ab}c///u.test 'a\u{1ab}c'
+  ok ///a\u{000001ab}c///u.test 'a\u{1ab}c'
+  ok /a\u{12345}c/u.test 'a\ud808\udf45c'
+
+  # and now without u flag
+  ok /a\u{1ab}c/.test 'a\u01abc'
+  ok ///#{ 'a' }\u{000001ab}c///.test 'a\u{1ab}c'
+  ok ///a\u{000001ab}c///.test 'a\u{1ab}c'
+  ok /a\u{12345}c/.test 'a\ud808\udf45c'
+
+  # rewrite code point escapes
+  input = """
+    /\\u{bcdef}\\u{abc}/u
+    """
+  output = """
+    /\\udab3\\uddef\\u0abc/u;
+  """
+  eq toJS(input), output
+
+  input = """
+    ///#{ 'a' }\\u{bcdef}///
+    """
+  output = """
+    /a\\udab3\\uddef/;
+  """
+  eq toJS(input), output
--- a/test/strings.coffee
+++ b/test/strings.coffee
@ -7,6 +7,12 @@
 # * Strings
 # * Heredocs

+# Helper function
+toJS = (str) ->
+  CoffeeScript.compile str, bare: yes
+  .replace /^\s+|\s+$/g, '' # Trim leading/trailing whitespace
+
+
 test "backslash escapes", ->
  eq "\\/\\\\", /\/\\/.source

@ -400,3 +406,33 @@ test "#4314: Whitespace less than or equal to stripped indentation", ->
  eq '1 2  3   4    5     end\na 0     b', """
    #{1} #{2}  #{3}   #{4}    #{5}     end
    a #{0}     b"""
+
+test "#4248: Unicode code point escapes", ->
+  eq '\u01ab\u00cd', '\u{1ab}\u{cd}'
+  eq '\u01ab', '\u{000001ab}'
+  eq 'a\u01ab', "#{ 'a' }\u{1ab}"
+  eq '\u01abc', '''\u{01ab}c'''
+  eq '\u01abc', """\u{1ab}#{ 'c' }"""
+  eq '\udab3\uddef', '\u{bcdef}'
+  eq '\udab3\uddef', '\u{0000bcdef}'
+  eq 'a\udab3\uddef', "#{ 'a' }\u{bcdef}"
+  eq '\udab3\uddefc', '''\u{0bcdef}c'''
+  eq '\udab3\uddefc', """\u{bcdef}#{ 'c' }"""
+  eq '\\u{123456}', "#{'\\'}#{'u{123456}'}"
+
+  # rewrite code point escapes
+  input = """
+    '\\u{bcdef}\\u{abc}'
+    """
+  output = """
+    '\\udab3\\uddef\\u0abc';
+  """
+  eq toJS(input), output
+
+  input = """
+    "#{ 'a' }\\u{bcdef}"
+    """
+  output = """
+    "a\\udab3\\uddef";
+  """
+  eq toJS(input), output