Keep unicode code point escapes as is when possible (#4520)

2022-11-09 12:23:24 -05:00 · 2017-04-25 12:15:08 -05:00 · 2017-04-25 12:15:08 -05:00 · 7ef5cb4a1f
commit 7ef5cb4a1f
parent 07ae1edb44
4 changed files with 37 additions and 23 deletions
--- a/lib/coffeescript/lexer.js
+++ b/lib/coffeescript/lexer.js
@ -1,11 +1,11 @@
 // Generated by CoffeeScript 2.0.0-beta1
 (function() {
-  var BOM, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HERECOMMENT_ILLEGAL, HEREDOC_DOUBLE, HEREDOC_INDENT, HEREDOC_SINGLE, HEREGEX, HEREGEX_OMIT, HERE_JSTOKEN, IDENTIFIER, INDENTABLE_CLOSERS, INDEXABLE, INVERSES, JSTOKEN, JS_KEYWORDS, LEADING_BLANK_LINE, LINE_BREAK, LINE_CONTINUER, Lexer, MATH, MULTI_DENT, NOT_REGEX, NUMBER, OPERATOR, POSSIBLY_DIVISION, REGEX, REGEX_FLAGS, REGEX_ILLEGAL, REGEX_INVALID_ESCAPE, RELATION, RESERVED, Rewriter, SHIFT, SIMPLE_STRING_OMIT, STRICT_PROSCRIBED, STRING_DOUBLE, STRING_INVALID_ESCAPE, STRING_OMIT, STRING_SINGLE, STRING_START, TRAILING_BLANK_LINE, TRAILING_SPACES, UNARY, UNARY_MATH, UNICODE_CODE_POINT_ESCAPE, VALID_FLAGS, WHITESPACE, compact, count, invertLiterate, isForFrom, isUnassignable, key, locationDataToString, repeat, starts, throwSyntaxError,
+  var BOM, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HERECOMMENT_ILLEGAL, HEREDOC_DOUBLE, HEREDOC_INDENT, HEREDOC_SINGLE, HEREGEX, HEREGEX_OMIT, HERE_JSTOKEN, IDENTIFIER, INDENTABLE_CLOSERS, INDEXABLE, INVERSES, JSTOKEN, JS_KEYWORDS, LEADING_BLANK_LINE, LINE_BREAK, LINE_CONTINUER, Lexer, MATH, MULTI_DENT, NOT_REGEX, NUMBER, OPERATOR, POSSIBLY_DIVISION, REGEX, REGEX_FLAGS, REGEX_ILLEGAL, REGEX_INVALID_ESCAPE, RELATION, RESERVED, Rewriter, SHIFT, SIMPLE_STRING_OMIT, STRICT_PROSCRIBED, STRING_DOUBLE, STRING_INVALID_ESCAPE, STRING_OMIT, STRING_SINGLE, STRING_START, TRAILING_BLANK_LINE, TRAILING_SPACES, UNARY, UNARY_MATH, UNICODE_CODE_POINT_ESCAPE, VALID_FLAGS, WHITESPACE, compact, count, invertLiterate, isForFrom, isUnassignable, key, locationDataToString, merge, repeat, starts, throwSyntaxError,
    indexOf = [].indexOf;

  ({Rewriter, INVERSES} = require('./rewriter'));

-  ({count, starts, compact, repeat, invertLiterate, locationDataToString, throwSyntaxError} = require('./helpers'));
+  ({count, starts, compact, repeat, invertLiterate, merge, locationDataToString, throwSyntaxError} = require('./helpers'));

  exports.Lexer = Lexer = class Lexer {
    tokenize(code, opts = {}) {
@ -366,9 +366,6 @@
            isRegex: true,
            offsetInChunk: 1
          });
-          body = this.formatRegex(body, {
-            delimiter: '/'
-          });
          index = regex.length;
          prev = this.prev();
          if (prev) {
@ -398,8 +395,13 @@
          });
          break;
        case !(regex || tokens.length === 1):
-          if (body == null) {
-            body = this.formatHeregex(tokens[0][1]);
+          if (body) {
+            body = this.formatRegex(body, {
+              flags,
+              delimiter: '/'
+            });
+          } else {
+            body = this.formatHeregex(tokens[0][1], {flags});
          }
          this.token('REGEX', `${this.makeDelimitedLiteral(body, {
            delimiter: '/'
@ -412,7 +414,9 @@
          this.mergeInterpolationTokens(tokens, {
            delimiter: '"',
            double: true
-          }, this.formatHeregex);
+          }, (str) => {
+            return this.formatHeregex(str, {flags});
+          });
          if (flags) {
            this.token(',', ',', index - 1, 0);
            this.token('STRING', '"' + flags + '"', index - 1, flags.length);
@ -893,10 +897,10 @@
      return this.replaceUnicodeCodePointEscapes(str.replace(STRING_OMIT, '$1'), options);
    }

-    formatHeregex(str) {
-      return this.formatRegex(str.replace(HEREGEX_OMIT, '$1$2'), {
+    formatHeregex(str, options) {
+      return this.formatRegex(str.replace(HEREGEX_OMIT, '$1$2'), merge(options, {
        delimiter: '///'
-      });
+      }));
    }

    formatRegex(str, options) {
@ -919,6 +923,8 @@
    }

    replaceUnicodeCodePointEscapes(str, options) {
+      var shouldReplace;
+      shouldReplace = (options.flags != null) && indexOf.call(options.flags, 'u') < 0;
      return str.replace(UNICODE_CODE_POINT_ESCAPE, (match, escapedBackslash, codePointHex, offset) => {
        var codePointDecimal;
        if (escapedBackslash) {
@ -931,6 +937,9 @@
            length: codePointHex.length + 4
          });
        }
+        if (!shouldReplace) {
+          return match;
+        }
        return this.unicodeCodePointToUnicodeEscapes(codePointDecimal);
      });
    }
--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@ -12,7 +12,7 @@
 {Rewriter, INVERSES} = require './rewriter'

 # Import the helpers we need.
-{count, starts, compact, repeat, invertLiterate,
+{count, starts, compact, repeat, invertLiterate, merge,
 locationDataToString,  throwSyntaxError} = require './helpers'

 # The Lexer Class
@ -330,7 +330,6 @@ exports.Lexer = class Lexer
      when match = REGEX.exec @chunk
        [regex, body, closed] = match
        @validateEscapes body, isRegex: yes, offsetInChunk: 1
-        body = @formatRegex body, delimiter: '/'
        index = regex.length
        prev = @prev()
        if prev
@ -349,13 +348,17 @@ exports.Lexer = class Lexer
      when not VALID_FLAGS.test flags
        @error "invalid regular expression flags #{flags}", offset: index, length: flags.length
      when regex or tokens.length is 1
-        body ?= @formatHeregex tokens[0][1]
+        if body
+          body = @formatRegex body, { flags, delimiter: '/' }
+        else
+          body = @formatHeregex tokens[0][1], { flags }
        @token 'REGEX', "#{@makeDelimitedLiteral body, delimiter: '/'}#{flags}", 0, end, origin
      else
        @token 'REGEX_START', '(', 0, 0, origin
        @token 'IDENTIFIER', 'RegExp', 0, 0
        @token 'CALL_START', '(', 0, 0
-        @mergeInterpolationTokens tokens, {delimiter: '"', double: yes}, @formatHeregex
+        @mergeInterpolationTokens tokens, {delimiter: '"', double: yes}, (str) =>
+          @formatHeregex str, { flags }
        if flags
          @token ',', ',', index - 1, 0
          @token 'STRING', '"' + flags + '"', index - 1, flags.length
@ -792,8 +795,8 @@ exports.Lexer = class Lexer
  formatString: (str, options) ->
    @replaceUnicodeCodePointEscapes str.replace(STRING_OMIT, '$1'), options

-  formatHeregex: (str) ->
-    @formatRegex str.replace(HEREGEX_OMIT, '$1$2'), delimiter: '///'
+  formatHeregex: (str, options) ->
+    @formatRegex str.replace(HEREGEX_OMIT, '$1$2'), merge(options, delimiter: '///')

  formatRegex: (str, options) ->
    @replaceUnicodeCodePointEscapes str, options
@ -808,8 +811,9 @@ exports.Lexer = class Lexer
    low = (codePoint - 0x10000) % 0x400 + 0xDC00
    "#{toUnicodeEscape(high)}#{toUnicodeEscape(low)}"

-  # Replace \u{...} with \uxxxx[\uxxxx] in strings and regexes
+  # Replace \u{...} with \uxxxx[\uxxxx] in regexes without `u` flag
  replaceUnicodeCodePointEscapes: (str, options) ->
+    shouldReplace = options.flags? and 'u' not in options.flags
    str.replace UNICODE_CODE_POINT_ESCAPE, (match, escapedBackslash, codePointHex, offset) =>
      return escapedBackslash if escapedBackslash

@ -818,6 +822,7 @@ exports.Lexer = class Lexer
        @error "unicode code point escapes greater than \\u{10ffff} are not allowed",
          offset: offset + options.delimiter.length
          length: codePointHex.length + 4
+      return match unless shouldReplace

      @unicodeCodePointToUnicodeEscapes codePointDecimal

--- a/test/regexps.coffee
+++ b/test/regexps.coffee
@ -305,12 +305,12 @@ test "#4248: Unicode code point escapes", ->
  ok ///a\u{000001ab}c///.test 'a\u{1ab}c'
  ok /a\u{12345}c/.test 'a\ud808\udf45c'

-  # rewrite code point escapes
+  # rewrite code point escapes unless u flag is set
  input = """
    /\\u{bcdef}\\u{abc}/u
    """
  output = """
-    /\\udab3\\uddef\\u0abc/u;
+    /\\u{bcdef}\\u{abc}/u;
  """
  eq toJS(input), output

--- a/test/strings.coffee
+++ b/test/strings.coffee
@ -420,12 +420,12 @@ test "#4248: Unicode code point escapes", ->
  eq '\udab3\uddefc', """\u{bcdef}#{ 'c' }"""
  eq '\\u{123456}', "#{'\\'}#{'u{123456}'}"

-  # rewrite code point escapes
+  # don't rewrite code point escapes
  input = """
    '\\u{bcdef}\\u{abc}'
    """
  output = """
-    '\\udab3\\uddef\\u0abc';
+    '\\u{bcdef}\\u{abc}';
  """
  eq toJS(input), output

@ -433,6 +433,6 @@ test "#4248: Unicode code point escapes", ->
    "#{ 'a' }\\u{bcdef}"
    """
  output = """
-    "a\\udab3\\uddef";
+    "a\\u{bcdef}";
  """
  eq toJS(input), output