Regular expression interpolations; fixed bug in string interpolations when all tokens were identifiers.

2022-11-09 12:23:24 -05:00 · 2010-03-08 20:05:02 +02:00 · 2010-03-08 20:05:02 +02:00 · 81af8f296e
commit 81af8f296e
parent 830d1fb42b
6 changed files with 146 additions and 57 deletions
--- a/lib/lexer.js
+++ b/lib/lexer.js
@ -1,5 +1,5 @@
 (function(){
-  var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, INTERPOLATION, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, Lexer, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RESERVED, Rewriter, STRING_NEWLINES, WHITESPACE, compact, count, include, starts;
+  var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, INTERPOLATION, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, Lexer, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, RESERVED, Rewriter, STRING_NEWLINES, WHITESPACE, compact, count, include, starts;
  // The CoffeeScript Lexer. Uses a series of token-matching regexes to attempt
  // matches against the beginning of the source code. When a match is found,
  // a token is produced, we consume the match, and start again. Tokens are in the
@ -129,18 +129,18 @@
    // Matches strings, including multi-line strings. Ensures that quotation marks
    // are balanced within the string's contents, and within nested interpolations.
    Lexer.prototype.string_token = function string_token() {
-      var string;
+      var merge, string, supress;
      if (!(starts(this.chunk, '"') || starts(this.chunk, "'"))) {
        return false;
      }
-      string = this.balanced_token(['"', '"'], ['${', '}']);
+      string = this.balanced_token((supress = false), ['"', '"'], ['${', '}']);
      if (!(string)) {
-        string = this.balanced_token(["'", "'"]);
+        string = this.balanced_token((supress = false), ["'", "'"]);
      }
      if (!(string)) {
        return false;
      }
-      this.interpolate_string(string.replace(STRING_NEWLINES, " \\\n"));
+      this.interpolate_string(string.replace(STRING_NEWLINES, " \\\n"), (merge = true));
      this.line += count(string, "\n");
      this.i += string.length;
      return true;
@ -160,11 +160,11 @@
    };
    // Matches JavaScript interpolated directly into the source via backticks.
    Lexer.prototype.js_token = function js_token() {
-      var script;
+      var script, supress;
      if (!(starts(this.chunk, '`'))) {
        return false;
      }
-      if (!((script = this.balanced_token(['`', '`'])))) {
+      if (!((script = this.balanced_token((supress = false), ['`', '`'])))) {
        return false;
      }
      this.token('JS', script.replace(JS_CLEANER, ''));
@ -175,23 +175,57 @@
    // to distinguish from division, so we borrow some basic heuristics from
    // JavaScript and Ruby.
    Lexer.prototype.regex_token = function regex_token() {
-      var regex;
-      if (!((regex = this.match(REGEX, 1)))) {
+      var _a, _b, _c, _d, _e, each, flags, i, index, interp_tokens, merge, regex, str, supress;
+      if (!((regex = this.balanced_token((supress = true), ['/', '/'])))) {
+        return false;
+      }
+      if (regex.length < 3 || regex.match(/^\/\s+|\n/)) {
        return false;
      }
      if (include(NOT_REGEX, this.tag())) {
        return false;
      }
-      this.token('REGEX', regex);
+      flags = ['i', 'm', 'g', 'y'];
+      while (((index = flags.indexOf(this.chunk.substr(regex.length, 1)))) >= 0) {
+        regex += flags[index];
+        flags.splice(index, 1);
+      }
+      if (((0 < (_e = regex.indexOf('${'))) && (_e < regex.indexOf('}'))) || regex.match(/[^\\]\$[a-zA-Z_@]/)) {
+        _a = regex.substring(1).split('/');
+        str = _a[0];
+        flags = _a[1];
+        str = str.replace(/\\[^\$]/g, function(escaped) {
+          return '\\' + escaped;
+        });
+        this.tokens = this.tokens.concat([['(', '('], ['NEW', 'new'], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']]);
+        interp_tokens = this.interpolate_string("\"" + str + "\"", (merge = false));
+        _b = interp_tokens;
+        for (i = 0, _c = _b.length; i < _c; i++) {
+          each = _b[i];
+          if ((_d = each[0]) === 'TOKENS') {
+            this.tokens = this.tokens.concat(each[1]);
+          } else if (_d === 'STRING') {
+            this.token(each[0], each[1].substring(0, 1) + each[1].substring(1, each[1].length - 1).replace(/"/g, '\\"') + each[1].substring(0, 1));
+          } else {
+            this.token(each[0], each[1]);
+          }
+          if (i < interp_tokens.length - 1) {
+            this.token('+', '+');
+          }
+        }
+        this.tokens = this.tokens.concat([[',', ','], ['STRING', "'" + flags + "'"], [')', ')'], [')', ')']]);
+      } else {
+        this.token('REGEX', regex);
+      }
      this.i += regex.length;
      return true;
    };
    // Matches a token in which which the passed delimiter pairs must be correctly
    // balanced (ie. strings, JS literals).
-    Lexer.prototype.balanced_token = function balanced_token() {
+    Lexer.prototype.balanced_token = function balanced_token(supress) {
      var delimited;
-      delimited = Array.prototype.slice.call(arguments, 0);
-      return this.balanced_string.apply(this, [this.chunk].concat(delimited));
+      delimited = Array.prototype.slice.call(arguments, 1);
+      return this.balanced_string.apply(this, [this.chunk].concat([supress]).concat(delimited));
    };
    // Matches and conumes comments. We pass through comments into JavaScript,
    // so they're treated as real tokens, like any other part of the language.
@ -395,9 +429,9 @@
    // a series of delimiters, all of which must be nested correctly within the
    // contents of the string. This method allows us to have strings within
    // interpolations within strings etc...
-    Lexer.prototype.balanced_string = function balanced_string(str) {
+    Lexer.prototype.balanced_string = function balanced_string(str, supress) {
      var _a, _b, _c, _d, close, delimited, i, levels, open, pair;
-      delimited = Array.prototype.slice.call(arguments, 1);
+      delimited = Array.prototype.slice.call(arguments, 2);
      levels = [];
      i = 0;
      while (i < str.length) {
@ -429,7 +463,10 @@
        i += 1;
      }
      if (levels.length) {
-        throw new Error("SyntaxError: Unterminated " + (levels.pop()[0]) + " starting on line " + (this.line + 1));
+        if (!(supress)) {
+          throw new Error("SyntaxError: Unterminated " + (levels.pop()[0]) + " starting on line " + (this.line + 1));
+        }
+        return false;
      }
      if (i === 0) {
        return false;
@ -444,8 +481,8 @@
    // If it encounters an interpolation, this method will recursively create a
    // new Lexer, tokenize the interpolated contents, and merge them into the
    // token stream.
-    Lexer.prototype.interpolate_string = function interpolate_string(str) {
-      var _a, _b, _c, _d, _e, each, expr, group, i, inner, interp, lexer, match, nested, pi, quote, tokens;
+    Lexer.prototype.interpolate_string = function interpolate_string(str, merge) {
+      var _a, _b, _c, _d, _e, _f, _g, each, expr, group, has_string, i, inner, interp, lexer, match, nested, pi, quote, supress, tokens;
      if (str.length < 3 || !starts(str, '"')) {
        return this.token('STRING', str);
      } else {
@ -466,14 +503,14 @@
              interp = "this." + (interp.substring(1));
            }
            if (pi < i) {
-              tokens.push(['STRING', quote + (str.substring(pi, i)) + quote]);
+              tokens.push(['STRING', '' + quote + (str.substring(pi, i)) + quote]);
            }
            tokens.push(['IDENTIFIER', interp]);
            i += group.length - 1;
            pi = i + 1;
-          } else if (((expr = this.balanced_string(str.substring(i), ['${', '}'])))) {
+          } else if (((expr = this.balanced_string(str.substring(i), (supress = false), ['${', '}'])))) {
            if (pi < i) {
-              tokens.push(['STRING', quote + (str.substring(pi, i)) + quote]);
+              tokens.push(['STRING', '' + quote + (str.substring(pi, i)) + quote]);
            }
            inner = expr.substring(2, expr.length - 1);
            if (inner.length) {
@ -484,7 +521,7 @@
              nested.pop();
              tokens.push(['TOKENS', nested]);
            } else {
-              tokens.push(['STRING', quote + quote]);
+              tokens.push(['STRING', '' + quote + quote]);
            }
            i += expr.length - 1;
            pi = i + 1;
@ -492,19 +529,27 @@
          i += 1;
        }
        if (pi < i && pi < str.length - 1) {
-          tokens.push(['STRING', quote + (str.substring(pi, i)) + quote]);
+          tokens.push(['STRING', '' + quote + (str.substring(pi, i)) + quote]);
        }
-        _c = []; _d = tokens;
-        for (i = 0, _e = _d.length; i < _e; i++) {
-          each = _d[i];
-          _c.push((function() {
+        _c = tokens;
+        for (_d = 0, _e = _c.length; _d < _e; _d++) {
+          each = _c[_d];
+          each[0] === 'STRING' ? ((has_string = true)) : null;
+        }
+        if (!has_string) {
+          tokens.unshift(['STRING', "''"]);
+        }
+        if (((typeof merge !== "undefined" && merge !== null) ? merge : true)) {
+          _f = tokens;
+          for (i = 0, _g = _f.length; i < _g; i++) {
+            each = _f[i];
            each[0] === 'TOKENS' ? (this.tokens = this.tokens.concat(each[1])) : this.token(each[0], each[1]);
            if (i < tokens.length - 1) {
-              return this.token('+', '+');
+              this.token('+', '+');
            }
-          }).call(this));
+          }
        }
-        return _c;
+        return tokens;
      }
    };
    // Helpers
@ -568,7 +613,7 @@
  // be used as identifiers or properties.
  JS_FORBIDDEN = JS_KEYWORDS.concat(RESERVED);
  // Token matching regexes.
-  IDENTIFIER = /^([a-zA-Z$_](\w|\$)*)/;
+  IDENTIFIER = /^([a-zA-Z\$_](\w|\$)*)/;
  NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i;
  HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/;
  INTERPOLATION = /^\$([a-zA-Z_@]\w*(\.\w+)*)/;
@ -576,7 +621,6 @@
  WHITESPACE = /^([ \t]+)/;
  COMMENT = /^(((\n?[ \t]*)?#[^\n]*)+)/;
  CODE = /^((-|=)>)/;
-  REGEX = /^(\/(\S.*?)?([^\\]|\\\\)\/[imgy]{0,4})/;
  MULTI_DENT = /^((\n([ \t]*))+)(\.)?/;
  LAST_DENTS = /\n([ \t]*)/g;
  LAST_DENT = /\n([ \t]*)/;
--- a/lib/nodes.js
+++ b/lib/nodes.js
@ -289,7 +289,7 @@ idt += TAB
      var end, idt;
      idt = this.is_statement() ? this.idt() : '';
      end = this.is_statement() ? ';' : '';
-      return idt + this.value + end;
+      return '' + idt + this.value + end;
    };
    LiteralNode.prototype.toString = function toString(idt) {
      return " \"" + this.value + "\"";
@ -762,7 +762,7 @@ idt += TAB
      props = props.empty() ? '' : props.compile(o) + '\n';
      extension = extension ? this.idt() + extension.compile(o) + ';\n' : '';
      returns = ret ? '\n' + this.idt() + 'return ' + this.variable.compile(o) + ';' : '';
-      return construct + extension + props + returns;
+      return '' + construct + extension + props + returns;
    };
    return ClassNode;
  }).call(this);
@ -1540,7 +1540,7 @@ idt += TAB
        indent: this.idt(),
        chain_child: true
      })) : " else {\n" + (Expressions.wrap([this.else_body]).compile(o)) + "\n" + this.tab + "}";
-      return if_part + else_part;
+      return '' + if_part + else_part;
    };
    // Compile the IfNode as a ternary operator.
    IfNode.prototype.compile_ternary = function compile_ternary(o) {
@ -1559,7 +1559,7 @@ idt += TAB
  // with Git.
  TRAILING_WHITESPACE = /\s+$/gm;
  // Keep this identifier regex in sync with the Lexer.
-  IDENTIFIER = /^[a-zA-Z$_](\w|\$)*$/;
+  IDENTIFIER = /^[a-zA-Z\$_](\w|\$)*$/;
  // Utility Functions
  // -----------------
  // Merge objects, returning a fresh copy with attributes from both sides.
--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@ -96,10 +96,10 @@ exports.Lexer: class Lexer
  # are balanced within the string's contents, and within nested interpolations.
  string_token: ->
    return false unless starts(@chunk, '"') or starts(@chunk, "'")
-    string: @balanced_token ['"', '"'], ['${', '}']
-    string: @balanced_token ["'", "'"] unless string
+    string: @balanced_token supress: false, ['"', '"'], ['${', '}']
+    string: @balanced_token supress: false, ["'", "'"] unless string
    return false unless string
-    @interpolate_string string.replace STRING_NEWLINES, " \\\n"
+    @interpolate_string string.replace(STRING_NEWLINES, " \\\n"), merge: true
    @line += count string, "\n"
    @i += string.length
    true
@ -117,7 +117,7 @@ exports.Lexer: class Lexer
  # Matches JavaScript interpolated directly into the source via backticks.
  js_token: ->
    return false unless starts @chunk, '`'
-    return false unless script: @balanced_token ['`', '`']
+    return false unless script: @balanced_token supress: false, ['`', '`']
    @token 'JS', script.replace(JS_CLEANER, '')
    @i += script.length
    true
@ -126,16 +126,34 @@ exports.Lexer: class Lexer
  # to distinguish from division, so we borrow some basic heuristics from
  # JavaScript and Ruby.
  regex_token: ->
-    return false unless regex: @match REGEX, 1
+    return false unless regex: @balanced_token supress: true, ['/', '/']
+    return false if regex.length < 3 or regex.match /^\/\s+|\n/
    return false if include NOT_REGEX, @tag()
-    @token 'REGEX', regex
+    flags: ['i', 'm', 'g', 'y']
+    while (index: flags.indexOf @chunk.substr regex.length, 1) >= 0
+      regex += flags[index]
+      flags.splice index, 1
+    if (0 < regex.indexOf('${') < regex.indexOf('}')) or regex.match /[^\\]\$[a-zA-Z_@]/
+      [str, flags]: regex.substring(1).split('/')
+      str: str.replace /\\[^\$]/g, (escaped) -> '\\' + escaped
+      @tokens: @tokens.concat [['(', '('], ['NEW', 'new'], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']]
+      interp_tokens: @interpolate_string "\"$str\"", merge: false
+      for each, i in interp_tokens
+        switch each[0]
+          when 'TOKENS' then @tokens: @tokens.concat each[1]
+          when 'STRING' then @token each[0], each[1].substring(0, 1) + each[1].substring(1, each[1].length - 1).replace(/"/g, '\\"') + each[1].substring(0, 1)
+          else @token each[0], each[1]
+        @token '+', '+' if i < interp_tokens.length - 1
+      @tokens: @tokens.concat [[',', ','], ['STRING', "'$flags'"], [')', ')'], [')', ')']]
+    else
+      @token 'REGEX', regex
    @i += regex.length
    true

  # Matches a token in which which the passed delimiter pairs must be correctly
  # balanced (ie. strings, JS literals).
-  balanced_token: (delimited...) ->
-    @balanced_string @chunk, delimited...
+  balanced_token: (supress, delimited...) ->
+    @balanced_string @chunk, supress, delimited...

  # Matches and conumes comments. We pass through comments into JavaScript,
  # so they're treated as real tokens, like any other part of the language.
@ -297,7 +315,7 @@ exports.Lexer: class Lexer
  # a series of delimiters, all of which must be nested correctly within the
  # contents of the string. This method allows us to have strings within
  # interpolations within strings etc...
-  balanced_string: (str, delimited...) ->
+  balanced_string: (str, supress, delimited...) ->
    levels: []
    i: 0
    while i < str.length
@ -317,7 +335,9 @@ exports.Lexer: class Lexer
          break
      break unless levels.length
      i += 1
-    throw new Error "SyntaxError: Unterminated ${levels.pop()[0]} starting on line ${@line + 1}" if levels.length
+    if levels.length
+      throw new Error "SyntaxError: Unterminated ${levels.pop()[0]} starting on line ${@line + 1}" unless supress
+      return false
    return false if i is 0
    return str.substring(0, i)

@ -331,7 +351,7 @@ exports.Lexer: class Lexer
  # If it encounters an interpolation, this method will recursively create a
  # new Lexer, tokenize the interpolated contents, and merge them into the
  # token stream.
-  interpolate_string: (str) ->
+  interpolate_string: (str, merge) ->
    if str.length < 3 or not starts str, '"'
      @token 'STRING', str
    else
@ -349,7 +369,7 @@ exports.Lexer: class Lexer
          tokens.push ['IDENTIFIER', interp]
          i += group.length - 1
          pi: i + 1
-        else if (expr: @balanced_string str.substring(i), ['${', '}'])
+        else if (expr: @balanced_string str.substring(i), supress: false, ['${', '}'])
          tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i
          inner: expr.substring(2, expr.length - 1)
          if inner.length
@ -362,12 +382,16 @@ exports.Lexer: class Lexer
          pi: i + 1
        i += 1
      tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i and pi < str.length - 1
-      for each, i in tokens
-        if each[0] is 'TOKENS'
-          @tokens: @tokens.concat each[1]
-        else
-          @token each[0], each[1]
-        @token '+', '+' if i < tokens.length - 1
+      (has_string: yes) for each in tokens when each[0] is 'STRING'
+      tokens.unshift ['STRING', "''"] if not has_string
+      if (merge ? true)
+        for each, i in tokens
+          if each[0] is 'TOKENS'
+            @tokens: @tokens.concat each[1]
+          else
+            @token each[0], each[1]
+          @token '+', '+' if i < tokens.length - 1
+      tokens

  # Helpers
  # -------
@ -440,7 +464,7 @@ RESERVED: [
 JS_FORBIDDEN: JS_KEYWORDS.concat RESERVED

 # Token matching regexes.
-IDENTIFIER    : /^([a-zA-Z$_](\w|\$)*)/
+IDENTIFIER    : /^([a-zA-Z\$_](\w|\$)*)/
 NUMBER        : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i
 HEREDOC       : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/
 INTERPOLATION : /^\$([a-zA-Z_@]\w*(\.\w+)*)/
@ -448,7 +472,6 @@ OPERATOR      : /^([+\*&|\/\-%=<>:!?]+)/
 WHITESPACE    : /^([ \t]+)/
 COMMENT       : /^(((\n?[ \t]*)?#[^\n]*)+)/
 CODE          : /^((-|=)>)/
-REGEX         : /^(\/(\S.*?)?([^\\]|\\\\)\/[imgy]{0,4})/
 MULTI_DENT    : /^((\n([ \t]*))+)(\.)?/
 LAST_DENTS    : /\n([ \t]*)/g
 LAST_DENT     : /\n([ \t]*)/
--- a/src/nodes.coffee
+++ b/src/nodes.coffee
@ -1192,7 +1192,7 @@ TAB: '  '
 TRAILING_WHITESPACE: /\s+$/gm

 # Keep this identifier regex in sync with the Lexer.
-IDENTIFIER: /^[a-zA-Z$_](\w|\$)*$/
+IDENTIFIER: /^[a-zA-Z\$_](\w|\$)*$/

 # Utility Functions
 # -----------------
--- a/test/test_regexp_interpolation.coffee
+++ b/test/test_regexp_interpolation.coffee
@ -0,0 +1,17 @@
+name: 'Bob'
+
+ok not not '"Bob"'.match(/^"${name}"$/i)
+ok '"Bobby"'.match(/^"${name}"$/i) is null
+
+ok not not 'Bob'.match(/^$name$/)
+ok 'Bobby'.match(/^$name/)
+
+ok 'Bobby'.match(/${"${"${"$name"}"}"}/imgy)
+
+ok '$a$b$c'.match(/\$A\$B\$C/i)
+
+a: 1
+b: 2
+c: 3
+
+ok '123'.match(/$a$b$c/i)
--- a/test/test_string_interpolation.coffee
+++ b/test/test_string_interpolation.coffee
@ -61,3 +61,8 @@ ok "Where is ${"the nested ${obj["name"]}"}?" is 'Where is the nested Joe?'
 ok "Hello ${world ? "$hello"}" is 'Hello World'

 ok "Hello ${"${"${obj["name"]}" + '!'}"}" is 'Hello Joe!'
+
+a: 1
+b: 2
+c: 3
+ok "$a$b$c" is '123'