Hewing closer to JS' syntactic resynchronization for regexp lexing.

2010-11-09 22:39:15 -05:00 · 2010-11-09 22:39:15 -05:00 · 841463da8e
parent 71db1fc142
commit 841463da8e
3 changed files with 19 additions and 8 deletions
--- a/lib/lexer.js
+++ b/lib/lexer.js
@ -1,5 +1,5 @@
 (function() {
-  var ASSIGNED, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HEREDOC, HEREDOC_INDENT, HEREGEX, HEREGEX_OMIT, IDENTIFIER, INDEXABLE, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LEADING_SPACES, LINE_BREAK, LINE_CONTINUER, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RELATION, RESERVED, Rewriter, SHIFT, SIMPLESTR, TRAILING_SPACES, UNARY, WHITESPACE, compact, count, last, op, starts, _ref;
+  var ASSIGNED, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HEREDOC, HEREDOC_INDENT, HEREGEX, HEREGEX_OMIT, IDENTIFIER, INDEXABLE, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LEADING_SPACES, LINE_BREAK, LINE_CONTINUER, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NOT_REGEX, NOT_SPACED_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RELATION, RESERVED, Rewriter, SHIFT, SIMPLESTR, TRAILING_SPACES, UNARY, WHITESPACE, compact, count, last, op, starts, _ref;
  var __indexOf = Array.prototype.indexOf || function(item) {
    for (var i = 0, l = this.length; i < l; i++) {
      if (this[i] === item) return i;
@ -200,14 +200,15 @@
      return script.length;
    };
    Lexer.prototype.regexToken = function() {
-      var match, regex, _ref;
+      var match, prev, regex, _ref;
      if (this.chunk.charAt(0) !== '/') {
        return 0;
      }
      if (match = HEREGEX.exec(this.chunk)) {
        return this.heregexToken(match);
      }
-      if (_ref = this.tag(), __indexOf.call(NOT_REGEX, _ref) >= 0) {
+      prev = last(this.tokens);
+      if (prev && (_ref = prev[0], __indexOf.call((prev.spaced ? NOT_REGEX : NOT_SPACED_REGEX), _ref) >= 0)) {
        return 0;
      }
      if (!(match = REGEX.exec(this.chunk))) {
@ -641,6 +642,7 @@
  RELATION = ['IN', 'OF', 'INSTANCEOF'];
  BOOL = ['TRUE', 'FALSE', 'NULL', 'UNDEFINED'];
  NOT_REGEX = ['NUMBER', 'REGEX', 'BOOL', '++', '--', ']'];
+  NOT_SPACED_REGEX = NOT_REGEX.concat(')', '}', 'THIS');
  CALLABLE = ['IDENTIFIER', 'STRING', 'REGEX', ')', ']', '}', '?', '::', '@', 'THIS', 'SUPER'];
  INDEXABLE = CALLABLE.concat('NUMBER', 'BOOL');
  LINE_BREAK = ['INDENT', 'OUTDENT', 'TERMINATOR'];
--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@ -195,7 +195,8 @@ exports.Lexer = class Lexer
  regexToken: ->
    return 0 if @chunk.charAt(0) isnt '/'
    return @heregexToken match if match = HEREGEX.exec @chunk
-    return 0 if @tag() in NOT_REGEX
+    prev = last @tokens
+    return 0 if prev and (prev[0] in (if prev.spaced then NOT_REGEX else NOT_SPACED_REGEX))
    return 0 unless match = REGEX.exec @chunk
    [regex] = match
    @token 'REGEX', if regex is '//' then '/(?:)/' else regex
@ -644,6 +645,10 @@ BOOL = ['TRUE', 'FALSE', 'NULL', 'UNDEFINED']
 # Our list is shorter, due to sans-parentheses method calls.
 NOT_REGEX = ['NUMBER', 'REGEX', 'BOOL', '++', '--', ']']

+# If the previous token is not spaced, there are more preceding tokens that
+# force a division parse:
+NOT_SPACED_REGEX = NOT_REGEX.concat ')', '}', 'THIS'
+
 # Tokens which could legitimately be invoked or indexed. A opening
 # parentheses or bracket following these tokens will be recorded as the start
 # of a function invocation or indexing operation.
--- a/test/test_regexps.coffee
+++ b/test/test_regexps.coffee
@ -37,9 +37,13 @@ eq '\\\\#{}\\\\\\\"', ///
 eq ///  /// + '', '/(?:)/'


-#584: Unescaped slashes in character classes.
-ok /:\/[/]goog/.test 'http://google.com'
-
-
 #764: Should be indexable.
 eq /0/['source'], ///#{0}///['source']
+
+
+# If not preceded by whitespace, should be stricter.
+i = 5
+eq (1000)/200/i, 1
+
+#584: Unescaped slashes in character classes.
+ok /:\/[/]goog/.test 'http://google.com'