fixing the regex lexer to make it less agressive when we know it can't possibly be a regex

2022-11-09 12:23:24 -05:00 · 2010-01-01 09:49:18 -05:00 · 2010-01-01 09:49:18 -05:00 · ad5b5fa458
commit ad5b5fa458
parent 41056ca2bd
3 changed files with 19 additions and 3 deletions
--- a/lib/coffee_script/CoffeeScript.tmbundle/Syntaxes/CoffeeScript.tmLanguage
+++ b/lib/coffee_script/CoffeeScript.tmbundle/Syntaxes/CoffeeScript.tmLanguage
@ -240,7 +240,7 @@
 		</dict>
 		<dict>
 			<key>match</key>
-			<string>!|\$|%|&amp;|\*|\-\-|\-|\+\+|\+|~|===|==|=|!=|!==|&lt;=|&gt;=|&lt;&lt;=|&gt;&gt;=|&gt;&gt;&gt;=|&lt;&gt;|&lt;|&gt;|!|&amp;&amp;|\?|\|\||\:|\*=|(?&lt;!\()/=|%=|\+=|\-=|&amp;=|\^=|\b(in|instanceof|new|delete|typeof|and|or|is|isnt|not)\b</string>
+			<string>!|\$|%|&amp;|\*|\/|\-\-|\-|\+\+|\+|~|===|==|=|!=|!==|&lt;=|&gt;=|&lt;&lt;=|&gt;&gt;=|&gt;&gt;&gt;=|&lt;&gt;|&lt;|&gt;|!|&amp;&amp;|\?|\|\||\:|\*=|(?&lt;!\()/=|%=|\+=|\-=|&amp;=|\^=|\b(in|instanceof|new|delete|typeof|and|or|is|isnt|not)\b</string>
 			<key>name</key>
 			<string>keyword.operator.coffee</string>
 		</dict>
--- a/lib/coffee_script/lexer.rb
+++ b/lib/coffee_script/lexer.rb
@ -37,6 +37,10 @@ module CoffeeScript
    COMMENT_CLEANER = /(^\s*#|\n\s*$)/
    NO_NEWLINE = /\A([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)\Z/

+    # Tokens which a regular expression will never immediately follow.
+    # See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions
+    NOT_REGEX  = [:IDENTIFIER, :NUMBER, :STRING]
+
    # Scan by attempting to match tokens one character at a time. Slow and steady.
    def tokenize(code)
      @code = code.chomp  # Cleanup code by remove extra line breaks
@ -107,6 +111,7 @@ module CoffeeScript
    # Matches regular expression literals.
    def regex_token
      return false unless regex = @chunk[REGEX, 1]
+      return false if NOT_REGEX.include?(last_tag)
      token(:REGEX, regex)
      @i += regex.length
    end
@ -183,11 +188,16 @@ module CoffeeScript
      @tokens << [tag, Value.new(value, @line)]
    end

-    # Peek at the previous token.
+    # Peek at the previous token's value.
    def last_value
      @tokens.last && @tokens.last[1]
    end

+    # Peek at the previous token's tag.
+    def last_tag
+      @tokens.last && @tokens.last[0]
+    end
+
    # A source of ambiguity in our grammar was parameter lists in function
    # definitions (as opposed to argument lists in function calls). Tag
    # parameter identifiers in order to avoid this. Also, parameter lists can
--- a/test/fixtures/execution/test_literals.coffee
+++ b/test/fixtures/execution/test_literals.coffee
@ -1,3 +1,9 @@
 a: [(x => x), (x => x * x)]

-print(a.length is 2)
+print(a.length is 2)
+
+
+regex: /match/i
+words: "I think there is a match in here."
+
+print(!!words.match(regex))