From 8fd6258a4622593cbc77a23230c00a6af85647f5 Mon Sep 17 00:00:00 2001 From: Simon Lydell Date: Sat, 10 Jan 2015 01:48:00 +0100 Subject: [PATCH] Fix #3410, #3182: Allow regex to start with space or = MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A regex may not follow a specific set of tokens. These were already known before in the `NOT_REGEX` and `NOT_SPACED_REGEX` arrays. (However, I've refactored them to be more correct and to add a few missing tokens). In all other cases (except after a spaced callable) a slash is the start of a regex, and may now start with a space or an equals sign. It’s really that simple! A slash after a spaced callable is the only ambigous case. We cannot know if that's division or function application with a regex as the argument. The spacing determines which is which: Space on both sides: - `a / b/i` -> `a / b / i` - `a /= b/i` -> `a /= b / i` No spaces: - `a/b/i` -> `a / b / i` - `a/=b/i` -> `a /= b / i` Space on the right side: - `a/ b/i` -> `a / b / i` - `a/= b/i` -> `a /= b / i` Space on the left side: - `a /b/i` -> `a(/b/i)` - `a /=b/i` -> `a(/=b/i)` The last case used to compile to `a /= b / i`, but that has been changed to be consistent with the `/` operator. The last case really looks like a regex, so it should be parsed as one. Moreover, you may now also space the `/` and `/=` operators with other whitespace characters than a space (such as tabs and non-breaking spaces) for consistency. Lastly, unclosed regexes are now reported as such, instead of generating some other confusing error message. It should perhaps also be noted that apart from escaping (such as `a /\ b/`) you may now also use parentheses to disambiguate division and regex: `a (/ b/)`. See https://github.com/jashkenas/coffeescript/issues/3182#issuecomment-26688427. --- lib/coffee-script/lexer.js | 31 +++--- src/lexer.coffee | 37 +++---- test/error_messages.coffee | 26 +++++ test/regexps.coffee | 195 ++++++++++++++++++++++++++++++++++++- 4 files changed, 255 insertions(+), 34 deletions(-) diff --git a/lib/coffee-script/lexer.js b/lib/coffee-script/lexer.js index 9e4eb109..e1f7fcc7 100644 --- a/lib/coffee-script/lexer.js +++ b/lib/coffee-script/lexer.js @@ -1,6 +1,6 @@ // Generated by CoffeeScript 1.8.0 (function() { - var BOM, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HERECOMMENT_ILLEGAL, HEREDOC_DOUBLE, HEREDOC_INDENT, HEREDOC_SINGLE, HEREGEX, HEREGEX_OMIT, IDENTIFIER, INDENTABLE_CLOSERS, INDEXABLE, INVERSES, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LEADING_BLANK_LINE, LINE_BREAK, LINE_CONTINUER, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NOT_REGEX, NOT_SPACED_REGEX, NUMBER, OCTAL_ESCAPE, OPERATOR, REGEX, REGEX_FLAGS, REGEX_ILLEGAL, RELATION, RESERVED, Rewriter, SHIFT, STRICT_PROSCRIBED, STRING_DOUBLE, STRING_OMIT, STRING_SINGLE, STRING_START, TRAILING_BLANK_LINE, TRAILING_SPACES, UNARY, UNARY_MATH, VALID_FLAGS, WHITESPACE, compact, count, invertLiterate, key, last, locationDataToString, repeat, starts, throwSyntaxError, _ref, _ref1, + var BOM, BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HERECOMMENT_ILLEGAL, HEREDOC_DOUBLE, HEREDOC_INDENT, HEREDOC_SINGLE, HEREGEX, HEREGEX_OMIT, IDENTIFIER, INDENTABLE_CLOSERS, INDEXABLE, INVERSES, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LEADING_BLANK_LINE, LINE_BREAK, LINE_CONTINUER, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NOT_REGEX, NUMBER, OCTAL_ESCAPE, OPERATOR, POSSIBLY_DIVISION, REGEX, REGEX_FLAGS, REGEX_ILLEGAL, RELATION, RESERVED, Rewriter, SHIFT, STRICT_PROSCRIBED, STRING_DOUBLE, STRING_OMIT, STRING_SINGLE, STRING_START, TRAILING_BLANK_LINE, TRAILING_SPACES, UNARY, UNARY_MATH, VALID_FLAGS, WHITESPACE, compact, count, invertLiterate, key, last, locationDataToString, repeat, starts, throwSyntaxError, _ref, _ref1, __indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; }; _ref = require('./rewriter'), Rewriter = _ref.Rewriter, INVERSES = _ref.INVERSES; @@ -287,7 +287,7 @@ }; Lexer.prototype.regexToken = function() { - var end, flags, index, match, prev, re, regex, tokens, _ref2, _ref3; + var closed, end, flags, index, match, prev, re, regex, tokens, _ref2, _ref3, _ref4; switch (false) { case !(match = REGEX_ILLEGAL.exec(this.chunk)): this.error("regular expressions cannot begin with " + match[2], match.index + match[1].length); @@ -296,11 +296,20 @@ _ref2 = this.matchWithInterpolations(this.chunk.slice(3), HEREGEX, '///', 3), tokens = _ref2.tokens, index = _ref2.index; break; case !(match = REGEX.exec(this.chunk)): - regex = match[0]; + regex = match[0], closed = match[1]; index = regex.length; prev = last(this.tokens); - if (prev && (_ref3 = prev[0], __indexOf.call((prev.spaced ? NOT_REGEX : NOT_SPACED_REGEX), _ref3) >= 0)) { - return 0; + if (prev) { + if (prev.spaced && (_ref3 = prev[0], __indexOf.call(CALLABLE, _ref3) >= 0)) { + if (!closed || POSSIBLY_DIVISION.test(regex)) { + return 0; + } + } else if (_ref4 = prev[0], __indexOf.call(NOT_REGEX, _ref4) >= 0) { + return 0; + } + } + if (!closed) { + this.error('missing / (unclosed regex)'); } break; default: @@ -845,7 +854,7 @@ HEREDOC_INDENT = /\n+([^\n\S]*)(?=\S)/g; - REGEX = /^\/(?![\s=])(?:[^[\/\n\\]|\\.|\[(?:\\.|[^\]\n\\])*])+\//; + REGEX = /^\/(?!\/)(?:[^[\/\n\\]|\\.|\[(?:\\.|[^\]\n\\])*])*(\/)?/; REGEX_FLAGS = /^\w*/; @@ -857,6 +866,8 @@ REGEX_ILLEGAL = /^(\/|\/{3}\s*)(\*)/; + POSSIBLY_DIVISION = /^\/=?\s/; + MULTILINER = /\n/g; HERECOMMENT_ILLEGAL = /\*\//; @@ -889,13 +900,11 @@ BOOL = ['TRUE', 'FALSE']; - NOT_REGEX = ['NUMBER', 'REGEX', 'BOOL', 'NULL', 'UNDEFINED', '++', '--']; + CALLABLE = ['IDENTIFIER', ')', ']', '?', '@', 'THIS', 'SUPER']; - NOT_SPACED_REGEX = NOT_REGEX.concat(')', '}', 'THIS', 'IDENTIFIER', 'STRING', ']'); + INDEXABLE = CALLABLE.concat(['NUMBER', 'STRING', 'REGEX', 'BOOL', 'NULL', 'UNDEFINED', '}', '::']); - CALLABLE = ['IDENTIFIER', 'STRING', 'REGEX', ')', ']', '}', '?', '::', '@', 'THIS', 'SUPER']; - - INDEXABLE = CALLABLE.concat('NUMBER', 'BOOL', 'NULL', 'UNDEFINED'); + NOT_REGEX = INDEXABLE.concat(['++', '--']); LINE_BREAK = ['INDENT', 'OUTDENT', 'TERMINATOR']; diff --git a/src/lexer.coffee b/src/lexer.coffee index 11cea8e9..e712e67a 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -258,10 +258,15 @@ exports.Lexer = class Lexer when @chunk[...3] is '///' {tokens, index} = @matchWithInterpolations @chunk[3..], HEREGEX, '///', 3 when match = REGEX.exec @chunk - [regex] = match + [regex, closed] = match index = regex.length prev = last @tokens - return 0 if prev and (prev[0] in (if prev.spaced then NOT_REGEX else NOT_SPACED_REGEX)) + if prev + if prev.spaced and prev[0] in CALLABLE + return 0 if not closed or POSSIBLY_DIVISION.test regex + else if prev[0] in NOT_REGEX + return 0 + @error 'missing / (unclosed regex)' unless closed else return 0 @@ -776,13 +781,13 @@ HEREDOC_INDENT = /\n+([^\n\S]*)(?=\S)/g # Regex-matching-regexes. REGEX = /// ^ - / (?! [\s=] ) ( # disallow leading whitespace or equals sign + / (?!/) ( ?: [^ [ / \n \\ ] # every other thing | \\. # anything (but newlines) escaped | \[ # character class (?: \\. | [^ \] \n \\ ] )* ] - )+ / + )* (/)? /// REGEX_FLAGS = /^\w*/ @@ -798,6 +803,8 @@ HEREGEX_OMIT = /// REGEX_ILLEGAL = /// ^ ( / | /{3}\s*) (\*) /// +POSSIBLY_DIVISION = /// ^ /=?\s /// + # Other regexes. MULTILINER = /\n/g @@ -841,23 +848,17 @@ RELATION = ['IN', 'OF', 'INSTANCEOF'] # Boolean tokens. BOOL = ['TRUE', 'FALSE'] -# Tokens which a regular expression will never immediately follow, but which -# a division operator might. -# -# See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions -# -# Our list is shorter, due to sans-parentheses method calls. -NOT_REGEX = ['NUMBER', 'REGEX', 'BOOL', 'NULL', 'UNDEFINED', '++', '--'] - -# If the previous token is not spaced, there are more preceding tokens that -# force a division parse: -NOT_SPACED_REGEX = NOT_REGEX.concat ')', '}', 'THIS', 'IDENTIFIER', 'STRING', ']' - # Tokens which could legitimately be invoked or indexed. An opening # parentheses or bracket following these tokens will be recorded as the start # of a function invocation or indexing operation. -CALLABLE = ['IDENTIFIER', 'STRING', 'REGEX', ')', ']', '}', '?', '::', '@', 'THIS', 'SUPER'] -INDEXABLE = CALLABLE.concat 'NUMBER', 'BOOL', 'NULL', 'UNDEFINED' +CALLABLE = ['IDENTIFIER', ')', ']', '?', '@', 'THIS', 'SUPER'] +INDEXABLE = CALLABLE.concat ['NUMBER', 'STRING', 'REGEX', 'BOOL', 'NULL', 'UNDEFINED', '}', '::'] + +# Tokens which a regular expression will never immediately follow (except spaced +# CALLABLEs in some cases), but which a division operator can. +# +# See: http://www-archive.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions +NOT_REGEX = INDEXABLE.concat ['++', '--'] # Tokens that, when immediately preceding a `WHEN`, indicate that the `WHEN` # occurs at the start of a line. We disambiguate these from trailing whens to diff --git a/test/error_messages.coffee b/test/error_messages.coffee index 9f6c764a..2aa9e7f1 100644 --- a/test/error_messages.coffee +++ b/test/error_messages.coffee @@ -405,3 +405,29 @@ test "missing `)`, `}`, `]`", -> foo#{ bar "#{1}" ^ ''' + +test "unclosed regexes", -> + assertErrorFormat ''' + / + ''', ''' + [stdin]:1:1: error: missing / (unclosed regex) + / + ^ + ''' + assertErrorFormat ''' + # Note the double escaping; this would be `/a\/` real code. + /a\\/ + ''', ''' + [stdin]:2:1: error: missing / (unclosed regex) + /a\\/ + ^ + ''' + assertErrorFormat ''' + /// ^ + a #{""" ""#{if /[/].test "|" then 1 else 0}"" """} + /// + ''', ''' + [stdin]:2:18: error: missing / (unclosed regex) + a #{""" ""#{if /[/].test "|" then 1 else 0}"" """} + ^ + ''' diff --git a/test/regexps.coffee b/test/regexps.coffee index 0aa4cbec..c8f7f020 100644 --- a/test/regexps.coffee +++ b/test/regexps.coffee @@ -13,16 +13,34 @@ test "basic regular expression literals", -> ok 'a'.match /a/g test "division is not confused for a regular expression", -> + # Any spacing around the slash is allowed when it cannot be a regex. eq 2, 4 / 2 / 1 + eq 2, 4/2/1 + eq 2, 4/ 2 / 1 + eq 2, 4 /2 / 1 + eq 2, 4 / 2/ 1 + eq 2, 4 / 2 /1 + eq 2, 4 /2/ 1 - a = 4 + a = (regex) -> regex.test 'a b c' + a.valueOf = -> 4 b = 2 g = 1 - eq 2, a / b/g - a = 10 - b = a /= 4 / 2 - eq a, 5 + eq 2, a / b/g + eq 2, a/ b/g + eq 2, a / b/ g + eq 2, a / b/g # Tabs. + eq 2, a / b/g # Non-breaking spaces. + eq true, a /b/g + # Use parentheses to disambiguate. + eq true, a(/ b/g) + eq true, a(/ b/) + eq true, a (/ b/) + # Escape to disambiguate. + eq true, a /\ b/g + eq false, a /\ b/g + eq true, a /\ b/ obj = method: -> 2 two = 2 @@ -32,6 +50,173 @@ test "division is not confused for a regular expression", -> eq 2, (4)/2/i eq 1, i/i/i + a = '' + a += ' ' until / /.test a + eq a, ' ' + + a = if /=/.test '=' then yes else no + eq a, yes + + a = if !/=/.test '=' then yes else no + eq a, no + + #3182: + match = 'foo=bar'.match /=/ + eq match[0], '=' + + #3410: + ok ' '.match(/ /)[0] is ' ' + + +test "division vs regex after a callable token", -> + b = 2 + g = 1 + r = (r) -> r.test 'b' + + a = 4 + eq 2, a / b/g + eq 2, a/b/g + eq 2, a/ b/g + eq true, r /b/g + eq 2, (1 + 3) / b/g + eq 2, (1 + 3)/b/g + eq 2, (1 + 3)/ b/g + eq true, (r) /b/g + eq 2, [4][0] / b/g + eq 2, [4][0]/b/g + eq 2, [4][0]/ b/g + eq true, [r][0] /b/g + eq 0.5, 4? / b/g + eq 0.5, 4?/b/g + eq 0.5, 4?/ b/g + eq true, r? /b/g + (-> + eq 2, @ / b/g + eq 2, @/b/g + eq 2, @/ b/g + ).call 4 + (-> + eq true, @ /b/g + ).call r + (-> + eq 2, this / b/g + eq 2, this/b/g + eq 2, this/ b/g + ).call 4 + (-> + eq true, this /b/g + ).call r + class A + p: (regex) -> if regex then r regex else 4 + class B extends A + p: -> + eq 2, super / b/g + eq 2, super/b/g + eq 2, super/ b/g + eq true, super /b/g + new B().p() + +test "always division and never regex after some tokens", -> + b = 2 + g = 1 + + eq 2, 4 / b/g + eq 2, 4/b/g + eq 2, 4/ b/g + eq 2, 4 /b/g + eq 2, "4" / b/g + eq 2, "4"/b/g + eq 2, "4"/ b/g + eq 2, "4" /b/g + ok isNaN /a/ / b/g + ok isNaN /a/i / b/g + ok isNaN /a//b/g + ok isNaN /a/i/b/g + ok isNaN /a// b/g + ok isNaN /a/i/ b/g + ok isNaN /a/ /b/g + ok isNaN /a/i /b/g + eq 0.5, true / b/g + eq 0.5, true/b/g + eq 0.5, true/ b/g + eq 0.5, true /b/g + eq 0, false / b/g + eq 0, false/b/g + eq 0, false/ b/g + eq 0, false /b/g + eq 0, null / b/g + eq 0, null/b/g + eq 0, null/ b/g + eq 0, null /b/g + ok isNaN undefined / b/g + ok isNaN undefined/b/g + ok isNaN undefined/ b/g + ok isNaN undefined /b/g + ok isNaN {a: 4} / b/g + ok isNaN {a: 4}/b/g + ok isNaN {a: 4}/ b/g + ok isNaN {a: 4} /b/g + o = prototype: 4 + eq 2, o:: / b/g + eq 2, o::/b/g + eq 2, o::/ b/g + eq 2, o:: /b/g + i = 4 + eq 2.0, i++ / b/g + eq 2.5, i++/b/g + eq 3.0, i++/ b/g + eq 3.5, i++ /b/g + eq 4.0, i-- / b/g + eq 3.5, i--/b/g + eq 3.0, i--/ b/g + eq 2.5, i-- /b/g + +test "compound division vs regex", -> + c = 4 + i = 2 + + a = 10 + b = a /= c / i + eq a, 5 + + a = 10 + b = a /= c /i + eq a, 5 + + a = 10 + b = a /= c /i # Tabs. + eq a, 5 + + a = 10 + b = a /= c /i # Non-breaking spaces. + eq a, 5 + + a = 10 + b = a/= c /i + eq a, 5 + + a = 10 + b = a/=c/i + eq a, 5 + + a = (regex) -> regex.test '=C ' + b = a /=c /i + eq b, true + + a = (regex) -> regex.test '= C ' + # Use parentheses to disambiguate. + b = a(/= c /i) + eq b, true + b = a(/= c /) + eq b, false + b = a (/= c /) + eq b, false + # Escape to disambiguate. + b = a /\= c /i + eq b, true + b = a /\= c / + eq b, false + test "#764: regular expressions should be indexable", -> eq /0/['source'], ///#{0}///['source']