jashkenas--coffeescript/src/lexer.coffee

690 lines
23 KiB
CoffeeScript

# The CoffeeScript Lexer. Uses a series of token-matching regexes to attempt
# matches against the beginning of the source code. When a match is found,
# a token is produced, we consume the match, and start again. Tokens are in the
# form:
#
# [tag, value, lineNumber]
#
# Which is a format that can be fed directly into [Jison](http://github.com/zaach/jison).
{Rewriter, INVERSES} = require './rewriter'
# Import the helpers we need.
{count, starts, compact, last} = require './helpers'
# The Lexer Class
# ---------------
# The Lexer class reads a stream of CoffeeScript and divvies it up into tagged
# tokens. Some potential ambiguity in the grammar has been avoided by
# pushing some extra smarts into the Lexer.
exports.Lexer = class Lexer
# **tokenize** is the Lexer's main method. Scan by attempting to match tokens
# one at a time, using a regular expression anchored at the start of the
# remaining code, or a custom recursive token-matching method
# (for interpolations). When the next token has been recorded, we move forward
# within the code past the token, and begin again.
#
# Each tokenizing method is responsible for returning the number of characters
# it has consumed.
#
# Before returning the token stream, run it through the [Rewriter](rewriter.html)
# unless explicitly asked not to.
tokenize: (code, opts = {}) ->
code = "\n#{code}" if WHITESPACE.test code
code = code.replace(/\r/g, '').replace TRAILING_SPACES, ''
@code = code # The remainder of the source code.
@line = opts.line or 0 # The current line.
@indent = 0 # The current indentation level.
@indebt = 0 # The over-indentation at the current level.
@outdebt = 0 # The under-outdentation at the current level.
@indents = [] # The stack of all current indentation levels.
@ends = [] # The stack for pairing up tokens.
@tokens = [] # Stream of parsed tokens in the form `['TYPE', value, line]`.
# At every position, run through this list of attempted matches,
# short-circuiting if any of them succeed. Their order determines precedence:
# `@literalToken` is the fallback catch-all.
i = 0
while @chunk = code.slice i
i += @identifierToken() or
@commentToken() or
@whitespaceToken() or
@lineToken() or
@heredocToken() or
@stringToken() or
@numberToken() or
@regexToken() or
@jsToken() or
@literalToken()
@closeIndentation()
@error "missing #{tag}" if tag = @ends.pop()
return @tokens if opts.rewrite is off
(new Rewriter).rewrite @tokens
# Tokenizers
# ----------
# Matches identifying literals: variables, keywords, method names, etc.
# Check to ensure that JavaScript reserved words aren't being used as
# identifiers. Because CoffeeScript reserves a handful of keywords that are
# allowed in JavaScript, we're careful not to tag them as keywords when
# referenced as property names here, so you can still do `jQuery.is()` even
# though `is` means `===` otherwise.
identifierToken: ->
return 0 unless match = IDENTIFIER.exec @chunk
[input, id, colon] = match
if id is 'own' and @tag() is 'FOR'
@token 'OWN', id
return id.length
forcedIdentifier = colon or
(prev = last @tokens) and (prev[0] in ['.', '?.', '::'] or
not prev.spaced and prev[0] is '@')
tag = 'IDENTIFIER'
if not forcedIdentifier and (id in JS_KEYWORDS or id in COFFEE_KEYWORDS)
tag = id.toUpperCase()
if tag is 'WHEN' and @tag() in LINE_BREAK
tag = 'LEADING_WHEN'
else if tag is 'FOR'
@seenFor = yes
else if tag is 'UNLESS'
tag = 'IF'
else if tag in UNARY
tag = 'UNARY'
else if tag in RELATION
if tag isnt 'INSTANCEOF' and @seenFor
tag = 'FOR' + tag
@seenFor = no
else
tag = 'RELATION'
if @value() is '!'
@tokens.pop()
id = '!' + id
if id in ['eval', 'arguments'].concat JS_FORBIDDEN
if forcedIdentifier
tag = 'IDENTIFIER'
id = new String id
id.reserved = yes
else if id in RESERVED
@error "reserved word \"#{word}\""
unless forcedIdentifier
id = COFFEE_ALIAS_MAP[id] if id in COFFEE_ALIASES
tag = switch id
when '!' then 'UNARY'
when '==', '!=' then 'COMPARE'
when '&&', '||' then 'LOGIC'
when 'true', 'false', 'null', 'undefined' then 'BOOL'
when 'break', 'continue', 'debugger' then 'STATEMENT'
else tag
@token tag, id
@token ':', ':' if colon
input.length
# Matches numbers, including decimals, hex, and exponential notation.
# Be careful not to interfere with ranges-in-progress.
numberToken: ->
return 0 unless match = NUMBER.exec @chunk
number = match[0]
lexedLength = number.length
if binaryLiteral = /0b([01]+)/.exec number
number = (parseInt binaryLiteral[1], 2).toString()
@token 'NUMBER', number
lexedLength
# Matches strings, including multi-line strings. Ensures that quotation marks
# are balanced within the string's contents, and within nested interpolations.
stringToken: ->
switch @chunk.charAt 0
when "'"
return 0 unless match = SIMPLESTR.exec @chunk
@token 'STRING', (string = match[0]).replace MULTILINER, '\\\n'
when '"'
return 0 unless string = @balancedString @chunk, '"'
if 0 < string.indexOf '#{', 1
@interpolateString string.slice 1, -1
else
@token 'STRING', @escapeLines string
else
return 0
@line += count string, '\n'
string.length
# Matches heredocs, adjusting indentation to the correct level, as heredocs
# preserve whitespace, but ignore indentation to the left.
heredocToken: ->
return 0 unless match = HEREDOC.exec @chunk
heredoc = match[0]
quote = heredoc.charAt 0
doc = @sanitizeHeredoc match[2], quote: quote, indent: null
if quote is '"' and 0 <= doc.indexOf '#{'
@interpolateString doc, heredoc: yes
else
@token 'STRING', @makeString doc, quote, yes
@line += count heredoc, '\n'
heredoc.length
# Matches and consumes comments.
commentToken: ->
return 0 unless match = @chunk.match COMMENT
[comment, here] = match
if here
@token 'HERECOMMENT', @sanitizeHeredoc here,
herecomment: true, indent: Array(@indent + 1).join(' ')
@token 'TERMINATOR', '\n'
@line += count comment, '\n'
comment.length
# Matches JavaScript interpolated directly into the source via backticks.
jsToken: ->
return 0 unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk
@token 'JS', (script = match[0]).slice 1, -1
script.length
# Matches regular expression literals. Lexing regular expressions is difficult
# to distinguish from division, so we borrow some basic heuristics from
# JavaScript and Ruby.
regexToken: ->
return 0 if @chunk.charAt(0) isnt '/'
if match = HEREGEX.exec @chunk
length = @heregexToken match
@line += count match[0], '\n'
return length
prev = last @tokens
return 0 if prev and (prev[0] in (if prev.spaced then NOT_REGEX else NOT_SPACED_REGEX))
return 0 unless match = REGEX.exec @chunk
[match, regex, flags] = match
if regex[..1] is '/*' then @error 'regular expressions cannot begin with `*`'
if regex is '//' then regex = '/(?:)/'
@token 'REGEX', "#{regex}#{flags}"
match.length
# Matches multiline extended regular expressions.
heregexToken: (match) ->
[heregex, body, flags] = match
if 0 > body.indexOf '#{'
re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/')
if re.match /^\*/ then @error 'regular expressions cannot begin with `*`'
@token 'REGEX', "/#{ re or '(?:)' }/#{flags}"
return heregex.length
@token 'IDENTIFIER', 'RegExp'
@tokens.push ['CALL_START', '(']
tokens = []
for [tag, value] in @interpolateString(body, regex: yes)
if tag is 'TOKENS'
tokens.push value...
else
continue unless value = value.replace HEREGEX_OMIT, ''
value = value.replace /\\/g, '\\\\'
tokens.push ['STRING', @makeString(value, '"', yes)]
tokens.push ['+', '+']
tokens.pop()
@tokens.push ['STRING', '""'], ['+', '+'] unless tokens[0]?[0] is 'STRING'
@tokens.push tokens...
@tokens.push [',', ','], ['STRING', '"' + flags + '"'] if flags
@token ')', ')'
heregex.length
# Matches newlines, indents, and outdents, and determines which is which.
# If we can detect that the current line is continued onto the the next line,
# then the newline is suppressed:
#
# elements
# .each( ... )
# .map( ... )
#
# Keeps track of the level of indentation, because a single outdent token
# can close multiple indents, so we need to know how far in we happen to be.
lineToken: ->
return 0 unless match = MULTI_DENT.exec @chunk
indent = match[0]
@line += count indent, '\n'
@seenFor = no
prev = last @tokens, 1
size = indent.length - 1 - indent.lastIndexOf '\n'
noNewlines = @unfinished()
if size - @indebt is @indent
if noNewlines then @suppressNewlines() else @newlineToken()
return indent.length
if size > @indent
if noNewlines
@indebt = size - @indent
@suppressNewlines()
return indent.length
diff = size - @indent + @outdebt
@token 'INDENT', diff
@indents.push diff
@ends .push 'OUTDENT'
@outdebt = @indebt = 0
else
@indebt = 0
@outdentToken @indent - size, noNewlines
@indent = size
indent.length
# Record an outdent token or multiple tokens, if we happen to be moving back
# inwards past several recorded indents.
outdentToken: (moveOut, noNewlines) ->
while moveOut > 0
len = @indents.length - 1
if @indents[len] is undefined
moveOut = 0
else if @indents[len] is @outdebt
moveOut -= @outdebt
@outdebt = 0
else if @indents[len] < @outdebt
@outdebt -= @indents[len]
moveOut -= @indents[len]
else
dent = @indents.pop() - @outdebt
moveOut -= dent
@outdebt = 0
@pair 'OUTDENT'
@token 'OUTDENT', dent
@outdebt -= moveOut if dent
@tokens.pop() while @value() is ';'
@token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' or noNewlines
this
# Matches and consumes non-meaningful whitespace. Tag the previous token
# as being "spaced", because there are some cases where it makes a difference.
whitespaceToken: ->
return 0 unless (match = WHITESPACE.exec @chunk) or
(nline = @chunk.charAt(0) is '\n')
prev = last @tokens
prev[if match then 'spaced' else 'newLine'] = true if prev
if match then match[0].length else 0
# Generate a newline token. Consecutive newlines get merged together.
newlineToken: ->
@tokens.pop() while @value() is ';'
@token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR'
this
# Use a `\` at a line-ending to suppress the newline.
# The slash is removed here once its job is done.
suppressNewlines: ->
@tokens.pop() if @value() is '\\'
this
# We treat all other single characters as a token. E.g.: `( ) , . !`
# Multi-character operators are also literal tokens, so that Jison can assign
# the proper order of operations. There are some symbols that we tag specially
# here. `;` and newlines are both treated as a `TERMINATOR`, we distinguish
# parentheses that indicate a method call from regular parentheses, and so on.
literalToken: ->
if match = OPERATOR.exec @chunk
[value] = match
@tagParameters() if CODE.test value
else
value = @chunk.charAt 0
tag = value
prev = last @tokens
if value is '=' and prev
if not prev[1].reserved and prev[1] in JS_FORBIDDEN
@error "reserved word \"#{@value()}\" can't be assigned"
if prev[1] in ['||', '&&']
prev[0] = 'COMPOUND_ASSIGN'
prev[1] += '='
return value.length
if value is ';'
@seenFor = no
tag = 'TERMINATOR'
else if value in MATH then tag = 'MATH'
else if value in COMPARE then tag = 'COMPARE'
else if value in COMPOUND_ASSIGN then tag = 'COMPOUND_ASSIGN'
else if value in UNARY then tag = 'UNARY'
else if value in SHIFT then tag = 'SHIFT'
else if value in LOGIC or value is '?' and prev?.spaced then tag = 'LOGIC'
else if prev and not prev.spaced
if value is '(' and prev[0] in CALLABLE
prev[0] = 'FUNC_EXIST' if prev[0] is '?'
tag = 'CALL_START'
else if value is '[' and prev[0] in INDEXABLE
tag = 'INDEX_START'
switch prev[0]
when '?' then prev[0] = 'INDEX_SOAK'
switch value
when '(', '{', '[' then @ends.push INVERSES[value]
when ')', '}', ']' then @pair value
@token tag, value
value.length
# Token Manipulators
# ------------------
# Sanitize a heredoc or herecomment by
# erasing all external indentation on the left-hand side.
sanitizeHeredoc: (doc, options) ->
{indent, herecomment} = options
if herecomment
if HEREDOC_ILLEGAL.test doc
@error "block comment cannot contain \"*/\", starting"
return doc if doc.indexOf('\n') <= 0
else
while match = HEREDOC_INDENT.exec doc
attempt = match[1]
indent = attempt if indent is null or 0 < attempt.length < indent.length
doc = doc.replace /// \n #{indent} ///g, '\n' if indent
doc = doc.replace /^\n/, '' unless herecomment
doc
# A source of ambiguity in our grammar used to be parameter lists in function
# definitions versus argument lists in function calls. Walk backwards, tagging
# parameters specially in order to make things easier for the parser.
tagParameters: ->
return this if @tag() isnt ')'
stack = []
{tokens} = this
i = tokens.length
tokens[--i][0] = 'PARAM_END'
while tok = tokens[--i]
switch tok[0]
when ')'
stack.push tok
when '(', 'CALL_START'
if stack.length then stack.pop()
else if tok[0] is '('
tok[0] = 'PARAM_START'
return this
else return this
this
# Close up all remaining open blocks at the end of the file.
closeIndentation: ->
@outdentToken @indent
# Matches a balanced group such as a single or double-quoted string. Pass in
# a series of delimiters, all of which must be nested correctly within the
# contents of the string. This method allows us to have strings within
# interpolations within strings, ad infinitum.
balancedString: (str, end) ->
stack = [end]
for i in [1...str.length]
switch letter = str.charAt i
when '\\'
i++
continue
when end
stack.pop()
unless stack.length
return str.slice 0, i + 1
end = stack[stack.length - 1]
continue
if end is '}' and letter in ['"', "'"]
stack.push end = letter
else if end is '}' and letter is '/' and match = (HEREGEX.exec(str.slice i) or REGEX.exec(str.slice i))
i += match[0].length - 1
else if end is '}' and letter is '{'
stack.push end = '}'
else if end is '"' and prev is '#' and letter is '{'
stack.push end = '}'
prev = letter
@error "missing #{ stack.pop() }, starting"
# Expand variables and expressions inside double-quoted strings using
# Ruby-like notation for substitution of arbitrary expressions.
#
# "Hello #{name.capitalize()}."
#
# If it encounters an interpolation, this method will recursively create a
# new Lexer, tokenize the interpolated contents, and merge them into the
# token stream.
interpolateString: (str, options = {}) ->
{heredoc, regex} = options
tokens = []
pi = 0
i = -1
while letter = str.charAt i += 1
if letter is '\\'
i += 1
continue
unless letter is '#' and str.charAt(i+1) is '{' and
(expr = @balancedString str.slice(i + 1), '}')
continue
tokens.push ['NEOSTRING', str.slice(pi, i)] if pi < i
inner = expr.slice(1, -1)
if inner.length
nested = new Lexer().tokenize inner, line: @line, rewrite: off
nested.pop()
nested.shift() if nested[0]?[0] is 'TERMINATOR'
if len = nested.length
if len > 1
nested.unshift ['(', '(', @line]
nested.push [')', ')', @line]
tokens.push ['TOKENS', nested]
i += expr.length
pi = i + 1
tokens.push ['NEOSTRING', str.slice pi] if i > pi < str.length
return tokens if regex
return @token 'STRING', '""' unless tokens.length
tokens.unshift ['', ''] unless tokens[0][0] is 'NEOSTRING'
@token '(', '(' if interpolated = tokens.length > 1
for [tag, value], i in tokens
@token '+', '+' if i
if tag is 'TOKENS'
@tokens.push value...
else
@token 'STRING', @makeString value, '"', heredoc
@token ')', ')' if interpolated
tokens
# Pairs up a closing token, ensuring that all listed pairs of tokens are
# correctly balanced throughout the course of the token stream.
pair: (tag) ->
unless tag is wanted = last @ends
@error "unmatched #{tag}" unless 'OUTDENT' is wanted
# Auto-close INDENT to support syntax like this:
#
# el.click((event) ->
# el.hide())
#
@indent -= size = last @indents
@outdentToken size, true
return @pair tag
@ends.pop()
# Helpers
# -------
# Add a token to the results, taking note of the line number.
token: (tag, value) ->
@tokens.push [tag, value, @line]
# Peek at a tag in the current token stream.
tag: (index, tag) ->
(tok = last @tokens, index) and if tag then tok[0] = tag else tok[0]
# Peek at a value in the current token stream.
value: (index, val) ->
(tok = last @tokens, index) and if val then tok[1] = val else tok[1]
# Are we in the midst of an unfinished expression?
unfinished: ->
LINE_CONTINUER.test(@chunk) or
@tag() in ['\\', '.', '?.', 'UNARY', 'MATH', '+', '-', 'SHIFT', 'RELATION'
'COMPARE', 'LOGIC', 'COMPOUND_ASSIGN', 'THROW', 'EXTENDS']
# Converts newlines for string literals.
escapeLines: (str, heredoc) ->
str.replace MULTILINER, if heredoc then '\\n' else ''
# Constructs a string token by escaping quotes and newlines.
makeString: (body, quote, heredoc) ->
return quote + quote unless body
body = body.replace /\\([\s\S])/g, (match, contents) ->
if contents in ['\n', quote] then contents else match
body = body.replace /// #{quote} ///g, '\\$&'
quote + @escapeLines(body, heredoc) + quote
# Throws a syntax error on the current `@line`.
error: (message) ->
throw SyntaxError "#{message} on line #{ @line + 1}"
# Constants
# ---------
# Keywords that CoffeeScript shares in common with JavaScript.
JS_KEYWORDS = [
'true', 'false', 'null', 'this'
'new', 'delete', 'typeof', 'in', 'instanceof'
'return', 'throw', 'break', 'continue', 'debugger'
'if', 'else', 'switch', 'for', 'while', 'do', 'try', 'catch', 'finally'
'class', 'extends', 'super'
]
# CoffeeScript-only keywords.
COFFEE_KEYWORDS = ['undefined', 'then', 'unless', 'until', 'loop', 'of', 'by', 'when']
COFFEE_ALIAS_MAP =
and : '&&'
or : '||'
is : '=='
isnt : '!='
not : '!'
yes : 'true'
no : 'false'
on : 'true'
off : 'false'
COFFEE_ALIASES = (key for key of COFFEE_ALIAS_MAP)
COFFEE_KEYWORDS = COFFEE_KEYWORDS.concat COFFEE_ALIASES
# The list of keywords that are reserved by JavaScript, but not used, or are
# used by CoffeeScript internally. We throw an error when these are encountered,
# to avoid having a JavaScript error at runtime.
RESERVED = [
'case', 'default', 'function', 'var', 'void', 'with'
'const', 'let', 'enum', 'export', 'import', 'native'
'__hasProp', '__extends', '__slice', '__bind', '__indexOf'
]
# The superset of both JavaScript keywords and reserved words, none of which may
# be used as identifiers or properties.
JS_FORBIDDEN = JS_KEYWORDS.concat RESERVED
exports.RESERVED = RESERVED.concat(JS_KEYWORDS).concat(COFFEE_KEYWORDS)
# Token matching regexes.
IDENTIFIER = /// ^
( [$A-Za-z_\x7f-\uffff][$\w\x7f-\uffff]* )
( [^\n\S]* : (?!:) )? # Is this a property name?
///
NUMBER = ///
^ 0x[\da-f]+ | # hex
^ 0b[01]+ | # binary
^ \d*\.?\d+ (?:e[+-]?\d+)? # decimal
///i
HEREDOC = /// ^ ("""|''') ([\s\S]*?) (?:\n[^\n\S]*)? \1 ///
OPERATOR = /// ^ (
?: [-=]> # function
| [-+*/%<>&|^!?=]= # compound assign / compare
| >>>=? # zero-fill right shift
| ([-+:])\1 # doubles
| ([&|<>])\2=? # logic / shift
| \?\. # soak access
| \.{2,3} # range or splat
) ///
WHITESPACE = /^[^\n\S]+/
COMMENT = /^###([^#][\s\S]*?)(?:###[^\n\S]*|(?:###)?$)|^(?:\s*#(?!##[^#]).*)+/
CODE = /^[-=]>/
MULTI_DENT = /^(?:\n[^\n\S]*)+/
SIMPLESTR = /^'[^\\']*(?:\\.[^\\']*)*'/
JSTOKEN = /^`[^\\`]*(?:\\.[^\\`]*)*`/
# Regex-matching-regexes.
REGEX = /// ^
(/ (?! [\s=] ) # disallow leading whitespace or equals signs
[^ [ / \n \\ ]* # every other thing
(?:
(?: \\[\s\S] # anything escaped
| \[ # character class
[^ \] \n \\ ]*
(?: \\[\s\S] [^ \] \n \\ ]* )*
]
) [^ [ / \n \\ ]*
)*
/) ([imgy]{0,4}) (?!\w)
///
HEREGEX = /// ^ /{3} ([\s\S]+?) /{3} ([imgy]{0,4}) (?!\w) ///
HEREGEX_OMIT = /\s+(?:#.*)?/g
# Token cleaning regexes.
MULTILINER = /\n/g
HEREDOC_INDENT = /\n+([^\n\S]*)/g
HEREDOC_ILLEGAL = /\*\//
LINE_CONTINUER = /// ^ \s* (?: , | \??\.(?![.\d]) | :: ) ///
TRAILING_SPACES = /\s+$/
# Compound assignment tokens.
COMPOUND_ASSIGN = [
'-=', '+=', '/=', '*=', '%=', '||=', '&&=', '?=', '<<=', '>>=', '>>>=', '&=', '^=', '|='
]
# Unary tokens.
UNARY = ['!', '~', 'NEW', 'TYPEOF', 'DELETE', 'DO']
# Logical tokens.
LOGIC = ['&&', '||', '&', '|', '^']
# Bit-shifting tokens.
SHIFT = ['<<', '>>', '>>>']
# Comparison tokens.
COMPARE = ['==', '!=', '<', '>', '<=', '>=']
# Mathematical tokens.
MATH = ['*', '/', '%']
# Relational tokens that are negatable with `not` prefix.
RELATION = ['IN', 'OF', 'INSTANCEOF']
# Boolean tokens.
BOOL = ['TRUE', 'FALSE', 'NULL', 'UNDEFINED']
# Tokens which a regular expression will never immediately follow, but which
# a division operator might.
#
# See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions
#
# Our list is shorter, due to sans-parentheses method calls.
NOT_REGEX = ['NUMBER', 'REGEX', 'BOOL', '++', '--', ']']
# If the previous token is not spaced, there are more preceding tokens that
# force a division parse:
NOT_SPACED_REGEX = NOT_REGEX.concat ')', '}', 'THIS', 'IDENTIFIER', 'STRING'
# Tokens which could legitimately be invoked or indexed. An opening
# parentheses or bracket following these tokens will be recorded as the start
# of a function invocation or indexing operation.
CALLABLE = ['IDENTIFIER', 'STRING', 'REGEX', ')', ']', '}', '?', '::', '@', 'THIS', 'SUPER']
INDEXABLE = CALLABLE.concat 'NUMBER', 'BOOL'
# Tokens that, when immediately preceding a `WHEN`, indicate that the `WHEN`
# occurs at the start of a line. We disambiguate these from trailing whens to
# avoid an ambiguity in the grammar.
LINE_BREAK = ['INDENT', 'OUTDENT', 'TERMINATOR']