# The CoffeeScript Lexer. Uses a series of token-matching regexes to attempt # matches against the beginning of the source code. When a match is found, # a token is produced, we consume the match, and start again. Tokens are in the # form: # # [tag, value, locationData] # # where locationData is {first_line, first_column, last_line, last_column, last_line_exclusive, last_column_exclusive}, which is a # format that can be fed directly into [Jison](https://github.com/zaach/jison). These # are read by jison in the `parser.lexer` function defined in coffeescript.coffee. {Rewriter, INVERSES} = require './rewriter' # Import the helpers we need. {count, starts, compact, repeat, invertLiterate, merge, attachCommentsToNode, locationDataToString, throwSyntaxError replaceUnicodeCodePointEscapes} = require './helpers' # The Lexer Class # --------------- # The Lexer class reads a stream of CoffeeScript and divvies it up into tagged # tokens. Some potential ambiguity in the grammar has been avoided by # pushing some extra smarts into the Lexer. exports.Lexer = class Lexer # **tokenize** is the Lexer's main method. Scan by attempting to match tokens # one at a time, using a regular expression anchored at the start of the # remaining code, or a custom recursive token-matching method # (for interpolations). When the next token has been recorded, we move forward # within the code past the token, and begin again. # # Each tokenizing method is responsible for returning the number of characters # it has consumed. # # Before returning the token stream, run it through the [Rewriter](rewriter.html). tokenize: (code, opts = {}) -> @literate = opts.literate # Are we lexing literate CoffeeScript? @indent = 0 # The current indentation level. @baseIndent = 0 # The overall minimum indentation level. @indebt = 0 # The over-indentation at the current level. @outdebt = 0 # The under-outdentation at the current level. @indents = [] # The stack of all current indentation levels. @indentLiteral = '' # The indentation. @ends = [] # The stack for pairing up tokens. @tokens = [] # Stream of parsed tokens in the form `['TYPE', value, location data]`. @seenFor = no # Used to recognize `FORIN`, `FOROF` and `FORFROM` tokens. @seenImport = no # Used to recognize `IMPORT FROM? AS?` tokens. @seenExport = no # Used to recognize `EXPORT FROM? AS?` tokens. @importSpecifierList = no # Used to identify when in an `IMPORT {...} FROM? ...`. @exportSpecifierList = no # Used to identify when in an `EXPORT {...} FROM? ...`. @csxDepth = 0 # Used to optimize CSX checks, how deep in CSX we are. @csxObjAttribute = {} # Used to detect if CSX attributes is wrapped in {} (
). @chunkLine = opts.line or 0 # The start line for the current @chunk. @chunkColumn = opts.column or 0 # The start column of the current @chunk. @chunkOffset = opts.offset or 0 # The start offset for the current @chunk. code = @clean code # The stripped, cleaned original source code. # At every position, run through this list of attempted matches, # short-circuiting if any of them succeed. Their order determines precedence: # `@literalToken` is the fallback catch-all. i = 0 while @chunk = code[i..] consumed = \ @identifierToken() or @commentToken() or @whitespaceToken() or @lineToken() or @stringToken() or @numberToken() or @csxToken() or @regexToken() or @jsToken() or @literalToken() # Update position. [@chunkLine, @chunkColumn, @chunkOffset] = @getLineAndColumnFromChunk consumed i += consumed return {@tokens, index: i} if opts.untilBalanced and @ends.length is 0 @closeIndentation() @error "missing #{end.tag}", (end.origin ? end)[2] if end = @ends.pop() return @tokens if opts.rewrite is off (new Rewriter).rewrite @tokens # Preprocess the code to remove leading and trailing whitespace, carriage # returns, etc. If we’re lexing literate CoffeeScript, strip external Markdown # by removing all lines that aren’t indented by at least four spaces or a tab. clean: (code) -> code = code.slice(1) if code.charCodeAt(0) is BOM code = code.replace(/\r/g, '').replace TRAILING_SPACES, '' if WHITESPACE.test code code = "\n#{code}" @chunkLine-- code = invertLiterate code if @literate code # Tokenizers # ---------- # Matches identifying literals: variables, keywords, method names, etc. # Check to ensure that JavaScript reserved words aren’t being used as # identifiers. Because CoffeeScript reserves a handful of keywords that are # allowed in JavaScript, we’re careful not to tag them as keywords when # referenced as property names here, so you can still do `jQuery.is()` even # though `is` means `===` otherwise. identifierToken: -> inCSXTag = @atCSXTag() regex = if inCSXTag then CSX_ATTRIBUTE else IDENTIFIER return 0 unless match = regex.exec @chunk [input, id, colon] = match # Preserve length of id for location data idLength = id.length poppedToken = undefined if id is 'own' and @tag() is 'FOR' @token 'OWN', id return id.length if id is 'from' and @tag() is 'YIELD' @token 'FROM', id return id.length if id is 'as' and @seenImport if @value() is '*' @tokens[@tokens.length - 1][0] = 'IMPORT_ALL' else if @value(yes) in COFFEE_KEYWORDS prev = @prev() [prev[0], prev[1]] = ['IDENTIFIER', @value(yes)] if @tag() in ['DEFAULT', 'IMPORT_ALL', 'IDENTIFIER'] @token 'AS', id return id.length if id is 'as' and @seenExport if @tag() in ['IDENTIFIER', 'DEFAULT'] @token 'AS', id return id.length if @value(yes) in COFFEE_KEYWORDS prev = @prev() [prev[0], prev[1]] = ['IDENTIFIER', @value(yes)] @token 'AS', id return id.length if id is 'default' and @seenExport and @tag() in ['EXPORT', 'AS'] @token 'DEFAULT', id return id.length if id is 'do' and regExSuper = /^(\s*super)(?!\(\))/.exec @chunk[3...] @token 'SUPER', 'super' @token 'CALL_START', '(' @token 'CALL_END', ')' [input, sup] = regExSuper return sup.length + 3 prev = @prev() tag = if colon or prev? and (prev[0] in ['.', '?.', '::', '?::'] or not prev.spaced and prev[0] is '@') 'PROPERTY' else 'IDENTIFIER' tokenData = {} if tag is 'IDENTIFIER' and (id in JS_KEYWORDS or id in COFFEE_KEYWORDS) and not (@exportSpecifierList and id in COFFEE_KEYWORDS) tag = id.toUpperCase() if tag is 'WHEN' and @tag() in LINE_BREAK tag = 'LEADING_WHEN' else if tag is 'FOR' @seenFor = yes else if tag is 'UNLESS' tag = 'IF' else if tag is 'IMPORT' @seenImport = yes else if tag is 'EXPORT' @seenExport = yes else if tag in UNARY tag = 'UNARY' else if tag in RELATION if tag isnt 'INSTANCEOF' and @seenFor tag = 'FOR' + tag @seenFor = no else tag = 'RELATION' if @value() is '!' poppedToken = @tokens.pop() tokenData.invert = poppedToken.data?.original ? poppedToken[1] else if tag is 'IDENTIFIER' and @seenFor and id is 'from' and isForFrom(prev) tag = 'FORFROM' @seenFor = no # Throw an error on attempts to use `get` or `set` as keywords, or # what CoffeeScript would normally interpret as calls to functions named # `get` or `set`, i.e. `get({foo: function () {}})`. else if tag is 'PROPERTY' and prev if prev.spaced and prev[0] in CALLABLE and /^[gs]et$/.test(prev[1]) and @tokens.length > 1 and @tokens[@tokens.length - 2][0] not in ['.', '?.', '@'] @error "'#{prev[1]}' cannot be used as a keyword, or as a function call without parentheses", prev[2] else if prev[0] is '.' and @tokens.length > 1 and (prevprev = @tokens[@tokens.length - 2])[0] is 'UNARY' and prevprev[1] is 'new' prevprev[0] = 'NEW_TARGET' else if @tokens.length > 2 prevprev = @tokens[@tokens.length - 2] if prev[0] in ['@', 'THIS'] and prevprev and prevprev.spaced and /^[gs]et$/.test(prevprev[1]) and @tokens[@tokens.length - 3][0] not in ['.', '?.', '@'] @error "'#{prevprev[1]}' cannot be used as a keyword, or as a function call without parentheses", prevprev[2] if tag is 'IDENTIFIER' and id in RESERVED @error "reserved word '#{id}'", length: id.length unless tag is 'PROPERTY' or @exportSpecifierList or @importSpecifierList if id in COFFEE_ALIASES alias = id id = COFFEE_ALIAS_MAP[id] tokenData.original = alias tag = switch id when '!' then 'UNARY' when '==', '!=' then 'COMPARE' when 'true', 'false' then 'BOOL' when 'break', 'continue', \ 'debugger' then 'STATEMENT' when '&&', '||' then id else tag tagToken = @token tag, id, length: idLength, data: tokenData tagToken.origin = [tag, alias, tagToken[2]] if alias if poppedToken [tagToken[2].first_line, tagToken[2].first_column, tagToken[2].range[0]] = [poppedToken[2].first_line, poppedToken[2].first_column, poppedToken[2].range[0]] if colon colonOffset = input.lastIndexOf if inCSXTag then '=' else ':' colonToken = @token ':', ':', offset: colonOffset, length: colon.length colonToken.csxColon = yes if inCSXTag # used by rewriter if inCSXTag and tag is 'IDENTIFIER' and prev[0] isnt ':' @token ',', ',', length: 0, origin: tagToken input.length # Matches numbers, including decimals, hex, and exponential notation. # Be careful not to interfere with ranges in progress. numberToken: -> return 0 unless match = NUMBER.exec @chunk number = match[0] lexedLength = number.length switch when /^0[BOX]/.test number @error "radix prefix in '#{number}' must be lowercase", offset: 1 when /^(?!0x).*E/.test number @error "exponential notation in '#{number}' must be indicated with a lowercase 'e'", offset: number.indexOf('E') when /^0\d*[89]/.test number @error "decimal literal '#{number}' must not be prefixed with '0'", length: lexedLength when /^0\d+/.test number @error "octal literal '#{number}' must be prefixed with '0o'", length: lexedLength parsedValue = Number number tokenData = {parsedValue} tag = if Number.isFinite(parsedValue) then 'NUMBER' else 'INFINITY' if tag is 'INFINITY' tokenData.original = number @token tag, number, length: lexedLength data: tokenData lexedLength # Matches strings, including multiline strings, as well as heredocs, with or without # interpolation. stringToken: -> [quote] = STRING_START.exec(@chunk) || [] return 0 unless quote # If the preceding token is `from` and this is an import or export statement, # properly tag the `from`. prev = @prev() if prev and @value() is 'from' and (@seenImport or @seenExport) prev[0] = 'FROM' regex = switch quote when "'" then STRING_SINGLE when '"' then STRING_DOUBLE when "'''" then HEREDOC_SINGLE when '"""' then HEREDOC_DOUBLE {tokens, index: end} = @matchWithInterpolations regex, quote heredoc = quote.length is 3 if heredoc # Find the smallest indentation. It will be removed from all lines later. indent = null doc = (token[1] for token, i in tokens when token[0] is 'NEOSTRING').join '#{}' while match = HEREDOC_INDENT.exec doc attempt = match[1] indent = attempt if indent is null or 0 < attempt.length < indent.length delimiter = quote.charAt(0) @mergeInterpolationTokens tokens, {quote, indent, endOffset: end}, (value) => @validateUnicodeCodePointEscapes value, delimiter: quote if @atCSXTag() @token ',', ',', length: 0, origin: @prev end # Matches and consumes comments. The comments are taken out of the token # stream and saved for later, to be reinserted into the output after # everything has been parsed and the JavaScript code generated. commentToken: (chunk = @chunk) -> return 0 unless match = chunk.match COMMENT [comment, here] = match contents = null # Does this comment follow code on the same line? newLine = /^\s*\n+\s*#/.test comment if here matchIllegal = HERECOMMENT_ILLEGAL.exec comment if matchIllegal @error "block comments cannot contain #{matchIllegal[0]}", offset: matchIllegal.index, length: matchIllegal[0].length # Parse indentation or outdentation as if this block comment didn’t exist. chunk = chunk.replace "####{here}###", '' # Remove leading newlines, like `Rewriter::removeLeadingNewlines`, to # avoid the creation of unwanted `TERMINATOR` tokens. chunk = chunk.replace /^\n+/, '' @lineToken chunk # Pull out the ###-style comment’s content, and format it. content = here if '\n' in content content = content.replace /// \n #{repeat ' ', @indent} ///g, '\n' contents = [content] else # The `COMMENT` regex captures successive line comments as one token. # Remove any leading newlines before the first comment, but preserve # blank lines between line comments. content = comment.replace /^(\n*)/, '' content = content.replace /^([ |\t]*)#/gm, '' contents = content.split '\n' commentAttachments = for content, i in contents content: content here: here? newLine: newLine or i isnt 0 # Line comments after the first one start new lines, by definition. prev = @prev() unless prev # If there’s no previous token, create a placeholder token to attach # this comment to; and follow with a newline. commentAttachments[0].newLine = yes @lineToken @chunk[comment.length..] # Set the indent. placeholderToken = @makeToken 'JS', '', generated: yes placeholderToken.comments = commentAttachments @tokens.push placeholderToken @newlineToken 0 else attachCommentsToNode commentAttachments, prev comment.length # Matches JavaScript interpolated directly into the source via backticks. jsToken: -> return 0 unless @chunk.charAt(0) is '`' and (match = (matchedHere = HERE_JSTOKEN.exec(@chunk)) or JSTOKEN.exec(@chunk)) # Convert escaped backticks to backticks, and escaped backslashes # just before escaped backticks to backslashes script = match[1] {length} = match[0] @token 'JS', script, {length, data: {here: !!matchedHere}} length # Matches regular expression literals, as well as multiline extended ones. # Lexing regular expressions is difficult to distinguish from division, so we # borrow some basic heuristics from JavaScript and Ruby. regexToken: -> switch when match = REGEX_ILLEGAL.exec @chunk @error "regular expressions cannot begin with #{match[2]}", offset: match.index + match[1].length when match = @matchWithInterpolations HEREGEX, '///' {tokens, index} = match comments = @chunk[0...index].match /\s+(#(?!{).*)/g @commentToken comment for comment in comments if comments when match = REGEX.exec @chunk [regex, body, closed] = match @validateEscapes body, isRegex: yes, offsetInChunk: 1 index = regex.length prev = @prev() if prev if prev.spaced and prev[0] in CALLABLE return 0 if not closed or POSSIBLY_DIVISION.test regex else if prev[0] in NOT_REGEX return 0 @error 'missing / (unclosed regex)' unless closed else return 0 [flags] = REGEX_FLAGS.exec @chunk[index..] end = index + flags.length origin = @makeToken 'REGEX', null, length: end switch when not VALID_FLAGS.test flags @error "invalid regular expression flags #{flags}", offset: index, length: flags.length when regex or tokens.length is 1 delimiter = if body then '/' else '///' body ?= tokens[0][1] @validateUnicodeCodePointEscapes body, {delimiter} @token 'REGEX', "/#{body}/#{flags}", {length: end, origin, data: {delimiter}} else @token 'REGEX_START', '(', {length: 0, origin} @token 'IDENTIFIER', 'RegExp', length: 0 @token 'CALL_START', '(', length: 0 @mergeInterpolationTokens tokens, {double: yes, heregex: {flags}, endOffset: end - flags.length, quote: '///'}, (str) => @validateUnicodeCodePointEscapes str, {delimiter} if flags @token ',', ',', offset: index - 1, length: 0 @token 'STRING', '"' + flags + '"', offset: index - 1, length: flags.length @token ')', ')', offset: end, length: 0 @token 'REGEX_END', ')', offset: end, length: 0 end # Matches newlines, indents, and outdents, and determines which is which. # If we can detect that the current line is continued onto the next line, # then the newline is suppressed: # # elements # .each( ... ) # .map( ... ) # # Keeps track of the level of indentation, because a single outdent token # can close multiple indents, so we need to know how far in we happen to be. lineToken: (chunk = @chunk) -> return 0 unless match = MULTI_DENT.exec chunk indent = match[0] prev = @prev() backslash = prev?[0] is '\\' @seenFor = no unless backslash and @seenFor @seenImport = no unless (backslash and @seenImport) or @importSpecifierList @seenExport = no unless (backslash and @seenExport) or @exportSpecifierList size = indent.length - 1 - indent.lastIndexOf '\n' noNewlines = @unfinished() newIndentLiteral = if size > 0 then indent[-size..] else '' unless /^(.?)\1*$/.exec newIndentLiteral @error 'mixed indentation', offset: indent.length return indent.length minLiteralLength = Math.min newIndentLiteral.length, @indentLiteral.length if newIndentLiteral[...minLiteralLength] isnt @indentLiteral[...minLiteralLength] @error 'indentation mismatch', offset: indent.length return indent.length if size - @indebt is @indent if noNewlines then @suppressNewlines() else @newlineToken 0 return indent.length if size > @indent if noNewlines @indebt = size - @indent unless backslash @suppressNewlines() return indent.length unless @tokens.length @baseIndent = @indent = size @indentLiteral = newIndentLiteral return indent.length diff = size - @indent + @outdebt @token 'INDENT', diff, offset: indent.length - size, length: size @indents.push diff @ends.push {tag: 'OUTDENT'} @outdebt = @indebt = 0 @indent = size @indentLiteral = newIndentLiteral else if size < @baseIndent @error 'missing indentation', offset: indent.length else @indebt = 0 @outdentToken @indent - size, noNewlines, indent.length indent.length # Record an outdent token or multiple tokens, if we happen to be moving back # inwards past several recorded indents. Sets new @indent value. outdentToken: (moveOut, noNewlines, outdentLength) -> decreasedIndent = @indent - moveOut while moveOut > 0 lastIndent = @indents[@indents.length - 1] if not lastIndent @outdebt = moveOut = 0 else if @outdebt and moveOut <= @outdebt @outdebt -= moveOut moveOut = 0 else dent = @indents.pop() + @outdebt if outdentLength and @chunk[outdentLength] in INDENTABLE_CLOSERS decreasedIndent -= dent - moveOut moveOut = dent @outdebt = 0 # pair might call outdentToken, so preserve decreasedIndent @pair 'OUTDENT' @token 'OUTDENT', moveOut, length: outdentLength moveOut -= dent @outdebt -= moveOut if dent @suppressSemicolons() @token 'TERMINATOR', '\n', offset: outdentLength, length: 0 unless @tag() is 'TERMINATOR' or noNewlines @indent = decreasedIndent @indentLiteral = @indentLiteral[...decreasedIndent] this # Matches and consumes non-meaningful whitespace. Tag the previous token # as being “spaced”, because there are some cases where it makes a difference. whitespaceToken: -> return 0 unless (match = WHITESPACE.exec @chunk) or (nline = @chunk.charAt(0) is '\n') prev = @prev() prev[if match then 'spaced' else 'newLine'] = true if prev if match then match[0].length else 0 # Generate a newline token. Consecutive newlines get merged together. newlineToken: (offset) -> @suppressSemicolons() @token 'TERMINATOR', '\n', {offset, length: 0} unless @tag() is 'TERMINATOR' this # Use a `\` at a line-ending to suppress the newline. # The slash is removed here once its job is done. suppressNewlines: -> prev = @prev() if prev[1] is '\\' if prev.comments and @tokens.length > 1 # `@tokens.length` should be at least 2 (some code, then `\`). # If something puts a `\` after nothing, they deserve to lose any # comments that trail it. attachCommentsToNode prev.comments, @tokens[@tokens.length - 2] @tokens.pop() this # CSX is like JSX but for CoffeeScript. csxToken: -> firstChar = @chunk[0] # Check the previous token to detect if attribute is spread. prevChar = if @tokens.length > 0 then @tokens[@tokens.length - 1][0] else '' if firstChar is '<' match = CSX_IDENTIFIER.exec(@chunk[1...]) or CSX_FRAGMENT_IDENTIFIER.exec(@chunk[1...]) return 0 unless match and ( @csxDepth > 0 or # Not the right hand side of an unspaced comparison (i.e. `a', origin: tagToken, name: id, properties} @csxDepth++ return fullId.length + 1 else if csxTag = @atCSXTag() if @chunk[...2] is '/>' # Self-closing tag. @pair '/>' @token ']', ']', length: 2 generated: yes @token 'CALL_END', ')', length: 2 generated: yes data: selfClosingSlashToken: @makeToken '/', '/' closingBracketToken: @makeToken '>', '>', offset: 1 @csxDepth-- return 2 else if firstChar is '{' if prevChar is ':' token = @token '(', '(' @csxObjAttribute[@csxDepth] = no else token = @token '{', '{' @csxObjAttribute[@csxDepth] = yes @ends.push {tag: '}', origin: token} return 1 else if firstChar is '>' # end of opening tag # Ignore terminators inside a tag. {origin: openingTagToken} = @pair '/>' # As if the current tag was self-closing. @token ']', ']', generated: yes data: closingBracketToken: @makeToken '>', '>' @token ',', 'JSX_COMMA', generated: yes {tokens, index: end} = @matchWithInterpolations INSIDE_CSX, '>', ' @validateUnicodeCodePointEscapes value, delimiter: '>' match = CSX_IDENTIFIER.exec(@chunk[end...]) or CSX_FRAGMENT_IDENTIFIER.exec(@chunk[end...]) if not match or match[1] isnt "#{csxTag.name}#{(".#{property}" for property in csxTag.properties).join ''}" @error "expected corresponding CSX closing tag for #{csxTag.name}", csxTag.origin.data.tagNameToken[2] [, fullTagName] = match afterTag = end + fullTagName.length if @chunk[afterTag] isnt '>' @error "missing closing > after tag name", offset: afterTag, length: 1 # -2/+2 for the opening ``. endToken = @token 'CALL_END', ')', offset: end - 2 length: fullTagName.length + 3 generated: yes data: closingTagOpeningBracketToken: @makeToken '<', '<', offset: end - 2 closingTagSlashToken: @makeToken '/', '/', offset: end - 1 # TODO: individual tokens for complex tag name? eg < / A . B > closingTagNameToken: @makeToken 'IDENTIFIER', fullTagName, offset: end closingTagClosingBracketToken: @makeToken '>', '>', offset: end + fullTagName.length # make the closing tag location data more easily accessible to the grammar addTokenData openingTagToken, endToken.data @csxDepth-- return afterTag + 1 else return 0 else if @atCSXTag 1 if firstChar is '}' @pair firstChar if @csxObjAttribute[@csxDepth] @token '}', '}' @csxObjAttribute[@csxDepth] = no else @token ')', ')' @token ',', ',' return 1 else return 0 else return 0 atCSXTag: (depth = 0) -> return no if @csxDepth is 0 i = @ends.length - 1 i-- while @ends[i]?.tag is 'OUTDENT' or depth-- > 0 # Ignore indents. last = @ends[i] last?.tag is '/>' and last # We treat all other single characters as a token. E.g.: `( ) , . !` # Multi-character operators are also literal tokens, so that Jison can assign # the proper order of operations. There are some symbols that we tag specially # here. `;` and newlines are both treated as a `TERMINATOR`, we distinguish # parentheses that indicate a method call from regular parentheses, and so on. literalToken: -> if match = OPERATOR.exec @chunk [value] = match @tagParameters() if CODE.test value else value = @chunk.charAt 0 tag = value prev = @prev() if prev and value in ['=', COMPOUND_ASSIGN...] skipToken = false if value is '=' and prev[1] in ['||', '&&'] and not prev.spaced prev[0] = 'COMPOUND_ASSIGN' prev[1] += '=' prev.data.original += '=' if prev.data?.original prev = @tokens[@tokens.length - 2] skipToken = true if prev and prev[0] isnt 'PROPERTY' origin = prev.origin ? prev message = isUnassignable prev[1], origin[1] @error message, origin[2] if message return value.length if skipToken if value is '(' and prev?[0] is 'IMPORT' prev[0] = 'DYNAMIC_IMPORT' if value is '{' and @seenImport @importSpecifierList = yes else if @importSpecifierList and value is '}' @importSpecifierList = no else if value is '{' and prev?[0] is 'EXPORT' @exportSpecifierList = yes else if @exportSpecifierList and value is '}' @exportSpecifierList = no if value is ';' @error 'unexpected ;' if prev?[0] in ['=', UNFINISHED...] @seenFor = @seenImport = @seenExport = no tag = 'TERMINATOR' else if value is '*' and prev?[0] is 'EXPORT' tag = 'EXPORT_ALL' else if value in MATH then tag = 'MATH' else if value in COMPARE then tag = 'COMPARE' else if value in COMPOUND_ASSIGN then tag = 'COMPOUND_ASSIGN' else if value in UNARY then tag = 'UNARY' else if value in UNARY_MATH then tag = 'UNARY_MATH' else if value in SHIFT then tag = 'SHIFT' else if value is '?' and prev?.spaced then tag = 'BIN?' else if prev if value is '(' and not prev.spaced and prev[0] in CALLABLE prev[0] = 'FUNC_EXIST' if prev[0] is '?' tag = 'CALL_START' else if value is '[' and ((prev[0] in INDEXABLE and not prev.spaced) or (prev[0] is '::')) # `.prototype` can’t be a method you can call. tag = 'INDEX_START' switch prev[0] when '?' then prev[0] = 'INDEX_SOAK' token = @makeToken tag, value switch value when '(', '{', '[' then @ends.push {tag: INVERSES[value], origin: token} when ')', '}', ']' then @pair value @tokens.push @makeToken tag, value value.length # Token Manipulators # ------------------ # A source of ambiguity in our grammar used to be parameter lists in function # definitions versus argument lists in function calls. Walk backwards, tagging # parameters specially in order to make things easier for the parser. tagParameters: -> return @tagDoIife() if @tag() isnt ')' stack = [] {tokens} = this i = tokens.length paramEndToken = tokens[--i] paramEndToken[0] = 'PARAM_END' while tok = tokens[--i] switch tok[0] when ')' stack.push tok when '(', 'CALL_START' if stack.length then stack.pop() else if tok[0] is '(' tok[0] = 'PARAM_START' return @tagDoIife i - 1 else paramEndToken[0] = 'CALL_END' return this this # Tag `do` followed by a function differently than `do` followed by eg an # identifier to allow for different grammar precedence tagDoIife: (tokenIndex) -> tok = @tokens[tokenIndex ? @tokens.length - 1] return this unless tok?[0] is 'DO' tok[0] = 'DO_IIFE' this # Close up all remaining open blocks at the end of the file. closeIndentation: -> @outdentToken @indent # Match the contents of a delimited token and expand variables and expressions # inside it using Ruby-like notation for substitution of arbitrary # expressions. # # "Hello #{name.capitalize()}." # # If it encounters an interpolation, this method will recursively create a new # Lexer and tokenize until the `{` of `#{` is balanced with a `}`. # # - `regex` matches the contents of a token (but not `delimiter`, and not # `#{` if interpolations are desired). # - `delimiter` is the delimiter of the token. Examples are `'`, `"`, `'''`, # `"""` and `///`. # - `closingDelimiter` is different from `delimiter` only in CSX # - `interpolators` matches the start of an interpolation, for CSX it's both # `{` and `<` (i.e. nested CSX tag) # # This method allows us to have strings within interpolations within strings, # ad infinitum. matchWithInterpolations: (regex, delimiter, closingDelimiter = delimiter, interpolators = /^#\{/) -> tokens = [] offsetInChunk = delimiter.length return null unless @chunk[...offsetInChunk] is delimiter str = @chunk[offsetInChunk..] loop [strPart] = regex.exec str @validateEscapes strPart, {isRegex: delimiter.charAt(0) is '/', offsetInChunk} # Push a fake `'NEOSTRING'` token, which will get turned into a real string later. tokens.push @makeToken 'NEOSTRING', strPart, offset: offsetInChunk str = str[strPart.length..] offsetInChunk += strPart.length break unless match = interpolators.exec str [interpolator] = match # To remove the `#` in `#{`. interpolationOffset = interpolator.length - 1 [line, column, offset] = @getLineAndColumnFromChunk offsetInChunk + interpolationOffset rest = str[interpolationOffset..] {tokens: nested, index} = new Lexer().tokenize rest, {line, column, offset, untilBalanced: on} # Account for the `#` in `#{`. index += interpolationOffset braceInterpolator = str[index - 1] is '}' if braceInterpolator # Turn the leading and trailing `{` and `}` into parentheses. Unnecessary # parentheses will be removed later. [open, ..., close] = nested open[0] = 'INTERPOLATION_START' open[1] = '(' open[2].first_column -= interpolationOffset open[2].range = [ open[2].range[0] - interpolationOffset open[2].range[1] ] close[0] = 'INTERPOLATION_END' close[1] = ')' close.origin = ['', 'end of interpolation', close[2]] # Remove leading `'TERMINATOR'` (if any). nested.splice 1, 1 if nested[1]?[0] is 'TERMINATOR' # Remove trailing `'INDENT'/'OUTDENT'` pair (if any). nested.splice -3, 2 if nested[nested.length - 3]?[0] is 'INDENT' and nested[nested.length - 2][0] is 'OUTDENT' unless braceInterpolator # We are not using `{` and `}`, so wrap the interpolated tokens instead. open = @makeToken 'INTERPOLATION_START', '(', offset: offsetInChunk, length: 0 close = @makeToken 'INTERPOLATION_END', ')', offset: offsetInChunk + index, length: 0 nested = [open, nested..., close] # Push a fake `'TOKENS'` token, which will get turned into real tokens later. tokens.push ['TOKENS', nested] str = str[index..] offsetInChunk += index unless str[...closingDelimiter.length] is closingDelimiter @error "missing #{closingDelimiter}", length: delimiter.length {tokens, index: offsetInChunk + closingDelimiter.length} # Merge the array `tokens` of the fake token types `'TOKENS'` and `'NEOSTRING'` # (as returned by `matchWithInterpolations`) into the token stream. The value # of `'NEOSTRING'`s are converted using `fn` and turned into strings using # `options` first. mergeInterpolationTokens: (tokens, options, fn) -> {quote, indent, double, heregex, endOffset} = options if tokens.length > 1 lparen = @token 'STRING_START', '(', length: quote?.length ? 0, data: {quote} firstIndex = @tokens.length $ = tokens.length - 1 for token, i in tokens [tag, value] = token switch tag when 'TOKENS' # There are comments (and nothing else) in this interpolation. if value.length is 2 and (value[0].comments or value[1].comments) placeholderToken = @makeToken 'JS', '' placeholderToken.generated = yes # Use the same location data as the first parenthesis. placeholderToken[2] = value[0][2] for val in value when val.comments placeholderToken.comments ?= [] placeholderToken.comments.push val.comments... value.splice 1, 0, placeholderToken # Push all the tokens in the fake `'TOKENS'` token. These already have # sane location data. locationToken = value[0] tokensToPush = value when 'NEOSTRING' # Convert `'NEOSTRING'` into `'STRING'`. converted = fn.call this, token[1], i addTokenData token, initialChunk: yes if i is 0 addTokenData token, finalChunk: yes if i is $ addTokenData token, {indent, quote, double} addTokenData token, {heregex} if heregex token[0] = 'STRING' token[1] = '"' + converted + '"' if tokens.length is 1 and quote? token[2].first_column -= quote.length if token[1].substr(-2, 1) is '\n' token[2].last_line +=1 token[2].last_column = quote.length - 1 else token[2].last_column += quote.length token[2].last_column -= 1 if token[1].length is 2 token[2].last_column_exclusive += quote.length token[2].range = [ token[2].range[0] - quote.length token[2].range[1] + quote.length ] locationToken = token tokensToPush = [token] @tokens.push tokensToPush... if lparen [..., lastToken] = tokens lparen.origin = ['STRING', null, first_line: lparen[2].first_line first_column: lparen[2].first_column last_line: lastToken[2].last_line last_column: lastToken[2].last_column last_line_exclusive: lastToken[2].last_line_exclusive last_column_exclusive: lastToken[2].last_column_exclusive range: [ lparen[2].range[0] lastToken[2].range[1] ] ] lparen[2] = lparen.origin[2] rparen = @token 'STRING_END', ')', offset: endOffset - (quote ? '').length, length: quote?.length ? 0 # Pairs up a closing token, ensuring that all listed pairs of tokens are # correctly balanced throughout the course of the token stream. pair: (tag) -> [..., prev] = @ends unless tag is wanted = prev?.tag @error "unmatched #{tag}" unless 'OUTDENT' is wanted # Auto-close `INDENT` to support syntax like this: # # el.click((event) -> # el.hide()) # [..., lastIndent] = @indents @outdentToken lastIndent, true return @pair tag @ends.pop() # Helpers # ------- # Returns the line and column number from an offset into the current chunk. # # `offset` is a number of characters into `@chunk`. getLineAndColumnFromChunk: (offset) -> if offset is 0 return [@chunkLine, @chunkColumn, @chunkOffset] if offset >= @chunk.length string = @chunk else string = @chunk[..offset-1] lineCount = count string, '\n' column = @chunkColumn if lineCount > 0 [..., lastLine] = string.split '\n' column = lastLine.length else column += string.length [@chunkLine + lineCount, column, @chunkOffset + offset] makeLocationData: ({ offsetInChunk, length }) -> locationData = range: [] [locationData.first_line, locationData.first_column, locationData.range[0]] = @getLineAndColumnFromChunk offsetInChunk # Use length - 1 for the final offset - we’re supplying the last_line and the last_column, # so if last_column == first_column, then we’re looking at a character of length 1. lastCharacter = if length > 0 then (length - 1) else 0 [locationData.last_line, locationData.last_column, endOffset] = @getLineAndColumnFromChunk offsetInChunk + lastCharacter [locationData.last_line_exclusive, locationData.last_column_exclusive] = @getLineAndColumnFromChunk offsetInChunk + lastCharacter + (if length > 0 then 1 else 0) locationData.range[1] = if length > 0 then endOffset + 1 else endOffset locationData # Same as `token`, except this just returns the token without adding it # to the results. makeToken: (tag, value, {offset: offsetInChunk = 0, length = value.length, origin, generated} = {}) -> token = [tag, value, @makeLocationData {offsetInChunk, length}] token.origin = origin if origin token.generated = yes if generated token # Add a token to the results. # `offset` is the offset into the current `@chunk` where the token starts. # `length` is the length of the token in the `@chunk`, after the offset. If # not specified, the length of `value` will be used. # # Returns the new token. token: (tag, value, {offset, length, origin, data, generated} = {}) -> token = @makeToken tag, value, {offset, length, origin, generated} addTokenData token, data if data @tokens.push token token # Peek at the last tag in the token stream. tag: -> [..., token] = @tokens token?[0] # Peek at the last value in the token stream. value: (useOrigin = no) -> [..., token] = @tokens if useOrigin and token?.origin? token.origin[1] else token?[1] # Get the previous token in the token stream. prev: -> @tokens[@tokens.length - 1] # Are we in the midst of an unfinished expression? unfinished: -> LINE_CONTINUER.test(@chunk) or @tag() in UNFINISHED validateUnicodeCodePointEscapes: (str, options) -> replaceUnicodeCodePointEscapes str, merge options, {@error} # Validates escapes in strings and regexes. validateEscapes: (str, options = {}) -> invalidEscapeRegex = if options.isRegex REGEX_INVALID_ESCAPE else STRING_INVALID_ESCAPE match = invalidEscapeRegex.exec str return unless match [[], before, octal, hex, unicodeCodePoint, unicode] = match message = if octal "octal escape sequences are not allowed" else "invalid escape sequence" invalidEscape = "\\#{octal or hex or unicodeCodePoint or unicode}" @error "#{message} #{invalidEscape}", offset: (options.offsetInChunk ? 0) + match.index + before.length length: invalidEscape.length suppressSemicolons: -> while @value() is ';' @tokens.pop() @error 'unexpected ;' if @prev()?[0] in ['=', UNFINISHED...] # Throws an error at either a given offset from the current chunk or at the # location of a token (`token[2]`). error: (message, options = {}) => location = if 'first_line' of options options else [first_line, first_column] = @getLineAndColumnFromChunk options.offset ? 0 {first_line, first_column, last_column: first_column + (options.length ? 1) - 1} throwSyntaxError message, location # Helper functions # ---------------- isUnassignable = (name, displayName = name) -> switch when name in [JS_KEYWORDS..., COFFEE_KEYWORDS...] "keyword '#{displayName}' can't be assigned" when name in STRICT_PROSCRIBED "'#{displayName}' can't be assigned" when name in RESERVED "reserved word '#{displayName}' can't be assigned" else false exports.isUnassignable = isUnassignable # `from` isn’t a CoffeeScript keyword, but it behaves like one in `import` and # `export` statements (handled above) and in the declaration line of a `for` # loop. Try to detect when `from` is a variable identifier and when it is this # “sometimes” keyword. isForFrom = (prev) -> if prev[0] is 'IDENTIFIER' # `for i from from`, `for from from iterable` if prev[1] is 'from' prev[1][0] = 'IDENTIFIER' yes # `for i from iterable` yes # `for from…` else if prev[0] is 'FOR' no # `for {from}…`, `for [from]…`, `for {a, from}…`, `for {a: from}…` else if prev[1] in ['{', '[', ',', ':'] no else yes addTokenData = (token, data) -> Object.assign (token.data ?= {}), data # Constants # --------- # Keywords that CoffeeScript shares in common with JavaScript. JS_KEYWORDS = [ 'true', 'false', 'null', 'this' 'new', 'delete', 'typeof', 'in', 'instanceof' 'return', 'throw', 'break', 'continue', 'debugger', 'yield', 'await' 'if', 'else', 'switch', 'for', 'while', 'do', 'try', 'catch', 'finally' 'class', 'extends', 'super' 'import', 'export', 'default' ] # CoffeeScript-only keywords. COFFEE_KEYWORDS = [ 'undefined', 'Infinity', 'NaN' 'then', 'unless', 'until', 'loop', 'of', 'by', 'when' ] COFFEE_ALIAS_MAP = and : '&&' or : '||' is : '==' isnt : '!=' not : '!' yes : 'true' no : 'false' on : 'true' off : 'false' COFFEE_ALIASES = (key for key of COFFEE_ALIAS_MAP) COFFEE_KEYWORDS = COFFEE_KEYWORDS.concat COFFEE_ALIASES # The list of keywords that are reserved by JavaScript, but not used, or are # used by CoffeeScript internally. We throw an error when these are encountered, # to avoid having a JavaScript error at runtime. RESERVED = [ 'case', 'function', 'var', 'void', 'with', 'const', 'let', 'enum' 'native', 'implements', 'interface', 'package', 'private' 'protected', 'public', 'static' ] STRICT_PROSCRIBED = ['arguments', 'eval'] # The superset of both JavaScript keywords and reserved words, none of which may # be used as identifiers or properties. exports.JS_FORBIDDEN = JS_KEYWORDS.concat(RESERVED).concat(STRICT_PROSCRIBED) # The character code of the nasty Microsoft madness otherwise known as the BOM. BOM = 65279 # Token matching regexes. IDENTIFIER = /// ^ (?!\d) ( (?: (?!\s)[$\w\x7f-\uffff] )+ ) ( [^\n\S]* : (?!:) )? # Is this a property name? /// CSX_IDENTIFIER = /// ^ (?![\d<]) # Must not start with `<`. ( (?: (?!\s)[\.\-$\w\x7f-\uffff] )+ ) # Like `IDENTIFIER`, but includes `-`s and `.`s. /// # Fragment: <> CSX_FRAGMENT_IDENTIFIER = /// ^ ()> # Ends immediately with `>`. /// CSX_ATTRIBUTE = /// ^ (?!\d) ( (?: (?!\s)[\-$\w\x7f-\uffff] )+ ) # Like `IDENTIFIER`, but includes `-`s. ( [^\S]* = (?!=) )? # Is this an attribute with a value? /// NUMBER = /// ^ 0b[01]+ | # binary ^ 0o[0-7]+ | # octal ^ 0x[\da-f]+ | # hex ^ \d*\.?\d+ (?:e[+-]?\d+)? # decimal ///i OPERATOR = /// ^ ( ?: [-=]> # function | [-+*/%<>&|^!?=]= # compound assign / compare | >>>=? # zero-fill right shift | ([-+:])\1 # doubles | ([&|<>*/%])\2=? # logic / shift / power / floor division / modulo | \?(\.|::) # soak access | \.{2,3} # range or splat ) /// WHITESPACE = /^[^\n\S]+/ COMMENT = /^\s*###([^#][\s\S]*?)(?:###[^\n\S]*|###$)|^(?:\s*#(?!##[^#]).*)+/ CODE = /^[-=]>/ MULTI_DENT = /^(?:\n[^\n\S]*)+/ JSTOKEN = ///^ `(?!``) ((?: [^`\\] | \\[\s\S] )*) ` /// HERE_JSTOKEN = ///^ ``` ((?: [^`\\] | \\[\s\S] | `(?!``) )*) ``` /// # String-matching-regexes. STRING_START = /^(?:'''|"""|'|")/ STRING_SINGLE = /// ^(?: [^\\'] | \\[\s\S] )* /// STRING_DOUBLE = /// ^(?: [^\\"#] | \\[\s\S] | \#(?!\{) )* /// HEREDOC_SINGLE = /// ^(?: [^\\'] | \\[\s\S] | '(?!'') )* /// HEREDOC_DOUBLE = /// ^(?: [^\\"#] | \\[\s\S] | "(?!"") | \#(?!\{) )* /// INSIDE_CSX = /// ^(?: [^ \{ # Start of CoffeeScript interpolation. < # Maybe CSX tag (`<` not allowed even if bare). ] )* /// # Similar to `HEREDOC_DOUBLE` but there is no escaping. CSX_INTERPOLATION = /// ^(?: \{ # CoffeeScript interpolation. | <(?!/) # CSX opening tag. )/// HEREDOC_INDENT = /\n+([^\n\S]*)(?=\S)/g # Regex-matching-regexes. REGEX = /// ^ / (?!/) (( ?: [^ [ / \n \\ ] # Every other thing. | \\[^\n] # Anything but newlines escaped. | \[ # Character class. (?: \\[^\n] | [^ \] \n \\ ] )* \] )*) (/)? /// REGEX_FLAGS = /^\w*/ VALID_FLAGS = /^(?!.*(.).*\1)[gimsuy]*$/ HEREGEX = /// ^ (?: # Match any character, except those that need special handling below. [^\\/#\s] # Match `\` followed by any character. | \\[\s\S] # Match any `/` except `///`. | /(?!//) # Match `#` which is not part of interpolation, e.g. `#{}`. | \#(?!\{) # Comments consume everything until the end of the line, including `///`. | \s+(?:#(?!\{).*)? )* /// REGEX_ILLEGAL = /// ^ ( / | /{3}\s*) (\*) /// POSSIBLY_DIVISION = /// ^ /=?\s /// # Other regexes. HERECOMMENT_ILLEGAL = /\*\// LINE_CONTINUER = /// ^ \s* (?: , | \??\.(?![.\d]) | \??:: ) /// STRING_INVALID_ESCAPE = /// ( (?:^|[^\\]) (?:\\\\)* ) # Make sure the escape isn’t escaped. \\ ( ?: (0[0-7]|[1-7]) # octal escape | (x(?![\da-fA-F]{2}).{0,2}) # hex escape | (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape | (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape ) /// REGEX_INVALID_ESCAPE = /// ( (?:^|[^\\]) (?:\\\\)* ) # Make sure the escape isn’t escaped. \\ ( ?: (0[0-7]) # octal escape | (x(?![\da-fA-F]{2}).{0,2}) # hex escape | (u\{(?![\da-fA-F]{1,}\})[^}]*\}?) # unicode code point escape | (u(?!\{|[\da-fA-F]{4}).{0,4}) # unicode escape ) /// TRAILING_SPACES = /\s+$/ # Compound assignment tokens. COMPOUND_ASSIGN = [ '-=', '+=', '/=', '*=', '%=', '||=', '&&=', '?=', '<<=', '>>=', '>>>=' '&=', '^=', '|=', '**=', '//=', '%%=' ] # Unary tokens. UNARY = ['NEW', 'TYPEOF', 'DELETE'] UNARY_MATH = ['!', '~'] # Bit-shifting tokens. SHIFT = ['<<', '>>', '>>>'] # Comparison tokens. COMPARE = ['==', '!=', '<', '>', '<=', '>='] # Mathematical tokens. MATH = ['*', '/', '%', '//', '%%'] # Relational tokens that are negatable with `not` prefix. RELATION = ['IN', 'OF', 'INSTANCEOF'] # Boolean tokens. BOOL = ['TRUE', 'FALSE'] # Tokens which could legitimately be invoked or indexed. An opening # parentheses or bracket following these tokens will be recorded as the start # of a function invocation or indexing operation. CALLABLE = ['IDENTIFIER', 'PROPERTY', ')', ']', '?', '@', 'THIS', 'SUPER', 'DYNAMIC_IMPORT'] INDEXABLE = CALLABLE.concat [ 'NUMBER', 'INFINITY', 'NAN', 'STRING', 'STRING_END', 'REGEX', 'REGEX_END' 'BOOL', 'NULL', 'UNDEFINED', '}', '::' ] # Tokens which can be the left-hand side of a less-than comparison, i.e. `a