From a4f7a5e248795f66beee2bed382116ef24bdf748 Mon Sep 17 00:00:00 2001 From: Jeremy Ashkenas Date: Sun, 7 Mar 2010 14:41:52 -0500 Subject: [PATCH] documenting and cleaning up the Rewriter --- documentation/docs/docco.css | 4 +- documentation/docs/lexer.html | 173 +++++++++++----------- documentation/docs/rewriter.html | 133 +++++++++-------- lib/lexer.js | 112 ++++++++------- lib/repl.js | 6 +- lib/rewriter.js | 237 ++++++++++++++++--------------- src/lexer.coffee | 181 +++++++++++------------ src/rewriter.coffee | 176 ++++++++++++----------- test/test_everything.coffee | 2 + 9 files changed, 533 insertions(+), 491 deletions(-) diff --git a/documentation/docs/docco.css b/documentation/docs/docco.css index a510806b..1faf0ad1 100644 --- a/documentation/docs/docco.css +++ b/documentation/docs/docco.css @@ -72,7 +72,7 @@ table td { max-width: 500px; min-width: 500px; min-height: 5px; - padding: 10px 30px 1px 50px; + padding: 10px 25px 1px 50px; vertical-align: top; text-align: left; } @@ -103,7 +103,7 @@ table td { opacity: 1; } td.code, th.code { - padding: 14px 15px 16px 35px; + padding: 14px 15px 16px 50px; width: 100%; vertical-align: top; background: #f5f5ff; diff --git a/documentation/docs/lexer.html b/documentation/docs/lexer.html index 95a31271..2fec663c 100644 --- a/documentation/docs/lexer.html +++ b/documentation/docs/lexer.html @@ -10,61 +10,9 @@ form:

Rewriter: require('./rewriter').Rewriter else this.exports: this - Rewriter: this.Rewriter
#

Constants

#

Keywords that CoffeeScript shares in common with JavaScript.

JS_KEYWORDS: [
-  "if", "else",
-  "true", "false",
-  "new", "return",
-  "try", "catch", "finally", "throw",
-  "break", "continue",
-  "for", "in", "while",
-  "delete", "instanceof", "typeof",
-  "switch", "super", "extends", "class"
-]
#

CoffeeScript-only keywords, which we're more relaxed about allowing. They can't -be used standalone, but you can reference them as an attached property.

COFFEE_KEYWORDS: [
-  "then", "unless",
-  "yes", "no", "on", "off",
-  "and", "or", "is", "isnt", "not",
-  "of", "by", "where", "when"
-]
#

The combined list of keywords is the superset that gets passed verbatim to -the parser.

KEYWORDS: JS_KEYWORDS.concat COFFEE_KEYWORDS
#

The list of keywords that are reserved by JavaScript, but not used, or are -used by CoffeeScript internally. We throw an error when these are encountered, -to avoid having a JavaScript error at runtime.

RESERVED: [
-  "case", "default", "do", "function", "var", "void", "with"
-  "const", "let", "debugger", "enum", "export", "import", "native",
-  "__extends", "__hasProp"
-]
#

The superset of both JavaScript keywords and reserved words, none of which may -be used as identifiers or properties.

JS_FORBIDDEN: JS_KEYWORDS.concat RESERVED
#

Token matching regexes.

IDENTIFIER    : /^([a-zA-Z$_](\w|\$)*)/
-NUMBER        : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i
-HEREDOC       : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/
-INTERPOLATION : /^\$([a-zA-Z_@]\w*)/
-OPERATOR      : /^([+\*&|\/\-%=<>:!?]+)/
-WHITESPACE    : /^([ \t]+)/
-COMMENT       : /^(((\n?[ \t]*)?#[^\n]*)+)/
-CODE          : /^((-|=)>)/
-REGEX         : /^(\/(\S.*?)?([^\\]|\\\\)\/[imgy]{0,4})/
-MULTI_DENT    : /^((\n([ \t]*))+)(\.)?/
-LAST_DENTS    : /\n([ \t]*)/g
-LAST_DENT     : /\n([ \t]*)/
-ASSIGNMENT    : /^(:|=)$/
#

Token cleaning regexes.

JS_CLEANER      : /(^`|`$)/g
-MULTILINER      : /\n/g
-STRING_NEWLINES : /\n[ \t]*/g
-COMMENT_CLEANER : /(^[ \t]*#|\n[ \t]*$)/mg
-NO_NEWLINE      : /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/
-HEREDOC_INDENT  : /^[ \t]+/mg
#

Tokens which a regular expression will never immediately follow, but which -a division operator might.

- -

See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions

- -

Our list is shorter, due to sans-parentheses method calls.

NOT_REGEX: [
-  'NUMBER', 'REGEX', '++', '--', 'FALSE', 'NULL', 'TRUE'
-]
#

Tokens which could legitimately be invoked or indexed. A opening -parentheses or bracket following these tokens will be recorded as the start -of a function invocation or indexing operation.

CALLABLE: ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING', '@']
#

Tokens that indicate an access -- keywords immediately following will be -treated as identifiers.

ACCESSORS: ['PROPERTY_ACCESS', 'PROTOTYPE_ACCESS', 'SOAK_ACCESS', '@']
#

Tokens that, when immediately preceding a WHEN, indicate that the WHEN -occurs at the start of a line. We disambiguate these from trailing whens to -avoid an ambiguity in the grammar.

BEFORE_WHEN: ['INDENT', 'OUTDENT', 'TERMINATOR']
#

The Lexer Class

#

The Lexer class reads a stream of CoffeeScript and divvys it up into tagged -tokens. A minor bit of the ambiguity in the grammar has been avoided by -pushing some extra smarts into the Lexer.

exports.Lexer: class Lexer
#

tokenize is the Lexer's main method. Scan by attempting to match tokens + Rewriter: this.Rewriter

#

The Lexer Class

#

The Lexer class reads a stream of CoffeeScript and divvys it up into tagged +tokens. Some potential ambiguity in the grammar has been avoided by +pushing some extra smarts into the Lexer.

exports.Lexer: class Lexer
#

tokenize is the Lexer's main method. Scan by attempting to match tokens one at a time, using a regular expression anchored at the start of the remaining code, or a custom recursive token-matching method (for interpolations). When the next token has been recorded, we move forward @@ -72,7 +20,10 @@ within the code past the token, and begin again.

Each tokenizing method is responsible for incrementing @i by the number of characters it has consumed. @i can be thought of as our finger on the page -of source.

  tokenize: (code, options) ->
+of source.

+ +

Before returning the token stream, run it through the Rewriter +unless explicitly asked not to.

  tokenize: (code, options) ->
     o        : options or {}
     @code    : code         # The remainder of the source code.
     @i       : 0            # Current character position we're parsing.
@@ -85,7 +36,7 @@ of source.

@extract_next_token() @close_indentation() return @tokens if o.rewrite is off - (new Rewriter()).rewrite @tokens
#

At every position, run through this list of attempted matches, + (new Rewriter()).rewrite @tokens

#

At every position, run through this list of attempted matches, short-circuiting if any of them succeed. Their order determines precedence: @literal_token is the fallback catch-all.

  extract_next_token: ->
     return if @identifier_token()
@@ -97,7 +48,7 @@ short-circuiting if any of them succeed. Their order determines precedence:
     return if @whitespace_token()
     return if @js_token()
     return if @string_token()
-    return    @literal_token()
#

Tokenizers

#

Matches identifying literals: variables, keywords, method names, etc. + return @literal_token()

#

Tokenizers

#

Matches identifying literals: variables, keywords, method names, etc. Check to ensure that JavaScript reserved words aren't being used as identifiers. Because CoffeeScript reserves a handful of keywords that are allowed in JavaScript, we're careful not to tag them as keywords when @@ -112,11 +63,11 @@ though is means === otherwise.

tag: 'LEADING_WHEN' if tag is 'WHEN' and include BEFORE_WHEN, @tag() @token(tag, id) @i += id.length - true
#

Matches numbers, including decimals, hex, and exponential notation.

  number_token: ->
+    true
#

Matches numbers, including decimals, hex, and exponential notation.

  number_token: ->
     return false unless number: @match NUMBER, 1
     @token 'NUMBER', number
     @i += number.length
-    true
#

Matches strings, including multi-line strings. Ensures that quotation marks + true

#

Matches strings, including multi-line strings. Ensures that quotation marks are balanced within the string's contents, and within nested interpolations.

  string_token: ->
     return false unless starts(@chunk, '"') or starts(@chunk, "'")
     string: @balanced_token ['"', '"'], ['${', '}']
@@ -125,28 +76,28 @@ are balanced within the string's contents, and within nested interpolations.

@interpolate_string string.replace STRING_NEWLINES, " \\\n" @line += count string, "\n" @i += string.length - true
#

Matches heredocs, adjusting indentation to the correct level, as heredocs + true

#

Matches heredocs, adjusting indentation to the correct level, as heredocs preserve whitespace, but ignore indentation to the left.

  heredoc_token: ->
     return false unless match = @chunk.match(HEREDOC)
     doc: @sanitize_heredoc match[2] or match[4]
     @token 'STRING', "\"$doc\""
     @line += count match[1], "\n"
     @i += match[1].length
-    true
#

Matches JavaScript interpolated directly into the source via backticks.

  js_token: ->
+    true
#

Matches JavaScript interpolated directly into the source via backticks.

  js_token: ->
     return false unless starts @chunk, '`'
     return false unless script: @balanced_token ['`', '`']
     @token 'JS', script.replace(JS_CLEANER, '')
     @i += script.length
-    true
#

Matches regular expression literals. Lexing regular expressions is difficult + true

#

Matches regular expression literals. Lexing regular expressions is difficult to distinguish from division, so we borrow some basic heuristics from JavaScript and Ruby.

  regex_token: ->
     return false unless regex: @match REGEX, 1
     return false if include NOT_REGEX, @tag()
     @token 'REGEX', regex
     @i += regex.length
-    true
#

Matches a token in which which the passed delimiter pairs must be correctly + true

#

Matches a token in which which the passed delimiter pairs must be correctly balanced (ie. strings, JS literals).

  balanced_token: (delimited...) ->
-    @balanced_string @chunk, delimited...
#

Matches and conumes comments. We pass through comments into JavaScript, + @balanced_string @chunk, delimited...

#

Matches and conumes comments. We pass through comments into JavaScript, so they're treated as real tokens, like any other part of the language.

  comment_token: ->
     return false unless comment: @match COMMENT, 1
     @line += (comment.match(MULTILINER) or []).length
@@ -154,7 +105,7 @@ so they're treated as real tokens, like any other part of the language.

@token 'COMMENT', compact lines @token 'TERMINATOR', "\n" @i += comment.length - true
#

Matches newlines, indents, and outdents, and determines which is which. + true

#

Matches newlines, indents, and outdents, and determines which is which. If we can detect that the current line is continued onto the the next line, then the newline is suppressed:

@@ -184,25 +135,25 @@ can close multiple indents, so we need to know how far in we happen to be.

else @outdent_token @indent - size, no_newlines @indent: size - true
#

Record an outdent token or multiple tokens, if we happen to be moving back + true

#

Record an outdent token or multiple tokens, if we happen to be moving back inwards past several recorded indents.

  outdent_token: (move_out, no_newlines) ->
     while move_out > 0 and @indents.length
       last_indent: @indents.pop()
       @token 'OUTDENT', last_indent
       move_out -= last_indent
     @token 'TERMINATOR', "\n" unless @tag() is 'TERMINATOR' or no_newlines
-    true
#

Matches and consumes non-meaningful whitespace. Tag the previous token + true

#

Matches and consumes non-meaningful whitespace. Tag the previous token as being "spaced", because there are some cases where it makes a difference.

  whitespace_token: ->
     return false unless space: @match WHITESPACE, 1
     prev: @prev()
     prev.spaced: true if prev
     @i += space.length
-    true
#

Generate a newline token. Consecutive newlines get merged together.

  newline_token: (newlines) ->
+    true
#

Generate a newline token. Consecutive newlines get merged together.

  newline_token: (newlines) ->
     @token 'TERMINATOR', "\n" unless @tag() is 'TERMINATOR'
-    true
#

Use a \ at a line-ending to suppress the newline. + true

#

Use a \ at a line-ending to suppress the newline. The slash is removed here once its job is done.

  suppress_newlines: ->
     @tokens.pop() if @value() is "\\"
-    true
#

We treat all other single characters as a token. Eg.: ( ) , . ! + true

#

We treat all other single characters as a token. Eg.: ( ) , . ! Multi-character operators are also literal tokens, so that Jison can assign the proper order of operations. There are some symbols that we tag specially here. ; and newlines are both treated as a TERMINATOR, we distinguish @@ -230,7 +181,7 @@ parentheses that indicate a method call from regular parentheses, and so on.

tag: 'INDEX_START' if value is '[' @token tag, value @i += value.length - true
#

Token Manipulators

#

As we consume a new IDENTIFIER, look at the previous token to determine + true

#

Token Manipulators

#

As we consume a new IDENTIFIER, look at the previous token to determine if it's a special kind of accessor.

  name_access_type: ->
     @tag(1, 'PROTOTYPE_ACCESS') if @value() is '::'
     if @value() is '.' and not (@value(2) is '.')
@@ -238,12 +189,12 @@ if it's a special kind of accessor.

@tag(1, 'SOAK_ACCESS') @tokens.splice(-2, 1) else - @tag 1, 'PROPERTY_ACCESS'
#

Sanitize a heredoc by escaping internal double quotes and erasing all + @tag 1, 'PROPERTY_ACCESS'

#

Sanitize a heredoc by escaping internal double quotes and erasing all external indentation on the left-hand side.

  sanitize_heredoc: (doc) ->
     indent: (doc.match(HEREDOC_INDENT) or ['']).sort()[0]
     doc.replace(new RegExp("^" +indent, 'gm'), '')
        .replace(MULTILINER, "\\n")
-       .replace(/"/g, '\\"')
#

A source of ambiguity in our grammar used to be parameter lists in function + .replace(/"/g, '\\"')

#

A source of ambiguity in our grammar used to be parameter lists in function definitions versus argument lists in function calls. Walk backwards, tagging parameters specially in order to make things easier for the parser.

  tag_parameters: ->
     return if @tag() isnt ')'
@@ -256,12 +207,12 @@ parameters specially in order to make things easier for the parser.

when 'IDENTIFIER' then tok[0]: 'PARAM' when ')' then tok[0]: 'PARAM_END' when '(' then return tok[0]: 'PARAM_START' - true
#

Close up all remaining open blocks at the end of the file.

  close_indentation: ->
-    @outdent_token(@indent)
#

The error for when you try to use a forbidden word in JavaScript as + true

#

Close up all remaining open blocks at the end of the file.

  close_indentation: ->
+    @outdent_token(@indent)
#

The error for when you try to use a forbidden word in JavaScript as an identifier.

  identifier_error: (word) ->
-    throw new Error "SyntaxError: Reserved word \"$word\" on line ${@line + 1}"
#

The error for when you try to assign to a reserved word in JavaScript, + throw new Error "SyntaxError: Reserved word \"$word\" on line ${@line + 1}"

#

The error for when you try to assign to a reserved word in JavaScript, like "function" or "default".

  assignment_error: ->
-    throw new Error "SyntaxError: Reserved word \"${@value()}\" on line ${@line + 1} can't be assigned"
#

Matches a balanced group such as a single or double-quoted string. Pass in + throw new Error "SyntaxError: Reserved word \"${@value()}\" on line ${@line + 1} can't be assigned"

#

Matches a balanced group such as a single or double-quoted string. Pass in a series of delimiters, all of which must be nested correctly within the contents of the string. This method allows us to have strings within interpolations within strings etc...

  balanced_string: (str, delimited...) ->
@@ -286,7 +237,7 @@ interpolations within strings etc...

i += 1 throw new Error "SyntaxError: Unterminated ${levels.pop()[0]} starting on line ${@line + 1}" if levels.length return false if i is 0 - return str.substring(0, i)
#

Expand variables and expressions inside double-quoted strings using + return str.substring(0, i)

#

Expand variables and expressions inside double-quoted strings using ECMA Harmony's interpolation syntax for substitution of bare variables as well as arbitrary expressions.

@@ -329,18 +280,70 @@ token stream.

@tokens: @tokens.concat each[1] else @token each[0], each[1] - @token '+', '+' if i < tokens.length - 1
#

Helpers

#

Add a token to the results, taking note of the line number.

  token: (tag, value) ->
-    @tokens.push([tag, value, @line])
#

Peek at a tag in the current token stream.

  tag: (index, tag) ->
+        @token '+', '+' if i < tokens.length - 1
#

Helpers

#

Add a token to the results, taking note of the line number.

  token: (tag, value) ->
+    @tokens.push([tag, value, @line])
#

Peek at a tag in the current token stream.

  tag: (index, tag) ->
     return unless tok: @prev(index)
     return tok[0]: tag if tag?
-    tok[0]
#

Peek at a value in the current token stream.

  value: (index, val) ->
+    tok[0]
#

Peek at a value in the current token stream.

  value: (index, val) ->
     return unless tok: @prev(index)
     return tok[1]: val if val?
-    tok[1]
#

Peek at a previous token, entire.

  prev: (index) ->
-    @tokens[@tokens.length - (index or 1)]
#

Attempt to match a string against the current chunk, returning the indexed + tok[1]

#

Peek at a previous token, entire.

  prev: (index) ->
+    @tokens[@tokens.length - (index or 1)]
#

Attempt to match a string against the current chunk, returning the indexed match if successful, and false otherwise.

  match: (regex, index) ->
     return false unless m: @chunk.match(regex)
-    if m then m[index] else false
#

Utility Functions

#

Does a list include a value?

include: (list, value) ->
+    if m then m[index] else false
#

Constants

#

Keywords that CoffeeScript shares in common with JavaScript.

JS_KEYWORDS: [
+  "if", "else",
+  "true", "false",
+  "new", "return",
+  "try", "catch", "finally", "throw",
+  "break", "continue",
+  "for", "in", "while",
+  "delete", "instanceof", "typeof",
+  "switch", "super", "extends", "class"
+]
#

CoffeeScript-only keywords, which we're more relaxed about allowing. They can't +be used standalone, but you can reference them as an attached property.

COFFEE_KEYWORDS: [
+  "then", "unless",
+  "yes", "no", "on", "off",
+  "and", "or", "is", "isnt", "not",
+  "of", "by", "where", "when"
+]
#

The combined list of keywords is the superset that gets passed verbatim to +the parser.

KEYWORDS: JS_KEYWORDS.concat COFFEE_KEYWORDS
#

The list of keywords that are reserved by JavaScript, but not used, or are +used by CoffeeScript internally. We throw an error when these are encountered, +to avoid having a JavaScript error at runtime.

RESERVED: [
+  "case", "default", "do", "function", "var", "void", "with"
+  "const", "let", "debugger", "enum", "export", "import", "native",
+  "__extends", "__hasProp"
+]
#

The superset of both JavaScript keywords and reserved words, none of which may +be used as identifiers or properties.

JS_FORBIDDEN: JS_KEYWORDS.concat RESERVED
#

Token matching regexes.

IDENTIFIER    : /^([a-zA-Z$_](\w|\$)*)/
+NUMBER        : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i
+HEREDOC       : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/
+INTERPOLATION : /^\$([a-zA-Z_@]\w*)/
+OPERATOR      : /^([+\*&|\/\-%=<>:!?]+)/
+WHITESPACE    : /^([ \t]+)/
+COMMENT       : /^(((\n?[ \t]*)?#[^\n]*)+)/
+CODE          : /^((-|=)>)/
+REGEX         : /^(\/(\S.*?)?([^\\]|\\\\)\/[imgy]{0,4})/
+MULTI_DENT    : /^((\n([ \t]*))+)(\.)?/
+LAST_DENTS    : /\n([ \t]*)/g
+LAST_DENT     : /\n([ \t]*)/
+ASSIGNMENT    : /^(:|=)$/
#

Token cleaning regexes.

JS_CLEANER      : /(^`|`$)/g
+MULTILINER      : /\n/g
+STRING_NEWLINES : /\n[ \t]*/g
+COMMENT_CLEANER : /(^[ \t]*#|\n[ \t]*$)/mg
+NO_NEWLINE      : /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/
+HEREDOC_INDENT  : /^[ \t]+/mg
#

Tokens which a regular expression will never immediately follow, but which +a division operator might.

+ +

See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions

+ +

Our list is shorter, due to sans-parentheses method calls.

NOT_REGEX: [
+  'NUMBER', 'REGEX', '++', '--', 'FALSE', 'NULL', 'TRUE'
+]
#

Tokens which could legitimately be invoked or indexed. A opening +parentheses or bracket following these tokens will be recorded as the start +of a function invocation or indexing operation.

CALLABLE: ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING', '@']
#

Tokens that indicate an access -- keywords immediately following will be +treated as identifiers.

ACCESSORS: ['PROPERTY_ACCESS', 'PROTOTYPE_ACCESS', 'SOAK_ACCESS', '@']
#

Tokens that, when immediately preceding a WHEN, indicate that the WHEN +occurs at the start of a line. We disambiguate these from trailing whens to +avoid an ambiguity in the grammar.

BEFORE_WHEN: ['INDENT', 'OUTDENT', 'TERMINATOR']
#

Utility Functions

#

Does a list include a value?

include: (list, value) ->
   list.indexOf(value) >= 0
#

Peek at the beginning of a given string to see if it matches a sequence.

starts: (string, literal, start) ->
   string.substring(start, (start or 0) + literal.length) is literal
#

Trim out all falsy values from an array.

compact: (array) -> item for item in array when item
#

Count the number of occurences of a character in a string.

count: (string, letter) ->
   num: 0
diff --git a/documentation/docs/rewriter.html b/documentation/docs/rewriter.html
index e19f4de3..4d4e04a7 100644
--- a/documentation/docs/rewriter.html
+++ b/documentation/docs/rewriter.html
@@ -1,42 +1,36 @@
-      rewriter.coffee           

rewriter.coffee

#
this.exports: this unless process?
#

Tokens that must be balanced.

BALANCED_PAIRS: [['(', ')'], ['[', ']'], ['{', '}'], ['INDENT', 'OUTDENT'],
-  ['PARAM_START', 'PARAM_END'], ['CALL_START', 'CALL_END'],
-  ['INDEX_START', 'INDEX_END'], ['SOAKED_INDEX_START', 'SOAKED_INDEX_END']]
#

Tokens that signal the start of a balanced pair.

EXPRESSION_START: pair[0] for pair in BALANCED_PAIRS
#

Tokens that signal the end of a balanced pair.

EXPRESSION_TAIL: pair[1] for pair in BALANCED_PAIRS
#

Tokens that indicate the close of a clause of an expression.

EXPRESSION_CLOSE: ['CATCH', 'WHEN', 'ELSE', 'FINALLY'].concat(EXPRESSION_TAIL)
#

Tokens pairs that, in immediate succession, indicate an implicit call.

IMPLICIT_FUNC: ['IDENTIFIER', 'SUPER', ')', 'CALL_END', ']', 'INDEX_END']
-IMPLICIT_BLOCK:['->', '=>', '{', '[', ',']
-IMPLICIT_END:  ['IF', 'UNLESS', 'FOR', 'WHILE', 'TERMINATOR', 'INDENT', 'OUTDENT']
-IMPLICIT_CALL: ['IDENTIFIER', 'NUMBER', 'STRING', 'JS', 'REGEX', 'NEW', 'PARAM_START',
-                 'TRY', 'DELETE', 'TYPEOF', 'SWITCH',
-                 'TRUE', 'FALSE', 'YES', 'NO', 'ON', 'OFF', '!', '!!', 'NOT',
-                 '@', '->', '=>', '[', '(', '{']
#

The inverse mappings of token pairs we're trying to fix up.

INVERSES: {}
-for pair in BALANCED_PAIRS
-  INVERSES[pair[0]]: pair[1]
-  INVERSES[pair[1]]: pair[0]
#

Single-line flavors of block expressions that have unclosed endings. -The grammar can't disambiguate them, so we insert the implicit indentation.

SINGLE_LINERS: ['ELSE', "->", "=>", 'TRY', 'FINALLY', 'THEN']
-SINGLE_CLOSERS: ['TERMINATOR', 'CATCH', 'FINALLY', 'ELSE', 'OUTDENT', 'LEADING_WHEN']
#

In order to keep the grammar simple, the stream of tokens that the Lexer -emits is rewritten by the Rewriter, smoothing out ambiguities, mis-nested -indentation, and single-line flavors of expressions.

exports.Rewriter: class Rewriter
#

Rewrite the token stream in multiple passes, one logical filter at + rewriter.coffee

@tokens.splice(i,0,['TERMINATOR',"\n",prev[2]])return2else - return1

rewriter.coffee

#

The CoffeeScript language has a decent amount of optional syntax, +implicit syntax, and shorthand syntax. These things can greatly complicate a +grammar and bloat the resulting parse table. Instead of making the parser +handle it all, we take a series of passes over the token stream, +using this Rewriter to convert shorthand into the unambiguous long form, +add implicit indentation and parentheses, balance incorrect nestings, and +generally clean things up.

#

Set up exported variables for both Node.js and the browser.

this.exports: this unless process?
#

The Rewriter class is used by the Lexer, directly against +its internal array of tokens.

exports.Rewriter: class Rewriter
#

Rewrite the token stream in multiple passes, one logical filter at a time. This could certainly be changed into a single pass through the -stream, with a big ol' efficient switch, but it's much nicer like this.

  rewrite: (tokens) ->
+stream, with a big ol' efficient switch, but it's much nicer to work with
+like this. The order of these passes matters -- indentation must be
+corrected before implicit parentheses can be wrapped around blocks of code.

  rewrite: (tokens) ->
     @tokens: tokens
     @adjust_comments()
     @remove_leading_newlines()
     @remove_mid_expression_newlines()
-    @move_commas_outside_outdents()
     @close_open_calls_and_indexes()
     @add_implicit_indentation()
     @add_implicit_parentheses()
     @ensure_balance(BALANCED_PAIRS)
     @rewrite_closing_parens()
-    @tokens
#

Rewrite the token stream, looking one token ahead and behind. + @tokens

#

Rewrite the token stream, looking one token ahead and behind. Allow the return value of the block to tell us how many tokens to move forwards (or backwards) in the stream, to make sure we don't miss anything -as the stream changes length under our feet.

  scan_tokens: (block) ->
+as tokens are inserted and removed, and the stream changes length under
+our feet.

  scan_tokens: (block) ->
     i: 0
     while true
       break unless @tokens[i]
       move: block(@tokens[i - 1], @tokens[i], @tokens[i + 1], i)
       i += move
-    true
#

Massage newlines and indentations so that comments don't have to be -correctly indented, or appear on their own line.

  adjust_comments: ->
+    true
#

Massage newlines and indentations so that comments don't have to be +correctly indented, or appear on a line of their own.

  adjust_comments: ->
     @scan_tokens (prev, token, post, i) =>
       return 1 unless token[0] is 'COMMENT'
       after:  @tokens[i + 2]
@@ -48,19 +42,16 @@ correctly indented, or appear on their own line.

#

Leading newlines would introduce an ambiguity in the grammar, so we + return 1

#

Leading newlines would introduce an ambiguity in the grammar, so we dispatch them here.

  remove_leading_newlines: ->
-    @tokens.shift() if @tokens[0][0] is 'TERMINATOR'
#

Some blocks occur in the middle of expressions -- when we're expecting + @tokens.shift() while @tokens[0][0] is 'TERMINATOR'

#

Some blocks occur in the middle of expressions -- when we're expecting this, remove their trailing newlines.

  remove_mid_expression_newlines: ->
     @scan_tokens (prev, token, post, i) =>
-      return 1 unless post and EXPRESSION_CLOSE.indexOf(post[0]) >= 0 and token[0] is 'TERMINATOR'
+      return 1 unless post and include(EXPRESSION_CLOSE, post[0]) and token[0] is 'TERMINATOR'
       @tokens.splice(i, 1)
-      return 0
#

Make sure that we don't accidentally break trailing commas, which need -to go on the outside of expression closers.

  move_commas_outside_outdents: ->
-    @scan_tokens (prev, token, post, i) =>
-      @tokens.splice(i, 1, token) if token[0] is 'OUTDENT' and prev[0] is ','
-      return 1
#

We've tagged the opening parenthesis of a method call, and the opening -bracket of an indexing operation. Match them with their close.

  close_open_calls_and_indexes: ->
+      return 0
#

The lexer has tagged the opening parenthesis of a method call, and the +opening bracket of an indexing operation. Match them with their paired +close.

  close_open_calls_and_indexes: ->
     parens:   [0]
     brackets: [0]
     @scan_tokens (prev, token, post, i) =>
@@ -81,7 +72,7 @@ bracket of an indexing operation. Match them with their close.

< token[0]: 'INDEX_END' else brackets[brackets.length - 1] -= 1 - return 1
#

Methods may be optionally called without parentheses, for simple cases. + return 1

#

Methods may be optionally called without parentheses, for simple cases. Insert the implicit parentheses here, so that the parser doesn't have to deal with them.

  add_implicit_parentheses: ->
     stack: [0]
@@ -91,25 +82,26 @@ deal with them.

if tag is 'OUTDENT' last: stack.pop() stack[stack.length - 1] += last - if IMPLICIT_END.indexOf(tag) >= 0 or !post? - return 1 if tag is 'INDENT' and prev and IMPLICIT_BLOCK.indexOf(prev[0]) >= 0 + if !post? or include IMPLICIT_END, tag + return 1 if tag is 'INDENT' and prev and include IMPLICIT_BLOCK, prev[0] if stack[stack.length - 1] > 0 or tag is 'INDENT' - idx: if tag is 'OUTDENT' then i +1 else i + idx: if tag is 'OUTDENT' then i + 1 else i stack_pointer: if tag is 'INDENT' then 2 else 1 for tmp in [0...stack[stack.length - stack_pointer]] @tokens.splice(idx, 0, ['CALL_END', ')', token[2]]) size: stack[stack.length - stack_pointer] + 1 stack[stack.length - stack_pointer]: 0 return size - return 1 unless prev and IMPLICIT_FUNC.indexOf(prev[0]) >= 0 and IMPLICIT_CALL.indexOf(tag) >= 0 + return 1 unless prev and include(IMPLICIT_FUNC, prev[0]) and include IMPLICIT_CALL, tag @tokens.splice(i, 0, ['CALL_START', '(', token[2]]) stack[stack.length - 1] += 1 - return 2
#

Because our grammar is LALR(1), it can't handle some single-line -expressions that lack ending delimiters. Use the lexer to add the implicit -blocks, so it doesn't need to. -')' can close a single-line block, but we need to make sure it's balanced.

  add_implicit_indentation: ->
+      return 2
#

Because our grammar is LALR(1), it can't handle some single-line +expressions that lack ending delimiters. The Rewriter adds the implicit +blocks, so it doesn't need to. ')' can close a single-line block, +but we need to make sure it's balanced.

  add_implicit_indentation: ->
     @scan_tokens (prev, token, post, i) =>
-      return 1 unless SINGLE_LINERS.indexOf(token[0]) >= 0 and post[0] isnt 'INDENT' and
+      return 1 unless include(SINGLE_LINERS, token[0]) and
+        post[0] isnt 'INDENT' and
         not (token[0] is 'ELSE' and post[0] is 'IF')
       starter: token[0]
       @tokens.splice(i + 1, 0, ['INDENT', 2, token[2]])
@@ -120,7 +112,7 @@ blocks, so it doesn't need to.
         tok: @tokens[idx]
         pre: @tokens[idx - 1]
         if (not tok or
-            (SINGLE_CLOSERS.indexOf(tok[0]) >= 0 and tok[1] isnt ';') or
+            (include(SINGLE_CLOSERS, tok[0]) and tok[1] isnt ';') or
             (tok[0] is ')' && parens is 0)) and
             not (starter is 'ELSE' and tok[0] is 'ELSE')
           insertion: if pre[0] is "," then idx - 1 else idx
@@ -130,7 +122,7 @@ blocks, so it doesn't need to.
         parens -= 1 if tok[0] is ')'
       return 1 unless token[0] is 'THEN'
       @tokens.splice(i, 1)
-      return 0
#

Ensure that all listed pairs of tokens are correctly balanced throughout + return 0

#

Ensure that all listed pairs of tokens are correctly balanced throughout the course of the token stream.

  ensure_balance: (pairs) ->
     levels: {}
     @scan_tokens (prev, token, post, i) =>
@@ -142,10 +134,13 @@ the course of the token stream.

throw new Error("too many ${token[1]} on line ${token[2] + 1}") if levels[open] < 0 return 1 unclosed: key for key, value of levels when value > 0 - throw new Error("unclosed ${unclosed[0]}") if unclosed.length
#

We'd like to support syntax like this: - el.click((event) -> - el.hide()) -In order to accomplish this, move outdents that follow closing parens + throw new Error("unclosed ${unclosed[0]}") if unclosed.length

#

We'd like to support syntax like this:

+ +
el.click((event) ->
+  el.hide())
+
+ +

In order to accomplish this, move outdents that follow closing parens inwards, safely. The steps to accomplish this are:

    @@ -161,20 +156,36 @@ up balanced in the end. (debt[key]: 0) for key, val of INVERSES @scan_tokens (prev, token, post, i) => tag: token[0] - inv: INVERSES[token[0]]
#

Push openers onto the stack.

      if EXPRESSION_START.indexOf(tag) >= 0
-        stack.push(token)
-        return 1
#

The end of an expression, check stack and debt for a pair.

      else if EXPRESSION_TAIL.indexOf(tag) >= 0
#

If the tag is already in our debt, swallow it.

        if debt[inv] > 0
-          debt[inv] -= 1
-          @tokens.splice(i, 1)
-          return 0
-        else
#

Pop the stack of open delimiters.

          match: stack.pop()
-          mtag:  match[0]
#

Continue onwards if it's the expected tag.

          if tag is INVERSES[mtag]
-            return 1
-          else
#

Unexpected close, insert correct close, adding to the debt.

            debt[mtag] += 1
-            val: if mtag is 'INDENT' then match[1] else INVERSES[mtag]
-            @tokens.splice(i, 0, [INVERSES[mtag], val])
-            return 1
-      else
+      inv: INVERSES[token[0]]
+      if include EXPRESSION_START, tag
+        stack.push token
         return 1
+      else if include EXPRESSION_END, tag
+        if debt[inv] > 0
+          debt[inv] -= 1
+          @tokens.splice i, 1
+          return 0
+        else
+          match: stack.pop()
+          mtag:  match[0]
+          return 1 if tag is INVERSES[mtag]
+          debt[mtag] += 1
+          val: if mtag is 'INDENT' then match[1] else INVERSES[mtag]
+          @tokens.splice i, 0, [INVERSES[mtag], val]
+          return 1
+      else
+        return 1
#

Constants

#

List of the token pairs that must be balanced.

BALANCED_PAIRS: [['(', ')'], ['[', ']'], ['{', '}'], ['INDENT', 'OUTDENT'],
+  ['PARAM_START', 'PARAM_END'], ['CALL_START', 'CALL_END'],
+  ['INDEX_START', 'INDEX_END'], ['SOAKED_INDEX_START', 'SOAKED_INDEX_END']]
#

The inverse mappings of BALANCED_PAIRS we're trying to fix up, so we can +look things up from either end.

INVERSES: {}
+for pair in BALANCED_PAIRS
+  INVERSES[pair[0]]: pair[1]
+  INVERSES[pair[1]]: pair[0]
#

The tokens that signal the start of a balanced pair.

EXPRESSION_START: pair[0] for pair in BALANCED_PAIRS
#

The tokens that signal the end of a balanced pair.

EXPRESSION_END:   pair[1] for pair in BALANCED_PAIRS
#

Tokens that indicate the close of a clause of an expression.

EXPRESSION_CLOSE: ['CATCH', 'WHEN', 'ELSE', 'FINALLY'].concat EXPRESSION_END
#

Tokens that, if followed by an IMPLICIT_CALL, indicate a function invocation.

IMPLICIT_FUNC:  ['IDENTIFIER', 'SUPER', ')', 'CALL_END', ']', 'INDEX_END']
#

If preceded by an IMPLICIT_FUNC, indicates a function invocation.

IMPLICIT_CALL:  ['IDENTIFIER', 'NUMBER', 'STRING', 'JS', 'REGEX', 'NEW', 'PARAM_START',
+                 'TRY', 'DELETE', 'TYPEOF', 'SWITCH',
+                 'TRUE', 'FALSE', 'YES', 'NO', 'ON', 'OFF', '!', '!!', 'NOT',
+                 '@', '->', '=>', '[', '(', '{']
#

Tokens indicating that the implicit call must enclose a block of expressions.

IMPLICIT_BLOCK: ['->', '=>', '{', '[', ',']
#

Tokens that always mark the end of an implicit call for single-liners.

IMPLICIT_END:   ['IF', 'UNLESS', 'FOR', 'WHILE', 'TERMINATOR', 'INDENT', 'OUTDENT']
#

Single-line flavors of block expressions that have unclosed endings. +The grammar can't disambiguate them, so we insert the implicit indentation.

SINGLE_LINERS: ['ELSE', "->", "=>", 'TRY', 'FINALLY', 'THEN']
+SINGLE_CLOSERS: ['TERMINATOR', 'CATCH', 'FINALLY', 'ELSE', 'OUTDENT', 'LEADING_WHEN']
#

Utility Functions

#

Does a list include a value?

include: (list, value) ->
+  list.indexOf(value) >= 0
 
 
\ No newline at end of file diff --git a/lib/lexer.js b/lib/lexer.js index 9acd6fc0..b2f9cbad 100644 --- a/lib/lexer.js +++ b/lib/lexer.js @@ -13,64 +13,10 @@ this.exports = this; Rewriter = this.Rewriter; } - // Constants - // --------- - // Keywords that CoffeeScript shares in common with JavaScript. - JS_KEYWORDS = ["if", "else", "true", "false", "new", "return", "try", "catch", "finally", "throw", "break", "continue", "for", "in", "while", "delete", "instanceof", "typeof", "switch", "super", "extends", "class"]; - // CoffeeScript-only keywords, which we're more relaxed about allowing. They can't - // be used standalone, but you can reference them as an attached property. - COFFEE_KEYWORDS = ["then", "unless", "yes", "no", "on", "off", "and", "or", "is", "isnt", "not", "of", "by", "where", "when"]; - // The combined list of keywords is the superset that gets passed verbatim to - // the parser. - KEYWORDS = JS_KEYWORDS.concat(COFFEE_KEYWORDS); - // The list of keywords that are reserved by JavaScript, but not used, or are - // used by CoffeeScript internally. We throw an error when these are encountered, - // to avoid having a JavaScript error at runtime. - RESERVED = ["case", "default", "do", "function", "var", "void", "with", "const", "let", "debugger", "enum", "export", "import", "native", "__extends", "__hasProp"]; - // The superset of both JavaScript keywords and reserved words, none of which may - // be used as identifiers or properties. - JS_FORBIDDEN = JS_KEYWORDS.concat(RESERVED); - // Token matching regexes. - IDENTIFIER = /^([a-zA-Z$_](\w|\$)*)/; - NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i; - HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/; - INTERPOLATION = /^\$([a-zA-Z_@]\w*)/; - OPERATOR = /^([+\*&|\/\-%=<>:!?]+)/; - WHITESPACE = /^([ \t]+)/; - COMMENT = /^(((\n?[ \t]*)?#[^\n]*)+)/; - CODE = /^((-|=)>)/; - REGEX = /^(\/(\S.*?)?([^\\]|\\\\)\/[imgy]{0,4})/; - MULTI_DENT = /^((\n([ \t]*))+)(\.)?/; - LAST_DENTS = /\n([ \t]*)/g; - LAST_DENT = /\n([ \t]*)/; - ASSIGNMENT = /^(:|=)$/; - // Token cleaning regexes. - JS_CLEANER = /(^`|`$)/g; - MULTILINER = /\n/g; - STRING_NEWLINES = /\n[ \t]*/g; - COMMENT_CLEANER = /(^[ \t]*#|\n[ \t]*$)/mg; - NO_NEWLINE = /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/; - HEREDOC_INDENT = /^[ \t]+/mg; - // Tokens which a regular expression will never immediately follow, but which - // a division operator might. - // See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions - // Our list is shorter, due to sans-parentheses method calls. - NOT_REGEX = ['NUMBER', 'REGEX', '++', '--', 'FALSE', 'NULL', 'TRUE']; - // Tokens which could legitimately be invoked or indexed. A opening - // parentheses or bracket following these tokens will be recorded as the start - // of a function invocation or indexing operation. - CALLABLE = ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING', '@']; - // Tokens that indicate an access -- keywords immediately following will be - // treated as identifiers. - ACCESSORS = ['PROPERTY_ACCESS', 'PROTOTYPE_ACCESS', 'SOAK_ACCESS', '@']; - // Tokens that, when immediately preceding a `WHEN`, indicate that the `WHEN` - // occurs at the start of a line. We disambiguate these from trailing whens to - // avoid an ambiguity in the grammar. - BEFORE_WHEN = ['INDENT', 'OUTDENT', 'TERMINATOR']; // The Lexer Class // --------------- // The Lexer class reads a stream of CoffeeScript and divvys it up into tagged - // tokens. A minor bit of the ambiguity in the grammar has been avoided by + // tokens. Some potential ambiguity in the grammar has been avoided by // pushing some extra smarts into the Lexer. exports.Lexer = (function() { Lexer = function Lexer() { }; @@ -82,6 +28,8 @@ // Each tokenizing method is responsible for incrementing `@i` by the number of // characters it has consumed. `@i` can be thought of as our finger on the page // of source. + // Before returning the token stream, run it through the [Rewriter](rewriter.html) + // unless explicitly asked not to. Lexer.prototype.tokenize = function tokenize(code, options) { var o; o = options || {}; @@ -598,6 +546,60 @@ }; return Lexer; }).call(this); + // Constants + // --------- + // Keywords that CoffeeScript shares in common with JavaScript. + JS_KEYWORDS = ["if", "else", "true", "false", "new", "return", "try", "catch", "finally", "throw", "break", "continue", "for", "in", "while", "delete", "instanceof", "typeof", "switch", "super", "extends", "class"]; + // CoffeeScript-only keywords, which we're more relaxed about allowing. They can't + // be used standalone, but you can reference them as an attached property. + COFFEE_KEYWORDS = ["then", "unless", "yes", "no", "on", "off", "and", "or", "is", "isnt", "not", "of", "by", "where", "when"]; + // The combined list of keywords is the superset that gets passed verbatim to + // the parser. + KEYWORDS = JS_KEYWORDS.concat(COFFEE_KEYWORDS); + // The list of keywords that are reserved by JavaScript, but not used, or are + // used by CoffeeScript internally. We throw an error when these are encountered, + // to avoid having a JavaScript error at runtime. + RESERVED = ["case", "default", "do", "function", "var", "void", "with", "const", "let", "debugger", "enum", "export", "import", "native", "__extends", "__hasProp"]; + // The superset of both JavaScript keywords and reserved words, none of which may + // be used as identifiers or properties. + JS_FORBIDDEN = JS_KEYWORDS.concat(RESERVED); + // Token matching regexes. + IDENTIFIER = /^([a-zA-Z$_](\w|\$)*)/; + NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i; + HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/; + INTERPOLATION = /^\$([a-zA-Z_@]\w*)/; + OPERATOR = /^([+\*&|\/\-%=<>:!?]+)/; + WHITESPACE = /^([ \t]+)/; + COMMENT = /^(((\n?[ \t]*)?#[^\n]*)+)/; + CODE = /^((-|=)>)/; + REGEX = /^(\/(\S.*?)?([^\\]|\\\\)\/[imgy]{0,4})/; + MULTI_DENT = /^((\n([ \t]*))+)(\.)?/; + LAST_DENTS = /\n([ \t]*)/g; + LAST_DENT = /\n([ \t]*)/; + ASSIGNMENT = /^(:|=)$/; + // Token cleaning regexes. + JS_CLEANER = /(^`|`$)/g; + MULTILINER = /\n/g; + STRING_NEWLINES = /\n[ \t]*/g; + COMMENT_CLEANER = /(^[ \t]*#|\n[ \t]*$)/mg; + NO_NEWLINE = /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/; + HEREDOC_INDENT = /^[ \t]+/mg; + // Tokens which a regular expression will never immediately follow, but which + // a division operator might. + // See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions + // Our list is shorter, due to sans-parentheses method calls. + NOT_REGEX = ['NUMBER', 'REGEX', '++', '--', 'FALSE', 'NULL', 'TRUE']; + // Tokens which could legitimately be invoked or indexed. A opening + // parentheses or bracket following these tokens will be recorded as the start + // of a function invocation or indexing operation. + CALLABLE = ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING', '@']; + // Tokens that indicate an access -- keywords immediately following will be + // treated as identifiers. + ACCESSORS = ['PROPERTY_ACCESS', 'PROTOTYPE_ACCESS', 'SOAK_ACCESS', '@']; + // Tokens that, when immediately preceding a `WHEN`, indicate that the `WHEN` + // occurs at the start of a line. We disambiguate these from trailing whens to + // avoid an ambiguity in the grammar. + BEFORE_WHEN = ['INDENT', 'OUTDENT', 'TERMINATOR']; // Utility Functions // ----------------- // Does a list include a value? diff --git a/lib/repl.js b/lib/repl.js index 862758b9..a97dbd6b 100644 --- a/lib/repl.js +++ b/lib/repl.js @@ -2,6 +2,8 @@ var CoffeeScript, prompt, quit, run; // A very simple Read-Eval-Print-Loop. Compiles one line at a time to JavaScript // and evaluates it. Good for simple tests, or poking around the **Node.js** API. + // Using it looks like this: + // coffee> puts "$num bottles of beer" for num in [99..1] // Require the **coffee-script** module to get access to the compiler. CoffeeScript = require('coffee-script'); // Our prompt. @@ -10,7 +12,7 @@ quit = function quit() { return process.exit(0); }; - // The main REPL function. `run` is called every time a line of code is entered. + // The main REPL function. **run** is called every time a line of code is entered. // Attempt to evaluate the command. If there's an exception, print it out instead // of exiting. run = function run(code) { @@ -28,7 +30,7 @@ } return print(prompt); }; - // Start up the REPL. + // Start up the REPL by opening **stdio** and listening for input. process.stdio.addListener('data', run); process.stdio.open(); print(prompt); diff --git a/lib/rewriter.js b/lib/rewriter.js index dea82090..cbf63c26 100644 --- a/lib/rewriter.js +++ b/lib/rewriter.js @@ -1,62 +1,31 @@ (function(){ - var BALANCED_PAIRS, EXPRESSION_CLOSE, EXPRESSION_START, EXPRESSION_TAIL, IMPLICIT_BLOCK, IMPLICIT_CALL, IMPLICIT_END, IMPLICIT_FUNC, INVERSES, Rewriter, SINGLE_CLOSERS, SINGLE_LINERS, _a, _b, _c, _d, _e, _f, _g, _h, _i, _j, _k, pair; + var BALANCED_PAIRS, EXPRESSION_CLOSE, EXPRESSION_END, EXPRESSION_START, IMPLICIT_BLOCK, IMPLICIT_CALL, IMPLICIT_END, IMPLICIT_FUNC, INVERSES, Rewriter, SINGLE_CLOSERS, SINGLE_LINERS, _a, _b, _c, _d, _e, _f, _g, _h, _i, _j, _k, include, pair; var __hasProp = Object.prototype.hasOwnProperty; + // The CoffeeScript language has a decent amount of optional syntax, + // implicit syntax, and shorthand syntax. These things can greatly complicate a + // grammar and bloat the resulting parse table. Instead of making the parser + // handle it all, we take a series of passes over the token stream, + // using this **Rewriter** to convert shorthand into the unambiguous long form, + // add implicit indentation and parentheses, balance incorrect nestings, and + // generally clean things up. + // Set up exported variables for both Node.js and the browser. if (!((typeof process !== "undefined" && process !== null))) { this.exports = this; } - // Tokens that must be balanced. - BALANCED_PAIRS = [['(', ')'], ['[', ']'], ['{', '}'], ['INDENT', 'OUTDENT'], ['PARAM_START', 'PARAM_END'], ['CALL_START', 'CALL_END'], ['INDEX_START', 'INDEX_END'], ['SOAKED_INDEX_START', 'SOAKED_INDEX_END']]; - // Tokens that signal the start of a balanced pair. - EXPRESSION_START = (function() { - _a = []; _b = BALANCED_PAIRS; - for (_c = 0, _d = _b.length; _c < _d; _c++) { - pair = _b[_c]; - _a.push(pair[0]); - } - return _a; - }).call(this); - // Tokens that signal the end of a balanced pair. - EXPRESSION_TAIL = (function() { - _e = []; _f = BALANCED_PAIRS; - for (_g = 0, _h = _f.length; _g < _h; _g++) { - pair = _f[_g]; - _e.push(pair[1]); - } - return _e; - }).call(this); - // Tokens that indicate the close of a clause of an expression. - EXPRESSION_CLOSE = ['CATCH', 'WHEN', 'ELSE', 'FINALLY'].concat(EXPRESSION_TAIL); - // Tokens pairs that, in immediate succession, indicate an implicit call. - IMPLICIT_FUNC = ['IDENTIFIER', 'SUPER', ')', 'CALL_END', ']', 'INDEX_END']; - IMPLICIT_BLOCK = ['->', '=>', '{', '[', ',']; - IMPLICIT_END = ['IF', 'UNLESS', 'FOR', 'WHILE', 'TERMINATOR', 'INDENT', 'OUTDENT']; - IMPLICIT_CALL = ['IDENTIFIER', 'NUMBER', 'STRING', 'JS', 'REGEX', 'NEW', 'PARAM_START', 'TRY', 'DELETE', 'TYPEOF', 'SWITCH', 'TRUE', 'FALSE', 'YES', 'NO', 'ON', 'OFF', '!', '!!', 'NOT', '@', '->', '=>', '[', '(', '{']; - // The inverse mappings of token pairs we're trying to fix up. - INVERSES = {}; - _i = BALANCED_PAIRS; - for (_j = 0, _k = _i.length; _j < _k; _j++) { - pair = _i[_j]; - INVERSES[pair[0]] = pair[1]; - INVERSES[pair[1]] = pair[0]; - } - // Single-line flavors of block expressions that have unclosed endings. - // The grammar can't disambiguate them, so we insert the implicit indentation. - SINGLE_LINERS = ['ELSE', "->", "=>", 'TRY', 'FINALLY', 'THEN']; - SINGLE_CLOSERS = ['TERMINATOR', 'CATCH', 'FINALLY', 'ELSE', 'OUTDENT', 'LEADING_WHEN']; - // In order to keep the grammar simple, the stream of tokens that the Lexer - // emits is rewritten by the Rewriter, smoothing out ambiguities, mis-nested - // indentation, and single-line flavors of expressions. + // The **Rewriter** class is used by the [Lexer](lexer.html), directly against + // its internal array of tokens. exports.Rewriter = (function() { Rewriter = function Rewriter() { }; // Rewrite the token stream in multiple passes, one logical filter at // a time. This could certainly be changed into a single pass through the - // stream, with a big ol' efficient switch, but it's much nicer like this. + // stream, with a big ol' efficient switch, but it's much nicer to work with + // like this. The order of these passes matters -- indentation must be + // corrected before implicit parentheses can be wrapped around blocks of code. Rewriter.prototype.rewrite = function rewrite(tokens) { this.tokens = tokens; this.adjust_comments(); this.remove_leading_newlines(); this.remove_mid_expression_newlines(); - this.move_commas_outside_outdents(); this.close_open_calls_and_indexes(); this.add_implicit_indentation(); this.add_implicit_parentheses(); @@ -67,7 +36,8 @@ // Rewrite the token stream, looking one token ahead and behind. // Allow the return value of the block to tell us how many tokens to move // forwards (or backwards) in the stream, to make sure we don't miss anything - // as the stream changes length under our feet. + // as tokens are inserted and removed, and the stream changes length under + // our feet. Rewriter.prototype.scan_tokens = function scan_tokens(block) { var i, move; i = 0; @@ -81,7 +51,7 @@ return true; }; // Massage newlines and indentations so that comments don't have to be - // correctly indented, or appear on their own line. + // correctly indented, or appear on a line of their own. Rewriter.prototype.adjust_comments = function adjust_comments() { return this.scan_tokens((function(__this) { var __func = function(prev, token, post, i) { @@ -109,16 +79,19 @@ // Leading newlines would introduce an ambiguity in the grammar, so we // dispatch them here. Rewriter.prototype.remove_leading_newlines = function remove_leading_newlines() { - if (this.tokens[0][0] === 'TERMINATOR') { - return this.tokens.shift(); + var _a; + _a = []; + while (this.tokens[0][0] === 'TERMINATOR') { + _a.push(this.tokens.shift()); } + return _a; }; // Some blocks occur in the middle of expressions -- when we're expecting // this, remove their trailing newlines. Rewriter.prototype.remove_mid_expression_newlines = function remove_mid_expression_newlines() { return this.scan_tokens((function(__this) { var __func = function(prev, token, post, i) { - if (!(post && EXPRESSION_CLOSE.indexOf(post[0]) >= 0 && token[0] === 'TERMINATOR')) { + if (!(post && include(EXPRESSION_CLOSE, post[0]) && token[0] === 'TERMINATOR')) { return 1; } this.tokens.splice(i, 1); @@ -129,46 +102,32 @@ }); })(this)); }; - // Make sure that we don't accidentally break trailing commas, which need - // to go on the outside of expression closers. - Rewriter.prototype.move_commas_outside_outdents = function move_commas_outside_outdents() { - return this.scan_tokens((function(__this) { - var __func = function(prev, token, post, i) { - if (token[0] === 'OUTDENT' && prev[0] === ',') { - this.tokens.splice(i, 1, token); - } - return 1; - }; - return (function() { - return __func.apply(__this, arguments); - }); - })(this)); - }; - // We've tagged the opening parenthesis of a method call, and the opening - // bracket of an indexing operation. Match them with their close. + // The lexer has tagged the opening parenthesis of a method call, and the + // opening bracket of an indexing operation. Match them with their paired + // close. Rewriter.prototype.close_open_calls_and_indexes = function close_open_calls_and_indexes() { var brackets, parens; parens = [0]; brackets = [0]; return this.scan_tokens((function(__this) { var __func = function(prev, token, post, i) { - var _l; - if ((_l = token[0]) === 'CALL_START') { + var _a; + if ((_a = token[0]) === 'CALL_START') { parens.push(0); - } else if (_l === 'INDEX_START') { + } else if (_a === 'INDEX_START') { brackets.push(0); - } else if (_l === '(') { + } else if (_a === '(') { parens[parens.length - 1] += 1; - } else if (_l === '[') { + } else if (_a === '[') { brackets[brackets.length - 1] += 1; - } else if (_l === ')') { + } else if (_a === ')') { if (parens[parens.length - 1] === 0) { parens.pop(); token[0] = 'CALL_END'; } else { parens[parens.length - 1] -= 1; } - } else if (_l === ']') { + } else if (_a === ']') { if (brackets[brackets.length - 1] === 0) { brackets.pop(); token[0] = 'INDEX_END'; @@ -191,7 +150,7 @@ stack = [0]; return this.scan_tokens((function(__this) { var __func = function(prev, token, post, i) { - var _l, _m, _n, _o, idx, last, size, stack_pointer, tag, tmp; + var _a, _b, _c, _d, idx, last, size, stack_pointer, tag, tmp; tag = token[0]; if (tag === 'INDENT') { stack.push(0); @@ -200,15 +159,15 @@ last = stack.pop(); stack[stack.length - 1] += last; } - if (IMPLICIT_END.indexOf(tag) >= 0 || !(typeof post !== "undefined" && post !== null)) { - if (tag === 'INDENT' && prev && IMPLICIT_BLOCK.indexOf(prev[0]) >= 0) { + if (!(typeof post !== "undefined" && post !== null) || include(IMPLICIT_END, tag)) { + if (tag === 'INDENT' && prev && include(IMPLICIT_BLOCK, prev[0])) { return 1; } if (stack[stack.length - 1] > 0 || tag === 'INDENT') { idx = tag === 'OUTDENT' ? i + 1 : i; stack_pointer = tag === 'INDENT' ? 2 : 1; - _n = 0; _o = stack[stack.length - stack_pointer]; - for (_m = 0, tmp = _n; (_n <= _o ? tmp < _o : tmp > _o); (_n <= _o ? tmp += 1 : tmp -= 1), _m++) { + _c = 0; _d = stack[stack.length - stack_pointer]; + for (_b = 0, tmp = _c; (_c <= _d ? tmp < _d : tmp > _d); (_c <= _d ? tmp += 1 : tmp -= 1), _b++) { this.tokens.splice(idx, 0, ['CALL_END', ')', token[2]]); } size = stack[stack.length - stack_pointer] + 1; @@ -216,7 +175,7 @@ return size; } } - if (!(prev && IMPLICIT_FUNC.indexOf(prev[0]) >= 0 && IMPLICIT_CALL.indexOf(tag) >= 0)) { + if (!(prev && include(IMPLICIT_FUNC, prev[0]) && include(IMPLICIT_CALL, tag))) { return 1; } this.tokens.splice(i, 0, ['CALL_START', '(', token[2]]); @@ -229,14 +188,14 @@ })(this)); }; // Because our grammar is LALR(1), it can't handle some single-line - // expressions that lack ending delimiters. Use the lexer to add the implicit - // blocks, so it doesn't need to. - // ')' can close a single-line block, but we need to make sure it's balanced. + // expressions that lack ending delimiters. The **Rewriter** adds the implicit + // blocks, so it doesn't need to. ')' can close a single-line block, + // but we need to make sure it's balanced. Rewriter.prototype.add_implicit_indentation = function add_implicit_indentation() { return this.scan_tokens((function(__this) { var __func = function(prev, token, post, i) { var idx, insertion, parens, pre, starter, tok; - if (!(SINGLE_LINERS.indexOf(token[0]) >= 0 && post[0] !== 'INDENT' && !(token[0] === 'ELSE' && post[0] === 'IF'))) { + if (!(include(SINGLE_LINERS, token[0]) && post[0] !== 'INDENT' && !(token[0] === 'ELSE' && post[0] === 'IF'))) { return 1; } starter = token[0]; @@ -247,7 +206,7 @@ idx += 1; tok = this.tokens[idx]; pre = this.tokens[idx - 1]; - if ((!tok || (SINGLE_CLOSERS.indexOf(tok[0]) >= 0 && tok[1] !== ';') || (tok[0] === ')' && parens === 0)) && !(starter === 'ELSE' && tok[0] === 'ELSE')) { + if ((!tok || (include(SINGLE_CLOSERS, tok[0]) && tok[1] !== ';') || (tok[0] === ')' && parens === 0)) && !(starter === 'ELSE' && tok[0] === 'ELSE')) { insertion = pre[0] === "," ? idx - 1 : idx; this.tokens.splice(insertion, 0, ['OUTDENT', 2, token[2]]); break; @@ -273,17 +232,17 @@ // Ensure that all listed pairs of tokens are correctly balanced throughout // the course of the token stream. Rewriter.prototype.ensure_balance = function ensure_balance(pairs) { - var _l, _m, key, levels, unclosed, value; + var _a, _b, key, levels, unclosed, value; levels = {}; this.scan_tokens((function(__this) { var __func = function(prev, token, post, i) { - var _l, _m, _n, _o, close, open; - _l = pairs; - for (_m = 0, _n = _l.length; _m < _n; _m++) { - pair = _l[_m]; - _o = pair; - open = _o[0]; - close = _o[1]; + var _a, _b, _c, _d, close, open, pair; + _a = pairs; + for (_b = 0, _c = _a.length; _b < _c; _b++) { + pair = _a[_b]; + _d = pair; + open = _d[0]; + close = _d[1]; levels[open] = levels[open] || 0; if (token[0] === open) { levels[open] += 1; @@ -302,22 +261,22 @@ }); })(this)); unclosed = (function() { - _l = []; _m = levels; - for (key in _m) { if (__hasProp.call(_m, key)) { - value = _m[key]; + _a = []; _b = levels; + for (key in _b) { if (__hasProp.call(_b, key)) { + value = _b[key]; if (value > 0) { - _l.push(key); + _a.push(key); } }} - return _l; + return _a; }).call(this); if (unclosed.length) { throw new Error("unclosed " + (unclosed[0])); } }; // We'd like to support syntax like this: - // el.click((event) -> - // el.hide()) + // el.click((event) -> + // el.hide()) // In order to accomplish this, move outdents that follow closing parens // inwards, safely. The steps to accomplish this are: // 1. Check that all paired tokens are balanced and in order. @@ -327,12 +286,12 @@ // 3. Keep track of "debt" for tokens that we fake, to make sure we end // up balanced in the end. Rewriter.prototype.rewrite_closing_parens = function rewrite_closing_parens() { - var _l, debt, key, stack, val; + var _a, debt, key, stack, val; stack = []; debt = {}; - _l = INVERSES; - for (key in _l) { if (__hasProp.call(_l, key)) { - val = _l[key]; + _a = INVERSES; + for (key in _a) { if (__hasProp.call(_a, key)) { + val = _a[key]; ((debt[key] = 0)); }} return this.scan_tokens((function(__this) { @@ -340,31 +299,24 @@ var inv, match, mtag, tag; tag = token[0]; inv = INVERSES[token[0]]; - // Push openers onto the stack. - if (EXPRESSION_START.indexOf(tag) >= 0) { + if (include(EXPRESSION_START, tag)) { stack.push(token); return 1; - // The end of an expression, check stack and debt for a pair. - } else if (EXPRESSION_TAIL.indexOf(tag) >= 0) { - // If the tag is already in our debt, swallow it. + } else if (include(EXPRESSION_END, tag)) { if (debt[inv] > 0) { debt[inv] -= 1; this.tokens.splice(i, 1); return 0; } else { - // Pop the stack of open delimiters. match = stack.pop(); mtag = match[0]; - // Continue onwards if it's the expected tag. if (tag === INVERSES[mtag]) { return 1; - } else { - // Unexpected close, insert correct close, adding to the debt. - debt[mtag] += 1; - val = mtag === 'INDENT' ? match[1] : INVERSES[mtag]; - this.tokens.splice(i, 0, [INVERSES[mtag], val]); - return 1; } + debt[mtag] += 1; + val = mtag === 'INDENT' ? match[1] : INVERSES[mtag]; + this.tokens.splice(i, 0, [INVERSES[mtag], val]); + return 1; } } else { return 1; @@ -377,4 +329,55 @@ }; return Rewriter; }).call(this); + // Constants + // --------- + // List of the token pairs that must be balanced. + BALANCED_PAIRS = [['(', ')'], ['[', ']'], ['{', '}'], ['INDENT', 'OUTDENT'], ['PARAM_START', 'PARAM_END'], ['CALL_START', 'CALL_END'], ['INDEX_START', 'INDEX_END'], ['SOAKED_INDEX_START', 'SOAKED_INDEX_END']]; + // The inverse mappings of `BALANCED_PAIRS` we're trying to fix up, so we can + // look things up from either end. + INVERSES = {}; + _a = BALANCED_PAIRS; + for (_b = 0, _c = _a.length; _b < _c; _b++) { + pair = _a[_b]; + INVERSES[pair[0]] = pair[1]; + INVERSES[pair[1]] = pair[0]; + } + // The tokens that signal the start of a balanced pair. + EXPRESSION_START = (function() { + _d = []; _e = BALANCED_PAIRS; + for (_f = 0, _g = _e.length; _f < _g; _f++) { + pair = _e[_f]; + _d.push(pair[0]); + } + return _d; + }).call(this); + // The tokens that signal the end of a balanced pair. + EXPRESSION_END = (function() { + _h = []; _i = BALANCED_PAIRS; + for (_j = 0, _k = _i.length; _j < _k; _j++) { + pair = _i[_j]; + _h.push(pair[1]); + } + return _h; + }).call(this); + // Tokens that indicate the close of a clause of an expression. + EXPRESSION_CLOSE = ['CATCH', 'WHEN', 'ELSE', 'FINALLY'].concat(EXPRESSION_END); + // Tokens that, if followed by an `IMPLICIT_CALL`, indicate a function invocation. + IMPLICIT_FUNC = ['IDENTIFIER', 'SUPER', ')', 'CALL_END', ']', 'INDEX_END']; + // If preceded by an `IMPLICIT_FUNC`, indicates a function invocation. + IMPLICIT_CALL = ['IDENTIFIER', 'NUMBER', 'STRING', 'JS', 'REGEX', 'NEW', 'PARAM_START', 'TRY', 'DELETE', 'TYPEOF', 'SWITCH', 'TRUE', 'FALSE', 'YES', 'NO', 'ON', 'OFF', '!', '!!', 'NOT', '@', '->', '=>', '[', '(', '{']; + // Tokens indicating that the implicit call must enclose a block of expressions. + IMPLICIT_BLOCK = ['->', '=>', '{', '[', ',']; + // Tokens that always mark the end of an implicit call for single-liners. + IMPLICIT_END = ['IF', 'UNLESS', 'FOR', 'WHILE', 'TERMINATOR', 'INDENT', 'OUTDENT']; + // Single-line flavors of block expressions that have unclosed endings. + // The grammar can't disambiguate them, so we insert the implicit indentation. + SINGLE_LINERS = ['ELSE', "->", "=>", 'TRY', 'FINALLY', 'THEN']; + SINGLE_CLOSERS = ['TERMINATOR', 'CATCH', 'FINALLY', 'ELSE', 'OUTDENT', 'LEADING_WHEN']; + // Utility Functions + // ----------------- + // Does a list include a value? + include = function include(list, value) { + return list.indexOf(value) >= 0; + }; })(); diff --git a/src/lexer.coffee b/src/lexer.coffee index 718847a8..daf1ce5d 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -14,99 +14,11 @@ else this.exports: this Rewriter: this.Rewriter -# Constants -# --------- - -# Keywords that CoffeeScript shares in common with JavaScript. -JS_KEYWORDS: [ - "if", "else", - "true", "false", - "new", "return", - "try", "catch", "finally", "throw", - "break", "continue", - "for", "in", "while", - "delete", "instanceof", "typeof", - "switch", "super", "extends", "class" -] - -# CoffeeScript-only keywords, which we're more relaxed about allowing. They can't -# be used standalone, but you can reference them as an attached property. -COFFEE_KEYWORDS: [ - "then", "unless", - "yes", "no", "on", "off", - "and", "or", "is", "isnt", "not", - "of", "by", "where", "when" -] - -# The combined list of keywords is the superset that gets passed verbatim to -# the parser. -KEYWORDS: JS_KEYWORDS.concat COFFEE_KEYWORDS - -# The list of keywords that are reserved by JavaScript, but not used, or are -# used by CoffeeScript internally. We throw an error when these are encountered, -# to avoid having a JavaScript error at runtime. -RESERVED: [ - "case", "default", "do", "function", "var", "void", "with" - "const", "let", "debugger", "enum", "export", "import", "native", - "__extends", "__hasProp" -] - -# The superset of both JavaScript keywords and reserved words, none of which may -# be used as identifiers or properties. -JS_FORBIDDEN: JS_KEYWORDS.concat RESERVED - -# Token matching regexes. -IDENTIFIER : /^([a-zA-Z$_](\w|\$)*)/ -NUMBER : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i -HEREDOC : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/ -INTERPOLATION : /^\$([a-zA-Z_@]\w*)/ -OPERATOR : /^([+\*&|\/\-%=<>:!?]+)/ -WHITESPACE : /^([ \t]+)/ -COMMENT : /^(((\n?[ \t]*)?#[^\n]*)+)/ -CODE : /^((-|=)>)/ -REGEX : /^(\/(\S.*?)?([^\\]|\\\\)\/[imgy]{0,4})/ -MULTI_DENT : /^((\n([ \t]*))+)(\.)?/ -LAST_DENTS : /\n([ \t]*)/g -LAST_DENT : /\n([ \t]*)/ -ASSIGNMENT : /^(:|=)$/ - -# Token cleaning regexes. -JS_CLEANER : /(^`|`$)/g -MULTILINER : /\n/g -STRING_NEWLINES : /\n[ \t]*/g -COMMENT_CLEANER : /(^[ \t]*#|\n[ \t]*$)/mg -NO_NEWLINE : /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/ -HEREDOC_INDENT : /^[ \t]+/mg - -# Tokens which a regular expression will never immediately follow, but which -# a division operator might. -# -# See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions -# -# Our list is shorter, due to sans-parentheses method calls. -NOT_REGEX: [ - 'NUMBER', 'REGEX', '++', '--', 'FALSE', 'NULL', 'TRUE' -] - -# Tokens which could legitimately be invoked or indexed. A opening -# parentheses or bracket following these tokens will be recorded as the start -# of a function invocation or indexing operation. -CALLABLE: ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING', '@'] - -# Tokens that indicate an access -- keywords immediately following will be -# treated as identifiers. -ACCESSORS: ['PROPERTY_ACCESS', 'PROTOTYPE_ACCESS', 'SOAK_ACCESS', '@'] - -# Tokens that, when immediately preceding a `WHEN`, indicate that the `WHEN` -# occurs at the start of a line. We disambiguate these from trailing whens to -# avoid an ambiguity in the grammar. -BEFORE_WHEN: ['INDENT', 'OUTDENT', 'TERMINATOR'] - # The Lexer Class # --------------- # The Lexer class reads a stream of CoffeeScript and divvys it up into tagged -# tokens. A minor bit of the ambiguity in the grammar has been avoided by +# tokens. Some potential ambiguity in the grammar has been avoided by # pushing some extra smarts into the Lexer. exports.Lexer: class Lexer @@ -119,6 +31,9 @@ exports.Lexer: class Lexer # Each tokenizing method is responsible for incrementing `@i` by the number of # characters it has consumed. `@i` can be thought of as our finger on the page # of source. + # + # Before returning the token stream, run it through the [Rewriter](rewriter.html) + # unless explicitly asked not to. tokenize: (code, options) -> o : options or {} @code : code # The remainder of the source code. @@ -480,6 +395,94 @@ exports.Lexer: class Lexer return false unless m: @chunk.match(regex) if m then m[index] else false +# Constants +# --------- + +# Keywords that CoffeeScript shares in common with JavaScript. +JS_KEYWORDS: [ + "if", "else", + "true", "false", + "new", "return", + "try", "catch", "finally", "throw", + "break", "continue", + "for", "in", "while", + "delete", "instanceof", "typeof", + "switch", "super", "extends", "class" +] + +# CoffeeScript-only keywords, which we're more relaxed about allowing. They can't +# be used standalone, but you can reference them as an attached property. +COFFEE_KEYWORDS: [ + "then", "unless", + "yes", "no", "on", "off", + "and", "or", "is", "isnt", "not", + "of", "by", "where", "when" +] + +# The combined list of keywords is the superset that gets passed verbatim to +# the parser. +KEYWORDS: JS_KEYWORDS.concat COFFEE_KEYWORDS + +# The list of keywords that are reserved by JavaScript, but not used, or are +# used by CoffeeScript internally. We throw an error when these are encountered, +# to avoid having a JavaScript error at runtime. +RESERVED: [ + "case", "default", "do", "function", "var", "void", "with" + "const", "let", "debugger", "enum", "export", "import", "native", + "__extends", "__hasProp" +] + +# The superset of both JavaScript keywords and reserved words, none of which may +# be used as identifiers or properties. +JS_FORBIDDEN: JS_KEYWORDS.concat RESERVED + +# Token matching regexes. +IDENTIFIER : /^([a-zA-Z$_](\w|\$)*)/ +NUMBER : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i +HEREDOC : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/ +INTERPOLATION : /^\$([a-zA-Z_@]\w*)/ +OPERATOR : /^([+\*&|\/\-%=<>:!?]+)/ +WHITESPACE : /^([ \t]+)/ +COMMENT : /^(((\n?[ \t]*)?#[^\n]*)+)/ +CODE : /^((-|=)>)/ +REGEX : /^(\/(\S.*?)?([^\\]|\\\\)\/[imgy]{0,4})/ +MULTI_DENT : /^((\n([ \t]*))+)(\.)?/ +LAST_DENTS : /\n([ \t]*)/g +LAST_DENT : /\n([ \t]*)/ +ASSIGNMENT : /^(:|=)$/ + +# Token cleaning regexes. +JS_CLEANER : /(^`|`$)/g +MULTILINER : /\n/g +STRING_NEWLINES : /\n[ \t]*/g +COMMENT_CLEANER : /(^[ \t]*#|\n[ \t]*$)/mg +NO_NEWLINE : /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/ +HEREDOC_INDENT : /^[ \t]+/mg + +# Tokens which a regular expression will never immediately follow, but which +# a division operator might. +# +# See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions +# +# Our list is shorter, due to sans-parentheses method calls. +NOT_REGEX: [ + 'NUMBER', 'REGEX', '++', '--', 'FALSE', 'NULL', 'TRUE' +] + +# Tokens which could legitimately be invoked or indexed. A opening +# parentheses or bracket following these tokens will be recorded as the start +# of a function invocation or indexing operation. +CALLABLE: ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING', '@'] + +# Tokens that indicate an access -- keywords immediately following will be +# treated as identifiers. +ACCESSORS: ['PROPERTY_ACCESS', 'PROTOTYPE_ACCESS', 'SOAK_ACCESS', '@'] + +# Tokens that, when immediately preceding a `WHEN`, indicate that the `WHEN` +# occurs at the start of a line. We disambiguate these from trailing whens to +# avoid an ambiguity in the grammar. +BEFORE_WHEN: ['INDENT', 'OUTDENT', 'TERMINATOR'] + # Utility Functions # ----------------- diff --git a/src/rewriter.coffee b/src/rewriter.coffee index df932922..1fe3a8d8 100644 --- a/src/rewriter.coffee +++ b/src/rewriter.coffee @@ -1,53 +1,28 @@ +# The CoffeeScript language has a decent amount of optional syntax, +# implicit syntax, and shorthand syntax. These things can greatly complicate a +# grammar and bloat the resulting parse table. Instead of making the parser +# handle it all, we take a series of passes over the token stream, +# using this **Rewriter** to convert shorthand into the unambiguous long form, +# add implicit indentation and parentheses, balance incorrect nestings, and +# generally clean things up. + +# Set up exported variables for both Node.js and the browser. this.exports: this unless process? -# Tokens that must be balanced. -BALANCED_PAIRS: [['(', ')'], ['[', ']'], ['{', '}'], ['INDENT', 'OUTDENT'], - ['PARAM_START', 'PARAM_END'], ['CALL_START', 'CALL_END'], - ['INDEX_START', 'INDEX_END'], ['SOAKED_INDEX_START', 'SOAKED_INDEX_END']] - -# Tokens that signal the start of a balanced pair. -EXPRESSION_START: pair[0] for pair in BALANCED_PAIRS - -# Tokens that signal the end of a balanced pair. -EXPRESSION_TAIL: pair[1] for pair in BALANCED_PAIRS - -# Tokens that indicate the close of a clause of an expression. -EXPRESSION_CLOSE: ['CATCH', 'WHEN', 'ELSE', 'FINALLY'].concat(EXPRESSION_TAIL) - -# Tokens pairs that, in immediate succession, indicate an implicit call. -IMPLICIT_FUNC: ['IDENTIFIER', 'SUPER', ')', 'CALL_END', ']', 'INDEX_END'] -IMPLICIT_BLOCK:['->', '=>', '{', '[', ','] -IMPLICIT_END: ['IF', 'UNLESS', 'FOR', 'WHILE', 'TERMINATOR', 'INDENT', 'OUTDENT'] -IMPLICIT_CALL: ['IDENTIFIER', 'NUMBER', 'STRING', 'JS', 'REGEX', 'NEW', 'PARAM_START', - 'TRY', 'DELETE', 'TYPEOF', 'SWITCH', - 'TRUE', 'FALSE', 'YES', 'NO', 'ON', 'OFF', '!', '!!', 'NOT', - '@', '->', '=>', '[', '(', '{'] - -# The inverse mappings of token pairs we're trying to fix up. -INVERSES: {} -for pair in BALANCED_PAIRS - INVERSES[pair[0]]: pair[1] - INVERSES[pair[1]]: pair[0] - -# Single-line flavors of block expressions that have unclosed endings. -# The grammar can't disambiguate them, so we insert the implicit indentation. -SINGLE_LINERS: ['ELSE', "->", "=>", 'TRY', 'FINALLY', 'THEN'] -SINGLE_CLOSERS: ['TERMINATOR', 'CATCH', 'FINALLY', 'ELSE', 'OUTDENT', 'LEADING_WHEN'] - -# In order to keep the grammar simple, the stream of tokens that the Lexer -# emits is rewritten by the Rewriter, smoothing out ambiguities, mis-nested -# indentation, and single-line flavors of expressions. +# The **Rewriter** class is used by the [Lexer](lexer.html), directly against +# its internal array of tokens. exports.Rewriter: class Rewriter # Rewrite the token stream in multiple passes, one logical filter at # a time. This could certainly be changed into a single pass through the - # stream, with a big ol' efficient switch, but it's much nicer like this. + # stream, with a big ol' efficient switch, but it's much nicer to work with + # like this. The order of these passes matters -- indentation must be + # corrected before implicit parentheses can be wrapped around blocks of code. rewrite: (tokens) -> @tokens: tokens @adjust_comments() @remove_leading_newlines() @remove_mid_expression_newlines() - @move_commas_outside_outdents() @close_open_calls_and_indexes() @add_implicit_indentation() @add_implicit_parentheses() @@ -58,7 +33,8 @@ exports.Rewriter: class Rewriter # Rewrite the token stream, looking one token ahead and behind. # Allow the return value of the block to tell us how many tokens to move # forwards (or backwards) in the stream, to make sure we don't miss anything - # as the stream changes length under our feet. + # as tokens are inserted and removed, and the stream changes length under + # our feet. scan_tokens: (block) -> i: 0 while true @@ -68,7 +44,7 @@ exports.Rewriter: class Rewriter true # Massage newlines and indentations so that comments don't have to be - # correctly indented, or appear on their own line. + # correctly indented, or appear on a line of their own. adjust_comments: -> @scan_tokens (prev, token, post, i) => return 1 unless token[0] is 'COMMENT' @@ -86,25 +62,19 @@ exports.Rewriter: class Rewriter # Leading newlines would introduce an ambiguity in the grammar, so we # dispatch them here. remove_leading_newlines: -> - @tokens.shift() if @tokens[0][0] is 'TERMINATOR' + @tokens.shift() while @tokens[0][0] is 'TERMINATOR' # Some blocks occur in the middle of expressions -- when we're expecting # this, remove their trailing newlines. remove_mid_expression_newlines: -> @scan_tokens (prev, token, post, i) => - return 1 unless post and EXPRESSION_CLOSE.indexOf(post[0]) >= 0 and token[0] is 'TERMINATOR' + return 1 unless post and include(EXPRESSION_CLOSE, post[0]) and token[0] is 'TERMINATOR' @tokens.splice(i, 1) return 0 - # Make sure that we don't accidentally break trailing commas, which need - # to go on the outside of expression closers. - move_commas_outside_outdents: -> - @scan_tokens (prev, token, post, i) => - @tokens.splice(i, 1, token) if token[0] is 'OUTDENT' and prev[0] is ',' - return 1 - - # We've tagged the opening parenthesis of a method call, and the opening - # bracket of an indexing operation. Match them with their close. + # The lexer has tagged the opening parenthesis of a method call, and the + # opening bracket of an indexing operation. Match them with their paired + # close. close_open_calls_and_indexes: -> parens: [0] brackets: [0] @@ -139,28 +109,29 @@ exports.Rewriter: class Rewriter if tag is 'OUTDENT' last: stack.pop() stack[stack.length - 1] += last - if IMPLICIT_END.indexOf(tag) >= 0 or !post? - return 1 if tag is 'INDENT' and prev and IMPLICIT_BLOCK.indexOf(prev[0]) >= 0 + if !post? or include IMPLICIT_END, tag + return 1 if tag is 'INDENT' and prev and include IMPLICIT_BLOCK, prev[0] if stack[stack.length - 1] > 0 or tag is 'INDENT' - idx: if tag is 'OUTDENT' then i +1 else i + idx: if tag is 'OUTDENT' then i + 1 else i stack_pointer: if tag is 'INDENT' then 2 else 1 for tmp in [0...stack[stack.length - stack_pointer]] @tokens.splice(idx, 0, ['CALL_END', ')', token[2]]) size: stack[stack.length - stack_pointer] + 1 stack[stack.length - stack_pointer]: 0 return size - return 1 unless prev and IMPLICIT_FUNC.indexOf(prev[0]) >= 0 and IMPLICIT_CALL.indexOf(tag) >= 0 + return 1 unless prev and include(IMPLICIT_FUNC, prev[0]) and include IMPLICIT_CALL, tag @tokens.splice(i, 0, ['CALL_START', '(', token[2]]) stack[stack.length - 1] += 1 return 2 # Because our grammar is LALR(1), it can't handle some single-line - # expressions that lack ending delimiters. Use the lexer to add the implicit - # blocks, so it doesn't need to. - # ')' can close a single-line block, but we need to make sure it's balanced. + # expressions that lack ending delimiters. The **Rewriter** adds the implicit + # blocks, so it doesn't need to. ')' can close a single-line block, + # but we need to make sure it's balanced. add_implicit_indentation: -> @scan_tokens (prev, token, post, i) => - return 1 unless SINGLE_LINERS.indexOf(token[0]) >= 0 and post[0] isnt 'INDENT' and + return 1 unless include(SINGLE_LINERS, token[0]) and + post[0] isnt 'INDENT' and not (token[0] is 'ELSE' and post[0] is 'IF') starter: token[0] @tokens.splice(i + 1, 0, ['INDENT', 2, token[2]]) @@ -171,7 +142,7 @@ exports.Rewriter: class Rewriter tok: @tokens[idx] pre: @tokens[idx - 1] if (not tok or - (SINGLE_CLOSERS.indexOf(tok[0]) >= 0 and tok[1] isnt ';') or + (include(SINGLE_CLOSERS, tok[0]) and tok[1] isnt ';') or (tok[0] is ')' && parens is 0)) and not (starter is 'ELSE' and tok[0] is 'ELSE') insertion: if pre[0] is "," then idx - 1 else idx @@ -199,8 +170,10 @@ exports.Rewriter: class Rewriter throw new Error("unclosed ${unclosed[0]}") if unclosed.length # We'd like to support syntax like this: - # el.click((event) -> - # el.hide()) + # + # el.click((event) -> + # el.hide()) + # # In order to accomplish this, move outdents that follow closing parens # inwards, safely. The steps to accomplish this are: # @@ -218,29 +191,72 @@ exports.Rewriter: class Rewriter @scan_tokens (prev, token, post, i) => tag: token[0] inv: INVERSES[token[0]] - # Push openers onto the stack. - if EXPRESSION_START.indexOf(tag) >= 0 - stack.push(token) + if include EXPRESSION_START, tag + stack.push token return 1 - # The end of an expression, check stack and debt for a pair. - else if EXPRESSION_TAIL.indexOf(tag) >= 0 - # If the tag is already in our debt, swallow it. + else if include EXPRESSION_END, tag if debt[inv] > 0 debt[inv] -= 1 - @tokens.splice(i, 1) + @tokens.splice i, 1 return 0 else - # Pop the stack of open delimiters. match: stack.pop() mtag: match[0] - # Continue onwards if it's the expected tag. - if tag is INVERSES[mtag] - return 1 - else - # Unexpected close, insert correct close, adding to the debt. - debt[mtag] += 1 - val: if mtag is 'INDENT' then match[1] else INVERSES[mtag] - @tokens.splice(i, 0, [INVERSES[mtag], val]) - return 1 + return 1 if tag is INVERSES[mtag] + debt[mtag] += 1 + val: if mtag is 'INDENT' then match[1] else INVERSES[mtag] + @tokens.splice i, 0, [INVERSES[mtag], val] + return 1 else return 1 + +# Constants +# --------- + +# List of the token pairs that must be balanced. +BALANCED_PAIRS: [['(', ')'], ['[', ']'], ['{', '}'], ['INDENT', 'OUTDENT'], + ['PARAM_START', 'PARAM_END'], ['CALL_START', 'CALL_END'], + ['INDEX_START', 'INDEX_END'], ['SOAKED_INDEX_START', 'SOAKED_INDEX_END']] + +# The inverse mappings of `BALANCED_PAIRS` we're trying to fix up, so we can +# look things up from either end. +INVERSES: {} +for pair in BALANCED_PAIRS + INVERSES[pair[0]]: pair[1] + INVERSES[pair[1]]: pair[0] + +# The tokens that signal the start of a balanced pair. +EXPRESSION_START: pair[0] for pair in BALANCED_PAIRS + +# The tokens that signal the end of a balanced pair. +EXPRESSION_END: pair[1] for pair in BALANCED_PAIRS + +# Tokens that indicate the close of a clause of an expression. +EXPRESSION_CLOSE: ['CATCH', 'WHEN', 'ELSE', 'FINALLY'].concat EXPRESSION_END + +# Tokens that, if followed by an `IMPLICIT_CALL`, indicate a function invocation. +IMPLICIT_FUNC: ['IDENTIFIER', 'SUPER', ')', 'CALL_END', ']', 'INDEX_END'] + +# If preceded by an `IMPLICIT_FUNC`, indicates a function invocation. +IMPLICIT_CALL: ['IDENTIFIER', 'NUMBER', 'STRING', 'JS', 'REGEX', 'NEW', 'PARAM_START', + 'TRY', 'DELETE', 'TYPEOF', 'SWITCH', + 'TRUE', 'FALSE', 'YES', 'NO', 'ON', 'OFF', '!', '!!', 'NOT', + '@', '->', '=>', '[', '(', '{'] + +# Tokens indicating that the implicit call must enclose a block of expressions. +IMPLICIT_BLOCK: ['->', '=>', '{', '[', ','] + +# Tokens that always mark the end of an implicit call for single-liners. +IMPLICIT_END: ['IF', 'UNLESS', 'FOR', 'WHILE', 'TERMINATOR', 'INDENT', 'OUTDENT'] + +# Single-line flavors of block expressions that have unclosed endings. +# The grammar can't disambiguate them, so we insert the implicit indentation. +SINGLE_LINERS: ['ELSE', "->", "=>", 'TRY', 'FINALLY', 'THEN'] +SINGLE_CLOSERS: ['TERMINATOR', 'CATCH', 'FINALLY', 'ELSE', 'OUTDENT', 'LEADING_WHEN'] + +# Utility Functions +# ----------------- + +# Does a list include a value? +include: (list, value) -> + list.indexOf(value) >= 0 diff --git a/test/test_everything.coffee b/test/test_everything.coffee index a08c8a85..818f6d2c 100644 --- a/test/test_everything.coffee +++ b/test/test_everything.coffee @@ -1,3 +1,5 @@ + + func: -> a: 3 b: []