From 84feab3492514721aa67d3029f29f59bfeb24686 Mon Sep 17 00:00:00 2001 From: Jeremy Ashkenas Date: Sat, 30 Jan 2010 17:02:05 -0500 Subject: [PATCH] first little piece of the rewriter --- lib/coffee_script/coffee-script.js | 3 +- lib/coffee_script/lexer.js | 39 ++++++---- lib/coffee_script/rewriter.js | 104 +++++++++++++++++++++++++ src/coffee-script.coffee | 1 - src/lexer.coffee | 15 +++- src/rewriter.coffee | 120 +++++++++++++++++++++++++++++ 6 files changed, 263 insertions(+), 19 deletions(-) create mode 100644 lib/coffee_script/rewriter.js create mode 100644 src/rewriter.coffee diff --git a/lib/coffee_script/coffee-script.js b/lib/coffee_script/coffee-script.js index 7be28108..58bbe318 100644 --- a/lib/coffee_script/coffee-script.js +++ b/lib/coffee_script/coffee-script.js @@ -1,7 +1,6 @@ (function(){ - var compiler, path, sys; + var compiler, path; // Executes the `coffee` Ruby program to convert from CoffeeScript to JavaScript. - sys = require('sys'); path = require('path'); // The path to the CoffeeScript executable. compiler = path.normalize(path.dirname(__filename) + '/../../bin/coffee'); diff --git a/lib/coffee_script/lexer.js b/lib/coffee_script/lexer.js index bc9a74cb..1028280b 100644 --- a/lib/coffee_script/lexer.js +++ b/lib/coffee_script/lexer.js @@ -1,6 +1,7 @@ (function(){ - var ASSIGNMENT, CALLABLE, CODE, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, JS, JS_CLEANER, KEYWORDS, LAST_DENT, LAST_DENTS, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, STRING, STRING_NEWLINES, WHITESPACE, lex, sys; + var ASSIGNMENT, CALLABLE, CODE, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, JS, JS_CLEANER, KEYWORDS, LAST_DENT, LAST_DENTS, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, Rewriter, STRING, STRING_NEWLINES, WHITESPACE, lex, sys; sys = require('sys'); + Rewriter = require('./rewriter').Rewriter; // The lexer reads a stream of CoffeeScript and divvys it up into tagged // tokens. A minor bit of the ambiguity in the grammar has been avoided by // pushing some extra smarts into the Lexer. @@ -56,8 +57,7 @@ } // sys.puts "original stream: " + this.tokens if process.ENV['VERBOSE'] this.close_indentation(); - // (new Rewriter()).rewrite(this.tokens) - return this.tokens; + return (new Rewriter()).rewrite(this.tokens); }; // At every position, run through this list of attempted matches, // short-circuiting if any of them succeed. @@ -116,7 +116,8 @@ } } this.token(tag, id); - return this.i += id.length; + this.i += id.length; + return true; }; // Matches numbers, including decimals, hex, and exponential notation. lex.prototype.number_token = function number_token() { @@ -125,7 +126,8 @@ return false; } this.token('NUMBER', number); - return this.i += number.length; + this.i += number.length; + return true; }; // Matches strings, including multi-line strings. lex.prototype.string_token = function string_token() { @@ -136,7 +138,8 @@ escaped = string.replace(STRING_NEWLINES, " \\\n"); this.token('STRING', escaped); this.line += this.count(string, "\n"); - return this.i += string.length; + this.i += string.length; + return true; }; // Matches heredocs, adjusting indentation to the correct level. lex.prototype.heredoc_token = function heredoc_token() { @@ -149,7 +152,8 @@ doc = doc.replace(new RegExp("^" + indent, 'g'), '').replace(MULTILINER, "\\n").replace('"', '\\"'); this.token('STRING', '"' + doc + '"'); this.line += this.count(match[1], "\n"); - return this.i += match[1].length; + this.i += match[1].length; + return true; }; // Matches interpolated JavaScript. lex.prototype.js_token = function js_token() { @@ -158,7 +162,8 @@ return false; } this.token('JS', script.replace(JS_CLEANER, '')); - return this.i += script.length; + this.i += script.length; + return true; }; // Matches regular expression literals. lex.prototype.regex_token = function regex_token() { @@ -170,7 +175,8 @@ return false; } this.token('REGEX', regex); - return this.i += regex.length; + this.i += regex.length; + return true; }; // Matches and conumes comments. lex.prototype.comment_token = function comment_token() { @@ -181,7 +187,8 @@ this.line += comment.match(MULTILINER).length; this.token('COMMENT', comment.replace(COMMENT_CLEANER, '').split(MULTILINER)); this.token("\n", "\n"); - return this.i += comment.length; + this.i += comment.length; + return true; }; // Record tokens for indentation differing from the previous line. lex.prototype.indent_token = function indent_token() { @@ -207,7 +214,8 @@ } else { this.outdent_token(this.indent - size); } - return this.indent = size; + this.indent = size; + return true; }; // Record an oudent token or tokens, if we're moving back inwards past // multiple recorded indents. @@ -218,7 +226,8 @@ this.token('OUTDENT', last_indent); move_out -= last_indent; } - return this.token("\n", "\n"); + this.token("\n", "\n"); + return true; }; // Matches and consumes non-meaningful whitespace. lex.prototype.whitespace_token = function whitespace_token() { @@ -227,7 +236,8 @@ return false; } this.value().spaced = true; - return this.i += space.length; + this.i += space.length; + return true; }; // Multiple newlines get merged together. // Use a trailing \ to escape newlines. @@ -265,7 +275,8 @@ } } this.token(tag, value); - return this.i += value.length; + this.i += value.length; + return true; }; // Helpers ============================================================= // Add a token to the results, taking note of the line number. diff --git a/lib/coffee_script/rewriter.js b/lib/coffee_script/rewriter.js new file mode 100644 index 00000000..ef2c92c1 --- /dev/null +++ b/lib/coffee_script/rewriter.js @@ -0,0 +1,104 @@ +(function(){ + var BALANCED_PAIRS, EXPRESSION_CLOSE, EXPRESSION_START, EXPRESSION_TAIL, IMPLICIT_CALL, IMPLICIT_END, IMPLICIT_FUNC, INVERSES, SINGLE_CLOSERS, SINGLE_LINERS, __a, __b, __c, __d, __e, __f, __g, __h, pair, re; + // In order to keep the grammar simple, the stream of tokens that the Lexer + // emits is rewritten by the Rewriter, smoothing out ambiguities, mis-nested + // indentation, and single-line flavors of expressions. + exports.Rewriter = (re = function re() { }); + // Tokens that must be balanced. + BALANCED_PAIRS = [['(', ')'], ['[', ']'], ['{', '}'], ['INDENT', 'OUTDENT'], ['PARAM_START', 'PARAM_END'], ['CALL_START', 'CALL_END'], ['INDEX_START', 'INDEX_END']]; + // Tokens that signal the start of a balanced pair. + EXPRESSION_START = (function() { + __a = []; __b = BALANCED_PAIRS; + for (__c = 0; __c < __b.length; __c++) { + pair = __b[__c]; + __a.push(pair[0]); + } + return __a; + }).call(this); + // Tokens that signal the end of a balanced pair. + EXPRESSION_TAIL = (function() { + __d = []; __e = BALANCED_PAIRS; + for (__f = 0; __f < __e.length; __f++) { + pair = __e[__f]; + __d.push(pair[1]); + } + return __d; + }).call(this); + // Tokens that indicate the close of a clause of an expression. + EXPRESSION_CLOSE = ['CATCH', 'WHEN', 'ELSE', 'FINALLY'].concat(EXPRESSION_TAIL); + // Tokens pairs that, in immediate succession, indicate an implicit call. + IMPLICIT_FUNC = ['IDENTIFIER', 'SUPER', ')', 'CALL_END', ']', 'INDEX_END']; + IMPLICIT_END = ['IF', 'UNLESS', 'FOR', 'WHILE', "\n", 'OUTDENT']; + IMPLICIT_CALL = ['IDENTIFIER', 'NUMBER', 'STRING', 'JS', 'REGEX', 'NEW', 'PARAM_START', 'TRY', 'DELETE', 'TYPEOF', 'SWITCH', 'ARGUMENTS', 'TRUE', 'FALSE', 'YES', 'NO', 'ON', 'OFF', '!', '!!', 'NOT', '->', '=>', '[', '(', '{']; + // The inverse mappings of token pairs we're trying to fix up. + INVERSES = { + }; + __g = BALANCED_PAIRS; + for (__h = 0; __h < __g.length; __h++) { + pair = __g[__h]; + INVERSES[pair[0]] = pair[1]; + INVERSES[pair[1]] = pair[0]; + } + // Single-line flavors of block expressions that have unclosed endings. + // The grammar can't disambiguate them, so we insert the implicit indentation. + SINGLE_LINERS = ['ELSE', "->", "=>", 'TRY', 'FINALLY', 'THEN']; + SINGLE_CLOSERS = ["\n", 'CATCH', 'FINALLY', 'ELSE', 'OUTDENT', 'LEADING_WHEN', 'PARAM_START']; + // Rewrite the token stream in multiple passes, one logical filter at + // a time. This could certainly be changed into a single pass through the + // stream, with a big ol' efficient switch, but it's much nicer like this. + re.prototype.rewrite = function rewrite(tokens) { + this.tokens = tokens; + this.adjust_comments(); + // this.remove_leading_newlines() + // this.remove_mid_expression_newlines() + // this.move_commas_outside_outdents() + // this.close_open_calls_and_indexes() + // this.add_implicit_parentheses() + // this.add_implicit_indentation() + // this.ensure_balance(BALANCED_PAIRS) + // this.rewrite_closing_parens() + return this.tokens; + }; + // Rewrite the token stream, looking one token ahead and behind. + // Allow the return value of the block to tell us how many tokens to move + // forwards (or backwards) in the stream, to make sure we don't miss anything + // as the stream changes length under our feet. + re.prototype.scan_tokens = function scan_tokens(yield) { + var i, move; + i = 0; + while (true) { + if (!(this.tokens[i])) { + break; + } + move = yield(this.tokens[i - 1], this.tokens[i], this.tokens[i + 1], i); + i += move; + } + return true; + }; + // Massage newlines and indentations so that comments don't have to be + // correctly indented, or appear on their own line. + re.prototype.adjust_comments = function adjust_comments() { + return this.scan_tokens(function(prev, token, post, i) { + var after, before; + if (!(token[0] === 'COMMENT')) { + return 1; + } + before = this.tokens[i - 2]; + after = this.tokens[i + 2]; + if (before && after && ((before[0] === 'INDENT' && after[0] === 'OUTDENT') || (before[0] === 'OUTDENT' && after[0] === 'INDENT')) && before[1] === after[1]) { + this.tokens.splice(i + 2, 1); + this.tokens.splice(i - 2, 1); + return 0; + } else if (prev[0] === "\n" && after[0] === 'INDENT') { + this.tokens.splice(i + 2, 1); + this.tokens[i - 1] = after; + return 1; + } else if (prev[0] !== "\n" && prev[0] !== 'INDENT' && prev[0] !== 'OUTDENT') { + this.tokens.splice(i, 0, ["\n", "\n"]); + return 2; + } else { + return 1; + } + }); + }; +})(); \ No newline at end of file diff --git a/src/coffee-script.coffee b/src/coffee-script.coffee index 0594c6c6..c2519717 100644 --- a/src/coffee-script.coffee +++ b/src/coffee-script.coffee @@ -1,6 +1,5 @@ # Executes the `coffee` Ruby program to convert from CoffeeScript to JavaScript. -sys: require('sys') path: require('path') # The path to the CoffeeScript executable. diff --git a/src/lexer.coffee b/src/lexer.coffee index c40dec14..dbe14504 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -1,4 +1,5 @@ sys: require 'sys' +Rewriter: require('./rewriter').Rewriter # The lexer reads a stream of CoffeeScript and divvys it up into tagged # tokens. A minor bit of the ambiguity in the grammar has been avoided by @@ -70,8 +71,7 @@ lex::tokenize: (code) -> this.extract_next_token() # sys.puts "original stream: " + this.tokens if process.ENV['VERBOSE'] this.close_indentation() - # (new Rewriter()).rewrite(this.tokens) - this.tokens + (new Rewriter()).rewrite this.tokens # At every position, run through this list of attempted matches, # short-circuiting if any of them succeed. @@ -105,12 +105,14 @@ lex::identifier_token: -> this.tag(-1, 'PROPERTY_ACCESS') this.token(tag, id) this.i += id.length + true # Matches numbers, including decimals, hex, and exponential notation. lex::number_token: -> return false unless number: this.match NUMBER, 1 this.token 'NUMBER', number this.i += number.length + true # Matches strings, including multi-line strings. lex::string_token: -> @@ -119,6 +121,7 @@ lex::string_token: -> this.token 'STRING', escaped this.line += this.count string, "\n" this.i += string.length + true # Matches heredocs, adjusting indentation to the correct level. lex::heredoc_token: -> @@ -131,12 +134,14 @@ lex::heredoc_token: -> this.token 'STRING', '"' + doc + '"' this.line += this.count match[1], "\n" this.i += match[1].length + true # Matches interpolated JavaScript. lex::js_token: -> return false unless script: this.match JS, 1 this.token 'JS', script.replace(JS_CLEANER, '') this.i += script.length + true # Matches regular expression literals. lex::regex_token: -> @@ -144,6 +149,7 @@ lex::regex_token: -> return false if NOT_REGEX.indexOf(this.tag()) >= 0 this.token 'REGEX', regex this.i += regex.length + true # Matches and conumes comments. lex::comment_token: -> @@ -152,6 +158,7 @@ lex::comment_token: -> this.token 'COMMENT', comment.replace(COMMENT_CLEANER, '').split(MULTILINER) this.token "\n", "\n" this.i += comment.length + true # Record tokens for indentation differing from the previous line. lex::indent_token: -> @@ -170,6 +177,7 @@ lex::indent_token: -> else this.outdent_token this.indent - size this.indent: size + true # Record an oudent token or tokens, if we're moving back inwards past # multiple recorded indents. @@ -179,12 +187,14 @@ lex::outdent_token: (move_out) -> this.token 'OUTDENT', last_indent move_out -= last_indent this.token "\n", "\n" + true # Matches and consumes non-meaningful whitespace. lex::whitespace_token: -> return false unless space: this.match WHITESPACE, 1 this.value().spaced: true this.i += space.length + true # Multiple newlines get merged together. # Use a trailing \ to escape newlines. @@ -211,6 +221,7 @@ lex::literal_token: -> tag: 'INDEX_START' if value is '[' this.token tag, value this.i += value.length + true # Helpers ============================================================= diff --git a/src/rewriter.coffee b/src/rewriter.coffee new file mode 100644 index 00000000..21775b72 --- /dev/null +++ b/src/rewriter.coffee @@ -0,0 +1,120 @@ +# In order to keep the grammar simple, the stream of tokens that the Lexer +# emits is rewritten by the Rewriter, smoothing out ambiguities, mis-nested +# indentation, and single-line flavors of expressions. +exports.Rewriter: re: -> + +# Tokens that must be balanced. +BALANCED_PAIRS: [['(', ')'], ['[', ']'], ['{', '}'], ['INDENT', 'OUTDENT'], + ['PARAM_START', 'PARAM_END'], ['CALL_START', 'CALL_END'], ['INDEX_START', 'INDEX_END']] + +# Tokens that signal the start of a balanced pair. +EXPRESSION_START: pair[0] for pair in BALANCED_PAIRS + +# Tokens that signal the end of a balanced pair. +EXPRESSION_TAIL: pair[1] for pair in BALANCED_PAIRS + +# Tokens that indicate the close of a clause of an expression. +EXPRESSION_CLOSE: ['CATCH', 'WHEN', 'ELSE', 'FINALLY'].concat(EXPRESSION_TAIL) + +# Tokens pairs that, in immediate succession, indicate an implicit call. +IMPLICIT_FUNC: ['IDENTIFIER', 'SUPER', ')', 'CALL_END', ']', 'INDEX_END'] +IMPLICIT_END: ['IF', 'UNLESS', 'FOR', 'WHILE', "\n", 'OUTDENT'] +IMPLICIT_CALL: ['IDENTIFIER', 'NUMBER', 'STRING', 'JS', 'REGEX', 'NEW', 'PARAM_START', + 'TRY', 'DELETE', 'TYPEOF', 'SWITCH', 'ARGUMENTS', + 'TRUE', 'FALSE', 'YES', 'NO', 'ON', 'OFF', '!', '!!', 'NOT', + '->', '=>', '[', '(', '{'] + +# The inverse mappings of token pairs we're trying to fix up. +INVERSES: {} +for pair in BALANCED_PAIRS + INVERSES[pair[0]]: pair[1] + INVERSES[pair[1]]: pair[0] + +# Single-line flavors of block expressions that have unclosed endings. +# The grammar can't disambiguate them, so we insert the implicit indentation. +SINGLE_LINERS: ['ELSE', "->", "=>", 'TRY', 'FINALLY', 'THEN'] +SINGLE_CLOSERS: ["\n", 'CATCH', 'FINALLY', 'ELSE', 'OUTDENT', 'LEADING_WHEN', 'PARAM_START'] + +# Rewrite the token stream in multiple passes, one logical filter at +# a time. This could certainly be changed into a single pass through the +# stream, with a big ol' efficient switch, but it's much nicer like this. +re::rewrite: (tokens) -> + this.tokens: tokens + this.adjust_comments() + # this.remove_leading_newlines() + # this.remove_mid_expression_newlines() + # this.move_commas_outside_outdents() + # this.close_open_calls_and_indexes() + # this.add_implicit_parentheses() + # this.add_implicit_indentation() + # this.ensure_balance(BALANCED_PAIRS) + # this.rewrite_closing_parens() + this.tokens + +# Rewrite the token stream, looking one token ahead and behind. +# Allow the return value of the block to tell us how many tokens to move +# forwards (or backwards) in the stream, to make sure we don't miss anything +# as the stream changes length under our feet. +re::scan_tokens: (yield) -> + i = 0 + while true + break unless this.tokens[i] + move: yield(this.tokens[i - 1], this.tokens[i], this.tokens[i + 1], i) + i += move + true + +# Massage newlines and indentations so that comments don't have to be +# correctly indented, or appear on their own line. +re::adjust_comments: -> + this.scan_tokens (prev, token, post, i) -> + return 1 unless token[0] is 'COMMENT' + before: this.tokens[i - 2] + after: this.tokens[i + 2] + if before and after and + ((before[0] is 'INDENT' and after[0] is 'OUTDENT') or + (before[0] is 'OUTDENT' and after[0] is 'INDENT')) and + before[1] is after[1] + this.tokens.splice(i + 2, 1) + this.tokens.splice(i - 2, 1) + return 0 + else if prev[0] is "\n" and after[0] is 'INDENT' + this.tokens.splice(i + 2, 1) + this.tokens[i - 1]: after + return 1 + else if prev[0] isnt "\n" and prev[0] isnt 'INDENT' and prev[0] isnt 'OUTDENT' + this.tokens.splice(i, 0, ["\n", "\n"]) + return 2 + else + return 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +