From babeebcc1a5cf390aa0ed6a93b9b364b7ad51739 Mon Sep 17 00:00:00 2001 From: Jeremy Ashkenas Date: Sat, 30 Jan 2010 15:56:40 -0500 Subject: [PATCH] more progress with the lexer, perhaps it's done --- lib/coffee_script/coffee-script.js | 8 +- lib/coffee_script/lexer.js | 180 +++++++++++++++++++++-------- lib/coffee_script/runner.js | 1 - src/coffee-script.coffee | 4 + src/lexer.coffee | 174 ++++++++++++++++------------ src/runner.coffee | 1 - 6 files changed, 247 insertions(+), 121 deletions(-) diff --git a/lib/coffee_script/coffee-script.js b/lib/coffee_script/coffee-script.js index e69de174..7be28108 100644 --- a/lib/coffee_script/coffee-script.js +++ b/lib/coffee_script/coffee-script.js @@ -23,7 +23,7 @@ }; // Compile a list of CoffeeScript files on disk. exports.compile_files = function compile_files(paths, callback) { - var coffee, js; + var coffee, exit_ran, js; js = ''; coffee = process.createChildProcess(compiler, ['--print'].concat(paths)); coffee.addListener('output', function(results) { @@ -31,7 +31,13 @@ return js += results; } }); + // NB: we have to add a mutex to make sure it doesn't get called twice. + exit_ran = false; return coffee.addListener('exit', function() { + if (exit_ran) { + return null; + } + exit_ran = true; return callback(js); }); }; diff --git a/lib/coffee_script/lexer.js b/lib/coffee_script/lexer.js index 00c261ed..bc9a74cb 100644 --- a/lib/coffee_script/lexer.js +++ b/lib/coffee_script/lexer.js @@ -1,39 +1,41 @@ (function(){ - var lex, sys; + var ASSIGNMENT, CALLABLE, CODE, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, JS, JS_CLEANER, KEYWORDS, LAST_DENT, LAST_DENTS, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, STRING, STRING_NEWLINES, WHITESPACE, lex, sys; sys = require('sys'); // The lexer reads a stream of CoffeeScript and divvys it up into tagged // tokens. A minor bit of the ambiguity in the grammar has been avoided by // pushing some extra smarts into the Lexer. exports.Lexer = (lex = function lex() { }); + // Constants ============================================================ // The list of keywords passed verbatim to the parser. - lex.KEYWORDS = ["if", "else", "then", "unless", "true", "false", "yes", "no", "on", "off", "and", "or", "is", "isnt", "not", "new", "return", "arguments", "try", "catch", "finally", "throw", "break", "continue", "for", "in", "of", "by", "where", "while", "delete", "instanceof", "typeof", "switch", "when", "super", "extends"]; + KEYWORDS = ["if", "else", "then", "unless", "true", "false", "yes", "no", "on", "off", "and", "or", "is", "isnt", "not", "new", "return", "arguments", "try", "catch", "finally", "throw", "break", "continue", "for", "in", "of", "by", "where", "while", "delete", "instanceof", "typeof", "switch", "when", "super", "extends"]; // Token matching regexes. - lex.IDENTIFIER = /^([a-zA-Z$_](\w|\$)*)/; - lex.NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i; - lex.STRING = /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/; - lex.HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/; - lex.JS = /^(``|`([\s\S]*?)([^\\]|\\\\)`)/; - lex.OPERATOR = /^([+\*&|\/\-%=<>:!?]+)/; - lex.WHITESPACE = /^([ \t]+)/; - lex.COMMENT = /^(((\n?[ \t]*)?#.*$)+)/; - lex.CODE = /^((-|=)>)/; - lex.REGEX = /^(\/(.*?)([^\\]|\\\\)\/[imgy]{0,4})/; - lex.MULTI_DENT = /^((\n([ \t]*))+)(\.)?/; - lex.LAST_DENT = /\n([ \t]*)/; - lex.ASSIGNMENT = /^(:|=)$/; + IDENTIFIER = /^([a-zA-Z$_](\w|\$)*)/; + NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i; + STRING = /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/; + HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/; + JS = /^(``|`([\s\S]*?)([^\\]|\\\\)`)/; + OPERATOR = /^([+\*&|\/\-%=<>:!?]+)/; + WHITESPACE = /^([ \t]+)/; + COMMENT = /^(((\n?[ \t]*)?#.*$)+)/; + CODE = /^((-|=)>)/; + REGEX = /^(\/(.*?)([^\\]|\\\\)\/[imgy]{0,4})/; + MULTI_DENT = /^((\n([ \t]*))+)(\.)?/; + LAST_DENTS = /\n([ \t]*)/g; + LAST_DENT = /\n([ \t]*)/; + ASSIGNMENT = /^(:|=)$/; // Token cleaning regexes. - lex.JS_CLEANER = /(^`|`$)/g; - lex.MULTILINER = /\n/g; - lex.STRING_NEWLINES = /\n[ \t]*/g; - lex.COMMENT_CLEANER = /(^[ \t]*#|\n[ \t]*$)/mg; - lex.NO_NEWLINE = /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/; - lex.HEREDOC_INDENT = /^[ \t]+/g; + JS_CLEANER = /(^`|`$)/g; + MULTILINER = /\n/g; + STRING_NEWLINES = /\n[ \t]*/g; + COMMENT_CLEANER = /(^[ \t]*#|\n[ \t]*$)/mg; + NO_NEWLINE = /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/; + HEREDOC_INDENT = /^[ \t]+/g; // Tokens which a regular expression will never immediately follow, but which // a division operator might. // See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions - lex.NOT_REGEX = ['IDENTIFIER', 'NUMBER', 'REGEX', 'STRING', ')', '++', '--', ']', '}', 'FALSE', 'NULL', 'TRUE']; + NOT_REGEX = ['IDENTIFIER', 'NUMBER', 'REGEX', 'STRING', ')', '++', '--', ']', '}', 'FALSE', 'NULL', 'TRUE']; // Tokens which could legitimately be invoked or indexed. - lex.CALLABLE = ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING']; + CALLABLE = ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING']; // Scan by attempting to match tokens one character at a time. Slow and steady. lex.prototype.tokenize = function tokenize(code) { this.code = code; @@ -52,8 +54,8 @@ this.chunk = this.code.slice(this.i); this.extract_next_token(); } - // sys.puts "original stream: #{@tokens.inspect}" if process.ENV['VERBOSE'] - // this.close_indentation() + // sys.puts "original stream: " + this.tokens if process.ENV['VERBOSE'] + this.close_indentation(); // (new Rewriter()).rewrite(this.tokens) return this.tokens; }; @@ -78,7 +80,9 @@ if (this.regex_token()) { return null; } - // return if this.indent_token() + if (this.indent_token()) { + return null; + } if (this.comment_token()) { return null; } @@ -91,12 +95,12 @@ // Matches identifying literals: variables, keywords, method names, etc. lex.prototype.identifier_token = function identifier_token() { var id, tag; - if (!((id = this.match(lex.IDENTIFIER, 1)))) { + if (!((id = this.match(IDENTIFIER, 1)))) { return false; } // Keywords are special identifiers tagged with their own name, // 'if' will result in an ['IF', "if"] token. - tag = lex.KEYWORDS.indexOf(id) >= 0 ? id.toUpperCase() : 'IDENTIFIER'; + tag = KEYWORDS.indexOf(id) >= 0 ? id.toUpperCase() : 'IDENTIFIER'; if (tag === 'WHEN' && (this.tag() === 'OUTDENT' || this.tag() === 'INDENT')) { tag = 'LEADING_WHEN'; } @@ -117,7 +121,7 @@ // Matches numbers, including decimals, hex, and exponential notation. lex.prototype.number_token = function number_token() { var number; - if (!((number = this.match(lex.NUMBER, 1)))) { + if (!((number = this.match(NUMBER, 1)))) { return false; } this.token('NUMBER', number); @@ -126,7 +130,7 @@ // Matches strings, including multi-line strings. lex.prototype.string_token = function string_token() { var escaped, string; - if (!((string = this.match(lex.STRING, 1)))) { + if (!((string = this.match(STRING, 1)))) { return false; } escaped = string.replace(STRING_NEWLINES, " \\\n"); @@ -137,12 +141,12 @@ // Matches heredocs, adjusting indentation to the correct level. lex.prototype.heredoc_token = function heredoc_token() { var doc, indent, match; - if (!((match = this.chunk.match(lex.HEREDOC)))) { + if (!((match = this.chunk.match(HEREDOC)))) { return false; } doc = match[2] || match[4]; - indent = doc.match(lex.HEREDOC_INDENT).sort()[0]; - doc = doc.replace(new RegExp("^" + indent, 'g'), '').replace(lex.MULTILINER, "\\n").replace('"', '\\"'); + indent = doc.match(HEREDOC_INDENT).sort()[0]; + doc = doc.replace(new RegExp("^" + indent, 'g'), '').replace(MULTILINER, "\\n").replace('"', '\\"'); this.token('STRING', '"' + doc + '"'); this.line += this.count(match[1], "\n"); return this.i += match[1].length; @@ -150,19 +154,19 @@ // Matches interpolated JavaScript. lex.prototype.js_token = function js_token() { var script; - if (!((script = this.match(lex.JS, 1)))) { + if (!((script = this.match(JS, 1)))) { return false; } - this.token('JS', script.replace(lex.JS_CLEANER, '')); + this.token('JS', script.replace(JS_CLEANER, '')); return this.i += script.length; }; // Matches regular expression literals. lex.prototype.regex_token = function regex_token() { var regex; - if (!((regex = this.match(lex.REGEX, 1)))) { + if (!((regex = this.match(REGEX, 1)))) { return false; } - if (lex.NOT_REGEX.indexOf(this.tag()) >= 0) { + if (NOT_REGEX.indexOf(this.tag()) >= 0) { return false; } this.token('REGEX', regex); @@ -171,36 +175,88 @@ // Matches and conumes comments. lex.prototype.comment_token = function comment_token() { var comment; - if (!((comment = this.match(lex.COMMENT, 1)))) { + if (!((comment = this.match(COMMENT, 1)))) { return false; } - this.line += comment.match(lex.MULTILINER).length; - this.token('COMMENT', comment.replace(lex.COMMENT_CLEANER, '').split(lex.MULTILINER)); + this.line += comment.match(MULTILINER).length; + this.token('COMMENT', comment.replace(COMMENT_CLEANER, '').split(MULTILINER)); this.token("\n", "\n"); return this.i += comment.length; }; + // Record tokens for indentation differing from the previous line. + lex.prototype.indent_token = function indent_token() { + var diff, indent, next_character, no_newlines, size; + if (!((indent = this.match(MULTI_DENT, 1)))) { + return false; + } + this.line += indent.match(MULTILINER).length; + this.i += indent.length; + next_character = this.chunk.match(MULTI_DENT)[4]; + no_newlines = next_character === '.' || (this.value().match(NO_NEWLINE) && this.tokens[this.tokens.length - 2][0] !== '.' && !this.value().match(CODE)); + if (no_newlines) { + return this.suppress_newlines(indent); + } + size = indent.match(LAST_DENTS).reverse()[0].match(LAST_DENT)[1].length; + if (size === this.indent) { + return this.newline_token(indent); + } + if (size > this.indent) { + diff = size - this.indent; + this.token('INDENT', diff); + this.indents.push(diff); + } else { + this.outdent_token(this.indent - size); + } + return this.indent = size; + }; + // Record an oudent token or tokens, if we're moving back inwards past + // multiple recorded indents. + lex.prototype.outdent_token = function outdent_token(move_out) { + var last_indent; + while (move_out > 0 && this.indents.length) { + last_indent = this.indents.pop(); + this.token('OUTDENT', last_indent); + move_out -= last_indent; + } + return this.token("\n", "\n"); + }; // Matches and consumes non-meaningful whitespace. lex.prototype.whitespace_token = function whitespace_token() { var space; - if (!((space = this.match(lex.WHITESPACE, 1)))) { + if (!((space = this.match(WHITESPACE, 1)))) { return false; } this.value().spaced = true; return this.i += space.length; }; + // Multiple newlines get merged together. + // Use a trailing \ to escape newlines. + lex.prototype.newline_token = function newline_token(newlines) { + if (!(this.value() === "\n")) { + this.token("\n", "\n"); + } + return true; + }; + // Tokens to explicitly escape newlines are removed once their job is done. + lex.prototype.suppress_newlines = function suppress_newlines(newlines) { + if (this.value() === "\\") { + this.tokens.pop(); + } + return true; + }; // We treat all other single characters as a token. Eg.: ( ) , . ! // Multi-character operators are also literal tokens, so that Racc can assign // the proper order of operations. lex.prototype.literal_token = function literal_token() { var match, tag, value; - match = this.chunk.match(lex.OPERATOR); + match = this.chunk.match(OPERATOR); value = match && match[1]; - if (value && value.match(lex.CODE)) { - tag_parameters(); + if (value && value.match(CODE)) { + this.tag_parameters(); } value = value || this.chunk.substr(0, 1); - tag = value.match(lex.ASSIGNMENT) ? 'ASSIGN' : value; - if (this.value() && this.value().spaced && lex.CALLABLE.indexOf(this.tag() >= 0)) { + tag = value.match(ASSIGNMENT) ? 'ASSIGN' : value; + if (this.value() && this.value().spaced && CALLABLE.indexOf(this.tag() >= 0)) { if (value === '(') { tag = 'CALL_START'; } @@ -259,4 +315,36 @@ } return m ? m[index] : false; }; + // A source of ambiguity in our grammar was parameter lists in function + // definitions (as opposed to argument lists in function calls). Tag + // parameter identifiers in order to avoid this. Also, parameter lists can + // make use of splats. + lex.prototype.tag_parameters = function tag_parameters() { + var __a, i, tok; + if (this.tag() !== ')') { + return null; + } + i = 0; + __a = []; + while (true) { + i += 1; + tok = this.tokens[this.tokens.length - i]; + if (!tok) { + return null; + } + if (tok[0] === 'IDENTIFIER') { + tok[0] = 'PARAM'; + } else if (tok[0] === ')') { + tok[0] = 'PARAM_END'; + } else if (tok[0] === '(') { + return (tok[0] = 'PARAM_START'); + } + } + return __a; + }; + // Close up all remaining open blocks. IF the first token is an indent, + // axe it. + lex.prototype.close_indentation = function close_indentation() { + return this.outdent_token(this.indent); + }; })(); \ No newline at end of file diff --git a/lib/coffee_script/runner.js b/lib/coffee_script/runner.js index 10f6ac9b..cdda8327 100644 --- a/lib/coffee_script/runner.js +++ b/lib/coffee_script/runner.js @@ -2,7 +2,6 @@ var coffee, paths; // Quickie script to compile and run all the files given as arguments. coffee = require('./coffee-script'); - process.mixin(require('sys')); paths = process.ARGV; paths = paths.slice(2, paths.length); coffee.compile_files(paths, function(js) { diff --git a/src/coffee-script.coffee b/src/coffee-script.coffee index c1649364..0594c6c6 100644 --- a/src/coffee-script.coffee +++ b/src/coffee-script.coffee @@ -23,7 +23,11 @@ exports.compile_files: (paths, callback) -> coffee: process.createChildProcess compiler, ['--print'].concat(paths) coffee.addListener 'output', (results) -> js += results if results? + # NB: we have to add a mutex to make sure it doesn't get called twice. + exit_ran: false coffee.addListener 'exit', -> + return if exit_ran + exit_ran: true callback(js) diff --git a/src/lexer.coffee b/src/lexer.coffee index 096323a5..c40dec14 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -5,8 +5,10 @@ sys: require 'sys' # pushing some extra smarts into the Lexer. exports.Lexer: lex: -> +# Constants ============================================================ + # The list of keywords passed verbatim to the parser. -lex.KEYWORDS: [ +KEYWORDS: [ "if", "else", "then", "unless", "true", "false", "yes", "no", "on", "off", "and", "or", "is", "isnt", "not", @@ -20,39 +22,40 @@ lex.KEYWORDS: [ ] # Token matching regexes. -lex.IDENTIFIER : /^([a-zA-Z$_](\w|\$)*)/ -lex.NUMBER : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i -lex.STRING : /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/ -lex.HEREDOC : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/ -lex.JS : /^(``|`([\s\S]*?)([^\\]|\\\\)`)/ -lex.OPERATOR : /^([+\*&|\/\-%=<>:!?]+)/ -lex.WHITESPACE : /^([ \t]+)/ -lex.COMMENT : /^(((\n?[ \t]*)?#.*$)+)/ -lex.CODE : /^((-|=)>)/ -lex.REGEX : /^(\/(.*?)([^\\]|\\\\)\/[imgy]{0,4})/ -lex.MULTI_DENT : /^((\n([ \t]*))+)(\.)?/ -lex.LAST_DENT : /\n([ \t]*)/ -lex.ASSIGNMENT : /^(:|=)$/ +IDENTIFIER : /^([a-zA-Z$_](\w|\$)*)/ +NUMBER : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i +STRING : /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/ +HEREDOC : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/ +JS : /^(``|`([\s\S]*?)([^\\]|\\\\)`)/ +OPERATOR : /^([+\*&|\/\-%=<>:!?]+)/ +WHITESPACE : /^([ \t]+)/ +COMMENT : /^(((\n?[ \t]*)?#.*$)+)/ +CODE : /^((-|=)>)/ +REGEX : /^(\/(.*?)([^\\]|\\\\)\/[imgy]{0,4})/ +MULTI_DENT : /^((\n([ \t]*))+)(\.)?/ +LAST_DENTS : /\n([ \t]*)/g +LAST_DENT : /\n([ \t]*)/ +ASSIGNMENT : /^(:|=)$/ # Token cleaning regexes. -lex.JS_CLEANER : /(^`|`$)/g -lex.MULTILINER : /\n/g -lex.STRING_NEWLINES : /\n[ \t]*/g -lex.COMMENT_CLEANER : /(^[ \t]*#|\n[ \t]*$)/mg -lex.NO_NEWLINE : /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/ -lex.HEREDOC_INDENT : /^[ \t]+/g +JS_CLEANER : /(^`|`$)/g +MULTILINER : /\n/g +STRING_NEWLINES : /\n[ \t]*/g +COMMENT_CLEANER : /(^[ \t]*#|\n[ \t]*$)/mg +NO_NEWLINE : /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/ +HEREDOC_INDENT : /^[ \t]+/g # Tokens which a regular expression will never immediately follow, but which # a division operator might. # See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions -lex.NOT_REGEX: [ +NOT_REGEX: [ 'IDENTIFIER', 'NUMBER', 'REGEX', 'STRING', ')', '++', '--', ']', '}', 'FALSE', 'NULL', 'TRUE' ] # Tokens which could legitimately be invoked or indexed. -lex.CALLABLE: ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING'] +CALLABLE: ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING'] # Scan by attempting to match tokens one character at a time. Slow and steady. lex::tokenize: (code) -> @@ -65,8 +68,8 @@ lex::tokenize: (code) -> while this.i < this.code.length this.chunk: this.code.slice(this.i) this.extract_next_token() - # sys.puts "original stream: #{@tokens.inspect}" if process.ENV['VERBOSE'] - # this.close_indentation() + # sys.puts "original stream: " + this.tokens if process.ENV['VERBOSE'] + this.close_indentation() # (new Rewriter()).rewrite(this.tokens) this.tokens @@ -79,7 +82,7 @@ lex::extract_next_token: -> return if this.string_token() return if this.js_token() return if this.regex_token() - # return if this.indent_token() + return if this.indent_token() return if this.comment_token() return if this.whitespace_token() return this.literal_token() @@ -88,10 +91,10 @@ lex::extract_next_token: -> # Matches identifying literals: variables, keywords, method names, etc. lex::identifier_token: -> - return false unless id: this.match lex.IDENTIFIER, 1 + return false unless id: this.match IDENTIFIER, 1 # Keywords are special identifiers tagged with their own name, # 'if' will result in an ['IF', "if"] token. - tag: if lex.KEYWORDS.indexOf(id) >= 0 then id.toUpperCase() else 'IDENTIFIER' + tag: if KEYWORDS.indexOf(id) >= 0 then id.toUpperCase() else 'IDENTIFIER' tag: 'LEADING_WHEN' if tag is 'WHEN' and (this.tag() is 'OUTDENT' or this.tag() is 'INDENT') this.tag(-1, 'PROTOTYPE_ACCESS') if tag is 'IDENTIFIER' and this.value() is '::' if tag is 'IDENTIFIER' and this.value() is '.' and !(this.value(-2) is '.') @@ -105,13 +108,13 @@ lex::identifier_token: -> # Matches numbers, including decimals, hex, and exponential notation. lex::number_token: -> - return false unless number: this.match lex.NUMBER, 1 + return false unless number: this.match NUMBER, 1 this.token 'NUMBER', number this.i += number.length # Matches strings, including multi-line strings. lex::string_token: -> - return false unless string: this.match lex.STRING, 1 + return false unless string: this.match STRING, 1 escaped: string.replace STRING_NEWLINES, " \\\n" this.token 'STRING', escaped this.line += this.count string, "\n" @@ -119,11 +122,11 @@ lex::string_token: -> # Matches heredocs, adjusting indentation to the correct level. lex::heredoc_token: -> - return false unless match = this.chunk.match(lex.HEREDOC) + return false unless match = this.chunk.match(HEREDOC) doc: match[2] or match[4] - indent: doc.match(lex.HEREDOC_INDENT).sort()[0] + indent: doc.match(HEREDOC_INDENT).sort()[0] doc: doc.replace(new RegExp("^" + indent, 'g'), '') - .replace(lex.MULTILINER, "\\n") + .replace(MULTILINER, "\\n") .replace('"', '\\"') this.token 'STRING', '"' + doc + '"' this.line += this.count match[1], "\n" @@ -131,45 +134,79 @@ lex::heredoc_token: -> # Matches interpolated JavaScript. lex::js_token: -> - return false unless script: this.match lex.JS, 1 - this.token 'JS', script.replace(lex.JS_CLEANER, '') + return false unless script: this.match JS, 1 + this.token 'JS', script.replace(JS_CLEANER, '') this.i += script.length # Matches regular expression literals. lex::regex_token: -> - return false unless regex: this.match lex.REGEX, 1 - return false if lex.NOT_REGEX.indexOf(this.tag()) >= 0 + return false unless regex: this.match REGEX, 1 + return false if NOT_REGEX.indexOf(this.tag()) >= 0 this.token 'REGEX', regex this.i += regex.length # Matches and conumes comments. lex::comment_token: -> - return false unless comment: this.match lex.COMMENT, 1 - this.line += comment.match(lex.MULTILINER).length - this.token 'COMMENT', comment.replace(lex.COMMENT_CLEANER, '').split(lex.MULTILINER) + return false unless comment: this.match COMMENT, 1 + this.line += comment.match(MULTILINER).length + this.token 'COMMENT', comment.replace(COMMENT_CLEANER, '').split(MULTILINER) this.token "\n", "\n" this.i += comment.length +# Record tokens for indentation differing from the previous line. +lex::indent_token: -> + return false unless indent: this.match MULTI_DENT, 1 + this.line += indent.match(MULTILINER).length + this.i += indent.length + next_character: this.chunk.match(MULTI_DENT)[4] + no_newlines: next_character is '.' or (this.value().match(NO_NEWLINE) and this.tokens[this.tokens.length - 2][0] isnt '.' and not this.value().match(CODE)) + return this.suppress_newlines(indent) if no_newlines + size: indent.match(LAST_DENTS).reverse()[0].match(LAST_DENT)[1].length + return this.newline_token(indent) if size is this.indent + if size > this.indent + diff: size - this.indent + this.token 'INDENT', diff + this.indents.push diff + else + this.outdent_token this.indent - size + this.indent: size - - +# Record an oudent token or tokens, if we're moving back inwards past +# multiple recorded indents. +lex::outdent_token: (move_out) -> + while move_out > 0 and this.indents.length + last_indent: this.indents.pop() + this.token 'OUTDENT', last_indent + move_out -= last_indent + this.token "\n", "\n" # Matches and consumes non-meaningful whitespace. lex::whitespace_token: -> - return false unless space: this.match lex.WHITESPACE, 1 + return false unless space: this.match WHITESPACE, 1 this.value().spaced: true this.i += space.length +# Multiple newlines get merged together. +# Use a trailing \ to escape newlines. +lex::newline_token: (newlines) -> + this.token "\n", "\n" unless this.value() is "\n" + true + +# Tokens to explicitly escape newlines are removed once their job is done. +lex::suppress_newlines: (newlines) -> + this.tokens.pop() if this.value() is "\\" + true + # We treat all other single characters as a token. Eg.: ( ) , . ! # Multi-character operators are also literal tokens, so that Racc can assign # the proper order of operations. lex::literal_token: -> - match: this.chunk.match(lex.OPERATOR) + match: this.chunk.match(OPERATOR) value: match and match[1] - tag_parameters() if value and value.match(lex.CODE) + this.tag_parameters() if value and value.match(CODE) value ||= this.chunk.substr(0, 1) - tag: if value.match(lex.ASSIGNMENT) then 'ASSIGN' else value - if this.value() and this.value().spaced and lex.CALLABLE.indexOf(this.tag() >= 0) + tag: if value.match(ASSIGNMENT) then 'ASSIGN' else value + if this.value() and this.value().spaced and CALLABLE.indexOf(this.tag() >= 0) tag: 'CALL_START' if value is '(' tag: 'INDEX_START' if value is '[' this.token tag, value @@ -209,30 +246,23 @@ lex::match: (regex, index) -> return false unless m: this.chunk.match(regex) if m then m[index] else false +# A source of ambiguity in our grammar was parameter lists in function +# definitions (as opposed to argument lists in function calls). Tag +# parameter identifiers in order to avoid this. Also, parameter lists can +# make use of splats. +lex::tag_parameters: -> + return if this.tag() isnt ')' + i: 0 + while true + i += 1 + tok: this.tokens[this.tokens.length - i] + return if not tok + switch tok[0] + when 'IDENTIFIER' then tok[0]: 'PARAM' + when ')' then tok[0]: 'PARAM_END' + when '(' then return tok[0]: 'PARAM_START' - - - - - - - - - - - - - - - - - - - - - - - - - - +# Close up all remaining open blocks. IF the first token is an indent, +# axe it. +lex::close_indentation: -> + this.outdent_token(this.indent) diff --git a/src/runner.coffee b/src/runner.coffee index aed68387..7e7b7b69 100644 --- a/src/runner.coffee +++ b/src/runner.coffee @@ -1,7 +1,6 @@ # Quickie script to compile and run all the files given as arguments. coffee: require './coffee-script' -process.mixin require 'sys' paths: process.ARGV paths: paths[2...paths.length]