diff --git a/lib/coffee_script/lexer.js b/lib/coffee_script/lexer.js index cbedbdcc..00c261ed 100644 --- a/lib/coffee_script/lexer.js +++ b/lib/coffee_script/lexer.js @@ -22,12 +22,12 @@ lex.LAST_DENT = /\n([ \t]*)/; lex.ASSIGNMENT = /^(:|=)$/; // Token cleaning regexes. - lex.JS_CLEANER = /(^`|`$)/; - lex.MULTILINER = /\n/; - lex.STRING_NEWLINES = /\n[ \t]*/; - lex.COMMENT_CLEANER = /(^[ \t]*#|\n[ \t]*$)/; + lex.JS_CLEANER = /(^`|`$)/g; + lex.MULTILINER = /\n/g; + lex.STRING_NEWLINES = /\n[ \t]*/g; + lex.COMMENT_CLEANER = /(^[ \t]*#|\n[ \t]*$)/mg; lex.NO_NEWLINE = /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/; - lex.HEREDOC_INDENT = /^[ \t]+/; + lex.HEREDOC_INDENT = /^[ \t]+/g; // Tokens which a regular expression will never immediately follow, but which // a division operator might. // See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions @@ -36,8 +36,8 @@ lex.CALLABLE = ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING']; // Scan by attempting to match tokens one character at a time. Slow and steady. lex.prototype.tokenize = function tokenize(code) { - this.code = code.chomp; - // Cleanup code by remove extra line breaks + this.code = code; + // Cleanup code by remove extra line breaks, TODO: chomp this.i = 0; // Current character position we're parsing this.line = 1; @@ -48,17 +48,14 @@ // The stack of all indent levels we are currently within. this.tokens = []; // Collection of all parsed tokens in the form [:TOKEN_TYPE, value] - this.spaced = nil; - // The last value that has a space following it. while (this.i < this.code.length) { - this.chunk = this.code.slice(this.i, -1 + 1); + this.chunk = this.code.slice(this.i); this.extract_next_token(); } - if (process.ENV['VERBOSE']) { - sys.puts("original stream: #{@tokens.inspect}"); - } - this.close_indentation(); - return (new Rewriter()).rewrite(this.tokens); + // sys.puts "original stream: #{@tokens.inspect}" if process.ENV['VERBOSE'] + // this.close_indentation() + // (new Rewriter()).rewrite(this.tokens) + return this.tokens; }; // At every position, run through this list of attempted matches, // short-circuiting if any of them succeed. @@ -81,9 +78,7 @@ if (this.regex_token()) { return null; } - if (this.indent_token()) { - return null; - } + // return if this.indent_token() if (this.comment_token()) { return null; } @@ -92,39 +87,16 @@ } return this.literal_token(); }; - // Look at a tag in the current token stream. - lex.prototype.tag = function tag(index, tag) { - var tok; - if (!((tok = this.tokens[index || -1]))) { - return null; - } - if ((typeof tag !== "undefined" && tag !== null)) { - return (tok[0] = tag); - } - return tok[0]; - }; - // Look at a value in the current token stream. - lex.prototype.value = function value(index, val) { - var tok; - if (!((tok = this.tokens[index || -1]))) { - return null; - } - if ((typeof val !== "undefined" && val !== null)) { - return (tok[1] = val); - } - return tok[1]; - }; // Tokenizers ========================================================== // Matches identifying literals: variables, keywords, method names, etc. lex.prototype.identifier_token = function identifier_token() { - var id, match, tag; - match = this.chunk.match(lex.IDENTIFIER); - if (!(match && (id = match[1]))) { + var id, tag; + if (!((id = this.match(lex.IDENTIFIER, 1)))) { return false; } // Keywords are special identifiers tagged with their own name, // 'if' will result in an ['IF', "if"] token. - tag = this.KEYWORDS.indexOf(id) >= 0 ? id.toUpperCase() : 'IDENTIFIER'; + tag = lex.KEYWORDS.indexOf(id) >= 0 ? id.toUpperCase() : 'IDENTIFIER'; if (tag === 'WHEN' && (this.tag() === 'OUTDENT' || this.tag() === 'INDENT')) { tag = 'LEADING_WHEN'; } @@ -142,4 +114,149 @@ this.token(tag, id); return this.i += id.length; }; + // Matches numbers, including decimals, hex, and exponential notation. + lex.prototype.number_token = function number_token() { + var number; + if (!((number = this.match(lex.NUMBER, 1)))) { + return false; + } + this.token('NUMBER', number); + return this.i += number.length; + }; + // Matches strings, including multi-line strings. + lex.prototype.string_token = function string_token() { + var escaped, string; + if (!((string = this.match(lex.STRING, 1)))) { + return false; + } + escaped = string.replace(STRING_NEWLINES, " \\\n"); + this.token('STRING', escaped); + this.line += this.count(string, "\n"); + return this.i += string.length; + }; + // Matches heredocs, adjusting indentation to the correct level. + lex.prototype.heredoc_token = function heredoc_token() { + var doc, indent, match; + if (!((match = this.chunk.match(lex.HEREDOC)))) { + return false; + } + doc = match[2] || match[4]; + indent = doc.match(lex.HEREDOC_INDENT).sort()[0]; + doc = doc.replace(new RegExp("^" + indent, 'g'), '').replace(lex.MULTILINER, "\\n").replace('"', '\\"'); + this.token('STRING', '"' + doc + '"'); + this.line += this.count(match[1], "\n"); + return this.i += match[1].length; + }; + // Matches interpolated JavaScript. + lex.prototype.js_token = function js_token() { + var script; + if (!((script = this.match(lex.JS, 1)))) { + return false; + } + this.token('JS', script.replace(lex.JS_CLEANER, '')); + return this.i += script.length; + }; + // Matches regular expression literals. + lex.prototype.regex_token = function regex_token() { + var regex; + if (!((regex = this.match(lex.REGEX, 1)))) { + return false; + } + if (lex.NOT_REGEX.indexOf(this.tag()) >= 0) { + return false; + } + this.token('REGEX', regex); + return this.i += regex.length; + }; + // Matches and conumes comments. + lex.prototype.comment_token = function comment_token() { + var comment; + if (!((comment = this.match(lex.COMMENT, 1)))) { + return false; + } + this.line += comment.match(lex.MULTILINER).length; + this.token('COMMENT', comment.replace(lex.COMMENT_CLEANER, '').split(lex.MULTILINER)); + this.token("\n", "\n"); + return this.i += comment.length; + }; + // Matches and consumes non-meaningful whitespace. + lex.prototype.whitespace_token = function whitespace_token() { + var space; + if (!((space = this.match(lex.WHITESPACE, 1)))) { + return false; + } + this.value().spaced = true; + return this.i += space.length; + }; + // We treat all other single characters as a token. Eg.: ( ) , . ! + // Multi-character operators are also literal tokens, so that Racc can assign + // the proper order of operations. + lex.prototype.literal_token = function literal_token() { + var match, tag, value; + match = this.chunk.match(lex.OPERATOR); + value = match && match[1]; + if (value && value.match(lex.CODE)) { + tag_parameters(); + } + value = value || this.chunk.substr(0, 1); + tag = value.match(lex.ASSIGNMENT) ? 'ASSIGN' : value; + if (this.value() && this.value().spaced && lex.CALLABLE.indexOf(this.tag() >= 0)) { + if (value === '(') { + tag = 'CALL_START'; + } + if (value === '[') { + tag = 'INDEX_START'; + } + } + this.token(tag, value); + return this.i += value.length; + }; + // Helpers ============================================================= + // Add a token to the results, taking note of the line number. + lex.prototype.token = function token(tag, value) { + return this.tokens.push([tag, value]); + // this.tokens.push([tag, Value.new(value, @line)]) + }; + // Look at a tag in the current token stream. + lex.prototype.tag = function tag(index, tag) { + var tok; + if (!((tok = this.tokens[this.tokens.length - (index || 1)]))) { + return null; + } + if ((typeof tag !== "undefined" && tag !== null)) { + return (tok[0] = tag); + } + return tok[0]; + }; + // Look at a value in the current token stream. + lex.prototype.value = function value(index, val) { + var tok; + if (!((tok = this.tokens[this.tokens.length - (index || 1)]))) { + return null; + } + if ((typeof val !== "undefined" && val !== null)) { + return (tok[1] = val); + } + return tok[1]; + }; + // Count the occurences of a character in a string. + lex.prototype.count = function count(string, char) { + var num, pos; + num = 0; + pos = string.indexOf(char); + while (pos !== -1) { + count += 1; + pos = string.indexOf(char, pos + 1); + } + return count; + }; + // Attempt to match a string against the current chunk, returning the indexed + // match. + lex.prototype.match = function match(regex, index) { + var m; + if (!((m = this.chunk.match(regex)))) { + return false; + } + return m ? m[index] : false; + }; })(); \ No newline at end of file diff --git a/lib/coffee_script/lexer.rb b/lib/coffee_script/lexer.rb index ecb56021..99f40a7f 100644 --- a/lib/coffee_script/lexer.rb +++ b/lib/coffee_script/lexer.rb @@ -229,8 +229,7 @@ module CoffeeScript # Helpers ========================================================== - # Add a token to the results, taking note of the line number, and - # immediately-preceding comment. + # Add a token to the results, taking note of the line number. def token(tag, value) @tokens << [tag, Value.new(value, @line)] end diff --git a/src/lexer.coffee b/src/lexer.coffee index 3e028331..096323a5 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -35,12 +35,12 @@ lex.LAST_DENT : /\n([ \t]*)/ lex.ASSIGNMENT : /^(:|=)$/ # Token cleaning regexes. -lex.JS_CLEANER : /(^`|`$)/ -lex.MULTILINER : /\n/ -lex.STRING_NEWLINES : /\n[ \t]*/ -lex.COMMENT_CLEANER : /(^[ \t]*#|\n[ \t]*$)/ +lex.JS_CLEANER : /(^`|`$)/g +lex.MULTILINER : /\n/g +lex.STRING_NEWLINES : /\n[ \t]*/g +lex.COMMENT_CLEANER : /(^[ \t]*#|\n[ \t]*$)/mg lex.NO_NEWLINE : /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/ -lex.HEREDOC_INDENT : /^[ \t]+/ +lex.HEREDOC_INDENT : /^[ \t]+/g # Tokens which a regular expression will never immediately follow, but which # a division operator might. @@ -56,19 +56,19 @@ lex.CALLABLE: ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING'] # Scan by attempting to match tokens one character at a time. Slow and steady. lex::tokenize: (code) -> - this.code : code.chomp # Cleanup code by remove extra line breaks + this.code : code # Cleanup code by remove extra line breaks, TODO: chomp this.i : 0 # Current character position we're parsing this.line : 1 # The current line. this.indent : 0 # The current indent level. this.indents : [] # The stack of all indent levels we are currently within. this.tokens : [] # Collection of all parsed tokens in the form [:TOKEN_TYPE, value] - this.spaced : nil # The last value that has a space following it. while this.i < this.code.length - this.chunk: this.code[this.i..-1] + this.chunk: this.code.slice(this.i) this.extract_next_token() - sys.puts "original stream: #{@tokens.inspect}" if process.ENV['VERBOSE'] - this.close_indentation() - (new Rewriter()).rewrite(this.tokens) + # sys.puts "original stream: #{@tokens.inspect}" if process.ENV['VERBOSE'] + # this.close_indentation() + # (new Rewriter()).rewrite(this.tokens) + this.tokens # At every position, run through this list of attempted matches, # short-circuiting if any of them succeed. @@ -79,32 +79,19 @@ lex::extract_next_token: -> return if this.string_token() return if this.js_token() return if this.regex_token() - return if this.indent_token() + # return if this.indent_token() return if this.comment_token() return if this.whitespace_token() return this.literal_token() -# Look at a tag in the current token stream. -lex::tag: (index, tag) -> - return unless tok: this.tokens[index || -1] - return tok[0]: tag if tag? - tok[0] - -# Look at a value in the current token stream. -lex::value: (index, val) -> - return unless tok: this.tokens[index || -1] - return tok[1]: val if val? - tok[1] - # Tokenizers ========================================================== # Matches identifying literals: variables, keywords, method names, etc. lex::identifier_token: -> - match: this.chunk.match(lex.IDENTIFIER) - return false unless match and id: match[1] + return false unless id: this.match lex.IDENTIFIER, 1 # Keywords are special identifiers tagged with their own name, # 'if' will result in an ['IF', "if"] token. - tag: if this.KEYWORDS.indexOf(id) >= 0 then id.toUpperCase() else 'IDENTIFIER' + tag: if lex.KEYWORDS.indexOf(id) >= 0 then id.toUpperCase() else 'IDENTIFIER' tag: 'LEADING_WHEN' if tag is 'WHEN' and (this.tag() is 'OUTDENT' or this.tag() is 'INDENT') this.tag(-1, 'PROTOTYPE_ACCESS') if tag is 'IDENTIFIER' and this.value() is '::' if tag is 'IDENTIFIER' and this.value() is '.' and !(this.value(-2) is '.') @@ -116,6 +103,115 @@ lex::identifier_token: -> this.token(tag, id) this.i += id.length +# Matches numbers, including decimals, hex, and exponential notation. +lex::number_token: -> + return false unless number: this.match lex.NUMBER, 1 + this.token 'NUMBER', number + this.i += number.length + +# Matches strings, including multi-line strings. +lex::string_token: -> + return false unless string: this.match lex.STRING, 1 + escaped: string.replace STRING_NEWLINES, " \\\n" + this.token 'STRING', escaped + this.line += this.count string, "\n" + this.i += string.length + +# Matches heredocs, adjusting indentation to the correct level. +lex::heredoc_token: -> + return false unless match = this.chunk.match(lex.HEREDOC) + doc: match[2] or match[4] + indent: doc.match(lex.HEREDOC_INDENT).sort()[0] + doc: doc.replace(new RegExp("^" + indent, 'g'), '') + .replace(lex.MULTILINER, "\\n") + .replace('"', '\\"') + this.token 'STRING', '"' + doc + '"' + this.line += this.count match[1], "\n" + this.i += match[1].length + +# Matches interpolated JavaScript. +lex::js_token: -> + return false unless script: this.match lex.JS, 1 + this.token 'JS', script.replace(lex.JS_CLEANER, '') + this.i += script.length + +# Matches regular expression literals. +lex::regex_token: -> + return false unless regex: this.match lex.REGEX, 1 + return false if lex.NOT_REGEX.indexOf(this.tag()) >= 0 + this.token 'REGEX', regex + this.i += regex.length + +# Matches and conumes comments. +lex::comment_token: -> + return false unless comment: this.match lex.COMMENT, 1 + this.line += comment.match(lex.MULTILINER).length + this.token 'COMMENT', comment.replace(lex.COMMENT_CLEANER, '').split(lex.MULTILINER) + this.token "\n", "\n" + this.i += comment.length + + + + + +# Matches and consumes non-meaningful whitespace. +lex::whitespace_token: -> + return false unless space: this.match lex.WHITESPACE, 1 + this.value().spaced: true + this.i += space.length + +# We treat all other single characters as a token. Eg.: ( ) , . ! +# Multi-character operators are also literal tokens, so that Racc can assign +# the proper order of operations. +lex::literal_token: -> + match: this.chunk.match(lex.OPERATOR) + value: match and match[1] + tag_parameters() if value and value.match(lex.CODE) + value ||= this.chunk.substr(0, 1) + tag: if value.match(lex.ASSIGNMENT) then 'ASSIGN' else value + if this.value() and this.value().spaced and lex.CALLABLE.indexOf(this.tag() >= 0) + tag: 'CALL_START' if value is '(' + tag: 'INDEX_START' if value is '[' + this.token tag, value + this.i += value.length + +# Helpers ============================================================= + +# Add a token to the results, taking note of the line number. +lex::token: (tag, value) -> + this.tokens.push([tag, value]) + # this.tokens.push([tag, Value.new(value, @line)]) + +# Look at a tag in the current token stream. +lex::tag: (index, tag) -> + return unless tok: this.tokens[this.tokens.length - (index || 1)] + return tok[0]: tag if tag? + tok[0] + +# Look at a value in the current token stream. +lex::value: (index, val) -> + return unless tok: this.tokens[this.tokens.length - (index || 1)] + return tok[1]: val if val? + tok[1] + +# Count the occurences of a character in a string. +lex::count: (string, char) -> + num: 0 + pos: string.indexOf(char) + while pos isnt -1 + count += 1 + pos: string.indexOf(char, pos + 1) + count + +# Attempt to match a string against the current chunk, returning the indexed +# match. +lex::match: (regex, index) -> + return false unless m: this.chunk.match(regex) + if m then m[index] else false + + + +