From eff2f4b520602c88d6ef8fdf873bdb703d29e8ac Mon Sep 17 00:00:00 2001 From: Jeremy Ashkenas Date: Sat, 30 Jan 2010 00:37:38 -0500 Subject: [PATCH] a little further on with the lexer --- lib/coffee_script/lexer.js | 55 +++++++++++++++++++++++++++++++++++--- src/lexer.coffee | 36 ++++++++++++++++++++++--- 2 files changed, 84 insertions(+), 7 deletions(-) diff --git a/lib/coffee_script/lexer.js b/lib/coffee_script/lexer.js index 9c01af22..cbedbdcc 100644 --- a/lib/coffee_script/lexer.js +++ b/lib/coffee_script/lexer.js @@ -10,9 +10,9 @@ // Token matching regexes. lex.IDENTIFIER = /^([a-zA-Z$_](\w|\$)*)/; lex.NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i; - lex.STRING = /^(""|''|"(.*?)([^\\]|\\\\)"|'(.*?)([^\\]|\\\\)')/m; - lex.HEREDOC = /^("{6}|'{6}|"{3}\n?(.*?)\n?([ \t]*)"{3}|'{3}\n?(.*?)\n?([ \t]*)'{3})/m; - lex.JS = /^(``|`(.*?)([^\\]|\\\\)`)/m; + lex.STRING = /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/; + lex.HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/; + lex.JS = /^(``|`([\s\S]*?)([^\\]|\\\\)`)/; lex.OPERATOR = /^([+\*&|\/\-%=<>:!?]+)/; lex.WHITESPACE = /^([ \t]+)/; lex.COMMENT = /^(((\n?[ \t]*)?#.*$)+)/; @@ -92,5 +92,54 @@ } return this.literal_token(); }; + // Look at a tag in the current token stream. + lex.prototype.tag = function tag(index, tag) { + var tok; + if (!((tok = this.tokens[index || -1]))) { + return null; + } + if ((typeof tag !== "undefined" && tag !== null)) { + return (tok[0] = tag); + } + return tok[0]; + }; + // Look at a value in the current token stream. + lex.prototype.value = function value(index, val) { + var tok; + if (!((tok = this.tokens[index || -1]))) { + return null; + } + if ((typeof val !== "undefined" && val !== null)) { + return (tok[1] = val); + } + return tok[1]; + }; // Tokenizers ========================================================== + // Matches identifying literals: variables, keywords, method names, etc. + lex.prototype.identifier_token = function identifier_token() { + var id, match, tag; + match = this.chunk.match(lex.IDENTIFIER); + if (!(match && (id = match[1]))) { + return false; + } + // Keywords are special identifiers tagged with their own name, + // 'if' will result in an ['IF', "if"] token. + tag = this.KEYWORDS.indexOf(id) >= 0 ? id.toUpperCase() : 'IDENTIFIER'; + if (tag === 'WHEN' && (this.tag() === 'OUTDENT' || this.tag() === 'INDENT')) { + tag = 'LEADING_WHEN'; + } + if (tag === 'IDENTIFIER' && this.value() === '::') { + this.tag(-1, 'PROTOTYPE_ACCESS'); + } + if (tag === 'IDENTIFIER' && this.value() === '.' && !(this.value(-2) === '.')) { + if (this.tag(-2) === '?') { + this.tag(-1, 'SOAK_ACCESS'); + this.tokens.splice(-2, 1); + } else { + this.tag(-1, 'PROPERTY_ACCESS'); + } + } + this.token(tag, id); + return this.i += id.length; + }; })(); \ No newline at end of file diff --git a/src/lexer.coffee b/src/lexer.coffee index 0bf041c1..3e028331 100644 --- a/src/lexer.coffee +++ b/src/lexer.coffee @@ -22,9 +22,9 @@ lex.KEYWORDS: [ # Token matching regexes. lex.IDENTIFIER : /^([a-zA-Z$_](\w|\$)*)/ lex.NUMBER : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i -lex.STRING : /^(""|''|"(.*?)([^\\]|\\\\)"|'(.*?)([^\\]|\\\\)')/m -lex.HEREDOC : /^("{6}|'{6}|"{3}\n?(.*?)\n?([ \t]*)"{3}|'{3}\n?(.*?)\n?([ \t]*)'{3})/m -lex.JS : /^(``|`(.*?)([^\\]|\\\\)`)/m +lex.STRING : /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/ +lex.HEREDOC : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/ +lex.JS : /^(``|`([\s\S]*?)([^\\]|\\\\)`)/ lex.OPERATOR : /^([+\*&|\/\-%=<>:!?]+)/ lex.WHITESPACE : /^([ \t]+)/ lex.COMMENT : /^(((\n?[ \t]*)?#.*$)+)/ @@ -84,9 +84,37 @@ lex::extract_next_token: -> return if this.whitespace_token() return this.literal_token() +# Look at a tag in the current token stream. +lex::tag: (index, tag) -> + return unless tok: this.tokens[index || -1] + return tok[0]: tag if tag? + tok[0] + +# Look at a value in the current token stream. +lex::value: (index, val) -> + return unless tok: this.tokens[index || -1] + return tok[1]: val if val? + tok[1] + # Tokenizers ========================================================== - +# Matches identifying literals: variables, keywords, method names, etc. +lex::identifier_token: -> + match: this.chunk.match(lex.IDENTIFIER) + return false unless match and id: match[1] + # Keywords are special identifiers tagged with their own name, + # 'if' will result in an ['IF', "if"] token. + tag: if this.KEYWORDS.indexOf(id) >= 0 then id.toUpperCase() else 'IDENTIFIER' + tag: 'LEADING_WHEN' if tag is 'WHEN' and (this.tag() is 'OUTDENT' or this.tag() is 'INDENT') + this.tag(-1, 'PROTOTYPE_ACCESS') if tag is 'IDENTIFIER' and this.value() is '::' + if tag is 'IDENTIFIER' and this.value() is '.' and !(this.value(-2) is '.') + if this.tag(-2) is '?' + this.tag(-1, 'SOAK_ACCESS') + this.tokens.splice(-2, 1) + else + this.tag(-1, 'PROPERTY_ACCESS') + this.token(tag, id) + this.i += id.length