From eff2f4b520602c88d6ef8fdf873bdb703d29e8ac Mon Sep 17 00:00:00 2001
From: Jeremy Ashkenas <jashkenas@gmail.com>
Date: Sat, 30 Jan 2010 00:37:38 -0500
Subject: [PATCH] a little further on with the lexer

---
 lib/coffee_script/lexer.js | 55 +++++++++++++++++++++++++++++++++++---
 src/lexer.coffee           | 36 ++++++++++++++++++++++---
 2 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/lib/coffee_script/lexer.js b/lib/coffee_script/lexer.js
index 9c01af22..cbedbdcc 100644
--- a/lib/coffee_script/lexer.js
+++ b/lib/coffee_script/lexer.js
@@ -10,9 +10,9 @@
   // Token matching regexes.
   lex.IDENTIFIER = /^([a-zA-Z$_](\w|\$)*)/;
   lex.NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i;
-  lex.STRING = /^(""|''|"(.*?)([^\\]|\\\\)"|'(.*?)([^\\]|\\\\)')/m;
-  lex.HEREDOC = /^("{6}|'{6}|"{3}\n?(.*?)\n?([ \t]*)"{3}|'{3}\n?(.*?)\n?([ \t]*)'{3})/m;
-  lex.JS = /^(``|`(.*?)([^\\]|\\\\)`)/m;
+  lex.STRING = /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/;
+  lex.HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/;
+  lex.JS = /^(``|`([\s\S]*?)([^\\]|\\\\)`)/;
   lex.OPERATOR = /^([+\*&|\/\-%=<>:!?]+)/;
   lex.WHITESPACE = /^([ \t]+)/;
   lex.COMMENT = /^(((\n?[ \t]*)?#.*$)+)/;
@@ -92,5 +92,54 @@
     }
     return this.literal_token();
   };
+  // Look at a tag in the current token stream.
+  lex.prototype.tag = function tag(index, tag) {
+    var tok;
+    if (!((tok = this.tokens[index || -1]))) {
+      return null;
+    }
+    if ((typeof tag !== "undefined" && tag !== null)) {
+      return (tok[0] = tag);
+    }
+    return tok[0];
+  };
+  // Look at a value in the current token stream.
+  lex.prototype.value = function value(index, val) {
+    var tok;
+    if (!((tok = this.tokens[index || -1]))) {
+      return null;
+    }
+    if ((typeof val !== "undefined" && val !== null)) {
+      return (tok[1] = val);
+    }
+    return tok[1];
+  };
   // Tokenizers ==========================================================
+  // Matches identifying literals: variables, keywords, method names, etc.
+  lex.prototype.identifier_token = function identifier_token() {
+    var id, match, tag;
+    match = this.chunk.match(lex.IDENTIFIER);
+    if (!(match && (id = match[1]))) {
+      return false;
+    }
+    // Keywords are special identifiers tagged with their own name,
+    // 'if' will result in an ['IF', "if"] token.
+    tag = this.KEYWORDS.indexOf(id) >= 0 ? id.toUpperCase() : 'IDENTIFIER';
+    if (tag === 'WHEN' && (this.tag() === 'OUTDENT' || this.tag() === 'INDENT')) {
+      tag = 'LEADING_WHEN';
+    }
+    if (tag === 'IDENTIFIER' && this.value() === '::') {
+      this.tag(-1, 'PROTOTYPE_ACCESS');
+    }
+    if (tag === 'IDENTIFIER' && this.value() === '.' && !(this.value(-2) === '.')) {
+      if (this.tag(-2) === '?') {
+        this.tag(-1, 'SOAK_ACCESS');
+        this.tokens.splice(-2, 1);
+      } else {
+        this.tag(-1, 'PROPERTY_ACCESS');
+      }
+    }
+    this.token(tag, id);
+    return this.i += id.length;
+  };
 })();
\ No newline at end of file
diff --git a/src/lexer.coffee b/src/lexer.coffee
index 0bf041c1..3e028331 100644
--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@@ -22,9 +22,9 @@ lex.KEYWORDS: [
 # Token matching regexes.
 lex.IDENTIFIER : /^([a-zA-Z$_](\w|\$)*)/
 lex.NUMBER     : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i
-lex.STRING     : /^(""|''|"(.*?)([^\\]|\\\\)"|'(.*?)([^\\]|\\\\)')/m
-lex.HEREDOC    : /^("{6}|'{6}|"{3}\n?(.*?)\n?([ \t]*)"{3}|'{3}\n?(.*?)\n?([ \t]*)'{3})/m
-lex.JS         : /^(``|`(.*?)([^\\]|\\\\)`)/m
+lex.STRING     : /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/
+lex.HEREDOC    : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/
+lex.JS         : /^(``|`([\s\S]*?)([^\\]|\\\\)`)/
 lex.OPERATOR   : /^([+\*&|\/\-%=<>:!?]+)/
 lex.WHITESPACE : /^([ \t]+)/
 lex.COMMENT    : /^(((\n?[ \t]*)?#.*$)+)/
@@ -84,9 +84,37 @@ lex::extract_next_token: ->
   return if this.whitespace_token()
   return    this.literal_token()
 
+# Look at a tag in the current token stream.
+lex::tag: (index, tag) ->
+  return unless tok: this.tokens[index || -1]
+  return tok[0]: tag if tag?
+  tok[0]
+
+# Look at a value in the current token stream.
+lex::value: (index, val) ->
+  return unless tok: this.tokens[index || -1]
+  return tok[1]: val if val?
+  tok[1]
+
 # Tokenizers ==========================================================
 
-
+# Matches identifying literals: variables, keywords, method names, etc.
+lex::identifier_token: ->
+  match: this.chunk.match(lex.IDENTIFIER)
+  return false unless match and id: match[1]
+  # Keywords are special identifiers tagged with their own name,
+  # 'if' will result in an ['IF', "if"] token.
+  tag: if this.KEYWORDS.indexOf(id) >= 0 then id.toUpperCase() else 'IDENTIFIER'
+  tag: 'LEADING_WHEN' if tag is 'WHEN' and (this.tag() is 'OUTDENT' or this.tag() is 'INDENT')
+  this.tag(-1, 'PROTOTYPE_ACCESS') if tag is 'IDENTIFIER' and this.value() is '::'
+  if tag is 'IDENTIFIER' and this.value() is '.' and !(this.value(-2) is '.')
+    if this.tag(-2) is '?'
+      this.tag(-1, 'SOAK_ACCESS')
+      this.tokens.splice(-2, 1)
+    else
+      this.tag(-1, 'PROPERTY_ACCESS')
+  this.token(tag, id)
+  this.i += id.length