Lexer now adds location data, including first/last line/column to all generated tokens.

2022-11-09 12:23:24 -05:00 · 2012-11-16 19:09:56 -05:00 · 2012-11-16 19:09:56 -05:00 · bb94e02fad
commit bb94e02fad
parent 25126e2f99
4 changed files with 340 additions and 109 deletions
--- a/lib/coffee-script/helpers.js
+++ b/lib/coffee-script/helpers.js
@ -107,4 +107,18 @@
    };
  };

+  exports.locationDataToString = function(obj) {
+    var locationData;
+    if ("locationData" in obj) {
+      locationData = obj.locationData;
+    } else if ("first_line" in obj) {
+      locationData = obj;
+    }
+    if (locationData) {
+      return ("" + (locationData.first_line + 1) + ":" + (locationData.first_column + 1) + "-") + ("" + (locationData.last_line + 1) + ":" + (locationData.last_column + 1));
+    } else {
+      return "No location data";
+    }
+  };
+
 }).call(this);
--- a/lib/coffee-script/lexer.js
+++ b/lib/coffee-script/lexer.js
@ -1,18 +1,18 @@
 // Generated by CoffeeScript 1.4.0
 (function() {
-  var BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HEREDOC, HEREDOC_ILLEGAL, HEREDOC_INDENT, HEREGEX, HEREGEX_OMIT, IDENTIFIER, INDEXABLE, INVERSES, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LINE_BREAK, LINE_CONTINUER, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NOT_REGEX, NOT_SPACED_REGEX, NUMBER, OPERATOR, REGEX, RELATION, RESERVED, Rewriter, SHIFT, SIMPLESTR, STRICT_PROSCRIBED, TRAILING_SPACES, UNARY, WHITESPACE, compact, count, key, last, starts, _ref, _ref1,
+  var BOOL, CALLABLE, CODE, COFFEE_ALIASES, COFFEE_ALIAS_MAP, COFFEE_KEYWORDS, COMMENT, COMPARE, COMPOUND_ASSIGN, HEREDOC, HEREDOC_ILLEGAL, HEREDOC_INDENT, HEREGEX, HEREGEX_OMIT, IDENTIFIER, INDEXABLE, INVERSES, JSTOKEN, JS_FORBIDDEN, JS_KEYWORDS, LINE_BREAK, LINE_CONTINUER, LOGIC, Lexer, MATH, MULTILINER, MULTI_DENT, NOT_REGEX, NOT_SPACED_REGEX, NUMBER, OPERATOR, REGEX, RELATION, RESERVED, Rewriter, SHIFT, SIMPLESTR, STRICT_PROSCRIBED, TRAILING_SPACES, UNARY, WHITESPACE, compact, count, key, last, locationDataToString, starts, _ref, _ref1,
    __indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; };

  _ref = require('./rewriter'), Rewriter = _ref.Rewriter, INVERSES = _ref.INVERSES;

-  _ref1 = require('./helpers'), count = _ref1.count, starts = _ref1.starts, compact = _ref1.compact, last = _ref1.last;
+  _ref1 = require('./helpers'), count = _ref1.count, starts = _ref1.starts, compact = _ref1.compact, last = _ref1.last, locationDataToString = _ref1.locationDataToString;

  exports.Lexer = Lexer = (function() {

    function Lexer() {}

    Lexer.prototype.tokenize = function(code, opts) {
-      var i, tag;
+      var consumed, i, tag, _ref2;
      if (opts == null) {
        opts = {};
      }
@ -22,6 +22,8 @@
      code = code.replace(/\r/g, '').replace(TRAILING_SPACES, '');
      this.code = code;
      this.line = opts.line || 0;
+      this.chunkLine = opts.line || 0;
+      this.chunkColumn = opts.column || 0;
      this.indent = 0;
      this.indebt = 0;
      this.outdebt = 0;
@ -30,7 +32,9 @@
      this.tokens = [];
      i = 0;
      while (this.chunk = code.slice(i)) {
-        i += this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken();
+        consumed = this.identifierToken() || this.commentToken() || this.whitespaceToken() || this.lineToken() || this.heredocToken() || this.stringToken() || this.numberToken() || this.regexToken() || this.jsToken() || this.literalToken();
+        _ref2 = this.getLineAndColumnFromChunk(consumed), this.chunkLine = _ref2[0], this.chunkColumn = _ref2[1];
+        i += consumed;
      }
      this.closeIndentation();
      if (tag = this.ends.pop()) {
@ -43,11 +47,13 @@
    };

    Lexer.prototype.identifierToken = function() {
-      var colon, forcedIdentifier, id, input, match, prev, tag, _ref2, _ref3;
+      var colon, colonOffset, forcedIdentifier, id, idLength, input, match, poppedToken, prev, tag, tagToken, _ref2, _ref3, _ref4;
      if (!(match = IDENTIFIER.exec(this.chunk))) {
        return 0;
      }
      input = match[0], id = match[1], colon = match[2];
+      idLength = id.length;
+      poppedToken = void 0;
      if (id === 'own' && this.tag() === 'FOR') {
        this.token('OWN', id);
        return id.length;
@ -71,7 +77,7 @@
          } else {
            tag = 'RELATION';
            if (this.value() === '!') {
-              this.tokens.pop();
+              poppedToken = this.tokens.pop();
              id = '!' + id;
            }
          }
@ -111,9 +117,13 @@
          }
        })();
      }
-      this.token(tag, id);
+      tagToken = this.token(tag, id, 0, idLength);
+      if (poppedToken) {
+        _ref4 = [poppedToken.locationData.first_line, poppedToken.locationData.first_column], tagToken.locationData.first_line = _ref4[0], tagToken.locationData.first_column = _ref4[1];
+      }
      if (colon) {
-        this.token(':', ':');
+        colonOffset = input.lastIndexOf(':');
+        this.token(':', ':', colonOffset, colon.length);
      }
      return input.length;
    };
@ -140,7 +150,7 @@
      if (binaryLiteral = /^0b([01]+)/.exec(number)) {
        number = '0x' + (parseInt(binaryLiteral[1], 2)).toString(16);
      }
-      this.token('NUMBER', number);
+      this.token('NUMBER', number, 0, lexedLength);
      return lexedLength;
    };

@ -151,16 +161,19 @@
          if (!(match = SIMPLESTR.exec(this.chunk))) {
            return 0;
          }
-          this.token('STRING', (string = match[0]).replace(MULTILINER, '\\\n'));
+          string = match[0];
+          this.token('STRING', string.replace(MULTILINER, '\\\n'), 0, string.length);
          break;
        case '"':
          if (!(string = this.balancedString(this.chunk, '"'))) {
            return 0;
          }
          if (0 < string.indexOf('#{', 1)) {
-            this.interpolateString(string.slice(1, -1));
+            this.interpolateString(string.slice(1, -1), {
+              offsetInChunk: 1
+            });
          } else {
-            this.token('STRING', this.escapeLines(string));
+            this.token('STRING', this.escapeLines(string, 0, string.length));
          }
          break;
        default:
@ -186,10 +199,11 @@
      });
      if (quote === '"' && 0 <= doc.indexOf('#{')) {
        this.interpolateString(doc, {
-          heredoc: true
+          heredoc: true,
+          offsetInChunk: 3
        });
      } else {
-        this.token('STRING', this.makeString(doc, quote, true));
+        this.token('STRING', this.makeString(doc, quote, true), 0, heredoc.length);
      }
      this.line += count(heredoc, '\n');
      return heredoc.length;
@ -205,7 +219,7 @@
        this.token('HERECOMMENT', this.sanitizeHeredoc(here, {
          herecomment: true,
          indent: Array(this.indent + 1).join(' ')
-        }));
+        }), 0, comment.length);
      }
      this.line += count(comment, '\n');
      return comment.length;
@ -216,7 +230,7 @@
      if (!(this.chunk.charAt(0) === '`' && (match = JSTOKEN.exec(this.chunk)))) {
        return 0;
      }
-      this.token('JS', (script = match[0]).slice(1, -1));
+      this.token('JS', (script = match[0]).slice(1, -1), 0, script.length);
      this.line += count(script, '\n');
      return script.length;
    };
@ -245,49 +259,61 @@
      if (regex === '//') {
        regex = '/(?:)/';
      }
-      this.token('REGEX', "" + regex + flags);
+      this.token('REGEX', "" + regex + flags, 0, match.length);
      return match.length;
    };

    Lexer.prototype.heregexToken = function(match) {
-      var body, flags, heregex, re, tag, tokens, value, _i, _len, _ref2, _ref3, _ref4, _ref5;
+      var body, flags, flagsOffset, heregex, plusToken, prev, re, tag, token, tokens, value, _i, _len, _ref2, _ref3, _ref4;
      heregex = match[0], body = match[1], flags = match[2];
      if (0 > body.indexOf('#{')) {
        re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/');
        if (re.match(/^\*/)) {
          this.error('regular expressions cannot begin with `*`');
        }
-        this.token('REGEX', "/" + (re || '(?:)') + "/" + flags);
+        this.token('REGEX', "/" + (re || '(?:)') + "/" + flags, 0, heregex.length);
        return heregex.length;
      }
-      this.token('IDENTIFIER', 'RegExp');
-      this.tokens.push(['CALL_START', '(']);
+      this.token('IDENTIFIER', 'RegExp', 0, 0);
+      this.token('CALL_START', '(', 0, 0);
      tokens = [];
      _ref2 = this.interpolateString(body, {
-        regex: true
+        regex: true,
+        offsetInChunk: 3
      });
      for (_i = 0, _len = _ref2.length; _i < _len; _i++) {
-        _ref3 = _ref2[_i], tag = _ref3[0], value = _ref3[1];
+        token = _ref2[_i];
+        tag = token[0], value = token[1];
        if (tag === 'TOKENS') {
          tokens.push.apply(tokens, value);
-        } else {
+        } else if (tag === 'NEOSTRING') {
          if (!(value = value.replace(HEREGEX_OMIT, ''))) {
            continue;
          }
          value = value.replace(/\\/g, '\\\\');
-          tokens.push(['STRING', this.makeString(value, '"', true)]);
+          token[0] = 'STRING';
+          token[1] = this.makeString(value, '"', true);
+          tokens.push(token);
+        } else {
+          this.error("Unexpected " + tag);
        }
-        tokens.push(['+', '+']);
+        prev = last(this.tokens);
+        plusToken = ['+', '+'];
+        plusToken.locationData = prev.locationData;
+        tokens.push(plusToken);
      }
      tokens.pop();
-      if (((_ref4 = tokens[0]) != null ? _ref4[0] : void 0) !== 'STRING') {
-        this.tokens.push(['STRING', '""'], ['+', '+']);
+      if (((_ref3 = tokens[0]) != null ? _ref3[0] : void 0) !== 'STRING') {
+        this.token('STRING', '""', 0, 0);
+        this.token('+', '+', 0, 0);
      }
-      (_ref5 = this.tokens).push.apply(_ref5, tokens);
+      (_ref4 = this.tokens).push.apply(_ref4, tokens);
      if (flags) {
-        this.tokens.push([',', ','], ['STRING', '"' + flags + '"']);
+        flagsOffset = heregex.lastIndexOf(flags);
+        this.token(',', ',', flagsOffset, 0);
+        this.token('STRING', '"' + flags + '"', flagsOffset, flags.length);
      }
-      this.token(')', ')');
+      this.token(')', ')', heregex.length - 1, 0);
      return heregex.length;
    };

@ -304,7 +330,7 @@
        if (noNewlines) {
          this.suppressNewlines();
        } else {
-          this.newlineToken();
+          this.newlineToken(0);
        }
        this.line += count(indent, '\n');
        return indent.length;
@ -317,19 +343,19 @@
          return indent.length;
        }
        diff = size - this.indent + this.outdebt;
-        this.token('INDENT', diff);
+        this.token('INDENT', diff, 0, indent.length);
        this.indents.push(diff);
        this.ends.push('OUTDENT');
        this.outdebt = this.indebt = 0;
      } else {
        this.indebt = 0;
-        this.outdentToken(this.indent - size, noNewlines);
+        this.outdentToken(this.indent - size, noNewlines, indent.length);
      }
      this.indent = size;
      return indent.length;
    };

-    Lexer.prototype.outdentToken = function(moveOut, noNewlines) {
+    Lexer.prototype.outdentToken = function(moveOut, noNewlines, outdentLength) {
      var dent, len;
      while (moveOut > 0) {
        len = this.indents.length - 1;
@ -346,7 +372,7 @@
          moveOut -= dent;
          this.outdebt = 0;
          this.pair('OUTDENT');
-          this.token('OUTDENT', dent);
+          this.token('OUTDENT', dent, 0, outdentLength);
        }
      }
      if (dent) {
@ -356,7 +382,7 @@
        this.tokens.pop();
      }
      if (!(this.tag() === 'TERMINATOR' || noNewlines)) {
-        this.token('TERMINATOR', '\n');
+        this.token('TERMINATOR', '\n', outdentLength, 0);
      }
      return this;
    };
@ -377,12 +403,12 @@
      }
    };

-    Lexer.prototype.newlineToken = function() {
+    Lexer.prototype.newlineToken = function(offset) {
      while (this.value() === ';') {
        this.tokens.pop();
      }
      if (this.tag() !== 'TERMINATOR') {
-        this.token('TERMINATOR', '\n');
+        this.token('TERMINATOR', '\n', offset, 0);
      }
      return this;
    };
@ -556,11 +582,18 @@
    };

    Lexer.prototype.interpolateString = function(str, options) {
-      var expr, heredoc, i, inner, interpolated, len, letter, nested, pi, regex, tag, tokens, value, _i, _len, _ref2, _ref3, _ref4;
+      var column, expr, heredoc, i, inner, interpolated, len, letter, lexedLength, line, nested, offsetInChunk, originalOffsetInChunk, pi, popped, regex, tag, token, tokens, value, _i, _len, _ref2, _ref3, _ref4;
      if (options == null) {
        options = {};
      }
-      heredoc = options.heredoc, regex = options.regex;
+      heredoc = options.heredoc, regex = options.regex, offsetInChunk = options.offsetInChunk;
+      originalOffsetInChunk = offsetInChunk;
+      lexedLength = str.length;
+      offsetInChunk = offsetInChunk || 0;
+      if (heredoc && str.length > 0 && str[0] === '\n') {
+        str = str.slice(1);
+        offsetInChunk++;
+      }
      tokens = [];
      pi = 0;
      i = -1;
@ -573,22 +606,24 @@
          continue;
        }
        if (pi < i) {
-          tokens.push(['NEOSTRING', str.slice(pi, i)]);
+          tokens.push(this.makeToken('NEOSTRING', str.slice(pi, i), offsetInChunk + pi));
        }
        inner = expr.slice(1, -1);
        if (inner.length) {
+          _ref2 = this.getLineAndColumnFromChunk(offsetInChunk + i + 1), line = _ref2[0], column = _ref2[1];
          nested = new Lexer().tokenize(inner, {
-            line: this.line,
+            line: line,
+            column: column,
            rewrite: false
          });
-          nested.pop();
-          if (((_ref2 = nested[0]) != null ? _ref2[0] : void 0) === 'TERMINATOR') {
-            nested.shift();
+          popped = nested.pop();
+          if (((_ref3 = nested[0]) != null ? _ref3[0] : void 0) === 'TERMINATOR') {
+            popped = nested.shift();
          }
          if (len = nested.length) {
            if (len > 1) {
-              nested.unshift(['(', '(', this.line]);
-              nested.push([')', ')', this.line]);
+              nested.unshift(this.makeToken('(', '(', offsetInChunk + i + 1, 0));
+              nested.push(this.makeToken(')', ')', offsetInChunk + i + 1 + inner.length, 0));
            }
            tokens.push(['TOKENS', nested]);
          }
@ -597,33 +632,38 @@
        pi = i + 1;
      }
      if ((i > pi && pi < str.length)) {
-        tokens.push(['NEOSTRING', str.slice(pi)]);
+        tokens.push(this.makeToken('NEOSTRING', str.slice(pi), offsetInChunk + pi));
      }
      if (regex) {
        return tokens;
      }
      if (!tokens.length) {
-        return this.token('STRING', '""');
+        return this.token('STRING', '""', originalOffsetInChunk, lexedLength);
      }
      if (tokens[0][0] !== 'NEOSTRING') {
-        tokens.unshift(['', '']);
+        tokens.unshift(this.makeToken('NEOSTRING', '', originalOffsetInChunk));
      }
      if (interpolated = tokens.length > 1) {
-        this.token('(', '(');
+        this.token('(', '(', originalOffsetInChunk, 0);
      }
      for (i = _i = 0, _len = tokens.length; _i < _len; i = ++_i) {
-        _ref3 = tokens[i], tag = _ref3[0], value = _ref3[1];
+        token = tokens[i];
+        tag = token[0], value = token[1];
        if (i) {
          this.token('+', '+');
        }
        if (tag === 'TOKENS') {
          (_ref4 = this.tokens).push.apply(_ref4, value);
+        } else if (tag === 'NEOSTRING') {
+          token[0] = 'STRING';
+          token[1] = this.makeString(value, '"', heredoc);
+          this.tokens.push(token);
        } else {
-          this.token('STRING', this.makeString(value, '"', heredoc));
+          this.error("Unexpected " + tag);
        }
      }
      if (interpolated) {
-        this.token(')', ')');
+        this.token(')', ')', originalOffsetInChunk + lexedLength, 0);
      }
      return tokens;
    };
@ -641,8 +681,46 @@
      return this.ends.pop();
    };

-    Lexer.prototype.token = function(tag, value) {
-      return this.tokens.push([tag, value, this.line]);
+    Lexer.prototype.getLineAndColumnFromChunk = function(offset) {
+      var column, lineCount, lines, string;
+      if (offset === 0) {
+        return [this.chunkLine, this.chunkColumn];
+      }
+      if (offset >= this.chunk.length) {
+        string = this.chunk;
+      } else {
+        string = this.chunk.slice(0, +(offset - 1) + 1 || 9e9);
+      }
+      lineCount = count(string, '\n');
+      column = this.chunkColumn;
+      if (lineCount > 0) {
+        lines = string.split('\n');
+        column = (last(lines)).length;
+      } else {
+        column += string.length;
+      }
+      return [this.chunkLine + lineCount, column];
+    };
+
+    Lexer.prototype.makeToken = function(tag, value, offsetInChunk, length) {
+      var locationData, token, _ref2, _ref3;
+      offsetInChunk = offsetInChunk || 0;
+      if (length === void 0) {
+        length = value.length;
+      }
+      locationData = {};
+      _ref2 = this.getLineAndColumnFromChunk(offsetInChunk), locationData.first_line = _ref2[0], locationData.first_column = _ref2[1];
+      _ref3 = this.getLineAndColumnFromChunk(offsetInChunk + length), locationData.last_line = _ref3[0], locationData.last_column = _ref3[1];
+      token = [tag, value, locationData.first_line];
+      token.locationData = locationData;
+      return token;
+    };
+
+    Lexer.prototype.token = function(tag, value, offsetInChunk, length) {
+      var token;
+      token = this.makeToken(tag, value, offsetInChunk, length);
+      this.tokens.push(token);
+      return token;
    };

    Lexer.prototype.tag = function(index, tag) {
--- a/src/helpers.coffee
+++ b/src/helpers.coffee
@ -79,3 +79,17 @@ exports.addLocationDataFn = (first, last) ->
        obj.updateLocationDataIfMissing buildLocationData(first, last)

      return obj
+
+# Convert jison location data to a string.
+# `obj` can be a token, or a locationData.
+exports.locationDataToString = (obj) ->
+    if "locationData" of obj then locationData = obj.locationData
+    else if "first_line" of obj then locationData = obj
+
+    if locationData
+      "#{locationData.first_line + 1}:#{locationData.first_column + 1}-" +
+      "#{locationData.last_line + 1}:#{locationData.last_column + 1}"
+    else
+      "No location data"
+
+
--- a/src/lexer.coffee
+++ b/src/lexer.coffee
@ -10,7 +10,7 @@
 {Rewriter, INVERSES} = require './rewriter'

 # Import the helpers we need.
-{count, starts, compact, last} = require './helpers'
+{count, starts, compact, last, locationDataToString} = require './helpers'

 # The Lexer Class
 # ---------------
@ -35,8 +35,12 @@ exports.Lexer = class Lexer
    code     = "\n#{code}" if WHITESPACE.test code
    code     = code.replace(/\r/g, '').replace TRAILING_SPACES, ''

-    @code    = code           # The remainder of the source code.
-    @line    = opts.line or 0 # The current line.
+    @code    = code           # The source code.
+    @line    = opts.line or 0 # TODO: Remove
+    @chunkLine =
+        opts.line or 0        # The start line for the current chunk.
+    @chunkColumn =
+        opts.column or 0      # The start column of the current chunk.
    @indent  = 0              # The current indentation level.
    @indebt  = 0              # The over-indentation at the current level.
    @outdebt = 0              # The under-outdentation at the current level.
@ -49,7 +53,8 @@ exports.Lexer = class Lexer
    # `@literalToken` is the fallback catch-all.
    i = 0
    while @chunk = code[i..]
-      i += @identifierToken() or
+      consumed = \
+           @identifierToken() or
           @commentToken()    or
           @whitespaceToken() or
           @lineToken()       or
@ -60,9 +65,15 @@ exports.Lexer = class Lexer
           @jsToken()         or
           @literalToken()

+      # Update position
+      [@chunkLine, @chunkColumn] = @getLineAndColumnFromChunk consumed
+
+      i += consumed
+
    @closeIndentation()
    @error "missing #{tag}" if tag = @ends.pop()
    return @tokens if opts.rewrite is off
+    # TODO: deal with Rewriter
    (new Rewriter).rewrite @tokens

  # Tokenizers
@ -78,6 +89,9 @@ exports.Lexer = class Lexer
    return 0 unless match = IDENTIFIER.exec @chunk
    [input, id, colon] = match

+    idLength = id.length
+    poppedToken = undefined
+
    if id is 'own' and @tag() is 'FOR'
      @token 'OWN', id
      return id.length
@ -103,7 +117,7 @@ exports.Lexer = class Lexer
        else
          tag = 'RELATION'
          if @value() is '!'
-            @tokens.pop()
+            poppedToken = @tokens.pop()
            id = '!' + id

    if id in JS_FORBIDDEN
@ -124,8 +138,14 @@ exports.Lexer = class Lexer
        when 'break', 'continue' then 'STATEMENT'
        else  tag

-    @token tag, id
-    @token ':', ':' if colon
+    tagToken = @token tag, id, 0, idLength
+    if poppedToken
+      [tagToken.locationData.first_line, tagToken.locationData.first_column] =
+        [poppedToken.locationData.first_line, poppedToken.locationData.first_column]
+    if colon
+      colonOffset = input.lastIndexOf ':'
+      @token ':', ':', colonOffset, colon.length
+
    input.length

  # Matches numbers, including decimals, hex, and exponential notation.
@ -146,7 +166,7 @@ exports.Lexer = class Lexer
      number = '0x' + (parseInt octalLiteral[1], 8).toString 16
    if binaryLiteral = /^0b([01]+)/.exec number
      number = '0x' + (parseInt binaryLiteral[1], 2).toString 16
-    @token 'NUMBER', number
+    @token 'NUMBER', number, 0, lexedLength
    lexedLength

  # Matches strings, including multi-line strings. Ensures that quotation marks
@ -155,13 +175,14 @@ exports.Lexer = class Lexer
    switch @chunk.charAt 0
      when "'"
        return 0 unless match = SIMPLESTR.exec @chunk
-        @token 'STRING', (string = match[0]).replace MULTILINER, '\\\n'
+        string = match[0]
+        @token 'STRING', string.replace(MULTILINER, '\\\n'), 0, string.length
      when '"'
        return 0 unless string = @balancedString @chunk, '"'
        if 0 < string.indexOf '#{', 1
-          @interpolateString string[1...-1]
+          @interpolateString string[1...-1], offsetInChunk: 1
        else
-          @token 'STRING', @escapeLines string
+          @token 'STRING', @escapeLines string, 0, string.length
      else
        return 0
    if octalEsc = /^(?:\\.|[^\\])*\\(?:0[0-7]|[1-7])/.test string
@ -177,9 +198,9 @@ exports.Lexer = class Lexer
    quote = heredoc.charAt 0
    doc = @sanitizeHeredoc match[2], quote: quote, indent: null
    if quote is '"' and 0 <= doc.indexOf '#{'
-      @interpolateString doc, heredoc: yes
+      @interpolateString doc, heredoc: yes, offsetInChunk: 3
    else
-      @token 'STRING', @makeString doc, quote, yes
+      @token 'STRING', @makeString(doc, quote, yes), 0, heredoc.length
    @line += count heredoc, '\n'
    heredoc.length

@ -188,15 +209,17 @@ exports.Lexer = class Lexer
    return 0 unless match = @chunk.match COMMENT
    [comment, here] = match
    if here
-      @token 'HERECOMMENT', @sanitizeHeredoc here,
-        herecomment: true, indent: Array(@indent + 1).join(' ')
+      @token 'HERECOMMENT',
+        (@sanitizeHeredoc here,
+          herecomment: true, indent: Array(@indent + 1).join(' ')),
+        0, comment.length
    @line += count comment, '\n'
    comment.length

  # Matches JavaScript interpolated directly into the source via backticks.
  jsToken: ->
    return 0 unless @chunk.charAt(0) is '`' and match = JSTOKEN.exec @chunk
-    @token 'JS', (script = match[0])[1...-1]
+    @token 'JS', (script = match[0])[1...-1], 0, script.length
    @line += count script, '\n'
    script.length

@ -216,7 +239,7 @@ exports.Lexer = class Lexer
    [match, regex, flags] = match
    if regex[..1] is '/*' then @error 'regular expressions cannot begin with `*`'
    if regex is '//' then regex = '/(?:)/'
-    @token 'REGEX', "#{regex}#{flags}"
+    @token 'REGEX', "#{regex}#{flags}", 0, match.length
    match.length

  # Matches multiline extended regular expressions.
@ -225,24 +248,45 @@ exports.Lexer = class Lexer
    if 0 > body.indexOf '#{'
      re = body.replace(HEREGEX_OMIT, '').replace(/\//g, '\\/')
      if re.match /^\*/ then @error 'regular expressions cannot begin with `*`'
-      @token 'REGEX', "/#{ re or '(?:)' }/#{flags}"
+      @token 'REGEX', "/#{ re or '(?:)' }/#{flags}", 0, heregex.length
      return heregex.length
-    @token 'IDENTIFIER', 'RegExp'
-    @tokens.push ['CALL_START', '(']
+    @token 'IDENTIFIER', 'RegExp', 0, 0
+    @token 'CALL_START', '(', 0, 0
    tokens = []
-    for [tag, value] in @interpolateString(body, regex: yes)
+    for token in @interpolateString(body, regex: yes, offsetInChunk: 3)
+      [tag, value] = token
      if tag is 'TOKENS'
        tokens.push value...
-      else
+      else if tag is 'NEOSTRING'
        continue unless value = value.replace HEREGEX_OMIT, ''
+        # Convert NEOSTRING into STRING
        value = value.replace /\\/g, '\\\\'
-        tokens.push ['STRING', @makeString(value, '"', yes)]
-      tokens.push ['+', '+']
+        token[0] = 'STRING'
+        token[1] = @makeString(value, '"', yes)
+        tokens.push token
+      else
+        @error "Unexpected #{tag}"
+
+      prev = last @tokens
+      plusToken = ['+', '+']
+      plusToken.locationData = prev.locationData
+      tokens.push plusToken
+
+    # Remove the extra "+"
    tokens.pop()
-    @tokens.push ['STRING', '""'], ['+', '+'] unless tokens[0]?[0] is 'STRING'
+
+    unless tokens[0]?[0] is 'STRING'
+      @token 'STRING', '""', 0, 0
+      @token '+', '+', 0, 0
    @tokens.push tokens...
-    @tokens.push [',', ','], ['STRING', '"' + flags + '"'] if flags
-    @token ')', ')'
+
+    if flags
+      # Find the flags in the heregex
+      flagsOffset = heregex.lastIndexOf flags
+      @token ',', ',', flagsOffset, 0
+      @token 'STRING', '"' + flags + '"', flagsOffset, flags.length
+
+    @token ')', ')', heregex.length-1, 0
    heregex.length

  # Matches newlines, indents, and outdents, and determines which is which.
@ -262,7 +306,7 @@ exports.Lexer = class Lexer
    size = indent.length - 1 - indent.lastIndexOf '\n'
    noNewlines = @unfinished()
    if size - @indebt is @indent
-      if noNewlines then @suppressNewlines() else @newlineToken()
+      if noNewlines then @suppressNewlines() else @newlineToken 0
      # Advance @line line after the newlineToken, so the TERMINATOR shows up
      # on the right line.
      @line += count indent, '\n'
@ -275,19 +319,19 @@ exports.Lexer = class Lexer
        @suppressNewlines()
        return indent.length
      diff = size - @indent + @outdebt
-      @token 'INDENT', diff
+      @token 'INDENT', diff, 0, indent.length
      @indents.push diff
      @ends.push 'OUTDENT'
      @outdebt = @indebt = 0
    else
      @indebt = 0
-      @outdentToken @indent - size, noNewlines
+      @outdentToken @indent - size, noNewlines, indent.length
    @indent = size
    indent.length

  # Record an outdent token or multiple tokens, if we happen to be moving back
  # inwards past several recorded indents.
-  outdentToken: (moveOut, noNewlines) ->
+  outdentToken: (moveOut, noNewlines, outdentLength) ->
    while moveOut > 0
      len = @indents.length - 1
      if @indents[len] is undefined
@ -303,10 +347,11 @@ exports.Lexer = class Lexer
        moveOut -= dent
        @outdebt = 0
        @pair 'OUTDENT'
-        @token 'OUTDENT', dent
+        @token 'OUTDENT', dent, 0, outdentLength
    @outdebt -= moveOut if dent
    @tokens.pop() while @value() is ';'
-    @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR' or noNewlines
+
+    @token 'TERMINATOR', '\n', outdentLength, 0 unless @tag() is 'TERMINATOR' or noNewlines
    this

  # Matches and consumes non-meaningful whitespace. Tag the previous token
@ -319,9 +364,9 @@ exports.Lexer = class Lexer
    if match then match[0].length else 0

  # Generate a newline token. Consecutive newlines get merged together.
-  newlineToken: ->
+  newlineToken: (offset) ->
    @tokens.pop() while @value() is ';'
-    @token 'TERMINATOR', '\n' unless @tag() is 'TERMINATOR'
+    @token 'TERMINATOR', '\n', offset, 0 unless @tag() is 'TERMINATOR'
    this

  # Use a `\` at a line-ending to suppress the newline.
@ -458,7 +503,22 @@ exports.Lexer = class Lexer
  # new Lexer, tokenize the interpolated contents, and merge them into the
  # token stream.
  interpolateString: (str, options = {}) ->
-    {heredoc, regex} = options
+    {heredoc, regex, offsetInChunk} = options
+
+    # TODO: we pass in offsetInChunk, but we've already discarded the " or the
+    # """, or the /// that got us here.  Those characters are not going to end
+    # up being part of any tokens.
+
+    originalOffsetInChunk = offsetInChunk
+    lexedLength = str.length
+
+    # Clip leading \n from heredoc
+    offsetInChunk = offsetInChunk || 0
+    if heredoc and str.length > 0 and str[0] == '\n'
+      str = str[1...]
+      offsetInChunk++
+
+    # Parse the string.
    tokens = []
    pi = 0
    i  = -1
@ -469,31 +529,51 @@ exports.Lexer = class Lexer
      unless letter is '#' and str.charAt(i+1) is '{' and
             (expr = @balancedString str[i + 1..], '}')
        continue
-      tokens.push ['NEOSTRING', str[pi...i]] if pi < i
+      # NEOSTRING is a fake token.  This will be converted to a string below.
+      tokens.push @makeToken('NEOSTRING', str[pi...i], offsetInChunk + pi) if pi < i
      inner = expr[1...-1]
      if inner.length
-        nested = new Lexer().tokenize inner, line: @line, rewrite: off
-        nested.pop()
-        nested.shift() if nested[0]?[0] is 'TERMINATOR'
+        [line, column] = @getLineAndColumnFromChunk(offsetInChunk + i + 1)
+        nested = new Lexer().tokenize inner, line: line, column: column, rewrite: off
+        popped = nested.pop()
+        popped = nested.shift() if nested[0]?[0] is 'TERMINATOR'
        if len = nested.length
          if len > 1
-            nested.unshift ['(', '(', @line]
-            nested.push    [')', ')', @line]
+            nested.unshift @makeToken '(', '(', offsetInChunk + i + 1, 0
+            nested.push    @makeToken ')', ')', offsetInChunk + i + 1 + inner.length, 0
+          # Push a fake 'TOKENS' token, which will get turned into real tokens below.
          tokens.push ['TOKENS', nested]
      i += expr.length
      pi = i + 1
-    tokens.push ['NEOSTRING', str[pi..]] if i > pi < str.length
+    tokens.push @makeToken('NEOSTRING', str[pi..], offsetInChunk + pi) if i > pi < str.length
+
+    # If regex, then return now and let the regex code deal with all these fake tokens
    return tokens if regex
-    return @token 'STRING', '""' unless tokens.length
-    tokens.unshift ['', ''] unless tokens[0][0] is 'NEOSTRING'
-    @token '(', '(' if interpolated = tokens.length > 1
-    for [tag, value], i in tokens
+
+    # If we didn't find any tokens, then just return an empty string.
+    return @token 'STRING', '""', originalOffsetInChunk, lexedLength unless tokens.length
+
+    # If the first token is not a string, add a fake empty string to the beginning.
+    tokens.unshift @makeToken('NEOSTRING', '', originalOffsetInChunk) unless tokens[0][0] is 'NEOSTRING'
+
+    @token '(', '(', originalOffsetInChunk, 0 if interpolated = tokens.length > 1
+    # Push all the tokens
+    for token, i in tokens
+      [tag, value] = token
+      # TODO: this needs location data.
      @token '+', '+' if i
      if tag is 'TOKENS'
+        # Push all the tokens in the fake 'TOKENS' token.  These already have
+        # sane location data.
        @tokens.push value...
+      else if tag is 'NEOSTRING'
+        # Convert NEOSTRING into STRING
+        token[0] = 'STRING'
+        token[1] = @makeString value, '"', heredoc
+        @tokens.push token
      else
-        @token 'STRING', @makeString value, '"', heredoc
-    @token ')', ')' if interpolated
+        @error "Unexpected #{tag}"
+    @token ')', ')', originalOffsetInChunk + lexedLength, 0 if interpolated
    tokens

  # Pairs up a closing token, ensuring that all listed pairs of tokens are
@ -514,9 +594,54 @@ exports.Lexer = class Lexer
  # Helpers
  # -------

-  # Add a token to the results, taking note of the line number.
-  token: (tag, value) ->
-    @tokens.push [tag, value, @line]
+  # Returns the line and column number from an offset into the current chunk.
+  getLineAndColumnFromChunk: (offset) ->
+    if offset is 0
+      return [@chunkLine, @chunkColumn]
+
+    if offset >= @chunk.length
+      string = @chunk
+    else
+      string = @chunk[..offset-1]
+
+    lineCount = count string, '\n'
+
+    column = @chunkColumn
+    if lineCount > 0
+      lines = string.split '\n'
+      column = (last lines).length
+    else
+      column += string.length
+
+    return [@chunkLine + lineCount, column]
+
+  # Same as "token", exception this just returns the token without adding it
+  # to the results.
+  makeToken: (tag, value, offsetInChunk, length) ->
+    offsetInChunk = offsetInChunk || 0
+    if length is undefined then length = value.length
+
+    locationData = {}
+    [locationData.first_line, locationData.first_column] =
+      @getLineAndColumnFromChunk offsetInChunk
+    [locationData.last_line, locationData.last_column] =
+      @getLineAndColumnFromChunk offsetInChunk + length
+
+    token = [tag, value, locationData.first_line]
+    token.locationData = locationData
+
+    return token
+
+  # Add a token to the results.
+  # `offset` is the offset into the current @chunk where the token starts.
+  # `length` is the length of the token in the @chunk, after the offset.  If
+  # not specified, the length of `value` will be used.
+  #
+  # Returns the new token.
+  token: (tag, value, offsetInChunk, length) ->
+    token = @makeToken tag, value, offsetInChunk, length
+    @tokens.push token
+    return token

  # Peek at a tag in the current token stream.
  tag: (index, tag) ->