jashkenas--coffeescript/lib/coffee_script/lexer.rb

module CoffeeScript

  # The lexer reads a stream of CoffeeScript and divvys it up into tagged
  # tokens. A minor bit of the ambiguity in the grammar has been avoided by
  # pushing some extra smarts into the Lexer.
  class Lexer

    # The list of keywords passed verbatim to the parser.
    KEYWORDS   = ["if", "else", "then", "unless",
                  "true", "false", "null",
                  "and", "or", "is", "aint", "not",
                  "new", "return",
                  "try", "catch", "finally", "throw",
                  "break", "continue",
                  "for", "in", "while",
                  "switch", "case",
                  "super",
                  "delete"]

    # Token matching regexes.
    IDENTIFIER = /\A([a-zA-Z$_]\w*)/
    NUMBER     = /\A\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?))\b/i
    STRING     = /\A(""|''|"(.*?)[^\\]"|'(.*?)[^\\]')/m
    JS         = /\A(`(.*?)`)/
    OPERATOR   = /\A([+\*&|\/\-%=<>:]+)/
    WHITESPACE = /\A([ \t\r]+)/
    NEWLINE    = /\A(\n+)/
    COMMENT    = /\A((#[^\n]*\s*)+)/m
    CODE       = /\A(=>)/
    REGEX      = /\A(\/(.*?)[^\\]\/[imgy]{0,4})/

    # Token cleaning regexes.
    JS_CLEANER = /(\A`|`\Z)/
    MULTILINER = /\n/
    COMMENT_CLEANER = /(^\s*#|\n\s*$)/

    # Tokens that always constitute the start of an expression.
    EXP_START  = ['{', '(', '[']

    # Tokens that always constitute the end of an expression.
    EXP_END    = ['}', ')', ']']

    # Scan by attempting to match tokens one character at a time. Slow and steady.
    def tokenize(code)
      @code = code.chomp  # Cleanup code by remove extra line breaks
      @i = 0              # Current character position we're parsing
      @line = 1           # The current line.
      @tokens = []        # Collection of all parsed tokens in the form [:TOKEN_TYPE, value]
      while @i < @code.length
        @chunk = @code[@i..-1]
        extract_next_token
      end
      @tokens
    end

    # At every position, run this list of match attempts, short-circuiting if
    # any of them succeed.
    def extract_next_token
      return if identifier_token
      return if number_token
      return if string_token
      return if js_token
      return if regex_token
      return if comment_token
      return if whitespace_token
      return    literal_token
    end

    # Matches identifying literals: variables, keywords, method names, etc.
    def identifier_token
      return false unless identifier = @chunk[IDENTIFIER, 1]
      # Keywords are special identifiers tagged with their own name, 'if' will result
      # in an [:IF, "if"] token
      tag = KEYWORDS.include?(identifier) ? identifier.upcase.to_sym : :IDENTIFIER
      @tokens[-1][0] = :PROPERTY_ACCESS if tag == :IDENTIFIER && last_value == '.'
      token(tag, identifier)
      @i += identifier.length
    end

    # Matches numbers, including decimals, hex, and exponential notation.
    def number_token
      return false unless number = @chunk[NUMBER, 1]
      token(:NUMBER, number)
      @i += number.length
    end

    # Matches strings, including multi-line strings.
    def string_token
      return false unless string = @chunk[STRING, 1]
      escaped = string.gsub(MULTILINER) do |match|
        @line += 1
        "\\\n"
      end
      token(:STRING, escaped)
      @i += string.length
    end

    # Matches interpolated JavaScript.
    def js_token
      return false unless script = @chunk[JS, 1]
      token(:JS, script.gsub(JS_CLEANER, ''))
      @i += script.length
    end

    # Matches regular expression literals.
    def regex_token
      return false unless regex = @chunk[REGEX, 1]
      token(:REGEX, regex)
      @i += regex.length
    end

    # Matches and consumes comments.
    def comment_token
      return false unless comment = @chunk[COMMENT, 1]
      @line += comment.scan(MULTILINER).length
      token(:COMMENT, comment.gsub(COMMENT_CLEANER, '').split(MULTILINER))
      token("\n", "\n")
      @i += comment.length
    end

    # Matches and consumes non-meaningful whitespace.
    def whitespace_token
      return false unless whitespace = @chunk[WHITESPACE, 1]
      @i += whitespace.length
    end

    # We treat all other single characters as a token. Eg.: ( ) , . !
    # Multi-character operators are also literal tokens, so that Racc can assign
    # the proper order of operations. Multiple newlines get merged together.
    def literal_token
      value = @chunk[NEWLINE, 1]
      if value
        @line += value.length
        token("\n", "\n") unless last_value == "\n"
        return @i += value.length
      end
      value = @chunk[OPERATOR, 1]
      tag_parameters if value && value.match(CODE)
      value ||= @chunk[0,1]
      skip_following_newlines if EXP_START.include?(value)
      remove_leading_newlines if EXP_END.include?(value)
      token(value, value)
      @i += value.length
    end

    # Add a token to the results, taking note of the line number, and
    # immediately-preceding comment.
    def token(tag, value)
      @tokens << [tag, Value.new(value, @line)]
    end

    # Peek at the previous token.
    def last_value
      @tokens.last && @tokens.last[1]
    end

    # A source of ambiguity in our grammar was parameter lists in function
    # definitions (as opposed to argument lists in function calls). Tag
    # parameter identifiers in order to avoid this.
    def tag_parameters
      index = 0
      loop do
        tok = @tokens[index -= 1]
        return if !tok
        next if tok[0] == ','
        return if tok[0] != :IDENTIFIER
        tok[0] = :PARAM
      end
    end

    # Consume and ignore newlines immediately after this point.
    def skip_following_newlines
      newlines = @code[(@i+1)..-1][NEWLINE, 1]
      if newlines
        @line += newlines.length
        @i += newlines.length
      end
    end

    # Discard newlines immediately before this point.
    def remove_leading_newlines
      @tokens.pop if last_value == "\n"
    end

  end

end
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`module CoffeeScript`

finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# The lexer reads a stream of CoffeeScript and divvys it up into tagged`
			`# tokens. A minor bit of the ambiguity in the grammar has been avoided by`
			`# pushing some extra smarts into the Lexer.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`class Lexer`

finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# The list of keywords passed verbatim to the parser.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`KEYWORDS = ["if", "else", "then", "unless",`
			`"true", "false", "null",`
			`"and", "or", "is", "aint", "not",`
			`"new", "return",`
			`"try", "catch", "finally", "throw",`
			`"break", "continue",`
			`"for", "in", "while",`
			`"switch", "case",`
removed all traces of 'extends' -- it's not any shorter or more convenient than just setting the prototype 2009-12-22 12:08:29 -05:00			`"super",`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`"delete"]`

finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Token matching regexes.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`IDENTIFIER = /\A([a-zA-Z$_]\w*)/`
			`NUMBER = /\A\b((0(x\|X)[0-9a-fA-F]+)\|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?))\b/i`
little fixes more examples 2009-12-18 09:55:31 -05:00			`STRING = /\A(""\|''\|"(.?)[^\\]"\|'(.?)[^\\]')/m`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			JS = /\A(`(.*?)`)/
broken waypoint, but fixed line numbers with the new JS comments 2009-12-23 19:42:44 -05:00			`OPERATOR = /\A([+\*&\|\/\-%=<>:]+)/`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`WHITESPACE = /\A([ \t\r]+)/`
first draft of parsing and printing along comments -- unfortunately, not yet working within objects and arrays 2009-12-22 11:27:19 -05:00			`NEWLINE = /\A(\n+)/`
			`COMMENT = /\A((#[^\n]\s)+)/m`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`CODE = /\A(=>)/`
			`REGEX = /\A(\/(.*?)[^\\]\/[imgy]{0,4})/`

finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Token cleaning regexes.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			JS_CLEANER = /(\A`\|`\Z)/
first draft of parsing and printing along comments -- unfortunately, not yet working within objects and arrays 2009-12-22 11:27:19 -05:00			`MULTILINER = /\n/`
got comments within object and array literals working out 2009-12-22 11:50:43 -05:00			`COMMENT_CLEANER = /(^\s#\|\n\s$)/`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Tokens that always constitute the start of an expression.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`EXP_START = ['{', '(', '[']`
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00
			`# Tokens that always constitute the end of an expression.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`EXP_END = ['}', ')', ']']`

finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Scan by attempting to match tokens one character at a time. Slow and steady.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def tokenize(code)`
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`@code = code.chomp # Cleanup code by remove extra line breaks`
			`@i = 0 # Current character position we're parsing`
			`@line = 1 # The current line.`
			`@tokens = [] # Collection of all parsed tokens in the form [:TOKEN_TYPE, value]`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`while @i < @code.length`
			`@chunk = @code[@i..-1]`
			`extract_next_token`
			`end`
			`@tokens`
initial commit of the mystery language 2009-12-13 17:07:16 -05:00			`end`

finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# At every position, run this list of match attempts, short-circuiting if`
			`# any of them succeed.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def extract_next_token`
			`return if identifier_token`
			`return if number_token`
			`return if string_token`
			`return if js_token`
			`return if regex_token`
first draft of parsing and printing along comments -- unfortunately, not yet working within objects and arrays 2009-12-22 11:27:19 -05:00			`return if comment_token`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`return if whitespace_token`
			`return literal_token`
			`end`
initial commit of the mystery language 2009-12-13 17:07:16 -05:00
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Matches identifying literals: variables, keywords, method names, etc.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def identifier_token`
			`return false unless identifier = @chunk[IDENTIFIER, 1]`
			`# Keywords are special identifiers tagged with their own name, 'if' will result`
			`# in an [:IF, "if"] token`
			`tag = KEYWORDS.include?(identifier) ? identifier.upcase.to_sym : :IDENTIFIER`
			`@tokens[-1][0] = :PROPERTY_ACCESS if tag == :IDENTIFIER && last_value == '.'`
			`token(tag, identifier)`
			`@i += identifier.length`
			`end`
initial commit of the mystery language 2009-12-13 17:07:16 -05:00
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Matches numbers, including decimals, hex, and exponential notation.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def number_token`
			`return false unless number = @chunk[NUMBER, 1]`
			`token(:NUMBER, number)`
			`@i += number.length`
			`end`
initial commit of the mystery language 2009-12-13 17:07:16 -05:00
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Matches strings, including multi-line strings.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def string_token`
			`return false unless string = @chunk[STRING, 1]`
			`escaped = string.gsub(MULTILINER) do \|match\|`
			`@line += 1`
			`"\\\n"`
			`end`
			`token(:STRING, escaped)`
			`@i += string.length`
cleaned up lexer in order to add line numbers 2009-12-17 09:29:49 -05:00			`end`
initial commit of the mystery language 2009-12-13 17:07:16 -05:00
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Matches interpolated JavaScript.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def js_token`
			`return false unless script = @chunk[JS, 1]`
			`token(:JS, script.gsub(JS_CLEANER, ''))`
			`@i += script.length`
			`end`
with shelling out to javascript 2009-12-15 09:11:27 -05:00
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Matches regular expression literals.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def regex_token`
			`return false unless regex = @chunk[REGEX, 1]`
			`token(:REGEX, regex)`
			`@i += regex.length`
			`end`
moving right along 2009-12-13 18:37:29 -05:00
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Matches and consumes comments.`
first draft of parsing and printing along comments -- unfortunately, not yet working within objects and arrays 2009-12-22 11:27:19 -05:00			`def comment_token`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`return false unless comment = @chunk[COMMENT, 1]`
broken waypoint, but fixed line numbers with the new JS comments 2009-12-23 19:42:18 -05:00			`@line += comment.scan(MULTILINER).length`
first draft of parsing and printing along comments -- unfortunately, not yet working within objects and arrays 2009-12-22 11:27:19 -05:00			`token(:COMMENT, comment.gsub(COMMENT_CLEANER, '').split(MULTILINER))`
			`token("\n", "\n")`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`@i += comment.length`
			`end`
initial commit of the mystery language 2009-12-13 17:07:16 -05:00
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Matches and consumes non-meaningful whitespace.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def whitespace_token`
			`return false unless whitespace = @chunk[WHITESPACE, 1]`
			`@i += whitespace.length`
			`end`
initial commit of the mystery language 2009-12-13 17:07:16 -05:00
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`# We treat all other single characters as a token. Eg.: ( ) , . !`
			`# Multi-character operators are also literal tokens, so that Racc can assign`
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# the proper order of operations. Multiple newlines get merged together.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def literal_token`
			`value = @chunk[NEWLINE, 1]`
			`if value`
			`@line += value.length`
			`token("\n", "\n") unless last_value == "\n"`
			`return @i += value.length`
			`end`
			`value = @chunk[OPERATOR, 1]`
			`tag_parameters if value && value.match(CODE)`
			`value \|\|= @chunk[0,1]`
			`skip_following_newlines if EXP_START.include?(value)`
			`remove_leading_newlines if EXP_END.include?(value)`
			`token(value, value)`
			`@i += value.length`
initial commit of the mystery language 2009-12-13 17:07:16 -05:00			`end`

passing through comments as tags on Values, but not printing them out quite yet... 2009-12-22 10:48:58 -05:00			`# Add a token to the results, taking note of the line number, and`
			`# immediately-preceding comment.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def token(tag, value)`
first draft of parsing and printing along comments -- unfortunately, not yet working within objects and arrays 2009-12-22 11:27:19 -05:00			`@tokens << [tag, Value.new(value, @line)]`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`end`
cleaned up lexer in order to add line numbers 2009-12-17 09:29:49 -05:00
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Peek at the previous token.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def last_value`
			`@tokens.last && @tokens.last[1]`
			`end`
cleaned up lexer in order to add line numbers 2009-12-17 09:29:49 -05:00
finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# A source of ambiguity in our grammar was parameter lists in function`
			`# definitions (as opposed to argument lists in function calls). Tag`
			`# parameter identifiers in order to avoid this.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def tag_parameters`
			`index = 0`
			`loop do`
			`tok = @tokens[index -= 1]`
parser test raises some minor improvements (remove unnecessary ValueNode arrays, etc 2009-12-18 07:11:01 -05:00			`return if !tok`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`next if tok[0] == ','`
parser test raises some minor improvements (remove unnecessary ValueNode arrays, etc 2009-12-18 07:11:01 -05:00			`return if tok[0] != :IDENTIFIER`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`tok[0] = :PARAM`
			`end`
more more, including &&=, \|\|= 2009-12-13 20:29:44 -05:00			`end`

finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Consume and ignore newlines immediately after this point.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def skip_following_newlines`
			`newlines = @code[(@i+1)..-1][NEWLINE, 1]`
			`if newlines`
			`@line += newlines.length`
			`@i += newlines.length`
			`end`
			`end`

finished commenting everything but the nodes -- they're up next 2009-12-17 22:54:24 -05:00			`# Discard newlines immediately before this point.`
many more comments, plus a fix for inner-assignment indentation 2009-12-17 22:13:29 -05:00			`def remove_leading_newlines`
			`@tokens.pop if last_value == "\n"`
cleaned up lexer in order to add line numbers 2009-12-17 09:29:49 -05:00			`end`
a smarter lexer brings us down to three shift/reduces 2009-12-16 20:48:37 -05:00
			`end`

initial commit of the mystery language 2009-12-13 17:07:16 -05:00			`end`