a little further on with the lexer

This commit is contained in:
Jeremy Ashkenas 2010-01-30 00:37:38 -05:00
parent 854c796fd6
commit eff2f4b520
2 changed files with 84 additions and 7 deletions

View File

@ -10,9 +10,9 @@
// Token matching regexes.
lex.IDENTIFIER = /^([a-zA-Z$_](\w|\$)*)/;
lex.NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i;
lex.STRING = /^(""|''|"(.*?)([^\\]|\\\\)"|'(.*?)([^\\]|\\\\)')/m;
lex.HEREDOC = /^("{6}|'{6}|"{3}\n?(.*?)\n?([ \t]*)"{3}|'{3}\n?(.*?)\n?([ \t]*)'{3})/m;
lex.JS = /^(``|`(.*?)([^\\]|\\\\)`)/m;
lex.STRING = /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/;
lex.HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/;
lex.JS = /^(``|`([\s\S]*?)([^\\]|\\\\)`)/;
lex.OPERATOR = /^([+\*&|\/\-%=<>:!?]+)/;
lex.WHITESPACE = /^([ \t]+)/;
lex.COMMENT = /^(((\n?[ \t]*)?#.*$)+)/;
@ -92,5 +92,54 @@
}
return this.literal_token();
};
// Look at a tag in the current token stream.
lex.prototype.tag = function tag(index, tag) {
var tok;
if (!((tok = this.tokens[index || -1]))) {
return null;
}
if ((typeof tag !== "undefined" && tag !== null)) {
return (tok[0] = tag);
}
return tok[0];
};
// Look at a value in the current token stream.
lex.prototype.value = function value(index, val) {
var tok;
if (!((tok = this.tokens[index || -1]))) {
return null;
}
if ((typeof val !== "undefined" && val !== null)) {
return (tok[1] = val);
}
return tok[1];
};
// Tokenizers ==========================================================
// Matches identifying literals: variables, keywords, method names, etc.
lex.prototype.identifier_token = function identifier_token() {
var id, match, tag;
match = this.chunk.match(lex.IDENTIFIER);
if (!(match && (id = match[1]))) {
return false;
}
// Keywords are special identifiers tagged with their own name,
// 'if' will result in an ['IF', "if"] token.
tag = this.KEYWORDS.indexOf(id) >= 0 ? id.toUpperCase() : 'IDENTIFIER';
if (tag === 'WHEN' && (this.tag() === 'OUTDENT' || this.tag() === 'INDENT')) {
tag = 'LEADING_WHEN';
}
if (tag === 'IDENTIFIER' && this.value() === '::') {
this.tag(-1, 'PROTOTYPE_ACCESS');
}
if (tag === 'IDENTIFIER' && this.value() === '.' && !(this.value(-2) === '.')) {
if (this.tag(-2) === '?') {
this.tag(-1, 'SOAK_ACCESS');
this.tokens.splice(-2, 1);
} else {
this.tag(-1, 'PROPERTY_ACCESS');
}
}
this.token(tag, id);
return this.i += id.length;
};
})();

View File

@ -22,9 +22,9 @@ lex.KEYWORDS: [
# Token matching regexes.
lex.IDENTIFIER : /^([a-zA-Z$_](\w|\$)*)/
lex.NUMBER : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i
lex.STRING : /^(""|''|"(.*?)([^\\]|\\\\)"|'(.*?)([^\\]|\\\\)')/m
lex.HEREDOC : /^("{6}|'{6}|"{3}\n?(.*?)\n?([ \t]*)"{3}|'{3}\n?(.*?)\n?([ \t]*)'{3})/m
lex.JS : /^(``|`(.*?)([^\\]|\\\\)`)/m
lex.STRING : /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/
lex.HEREDOC : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/
lex.JS : /^(``|`([\s\S]*?)([^\\]|\\\\)`)/
lex.OPERATOR : /^([+\*&|\/\-%=<>:!?]+)/
lex.WHITESPACE : /^([ \t]+)/
lex.COMMENT : /^(((\n?[ \t]*)?#.*$)+)/
@ -84,9 +84,37 @@ lex::extract_next_token: ->
return if this.whitespace_token()
return this.literal_token()
# Look at a tag in the current token stream.
lex::tag: (index, tag) ->
return unless tok: this.tokens[index || -1]
return tok[0]: tag if tag?
tok[0]
# Look at a value in the current token stream.
lex::value: (index, val) ->
return unless tok: this.tokens[index || -1]
return tok[1]: val if val?
tok[1]
# Tokenizers ==========================================================
# Matches identifying literals: variables, keywords, method names, etc.
lex::identifier_token: ->
match: this.chunk.match(lex.IDENTIFIER)
return false unless match and id: match[1]
# Keywords are special identifiers tagged with their own name,
# 'if' will result in an ['IF', "if"] token.
tag: if this.KEYWORDS.indexOf(id) >= 0 then id.toUpperCase() else 'IDENTIFIER'
tag: 'LEADING_WHEN' if tag is 'WHEN' and (this.tag() is 'OUTDENT' or this.tag() is 'INDENT')
this.tag(-1, 'PROTOTYPE_ACCESS') if tag is 'IDENTIFIER' and this.value() is '::'
if tag is 'IDENTIFIER' and this.value() is '.' and !(this.value(-2) is '.')
if this.tag(-2) is '?'
this.tag(-1, 'SOAK_ACCESS')
this.tokens.splice(-2, 1)
else
this.tag(-1, 'PROPERTY_ACCESS')
this.token(tag, id)
this.i += id.length