Regular expression interpolations; fixed bug in string interpolations when all tokens were identifiers.

This commit is contained in:
Stan Angeloff 2010-03-08 20:05:02 +02:00
parent 830d1fb42b
commit 81af8f296e
6 changed files with 146 additions and 57 deletions

View File

@ -1,5 +1,5 @@
(function(){
var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, INTERPOLATION, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, Lexer, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, RESERVED, Rewriter, STRING_NEWLINES, WHITESPACE, compact, count, include, starts;
var ACCESSORS, ASSIGNMENT, BEFORE_WHEN, CALLABLE, CODE, COFFEE_KEYWORDS, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, INTERPOLATION, JS_CLEANER, JS_FORBIDDEN, JS_KEYWORDS, KEYWORDS, LAST_DENT, LAST_DENTS, Lexer, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, RESERVED, Rewriter, STRING_NEWLINES, WHITESPACE, compact, count, include, starts;
// The CoffeeScript Lexer. Uses a series of token-matching regexes to attempt
// matches against the beginning of the source code. When a match is found,
// a token is produced, we consume the match, and start again. Tokens are in the
@ -129,18 +129,18 @@
// Matches strings, including multi-line strings. Ensures that quotation marks
// are balanced within the string's contents, and within nested interpolations.
Lexer.prototype.string_token = function string_token() {
var string;
var merge, string, supress;
if (!(starts(this.chunk, '"') || starts(this.chunk, "'"))) {
return false;
}
string = this.balanced_token(['"', '"'], ['${', '}']);
string = this.balanced_token((supress = false), ['"', '"'], ['${', '}']);
if (!(string)) {
string = this.balanced_token(["'", "'"]);
string = this.balanced_token((supress = false), ["'", "'"]);
}
if (!(string)) {
return false;
}
this.interpolate_string(string.replace(STRING_NEWLINES, " \\\n"));
this.interpolate_string(string.replace(STRING_NEWLINES, " \\\n"), (merge = true));
this.line += count(string, "\n");
this.i += string.length;
return true;
@ -160,11 +160,11 @@
};
// Matches JavaScript interpolated directly into the source via backticks.
Lexer.prototype.js_token = function js_token() {
var script;
var script, supress;
if (!(starts(this.chunk, '`'))) {
return false;
}
if (!((script = this.balanced_token(['`', '`'])))) {
if (!((script = this.balanced_token((supress = false), ['`', '`'])))) {
return false;
}
this.token('JS', script.replace(JS_CLEANER, ''));
@ -175,23 +175,57 @@
// to distinguish from division, so we borrow some basic heuristics from
// JavaScript and Ruby.
Lexer.prototype.regex_token = function regex_token() {
var regex;
if (!((regex = this.match(REGEX, 1)))) {
var _a, _b, _c, _d, _e, each, flags, i, index, interp_tokens, merge, regex, str, supress;
if (!((regex = this.balanced_token((supress = true), ['/', '/'])))) {
return false;
}
if (regex.length < 3 || regex.match(/^\/\s+|\n/)) {
return false;
}
if (include(NOT_REGEX, this.tag())) {
return false;
}
this.token('REGEX', regex);
flags = ['i', 'm', 'g', 'y'];
while (((index = flags.indexOf(this.chunk.substr(regex.length, 1)))) >= 0) {
regex += flags[index];
flags.splice(index, 1);
}
if (((0 < (_e = regex.indexOf('${'))) && (_e < regex.indexOf('}'))) || regex.match(/[^\\]\$[a-zA-Z_@]/)) {
_a = regex.substring(1).split('/');
str = _a[0];
flags = _a[1];
str = str.replace(/\\[^\$]/g, function(escaped) {
return '\\' + escaped;
});
this.tokens = this.tokens.concat([['(', '('], ['NEW', 'new'], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']]);
interp_tokens = this.interpolate_string("\"" + str + "\"", (merge = false));
_b = interp_tokens;
for (i = 0, _c = _b.length; i < _c; i++) {
each = _b[i];
if ((_d = each[0]) === 'TOKENS') {
this.tokens = this.tokens.concat(each[1]);
} else if (_d === 'STRING') {
this.token(each[0], each[1].substring(0, 1) + each[1].substring(1, each[1].length - 1).replace(/"/g, '\\"') + each[1].substring(0, 1));
} else {
this.token(each[0], each[1]);
}
if (i < interp_tokens.length - 1) {
this.token('+', '+');
}
}
this.tokens = this.tokens.concat([[',', ','], ['STRING', "'" + flags + "'"], [')', ')'], [')', ')']]);
} else {
this.token('REGEX', regex);
}
this.i += regex.length;
return true;
};
// Matches a token in which which the passed delimiter pairs must be correctly
// balanced (ie. strings, JS literals).
Lexer.prototype.balanced_token = function balanced_token() {
Lexer.prototype.balanced_token = function balanced_token(supress) {
var delimited;
delimited = Array.prototype.slice.call(arguments, 0);
return this.balanced_string.apply(this, [this.chunk].concat(delimited));
delimited = Array.prototype.slice.call(arguments, 1);
return this.balanced_string.apply(this, [this.chunk].concat([supress]).concat(delimited));
};
// Matches and conumes comments. We pass through comments into JavaScript,
// so they're treated as real tokens, like any other part of the language.
@ -395,9 +429,9 @@
// a series of delimiters, all of which must be nested correctly within the
// contents of the string. This method allows us to have strings within
// interpolations within strings etc...
Lexer.prototype.balanced_string = function balanced_string(str) {
Lexer.prototype.balanced_string = function balanced_string(str, supress) {
var _a, _b, _c, _d, close, delimited, i, levels, open, pair;
delimited = Array.prototype.slice.call(arguments, 1);
delimited = Array.prototype.slice.call(arguments, 2);
levels = [];
i = 0;
while (i < str.length) {
@ -429,7 +463,10 @@
i += 1;
}
if (levels.length) {
throw new Error("SyntaxError: Unterminated " + (levels.pop()[0]) + " starting on line " + (this.line + 1));
if (!(supress)) {
throw new Error("SyntaxError: Unterminated " + (levels.pop()[0]) + " starting on line " + (this.line + 1));
}
return false;
}
if (i === 0) {
return false;
@ -444,8 +481,8 @@
// If it encounters an interpolation, this method will recursively create a
// new Lexer, tokenize the interpolated contents, and merge them into the
// token stream.
Lexer.prototype.interpolate_string = function interpolate_string(str) {
var _a, _b, _c, _d, _e, each, expr, group, i, inner, interp, lexer, match, nested, pi, quote, tokens;
Lexer.prototype.interpolate_string = function interpolate_string(str, merge) {
var _a, _b, _c, _d, _e, _f, _g, each, expr, group, has_string, i, inner, interp, lexer, match, nested, pi, quote, supress, tokens;
if (str.length < 3 || !starts(str, '"')) {
return this.token('STRING', str);
} else {
@ -466,14 +503,14 @@
interp = "this." + (interp.substring(1));
}
if (pi < i) {
tokens.push(['STRING', quote + (str.substring(pi, i)) + quote]);
tokens.push(['STRING', '' + quote + (str.substring(pi, i)) + quote]);
}
tokens.push(['IDENTIFIER', interp]);
i += group.length - 1;
pi = i + 1;
} else if (((expr = this.balanced_string(str.substring(i), ['${', '}'])))) {
} else if (((expr = this.balanced_string(str.substring(i), (supress = false), ['${', '}'])))) {
if (pi < i) {
tokens.push(['STRING', quote + (str.substring(pi, i)) + quote]);
tokens.push(['STRING', '' + quote + (str.substring(pi, i)) + quote]);
}
inner = expr.substring(2, expr.length - 1);
if (inner.length) {
@ -484,7 +521,7 @@
nested.pop();
tokens.push(['TOKENS', nested]);
} else {
tokens.push(['STRING', quote + quote]);
tokens.push(['STRING', '' + quote + quote]);
}
i += expr.length - 1;
pi = i + 1;
@ -492,19 +529,27 @@
i += 1;
}
if (pi < i && pi < str.length - 1) {
tokens.push(['STRING', quote + (str.substring(pi, i)) + quote]);
tokens.push(['STRING', '' + quote + (str.substring(pi, i)) + quote]);
}
_c = []; _d = tokens;
for (i = 0, _e = _d.length; i < _e; i++) {
each = _d[i];
_c.push((function() {
_c = tokens;
for (_d = 0, _e = _c.length; _d < _e; _d++) {
each = _c[_d];
each[0] === 'STRING' ? ((has_string = true)) : null;
}
if (!has_string) {
tokens.unshift(['STRING', "''"]);
}
if (((typeof merge !== "undefined" && merge !== null) ? merge : true)) {
_f = tokens;
for (i = 0, _g = _f.length; i < _g; i++) {
each = _f[i];
each[0] === 'TOKENS' ? (this.tokens = this.tokens.concat(each[1])) : this.token(each[0], each[1]);
if (i < tokens.length - 1) {
return this.token('+', '+');
this.token('+', '+');
}
}).call(this));
}
}
return _c;
return tokens;
}
};
// Helpers
@ -568,7 +613,7 @@
// be used as identifiers or properties.
JS_FORBIDDEN = JS_KEYWORDS.concat(RESERVED);
// Token matching regexes.
IDENTIFIER = /^([a-zA-Z$_](\w|\$)*)/;
IDENTIFIER = /^([a-zA-Z\$_](\w|\$)*)/;
NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i;
HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/;
INTERPOLATION = /^\$([a-zA-Z_@]\w*(\.\w+)*)/;
@ -576,7 +621,6 @@
WHITESPACE = /^([ \t]+)/;
COMMENT = /^(((\n?[ \t]*)?#[^\n]*)+)/;
CODE = /^((-|=)>)/;
REGEX = /^(\/(\S.*?)?([^\\]|\\\\)\/[imgy]{0,4})/;
MULTI_DENT = /^((\n([ \t]*))+)(\.)?/;
LAST_DENTS = /\n([ \t]*)/g;
LAST_DENT = /\n([ \t]*)/;

View File

@ -289,7 +289,7 @@ idt += TAB
var end, idt;
idt = this.is_statement() ? this.idt() : '';
end = this.is_statement() ? ';' : '';
return idt + this.value + end;
return '' + idt + this.value + end;
};
LiteralNode.prototype.toString = function toString(idt) {
return " \"" + this.value + "\"";
@ -762,7 +762,7 @@ idt += TAB
props = props.empty() ? '' : props.compile(o) + '\n';
extension = extension ? this.idt() + extension.compile(o) + ';\n' : '';
returns = ret ? '\n' + this.idt() + 'return ' + this.variable.compile(o) + ';' : '';
return construct + extension + props + returns;
return '' + construct + extension + props + returns;
};
return ClassNode;
}).call(this);
@ -1540,7 +1540,7 @@ idt += TAB
indent: this.idt(),
chain_child: true
})) : " else {\n" + (Expressions.wrap([this.else_body]).compile(o)) + "\n" + this.tab + "}";
return if_part + else_part;
return '' + if_part + else_part;
};
// Compile the IfNode as a ternary operator.
IfNode.prototype.compile_ternary = function compile_ternary(o) {
@ -1559,7 +1559,7 @@ idt += TAB
// with Git.
TRAILING_WHITESPACE = /\s+$/gm;
// Keep this identifier regex in sync with the Lexer.
IDENTIFIER = /^[a-zA-Z$_](\w|\$)*$/;
IDENTIFIER = /^[a-zA-Z\$_](\w|\$)*$/;
// Utility Functions
// -----------------
// Merge objects, returning a fresh copy with attributes from both sides.

View File

@ -96,10 +96,10 @@ exports.Lexer: class Lexer
# are balanced within the string's contents, and within nested interpolations.
string_token: ->
return false unless starts(@chunk, '"') or starts(@chunk, "'")
string: @balanced_token ['"', '"'], ['${', '}']
string: @balanced_token ["'", "'"] unless string
string: @balanced_token supress: false, ['"', '"'], ['${', '}']
string: @balanced_token supress: false, ["'", "'"] unless string
return false unless string
@interpolate_string string.replace STRING_NEWLINES, " \\\n"
@interpolate_string string.replace(STRING_NEWLINES, " \\\n"), merge: true
@line += count string, "\n"
@i += string.length
true
@ -117,7 +117,7 @@ exports.Lexer: class Lexer
# Matches JavaScript interpolated directly into the source via backticks.
js_token: ->
return false unless starts @chunk, '`'
return false unless script: @balanced_token ['`', '`']
return false unless script: @balanced_token supress: false, ['`', '`']
@token 'JS', script.replace(JS_CLEANER, '')
@i += script.length
true
@ -126,16 +126,34 @@ exports.Lexer: class Lexer
# to distinguish from division, so we borrow some basic heuristics from
# JavaScript and Ruby.
regex_token: ->
return false unless regex: @match REGEX, 1
return false unless regex: @balanced_token supress: true, ['/', '/']
return false if regex.length < 3 or regex.match /^\/\s+|\n/
return false if include NOT_REGEX, @tag()
@token 'REGEX', regex
flags: ['i', 'm', 'g', 'y']
while (index: flags.indexOf @chunk.substr regex.length, 1) >= 0
regex += flags[index]
flags.splice index, 1
if (0 < regex.indexOf('${') < regex.indexOf('}')) or regex.match /[^\\]\$[a-zA-Z_@]/
[str, flags]: regex.substring(1).split('/')
str: str.replace /\\[^\$]/g, (escaped) -> '\\' + escaped
@tokens: @tokens.concat [['(', '('], ['NEW', 'new'], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']]
interp_tokens: @interpolate_string "\"$str\"", merge: false
for each, i in interp_tokens
switch each[0]
when 'TOKENS' then @tokens: @tokens.concat each[1]
when 'STRING' then @token each[0], each[1].substring(0, 1) + each[1].substring(1, each[1].length - 1).replace(/"/g, '\\"') + each[1].substring(0, 1)
else @token each[0], each[1]
@token '+', '+' if i < interp_tokens.length - 1
@tokens: @tokens.concat [[',', ','], ['STRING', "'$flags'"], [')', ')'], [')', ')']]
else
@token 'REGEX', regex
@i += regex.length
true
# Matches a token in which which the passed delimiter pairs must be correctly
# balanced (ie. strings, JS literals).
balanced_token: (delimited...) ->
@balanced_string @chunk, delimited...
balanced_token: (supress, delimited...) ->
@balanced_string @chunk, supress, delimited...
# Matches and conumes comments. We pass through comments into JavaScript,
# so they're treated as real tokens, like any other part of the language.
@ -297,7 +315,7 @@ exports.Lexer: class Lexer
# a series of delimiters, all of which must be nested correctly within the
# contents of the string. This method allows us to have strings within
# interpolations within strings etc...
balanced_string: (str, delimited...) ->
balanced_string: (str, supress, delimited...) ->
levels: []
i: 0
while i < str.length
@ -317,7 +335,9 @@ exports.Lexer: class Lexer
break
break unless levels.length
i += 1
throw new Error "SyntaxError: Unterminated ${levels.pop()[0]} starting on line ${@line + 1}" if levels.length
if levels.length
throw new Error "SyntaxError: Unterminated ${levels.pop()[0]} starting on line ${@line + 1}" unless supress
return false
return false if i is 0
return str.substring(0, i)
@ -331,7 +351,7 @@ exports.Lexer: class Lexer
# If it encounters an interpolation, this method will recursively create a
# new Lexer, tokenize the interpolated contents, and merge them into the
# token stream.
interpolate_string: (str) ->
interpolate_string: (str, merge) ->
if str.length < 3 or not starts str, '"'
@token 'STRING', str
else
@ -349,7 +369,7 @@ exports.Lexer: class Lexer
tokens.push ['IDENTIFIER', interp]
i += group.length - 1
pi: i + 1
else if (expr: @balanced_string str.substring(i), ['${', '}'])
else if (expr: @balanced_string str.substring(i), supress: false, ['${', '}'])
tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i
inner: expr.substring(2, expr.length - 1)
if inner.length
@ -362,12 +382,16 @@ exports.Lexer: class Lexer
pi: i + 1
i += 1
tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i and pi < str.length - 1
for each, i in tokens
if each[0] is 'TOKENS'
@tokens: @tokens.concat each[1]
else
@token each[0], each[1]
@token '+', '+' if i < tokens.length - 1
(has_string: yes) for each in tokens when each[0] is 'STRING'
tokens.unshift ['STRING', "''"] if not has_string
if (merge ? true)
for each, i in tokens
if each[0] is 'TOKENS'
@tokens: @tokens.concat each[1]
else
@token each[0], each[1]
@token '+', '+' if i < tokens.length - 1
tokens
# Helpers
# -------
@ -440,7 +464,7 @@ RESERVED: [
JS_FORBIDDEN: JS_KEYWORDS.concat RESERVED
# Token matching regexes.
IDENTIFIER : /^([a-zA-Z$_](\w|\$)*)/
IDENTIFIER : /^([a-zA-Z\$_](\w|\$)*)/
NUMBER : /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i
HEREDOC : /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/
INTERPOLATION : /^\$([a-zA-Z_@]\w*(\.\w+)*)/
@ -448,7 +472,6 @@ OPERATOR : /^([+\*&|\/\-%=<>:!?]+)/
WHITESPACE : /^([ \t]+)/
COMMENT : /^(((\n?[ \t]*)?#[^\n]*)+)/
CODE : /^((-|=)>)/
REGEX : /^(\/(\S.*?)?([^\\]|\\\\)\/[imgy]{0,4})/
MULTI_DENT : /^((\n([ \t]*))+)(\.)?/
LAST_DENTS : /\n([ \t]*)/g
LAST_DENT : /\n([ \t]*)/

View File

@ -1192,7 +1192,7 @@ TAB: ' '
TRAILING_WHITESPACE: /\s+$/gm
# Keep this identifier regex in sync with the Lexer.
IDENTIFIER: /^[a-zA-Z$_](\w|\$)*$/
IDENTIFIER: /^[a-zA-Z\$_](\w|\$)*$/
# Utility Functions
# -----------------

View File

@ -0,0 +1,17 @@
name: 'Bob'
ok not not '"Bob"'.match(/^"${name}"$/i)
ok '"Bobby"'.match(/^"${name}"$/i) is null
ok not not 'Bob'.match(/^$name$/)
ok 'Bobby'.match(/^$name/)
ok 'Bobby'.match(/${"${"${"$name"}"}"}/imgy)
ok '$a$b$c'.match(/\$A\$B\$C/i)
a: 1
b: 2
c: 3
ok '123'.match(/$a$b$c/i)

View File

@ -61,3 +61,8 @@ ok "Where is ${"the nested ${obj["name"]}"}?" is 'Where is the nested Joe?'
ok "Hello ${world ? "$hello"}" is 'Hello World'
ok "Hello ${"${"${obj["name"]}" + '!'}"}" is 'Hello Joe!'
a: 1
b: 2
c: 3
ok "$a$b$c" is '123'