mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
423 lines
10 KiB
Text
423 lines
10 KiB
Text
# Copyright (c) 2011 SUSE
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person
|
|
# obtaining a copy of this software and associated documentation
|
|
# files (the "Software"), to deal in the Software without
|
|
# restriction, including without limitation the rights to use,
|
|
# copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
# copies of the Software, and to permit persons to whom the
|
|
# Software is furnished to do so, subject to the following
|
|
# conditions:
|
|
#
|
|
# The above copyright notice and this permission notice shall be
|
|
# included in all copies or substantial portions of the Software.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
# OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
class Machete::Parser
|
|
|
|
token NIL
|
|
token TRUE
|
|
token FALSE
|
|
token INTEGER
|
|
token SYMBOL
|
|
token STRING
|
|
token REGEXP
|
|
token ANY
|
|
token EVEN
|
|
token ODD
|
|
token METHOD_NAME
|
|
token CLASS_NAME
|
|
|
|
start expression
|
|
|
|
rule
|
|
|
|
expression : primary
|
|
| expression "|" primary {
|
|
result = if val[0].is_a?(ChoiceMatcher)
|
|
ChoiceMatcher.new(val[0].alternatives << val[2])
|
|
else
|
|
ChoiceMatcher.new([val[0], val[2]])
|
|
end
|
|
}
|
|
|
|
primary : node
|
|
| array
|
|
| literal
|
|
| any
|
|
|
|
node : CLASS_NAME {
|
|
result = NodeMatcher.new(val[0].to_sym)
|
|
}
|
|
| CLASS_NAME "<" attrs ">" {
|
|
result = NodeMatcher.new(val[0].to_sym, val[2])
|
|
}
|
|
|
|
attrs : attr
|
|
| attrs "," attr { result = val[0].merge(val[2]) }
|
|
|
|
attr : method_name "=" expression { result = { val[0].to_sym => val[2] } }
|
|
| method_name "^=" SYMBOL {
|
|
result = {
|
|
val[0].to_sym => SymbolRegexpMatcher.new(
|
|
Regexp.new("^" + Regexp.escape(symbol_value(val[2]).to_s))
|
|
)
|
|
}
|
|
}
|
|
| method_name "$=" SYMBOL {
|
|
result = {
|
|
val[0].to_sym => SymbolRegexpMatcher.new(
|
|
Regexp.new(Regexp.escape(symbol_value(val[2]).to_s) + "$")
|
|
)
|
|
}
|
|
}
|
|
| method_name "*=" SYMBOL {
|
|
result = {
|
|
val[0].to_sym => SymbolRegexpMatcher.new(
|
|
Regexp.new(Regexp.escape(symbol_value(val[2]).to_s))
|
|
)
|
|
}
|
|
}
|
|
| method_name "^=" STRING {
|
|
result = {
|
|
val[0].to_sym => StringRegexpMatcher.new(
|
|
Regexp.new("^" + Regexp.escape(string_value(val[2])))
|
|
)
|
|
}
|
|
}
|
|
| method_name "$=" STRING {
|
|
result = {
|
|
val[0].to_sym => StringRegexpMatcher.new(
|
|
Regexp.new(Regexp.escape(string_value(val[2])) + "$")
|
|
)
|
|
}
|
|
}
|
|
| method_name "*=" STRING {
|
|
result = {
|
|
val[0].to_sym => StringRegexpMatcher.new(
|
|
Regexp.new(Regexp.escape(string_value(val[2])))
|
|
)
|
|
}
|
|
}
|
|
| method_name "*=" REGEXP {
|
|
result = {
|
|
val[0].to_sym => IndifferentRegexpMatcher.new(
|
|
Regexp.new(regexp_value(val[2]))
|
|
)
|
|
}
|
|
}
|
|
|
|
# Hack to overcome the fact that some tokens will lex as simple tokens, not
|
|
# METHOD_NAME tokens, and that "reserved words" will lex as separate kinds of
|
|
# tokens.
|
|
method_name : METHOD_NAME
|
|
| NIL
|
|
| TRUE
|
|
| FALSE
|
|
| ANY
|
|
| EVEN
|
|
| ODD
|
|
| "*"
|
|
| "+"
|
|
| "<"
|
|
| ">"
|
|
| "^"
|
|
| "|"
|
|
|
|
array : "[" items_opt "]" { result = ArrayMatcher.new(val[1]) }
|
|
|
|
items_opt : /* empty */ { result = [] }
|
|
| items
|
|
|
|
items : item { result = [val[0]] }
|
|
| items "," item { result = val[0] << val[2] }
|
|
|
|
item : expression
|
|
| expression quantifier { result = Quantifier.new(val[0], *val[1]) }
|
|
|
|
quantifier : "*" { result = [0, nil, 1] }
|
|
| "+" { result = [1, nil, 1] }
|
|
| "?" { result = [0, 1, 1] }
|
|
| "{" INTEGER "}" {
|
|
result = [integer_value(val[1]), integer_value(val[1]), 1]
|
|
}
|
|
| "{" INTEGER "," "}" {
|
|
result = [integer_value(val[1]), nil, 1]
|
|
}
|
|
| "{" "," INTEGER "}" {
|
|
result = [0, integer_value(val[2]), 1]
|
|
}
|
|
| "{" INTEGER "," INTEGER "}" {
|
|
result = [integer_value(val[1]), integer_value(val[3]), 1]
|
|
}
|
|
| "{" EVEN "}" { result = [0, nil, 2] }
|
|
| "{" ODD "}" { result = [1, nil, 2] }
|
|
|
|
literal : NIL { result = LiteralMatcher.new(nil) }
|
|
| TRUE { result = LiteralMatcher.new(true) }
|
|
| FALSE { result = LiteralMatcher.new(false) }
|
|
| INTEGER { result = LiteralMatcher.new(integer_value(val[0])) }
|
|
| SYMBOL { result = LiteralMatcher.new(symbol_value(val[0])) }
|
|
| STRING { result = LiteralMatcher.new(string_value(val[0])) }
|
|
| REGEXP { result = LiteralMatcher.new(regexp_value(val[0])) }
|
|
|
|
any : ANY { result = AnyMatcher.new }
|
|
|
|
---- inner
|
|
|
|
include Matchers
|
|
|
|
class SyntaxError < StandardError; end
|
|
|
|
def parse(input)
|
|
@input = input
|
|
@pos = 0
|
|
|
|
do_parse
|
|
end
|
|
|
|
private
|
|
|
|
def integer_value(value)
|
|
if value =~ /^0[bB]/
|
|
value[2..-1].to_i(2)
|
|
elsif value =~ /^0[oO]/
|
|
value[2..-1].to_i(8)
|
|
elsif value =~ /^0[dD]/
|
|
value[2..-1].to_i(10)
|
|
elsif value =~ /^0[xX]/
|
|
value[2..-1].to_i(16)
|
|
elsif value =~ /^0/
|
|
value.to_i(8)
|
|
else
|
|
value.to_i
|
|
end
|
|
end
|
|
|
|
def symbol_value(value)
|
|
value[1..-1].to_sym
|
|
end
|
|
|
|
def string_value(value)
|
|
quote = value[0..0]
|
|
if quote == "'"
|
|
value[1..-2].gsub("\\\\", "\\").gsub("\\'", "'")
|
|
elsif quote == '"'
|
|
value[1..-2].
|
|
gsub("\\\\", "\\").
|
|
gsub('\\"', '"').
|
|
gsub("\\n", "\n").
|
|
gsub("\\t", "\t").
|
|
gsub("\\r", "\r").
|
|
gsub("\\f", "\f").
|
|
gsub("\\v", "\v").
|
|
gsub("\\a", "\a").
|
|
gsub("\\e", "\e").
|
|
gsub("\\b", "\b").
|
|
gsub("\\s", "\s").
|
|
gsub(/\\([0-7]{1,3})/) { $1.to_i(8).chr }.
|
|
gsub(/\\x([0-9a-fA-F]{1,2})/) { $1.to_i(16).chr }
|
|
else
|
|
raise "Unknown quote: #{quote.inspect}."
|
|
end
|
|
end
|
|
|
|
REGEXP_OPTIONS = {
|
|
'i' => Regexp::IGNORECASE,
|
|
'm' => Regexp::MULTILINE,
|
|
'x' => Regexp::EXTENDED
|
|
}
|
|
|
|
def regexp_value(value)
|
|
/\A\/(.*)\/([imx]*)\z/ =~ value
|
|
pattern, options = $1, $2
|
|
|
|
Regexp.new(pattern, options.chars.map { |ch| REGEXP_OPTIONS[ch] }.inject(:|))
|
|
end
|
|
|
|
# "^" needs to be here because if it were among operators recognized by
|
|
# METHOD_NAME, "^=" would be recognized as two tokens.
|
|
SIMPLE_TOKENS = [
|
|
"|",
|
|
"<",
|
|
">",
|
|
",",
|
|
"=",
|
|
"^=",
|
|
"^",
|
|
"$=",
|
|
"[",
|
|
"]",
|
|
"*=",
|
|
"*",
|
|
"+",
|
|
"?",
|
|
"{",
|
|
"}"
|
|
]
|
|
|
|
COMPLEX_TOKENS = [
|
|
[:NIL, /^nil/],
|
|
[:TRUE, /^true/],
|
|
[:FALSE, /^false/],
|
|
# INTEGER needs to be before METHOD_NAME, otherwise e.g. "+1" would be
|
|
# recognized as two tokens.
|
|
[
|
|
:INTEGER,
|
|
/^
|
|
[+-]? # sign
|
|
(
|
|
0[bB][01]+(_[01]+)* # binary (prefixed)
|
|
|
|
|
0[oO][0-7]+(_[0-7]+)* # octal (prefixed)
|
|
|
|
|
0[dD]\d+(_\d+)* # decimal (prefixed)
|
|
|
|
|
0[xX][0-9a-fA-F]+(_[0-9a-fA-F]+)* # hexadecimal (prefixed)
|
|
|
|
|
0[0-7]*(_[0-7]+)* # octal (unprefixed)
|
|
|
|
|
[1-9]\d*(_\d+)* # decimal (unprefixed)
|
|
)
|
|
/x
|
|
],
|
|
[
|
|
:SYMBOL,
|
|
/^
|
|
:
|
|
(
|
|
# class name
|
|
[A-Z][a-zA-Z0-9_]*
|
|
|
|
|
# regular method name
|
|
[a-z_][a-zA-Z0-9_]*[?!=]?
|
|
|
|
|
# instance variable name
|
|
@[a-zA-Z_][a-zA-Z0-9_]*
|
|
|
|
|
# class variable name
|
|
@@[a-zA-Z_][a-zA-Z0-9_]*
|
|
|
|
|
# operator (sorted by length, then alphabetically)
|
|
(<=>|===|\[\]=|\*\*|\+@|-@|<<|<=|==|=~|>=|>>|\[\]|[%&*+\-\/<>^`|~])
|
|
)
|
|
/x
|
|
],
|
|
[
|
|
:STRING,
|
|
/^
|
|
(
|
|
' # sinqle-quoted string
|
|
(
|
|
\\[\\'] # escape
|
|
|
|
|
[^'] # regular character
|
|
)*
|
|
'
|
|
|
|
|
" # double-quoted string
|
|
(
|
|
\\ # escape
|
|
(
|
|
[\\"ntrfvaebs] # one-character escape
|
|
|
|
|
[0-7]{1,3} # octal number escape
|
|
|
|
|
x[0-9a-fA-F]{1,2} # hexadecimal number escape
|
|
)
|
|
|
|
|
[^"] # regular character
|
|
)*
|
|
"
|
|
)
|
|
/x
|
|
],
|
|
[
|
|
:REGEXP,
|
|
/^
|
|
\/
|
|
(
|
|
\\ # escape
|
|
(
|
|
[\\\/ntrfvaebs\(\)\[\]\{\}\-\.\?\*\+\|\^\$] # one-character escape
|
|
|
|
|
[0-7]{2,3} # octal number escape
|
|
|
|
|
x[0-9a-fA-F]{1,2} # hexadecimal number escape
|
|
)
|
|
|
|
|
[^\/] # regular character
|
|
)*
|
|
\/
|
|
[imx]*
|
|
/x
|
|
],
|
|
# ANY, EVEN and ODD need to be before METHOD_NAME, otherwise they would be
|
|
# recognized as method names.
|
|
[:ANY, /^any/],
|
|
[:EVEN, /^even/],
|
|
[:ODD, /^odd/],
|
|
# We exclude "*", "+", "<", ">", "^" and "|" from method names since they are
|
|
# lexed as simple tokens. This is because they have also other meanings in
|
|
# Machette patterns beside Ruby method names.
|
|
[
|
|
:METHOD_NAME,
|
|
/^
|
|
(
|
|
# regular name
|
|
[a-z_][a-zA-Z0-9_]*[?!=]?
|
|
|
|
|
# operator (sorted by length, then alphabetically)
|
|
(<=>|===|\[\]=|\*\*|\+@|-@|<<|<=|==|=~|>=|>>|\[\]|[%&\-\/`~])
|
|
)
|
|
/x
|
|
],
|
|
[:CLASS_NAME, /^[A-Z][a-zA-Z0-9_]*/]
|
|
]
|
|
|
|
def next_token
|
|
skip_whitespace
|
|
|
|
return false if remaining_input.empty?
|
|
|
|
# Complex tokens need to be before simple tokens, otherwise e.g. "<<" would be
|
|
# recognized as two tokens.
|
|
|
|
COMPLEX_TOKENS.each do |type, regexp|
|
|
if remaining_input =~ regexp
|
|
@pos += $&.length
|
|
return [type, $&]
|
|
end
|
|
end
|
|
|
|
SIMPLE_TOKENS.each do |token|
|
|
if remaining_input[0...token.length] == token
|
|
@pos += token.length
|
|
return [token, token]
|
|
end
|
|
end
|
|
|
|
raise SyntaxError, "Unexpected character: #{remaining_input[0..0].inspect}."
|
|
end
|
|
|
|
def skip_whitespace
|
|
if remaining_input =~ /\A^[ \t\r\n]+/
|
|
@pos += $&.length
|
|
end
|
|
end
|
|
|
|
def remaining_input
|
|
@input[@pos..-1]
|
|
end
|
|
|
|
def on_error(error_token_id, error_value, value_stack)
|
|
raise SyntaxError, "Unexpected token: #{error_value.inspect}."
|
|
end
|