Optimize substrings generated from Regexp

This commit is contained in:
Thomas Walpole 2018-11-06 13:32:08 -08:00
parent df1be804dc
commit 9e15cf95f5
2 changed files with 89 additions and 16 deletions

View File

@ -12,18 +12,48 @@ module Capybara
def alternated_substrings
@alternated_substrings ||= begin
process(alternation: true)
or_strings = process(alternation: true)
remove_or_covered(or_strings)
or_strings.any?(&:empty?) ? [] : or_strings
end
end
def substrings
@substrings ||= begin
process(alternation: false).first
strs = process(alternation: false).first
remove_and_covered(strs)
end
end
private
def remove_and_covered(strings)
# If we have "ab" and "abcd" required - only need to check for "abcd"
strings.delete_if do |sub_string|
strings.any? do |cover_string|
next if sub_string.equal? cover_string
cover_string.include?(sub_string)
end
end
end
def remove_or_covered(or_series)
# If we are going to match `("a" and "b") or ("ade" and "bce")` it only makes sense to match ("a" and "b")
# Ensure minimum sets of strings are being or'd
or_series.each { |strs| remove_and_covered(strs) }
# Remove any of the alternated string series that fully contain any other string series
or_series.delete_if do |and_strs|
or_series.any? do |and_strs2|
next if and_strs.equal? and_strs2
remove_and_covered(and_strs + and_strs2) == and_strs
end
end
end
def process(alternation:)
strs = extract_strings(Regexp::Parser.parse(@regexp), alternation: alternation)
strs = collapse(combine(strs).map(&:flatten))
@ -68,8 +98,8 @@ module Capybara
end
def extract_strings(expression, strings = [], alternation: false)
expression.each do |exp|
if optional?(exp)
expression.each do |exp| # rubocop:disable Metrics/BlockLength
if optional?(exp) && !(alternation && zero_or_one?(exp))
strings.push(nil)
next
end
@ -87,12 +117,25 @@ module Capybara
if exp.terminal?
case exp.type
when :literal
strings.push(exp.text * min_repeat(exp))
if zero_or_one?(exp)
strings.push(Set.new([[''], [exp.text]]))
next
else
strings.push(exp.text * min_repeat(exp))
end
when :escape
strings.push(exp.char * min_repeat(exp))
if zero_or_one?(exp)
strings.push(Set.new([[''], [exp.text]]))
next
else
strings.push(exp.char * min_repeat(exp))
end
else
strings.push(nil)
end
elsif alternation && zero_or_one?(exp)
strings.push(Set.new([[''], extract_strings(exp, alternation: true)]))
next
else
min_repeat(exp).times { extract_strings(exp, strings, alternation: alternation) }
end
@ -101,6 +144,10 @@ module Capybara
strings
end
def zero_or_one?(exp)
exp.quantity == [0, 1]
end
def alternative_strings(expression)
alternatives = expression.alternatives.map { |sub_exp| extract_strings(sub_exp, alternation: true) }
if alternatives.all?(&:any?)

View File

@ -27,18 +27,37 @@ RSpec.describe Capybara::Selector::RegexpDisassembler do
/abc./ => %w[abc],
/abc.*/ => %w[abc],
/abc.def/ => %w[abc def],
/abc.def.ghi/ => %w[abc def ghi]
/abc.def.ghi/ => %w[abc def ghi],
/abc.abcd.abcde/ => %w[abcde],
/.*/ => []
)
end
it 'handles optional characters' do
verify_strings(
it 'ignores optional characters for substrings' do
{
/abc*def/ => %w[ab def],
/abc*/ => %w[ab],
/c*/ => [],
/abc?def/ => %w[ab def],
/abc?/ => %w[ab],
/abc?def?/ => %w[ab de],
/abc?def?g/ => %w[ab de g]
/abc?def?g/ => %w[ab de g],
/d?/ => []
}.each do |regexp, expected|
expect(Capybara::Selector::RegexpDisassembler.new(regexp).substrings).to eq expected
end
end
it 'handles optional characters for #alternated_substrings' do
verify_alternated_strings(
/abc*def/ => [%w[ab def]],
/abc*/ => [%w[ab]],
/c*/ => [],
/abc?def/ => [%w[abdef], %w[abcdef]],
/abc?/ => [%w[ab]],
/abc?def?/ => [%w[abde], %w[abcde]],
/abc?def?g/ => [%w[abdeg], %w[abdefg], %w[abcdeg], %w[abcdefg]],
/d?/ => []
)
end
@ -111,24 +130,31 @@ RSpec.describe Capybara::Selector::RegexpDisassembler do
end
end
it 'handles alternation for #options' do
it 'handles alternation for #alternated_substrings' do
verify_alternated_strings(
/abc|def/ => [%w[abc], %w[def]],
/ab(?:c|d)/ => [%w[abc], %w[abd]],
/ab(c|d|e)fg/ => [%w[abcfg], %w[abdfg], %w[abefg]],
/ab?(c|d)fg/ => [%w[a cfg], %w[a dfg]],
/ab?(c|d)fg/ => [%w[acfg], %w[adfg], %w[abcfg], %w[abdfg]],
/ab(c|d)ef/ => [%w[abcef], %w[abdef]],
/ab(cd?|ef)g/ => [%w[abc g], %w[abefg]],
/ab(cd?|ef)g/ => [%w[abcg], %w[abcdg], %w[abefg]],
/ab(cd|ef*)g/ => [%w[abcdg], %w[abe g]],
/ab|cd*/ => [%w[ab], %w[c]],
/cd(?:ef|gh)|xyz/ => [%w[cdef], %w[cdgh], %w[xyz]],
/(cd(?:ef|gh)|xyz)/ => [%w[cdef], %w[cdgh], %w[xyz]],
/cd(ef|gh)+/ => [%w[cdef], %w[cdgh]],
/cd(ef|gh)?/ => [%w[cd]],
/cd(ef|gh)?ij/ => [%w[cd ij]],
/cd(ef|gh)?ij/ => [%w[cdij], %w[cdefij], %w[cdghij]],
/cd(ef|gh)+ij/ => [%w[cdef ij], %w[cdgh ij]],
/cd(ef|gh){2}ij/ => [%w[cdefefij], %w[cdefghij], %w[cdghefij], %w[cdghghij]],
/(cd(ef|g*))/ => [%w[cd]]
/(cd(ef|g*))/ => [%w[cd]],
/a|b*/ => [],
/ab(?:c|d?)/ => [%w[ab]],
/ab(c|d)|a*/ => [],
/(abc)?(d|e)/ => [%w[d], %w[e]],
/(abc*de)?(d|e)/ => [%w[d], %w[e]],
/(abc*de)?(d|e?)/ => [],
/(abc)?(d|e?)/ => []
)
end
@ -193,7 +219,7 @@ RSpec.describe Capybara::Selector::RegexpDisassembler do
def verify_alternated_strings(hsh, wrap: false)
hsh.each do |regexp, expected|
expected = [expected] if wrap
expected = [expected] if wrap && (expected != [])
expect(Capybara::Selector::RegexpDisassembler.new(regexp).alternated_substrings).to eq expected
end
end