From 16a80882f9c18243332c95c9e856868088ae5495 Mon Sep 17 00:00:00 2001 From: Theo Julienne Date: Wed, 16 Dec 2020 08:57:10 +1100 Subject: [PATCH] actionpack: Improve performance by allowing routes with custom regexes in the FSM. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FSM used to find matching routes was previously limited to patterns that contained parameters with the default regexp / no constraints. In large route sets where many parameters are constrained by custom regexp, these routes all fall back on a slow linear search over the route list. These custom regexes were not previously able to be included in the FSM because it transitioned between nodes using only fragments of the URI, or path separators [/.?], but a custom regex may cross a path separator boundary. To work around this, the TransitionTable is improved to support remembering a point within the matching string that we started, and continuing to attempt to match from that point up to the end of each token. Only parameters not on a path separator boundary must still match with a linear search after this change (e.g. `/foo-:bar/`). This results in performance for constrainted routes that matches that of ones using the default regexp. Benchmark: https://gist.github.com/theojulienne/e91fc338d180e1710e29c81a5d701fab Before: ``` Calculating ------------------------------------- without params 6.466k (±12.7%) i/s - 31.648k in 5.009453s params without constraints 5.867k (±12.9%) i/s - 28.842k in 5.032637s params with constraints 909.661 (± 7.9%) i/s - 4.536k in 5.023534s ``` After: ``` Calculating ------------------------------------- without params 6.387k (±11.9%) i/s - 31.728k in 5.068939s params without constraints 5.824k (±13.2%) i/s - 28.650k in 5.043701s params with constraints 5.406k (±11.7%) i/s - 26.931k in 5.076412s ``` For github.com which has many constrainted parameters, a random sampling of 10 URL patterns can be matched approximately 2-4x faster than before. This commit fixes symbols as constrains as tested in https://github.com/rails/rails/commit/6ab985da28fd4e2b5c93b50ed4952f43459f8bce --- .../action_dispatch/journey/gtg/builder.rb | 23 +++-- .../action_dispatch/journey/gtg/simulator.rb | 14 ++- .../journey/gtg/transition_table.rb | 90 +++++++++++++++---- .../lib/action_dispatch/journey/nodes/node.rb | 9 ++ .../action_dispatch/journey/path/pattern.rb | 21 +++++ .../lib/action_dispatch/journey/route.rb | 2 + .../lib/action_dispatch/journey/router.rb | 2 +- .../lib/action_dispatch/journey/routes.rb | 2 +- .../action_dispatch/journey/visualizer/fsm.js | 83 +++++++++++------ actionpack/test/journey/gtg/builder_test.rb | 27 ++++-- .../test/journey/gtg/transition_table_test.rb | 2 +- actionpack/test/journey/routes_test.rb | 14 ++- 12 files changed, 216 insertions(+), 73 deletions(-) diff --git a/actionpack/lib/action_dispatch/journey/gtg/builder.rb b/actionpack/lib/action_dispatch/journey/gtg/builder.rb index 794cfb7de5..e83cda55a9 100644 --- a/actionpack/lib/action_dispatch/journey/gtg/builder.rb +++ b/actionpack/lib/action_dispatch/journey/gtg/builder.rb @@ -6,13 +6,13 @@ module ActionDispatch module Journey # :nodoc: module GTG # :nodoc: class Builder # :nodoc: - DUMMY = Nodes::Dummy.new + DUMMY_END_NODE = Nodes::Dummy.new attr_reader :root, :ast, :endpoints def initialize(root) @root = root - @ast = Nodes::Cat.new root, DUMMY + @ast = Nodes::Cat.new root, DUMMY_END_NODE @followpos = build_followpos end @@ -28,12 +28,12 @@ module ActionDispatch marked[s] = true # mark s s.group_by { |state| symbol(state) }.each do |sym, ps| - u = ps.flat_map { |l| @followpos[l] } + u = ps.flat_map { |l| @followpos[l] }.uniq next if u.empty? from = state_id[s] - if u.all? { |pos| pos == DUMMY } + if u.all? { |pos| pos == DUMMY_END_NODE } to = state_id[Object.new] dtrans[from, to] = sym dtrans.add_accepting(to) @@ -43,9 +43,9 @@ module ActionDispatch to = state_id[u] dtrans[from, to] = sym - if u.include?(DUMMY) + if u.include?(DUMMY_END_NODE) ps.each do |state| - if @followpos[state].include?(DUMMY) + if @followpos[state].include?(DUMMY_END_NODE) dtrans.add_memo(to, state.memo) end end @@ -66,7 +66,10 @@ module ActionDispatch when Nodes::Group true when Nodes::Star - true + # the default star regex is /(.+)/ which is NOT nullable + # but since different constraints can be provided we must + # actually check if this is the case or not. + node.regexp.match?("") when Nodes::Or node.children.any? { |c| nullable?(c) } when Nodes::Cat @@ -104,7 +107,7 @@ module ActionDispatch def lastpos(node) case node when Nodes::Star - firstpos(node.left) + lastpos(node.left) when Nodes::Or node.children.flat_map { |c| lastpos(c) }.tap(&:uniq!) when Nodes::Cat @@ -131,10 +134,6 @@ module ActionDispatch lastpos(n.left).each do |i| table[i] += firstpos(n.right) end - when Nodes::Star - lastpos(n).each do |i| - table[i] += firstpos(n) - end end end table diff --git a/actionpack/lib/action_dispatch/journey/gtg/simulator.rb b/actionpack/lib/action_dispatch/journey/gtg/simulator.rb index 6038cff3c1..67aa399c88 100644 --- a/actionpack/lib/action_dispatch/journey/gtg/simulator.rb +++ b/actionpack/lib/action_dispatch/journey/gtg/simulator.rb @@ -14,7 +14,7 @@ module ActionDispatch end class Simulator # :nodoc: - INITIAL_STATE = [0].freeze + INITIAL_STATE = [ [0, nil] ].freeze attr_reader :tt @@ -25,13 +25,19 @@ module ActionDispatch def memos(string) input = StringScanner.new(string) state = INITIAL_STATE + start_index = 0 while sym = input.scan(%r([/.?]|[^/.?]+)) - state = tt.move(state, sym) + end_index = start_index + sym.length + + state = tt.move(state, string, start_index, end_index) + + start_index = end_index end - acceptance_states = state.each_with_object([]) do |s, memos| - memos.concat(tt.memo(s)) if tt.accepting?(s) + acceptance_states = state.each_with_object([]) do |s_d, memos| + s, idx = s_d + memos.concat(tt.memo(s)) if idx.nil? && tt.accepting?(s) end acceptance_states.empty? ? yield : acceptance_states diff --git a/actionpack/lib/action_dispatch/journey/gtg/transition_table.rb b/actionpack/lib/action_dispatch/journey/gtg/transition_table.rb index 2db6523e2f..f3c93d5340 100644 --- a/actionpack/lib/action_dispatch/journey/gtg/transition_table.rb +++ b/actionpack/lib/action_dispatch/journey/gtg/transition_table.rb @@ -10,11 +10,15 @@ module ActionDispatch attr_reader :memos + DEFAULT_EXP = /[^.\/?]+/ + DEFAULT_EXP_ANCHORED = Regexp.new(/\A#{DEFAULT_EXP}\Z/) + def initialize - @regexp_states = {} - @string_states = {} - @accepting = {} - @memos = Hash.new { |h, k| h[k] = [] } + @stdparam_states = {} + @regexp_states = {} + @string_states = {} + @accepting = {} + @memos = Hash.new { |h, k| h[k] = [] } end def add_accepting(state) @@ -41,22 +45,54 @@ module ActionDispatch Array(t) end - def move(t, a) + def move(t, full_string, start_index, end_index) return [] if t.empty? - regexps = [] - strings = [] + next_states = [] - t.each { |s| - if states = @regexp_states[s] - states.each { |re, v| regexps << v if re.match?(a) && !v.nil? } + tok = full_string.slice(start_index, end_index - start_index) + token_matches_default_component = DEFAULT_EXP_ANCHORED.match?(tok) + + t.each { |s, previous_start| + if previous_start.nil? + # In the simple case of a "default" param regex do this fast-path + # and add all next states. + if token_matches_default_component && states = @stdparam_states[s] + states.each { |re, v| next_states << [v, nil].freeze if !v.nil? } + end + + # When we have a literal string, we can just pull the next state + if states = @string_states[s] + next_states << [states[tok], nil].freeze unless states[tok].nil? + end end - if states = @string_states[s] - strings << states[a] unless states[a].nil? + # For regexes that aren't the "default" style, they may potentially + # not be terminated by the first "token" [./?], so we need to continue + # to attempt to match this regexp as well as any successful paths that + # continue out of it. both paths could be valid. + if states = @regexp_states[s] + slice_start = if previous_start.nil? + start_index + else + previous_start + end + + slice_length = end_index - slice_start + curr_slice = full_string.slice(slice_start, slice_length) + + states.each { |re, v| + # if we match, we can try moving past this + next_states << [v, nil].freeze if !v.nil? && re.match?(curr_slice) + } + + # and regardless, we must continue accepting tokens and retrying this regexp. + # we need to remember where we started as well so we can take bigger slices. + next_states << [s, slice_start].freeze end } - strings.concat regexps + + next_states end def as_json(options = nil) @@ -69,9 +105,10 @@ module ActionDispatch end { - regexp_states: simple_regexp, - string_states: @string_states, - accepting: @accepting + regexp_states: simple_regexp, + string_states: @string_states, + stdparam_states: @stdparam_states, + accepting: @accepting } end @@ -125,18 +162,29 @@ module ActionDispatch def []=(from, to, sym) to_mappings = states_hash_for(sym)[from] ||= {} + case sym + when Regexp + # we must match the whole string to a token boundary + sym = Regexp.new(/\A#{sym}\Z/) + when Symbol + # account for symbols in the constraints the same as strings + sym = sym.to_s + end to_mappings[sym] = to end def states ss = @string_states.keys + @string_states.values.flat_map(&:values) + ps = @stdparam_states.keys + @stdparam_states.values.flat_map(&:values) rs = @regexp_states.keys + @regexp_states.values.flat_map(&:values) - (ss + rs).uniq + (ss + ps + rs).uniq end def transitions @string_states.flat_map { |from, hash| hash.map { |s, to| [from, s, to] } + } + @stdparam_states.flat_map { |from, hash| + hash.map { |s, to| [from, s, to] } } + @regexp_states.flat_map { |from, hash| hash.map { |s, to| [from, s, to] } } @@ -145,10 +193,14 @@ module ActionDispatch private def states_hash_for(sym) case sym - when String + when String, Symbol @string_states when Regexp - @regexp_states + if sym == DEFAULT_EXP + @stdparam_states + else + @regexp_states + end else raise ArgumentError, "unknown symbol: %s" % sym.class end diff --git a/actionpack/lib/action_dispatch/journey/nodes/node.rb b/actionpack/lib/action_dispatch/journey/nodes/node.rb index bfc9fbfb5e..ea0a6835e9 100644 --- a/actionpack/lib/action_dispatch/journey/nodes/node.rb +++ b/actionpack/lib/action_dispatch/journey/nodes/node.rb @@ -104,6 +104,15 @@ module ActionDispatch end class Star < Unary # :nodoc: + attr_accessor :regexp + + def initialize(left) + super(left) + + # By default wildcard routes are non-greedy and must match something. + @regexp = /.+?/ + end + def star?; true; end def type; :STAR; end diff --git a/actionpack/lib/action_dispatch/journey/path/pattern.rb b/actionpack/lib/action_dispatch/journey/path/pattern.rb index 36b205cee6..7d47800932 100644 --- a/actionpack/lib/action_dispatch/journey/path/pattern.rb +++ b/actionpack/lib/action_dispatch/journey/path/pattern.rb @@ -39,6 +39,27 @@ module ActionDispatch @spec end + def requirements_anchored? + # each required param must not be surrounded by a literal, otherwise it isn't simple to chunk-match the url piecemeal + terminals = ast.find_all { |t| t.is_a?(Nodes::Terminal) } + + terminals.each_with_index { |s, index| + next if index < 1 + next unless s.symbol? + + back = terminals[index - 1] + fwd = terminals[index + 1] + + # we also don't support this yet, constraints must be regexps + return false if s.regexp.is_a?(Array) + + return false if back.literal? + return false if !fwd.nil? && fwd.literal? + } + + true + end + def names @names ||= spec.find_all(&:symbol?).map(&:name) end diff --git a/actionpack/lib/action_dispatch/journey/route.rb b/actionpack/lib/action_dispatch/journey/route.rb index 21ee766e5c..38dfcea956 100644 --- a/actionpack/lib/action_dispatch/journey/route.rb +++ b/actionpack/lib/action_dispatch/journey/route.rb @@ -84,6 +84,8 @@ module ActionDispatch @decorated_ast ||= begin decorated_ast = path.ast decorated_ast.find_all(&:terminal?).each { |n| n.memo = self } + # inject any regexp requirements for `star` nodes so they can be determined nullable, which requires knowing if the regex accepts an empty string. + decorated_ast.find_all(&:star?).each { |n| n.regexp = path.requirements[n.name.to_sym] unless path.requirements[n.name.to_sym].nil? } decorated_ast end end diff --git a/actionpack/lib/action_dispatch/journey/router.rb b/actionpack/lib/action_dispatch/journey/router.rb index b6b71720db..27671b65da 100644 --- a/actionpack/lib/action_dispatch/journey/router.rb +++ b/actionpack/lib/action_dispatch/journey/router.rb @@ -85,7 +85,7 @@ module ActionDispatch private def partitioned_routes routes.partition { |r| - r.path.anchored && r.ast.grep(Nodes::Symbol).all? { |n| n.default_regexp? } + r.path.anchored && r.path.requirements_anchored? } end diff --git a/actionpack/lib/action_dispatch/journey/routes.rb b/actionpack/lib/action_dispatch/journey/routes.rb index 3f055db66d..9dfea744e8 100644 --- a/actionpack/lib/action_dispatch/journey/routes.rb +++ b/actionpack/lib/action_dispatch/journey/routes.rb @@ -41,7 +41,7 @@ module ActionDispatch end def partition_route(route) - if route.path.anchored && route.ast.grep(Nodes::Symbol).all?(&:default_regexp?) + if route.path.anchored && route.path.requirements_anchored? anchored_routes << route else custom_routes << route diff --git a/actionpack/lib/action_dispatch/journey/visualizer/fsm.js b/actionpack/lib/action_dispatch/journey/visualizer/fsm.js index d9bcaef928..6daf0cb1c1 100644 --- a/actionpack/lib/action_dispatch/journey/visualizer/fsm.js +++ b/actionpack/lib/action_dispatch/journey/visualizer/fsm.js @@ -68,7 +68,7 @@ function highlight_state(index, color) { } function highlight_finish(index) { - svg_nodes[index].select('polygon') + svg_nodes[index].select('ellipse') .style("fill", "while") .transition().duration(500) .style("fill", "blue"); @@ -76,54 +76,79 @@ function highlight_finish(index) { function match(input) { reset_graph(); - var table = tt(); - var states = [0]; - var regexp_states = table['regexp_states']; - var string_states = table['string_states']; - var accepting = table['accepting']; + var table = tt(); + var states = [[0, null]]; + var regexp_states = table['regexp_states']; + var string_states = table['string_states']; + var stdparam_states = table['stdparam_states']; + var accepting = table['accepting']; + var default_re = new RegExp("^[^.\/?]+$"); + var start_index = 0; highlight_state(0); tokenize(input, function(token) { + var end_index = start_index + token.length; + var new_states = []; for(var key in states) { - var state = states[key]; + var state_parts = states[key]; + var state = state_parts[0]; + var previous_start = state_parts[1]; - if(string_states[state] && string_states[state][token]) { - var new_state = string_states[state][token]; - highlight_edge(state, new_state); - highlight_state(new_state); - new_states.push(new_state); - } + if(previous_start == null) { + if(string_states[state] && string_states[state][token]) { + var new_state = string_states[state][token]; + highlight_edge(state, new_state); + highlight_state(new_state); + new_states.push([new_state, null]); + } - if(regexp_states[state]) { - for(var key in regexp_states[state]) { - var re = new RegExp("^" + key + "$"); - if(re.test(token)) { - var new_state = regexp_states[state][key]; + if(stdparam_states[state] && default_re.test(token)) { + for(var key in stdparam_states[state]) { + var new_state = stdparam_states[state][key]; highlight_edge(state, new_state); highlight_state(new_state); - new_states.push(new_state); + new_states.push([new_state, null]); } } } + + if(regexp_states[state]) { + var slice_start = previous_start != null ? previous_start : start_index; + + for(var key in regexp_states[state]) { + var re = new RegExp("^" + key + "$"); + + var accumulation = input.slice(slice_start, end_index); + + if(re.test(accumulation)) { + var new_state = regexp_states[state][key]; + highlight_edge(state, new_state); + highlight_state(new_state); + new_states.push([new_state, null]); + } + + // retry the same regexp with the accumulated data either way + new_states.push([state, slice_start]); + } + } } - if(new_states.length == 0) { - return; - } states = new_states; + start_index = end_index; }); for(var key in states) { - var state = states[key]; + var state_parts = states[key]; + var state = state_parts[0]; + var slice_start = state_parts[1]; + + // we must ignore ones that are still accepting more data + if (slice_start != null) continue; + if(accepting[state]) { - for(var mkey in svg_edges[state]) { - if(!regexp_states[mkey] && !string_states[mkey]) { - highlight_edge(state, mkey); - highlight_finish(mkey); - } - } + highlight_finish(state); } else { highlight_state(state, "red"); } diff --git a/actionpack/test/journey/gtg/builder_test.rb b/actionpack/test/journey/gtg/builder_test.rb index 9dc786e261..719c3c9959 100644 --- a/actionpack/test/journey/gtg/builder_test.rb +++ b/actionpack/test/journey/gtg/builder_test.rb @@ -8,13 +8,13 @@ module ActionDispatch class TestBuilder < ActiveSupport::TestCase def test_following_states_multi table = tt ["a|a"] - assert_equal 1, table.move([0], "a").length + assert_equal 1, table.move([[0, nil]], "a", 0, 1).length end def test_following_states_multi_regexp table = tt [":a|b"] - assert_equal 1, table.move([0], "fooo").length - assert_equal 2, table.move([0], "b").length + assert_equal 1, table.move([[0, nil]], "fooo", 0, 4).length + assert_equal 2, table.move([[0, nil]], "b", 0, 1).length end def test_multi_path @@ -25,8 +25,8 @@ module ActionDispatch [2, "b"], [2, "/"], [1, "c"], - ].inject([0]) { |state, (exp, sym)| - new = table.move(state, sym) + ].inject([[0, nil]]) { |state, (exp, sym)| + new = table.move(state, sym, 0, sym.length) assert_equal exp, new.length new } @@ -60,6 +60,23 @@ module ActionDispatch assert_equal 2, memos.length end + def test_catchall + table = tt %w{ + / + /*unmatched_route + } + + sim = Simulator.new table + + # matches just the /*unmatched_route + memos = sim.memos "/test" + assert_equal 1, memos.length + + # matches just the / + memos = sim.memos "/" + assert_equal 1, memos.length + end + private def ast(strings) parser = Journey::Parser.new diff --git a/actionpack/test/journey/gtg/transition_table_test.rb b/actionpack/test/journey/gtg/transition_table_test.rb index 9044934f05..7394764a4c 100644 --- a/actionpack/test/journey/gtg/transition_table_test.rb +++ b/actionpack/test/journey/gtg/transition_table_test.rb @@ -89,7 +89,7 @@ module ActionDispatch sim = GTG::Simulator.new builder.transition_table memos = sim.memos "/articles/new" - assert_equal [paths[1], paths[3]], memos + assert_equal Set[paths[1], paths[3]], Set.new(memos) end private diff --git a/actionpack/test/journey/routes_test.rb b/actionpack/test/journey/routes_test.rb index d5c81a8421..1b23d72849 100644 --- a/actionpack/test/journey/routes_test.rb +++ b/actionpack/test/journey/routes_test.rb @@ -45,12 +45,24 @@ module ActionDispatch assert_equal 1, @routes.anchored_routes.length assert_empty @routes.custom_routes - mapper.get "/hello/:who", to: "foo#bar", as: "bar", who: /\d/ + mapper.get "/not_anchored/hello/:who-notanchored", to: "foo#bar", as: "bar", who: /\d/, anchor: false assert_equal 1, @routes.custom_routes.length assert_equal 1, @routes.anchored_routes.length end + def test_custom_anchored_not_partition_route + mapper.get "/foo/:bar", to: "foo#bar", as: "aaron" + + assert_equal 1, @routes.anchored_routes.length + assert_empty @routes.custom_routes + + mapper.get "/:user/:repo", to: "foo#bar", as: "bar", repo: /[\w.]+/ + + assert_equal 2, @routes.anchored_routes.length + assert_empty @routes.custom_routes + end + def test_first_name_wins mapper.get "/hello", to: "foo#bar", as: "aaron" assert_raise(ArgumentError) do