mirror of
https://github.com/rails/rails.git
synced 2022-11-09 12:12:34 -05:00
Add jitter to :exponentially_longer
Prior to this change, exponentially_longer had adverse consequences during system-wide downstream failures. This change adds a random value to the back off calculation in order to prevent the thundering herd problem, whereby all retry jobs would retry at the same time. Specifically this change adds a jitter option to retry_on to enable users of it to scope the randomness calculation to a reasonable amount. The default is 15% of the exponential back off calculation.
This commit is contained in:
parent
be2473b2ef
commit
5f7621878d
3 changed files with 66 additions and 43 deletions
|
@ -34,5 +34,16 @@
|
|||
|
||||
*Vlado Cingel*
|
||||
|
||||
* Add jitter to :exponentially_longer
|
||||
|
||||
ActiveJob::Exceptions.retry_on with :exponentially_longer now uses a random amount of jitter in order to
|
||||
prevent the [thundering herd effect.](https://en.wikipedia.org/wiki/Thundering_herd_problem). Defaults to
|
||||
15% (represented as 0.15) but overridable via the `:jitter` option when using `retry_on`.
|
||||
Jitter is applied when an `Integer`, `ActiveSupport::Duration` or `exponentially_longer`, is passed to the `wait` argument in `retry_on`.
|
||||
|
||||
retry_on(MyError, wait: :exponentially_longer, jitter: 0.30)
|
||||
|
||||
*Anthony Ross*
|
||||
|
||||
|
||||
Please check [6-0-stable](https://github.com/rails/rails/blob/6-0-stable/activejob/CHANGELOG.md) for previous changes.
|
||||
|
|
|
@ -19,22 +19,24 @@ module ActiveJob
|
|||
# ==== Options
|
||||
# * <tt>:wait</tt> - Re-enqueues the job with a delay specified either in seconds (default: 3 seconds),
|
||||
# as a computing proc that the number of executions so far as an argument, or as a symbol reference of
|
||||
# <tt>:exponentially_longer</tt>, which applies the wait algorithm of <tt>(executions ** 4) + 2</tt>
|
||||
# (first wait 3s, then 18s, then 83s, etc)
|
||||
# <tt>:exponentially_longer</tt>, which applies the wait algorithm of <tt><((executions**4) + (Kernel.rand((executions**4) * jitter))) + 2/tt>
|
||||
# (first wait ~3s, then ~18s, then ~83s, etc)
|
||||
# * <tt>:attempts</tt> - Re-enqueues the job the specified number of times (default: 5 attempts)
|
||||
# * <tt>:queue</tt> - Re-enqueues the job on a different queue
|
||||
# * <tt>:priority</tt> - Re-enqueues the job with a different priority
|
||||
# * <tt>:jitter</tt> - A random delay of wait time used when calculating backoff. The default is 15%(0.15) which represents the upper bound of possible wait time (expressed as a percentage)
|
||||
#
|
||||
# ==== Examples
|
||||
#
|
||||
# class RemoteServiceJob < ActiveJob::Base
|
||||
# retry_on CustomAppException # defaults to 3s wait, 5 attempts
|
||||
# retry_on CustomAppException # defaults to ~3s wait, 5 attempts
|
||||
# retry_on AnotherCustomAppException, wait: ->(executions) { executions * 2 }
|
||||
#
|
||||
# retry_on ActiveRecord::Deadlocked, wait: 5.seconds, attempts: 3
|
||||
# retry_on Net::OpenTimeout, Timeout::Error, wait: :exponentially_longer, attempts: 10 # retries at most 10 times for Net::OpenTimeout and Timeout::Error combined
|
||||
# # To retry at most 10 times for each individual exception:
|
||||
# # retry_on Net::OpenTimeout, wait: :exponentially_longer, attempts: 10
|
||||
# # retry_on Net::ReadTimeout, wait: 5.seconds, jitter: 0.30, attempts: 10
|
||||
# # retry_on Timeout::Error, wait: :exponentially_longer, attempts: 10
|
||||
#
|
||||
# retry_on(YetAnotherCustomAppException) do |job, error|
|
||||
|
@ -47,12 +49,11 @@ module ActiveJob
|
|||
# # Might raise Net::OpenTimeout or Timeout::Error when the remote service is down
|
||||
# end
|
||||
# end
|
||||
def retry_on(*exceptions, wait: 3.seconds, attempts: 5, queue: nil, priority: nil)
|
||||
def retry_on(*exceptions, wait: 3.seconds, attempts: 5, queue: nil, priority: nil, jitter: 0.15)
|
||||
rescue_from(*exceptions) do |error|
|
||||
executions = executions_for(exceptions)
|
||||
|
||||
if executions < attempts
|
||||
retry_job wait: determine_delay(seconds_or_duration_or_algorithm: wait, executions: executions), queue: queue, priority: priority, error: error
|
||||
retry_job wait: determine_delay(seconds_or_duration_or_algorithm: wait, executions: executions, jitter: jitter), queue: queue, priority: priority, error: error
|
||||
else
|
||||
if block_given?
|
||||
instrument :retry_stopped, error: error do
|
||||
|
@ -121,16 +122,16 @@ module ActiveJob
|
|||
end
|
||||
|
||||
private
|
||||
def determine_delay(seconds_or_duration_or_algorithm:, executions:)
|
||||
def determine_delay(seconds_or_duration_or_algorithm:, executions:, jitter:)
|
||||
case seconds_or_duration_or_algorithm
|
||||
when :exponentially_longer
|
||||
(executions**4) + 2
|
||||
((executions**4) + (Kernel.rand((executions**4) * jitter))) + 2
|
||||
when ActiveSupport::Duration
|
||||
duration = seconds_or_duration_or_algorithm
|
||||
duration.to_i
|
||||
duration = seconds_or_duration_or_algorithm.to_i
|
||||
duration + Kernel.rand(duration * jitter)
|
||||
when Integer
|
||||
seconds = seconds_or_duration_or_algorithm
|
||||
seconds
|
||||
seconds + (Kernel.rand(seconds * jitter).ceil)
|
||||
when Proc
|
||||
algorithm = seconds_or_duration_or_algorithm
|
||||
algorithm.call(executions)
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
require "helper"
|
||||
require "jobs/retry_job"
|
||||
require "models/person"
|
||||
require "minitest/mock"
|
||||
|
||||
class ExceptionsTest < ActiveSupport::TestCase
|
||||
setup do
|
||||
|
@ -94,32 +95,38 @@ class ExceptionsTest < ActiveSupport::TestCase
|
|||
|
||||
test "long wait job" do
|
||||
travel_to Time.now
|
||||
random_amount = 1
|
||||
|
||||
RetryJob.perform_later "LongWaitError", 2, :log_scheduled_at
|
||||
|
||||
assert_equal [
|
||||
"Raised LongWaitError for the 1st time",
|
||||
"Next execution scheduled at #{(Time.now + 3600.seconds).to_f}",
|
||||
"Successfully completed job"
|
||||
], JobBuffer.values
|
||||
Kernel.stub(:rand, random_amount) do
|
||||
RetryJob.perform_later "LongWaitError", 2, :log_scheduled_at
|
||||
assert_equal [
|
||||
"Raised LongWaitError for the 1st time",
|
||||
"Next execution scheduled at #{(Time.now + 3600.seconds + random_amount).to_f}",
|
||||
"Successfully completed job"
|
||||
], JobBuffer.values
|
||||
end
|
||||
end
|
||||
|
||||
test "exponentially retrying job" do
|
||||
test "exponentially retrying job includes jitter" do
|
||||
travel_to Time.now
|
||||
|
||||
RetryJob.perform_later "ExponentialWaitTenAttemptsError", 5, :log_scheduled_at
|
||||
random_amount = 2
|
||||
|
||||
assert_equal [
|
||||
"Raised ExponentialWaitTenAttemptsError for the 1st time",
|
||||
"Next execution scheduled at #{(Time.now + 3.seconds).to_f}",
|
||||
"Raised ExponentialWaitTenAttemptsError for the 2nd time",
|
||||
"Next execution scheduled at #{(Time.now + 18.seconds).to_f}",
|
||||
"Raised ExponentialWaitTenAttemptsError for the 3rd time",
|
||||
"Next execution scheduled at #{(Time.now + 83.seconds).to_f}",
|
||||
"Raised ExponentialWaitTenAttemptsError for the 4th time",
|
||||
"Next execution scheduled at #{(Time.now + 258.seconds).to_f}",
|
||||
"Successfully completed job"
|
||||
], JobBuffer.values
|
||||
Kernel.stub(:rand, random_amount) do
|
||||
RetryJob.perform_later "ExponentialWaitTenAttemptsError", 5, :log_scheduled_at
|
||||
|
||||
assert_equal [
|
||||
"Raised ExponentialWaitTenAttemptsError for the 1st time",
|
||||
"Next execution scheduled at #{(Time.now + 3.seconds + random_amount).to_f}",
|
||||
"Raised ExponentialWaitTenAttemptsError for the 2nd time",
|
||||
"Next execution scheduled at #{(Time.now + 18.seconds + random_amount).to_f}",
|
||||
"Raised ExponentialWaitTenAttemptsError for the 3rd time",
|
||||
"Next execution scheduled at #{(Time.now + 83.seconds + random_amount).to_f}",
|
||||
"Raised ExponentialWaitTenAttemptsError for the 4th time",
|
||||
"Next execution scheduled at #{(Time.now + 258.seconds + random_amount).to_f}",
|
||||
"Successfully completed job"
|
||||
], JobBuffer.values
|
||||
end
|
||||
end
|
||||
|
||||
test "custom wait retrying job" do
|
||||
|
@ -145,19 +152,23 @@ class ExceptionsTest < ActiveSupport::TestCase
|
|||
|
||||
exceptions_to_raise = %w(ExponentialWaitTenAttemptsError CustomWaitTenAttemptsError ExponentialWaitTenAttemptsError CustomWaitTenAttemptsError)
|
||||
|
||||
RetryJob.perform_later exceptions_to_raise, 5, :log_scheduled_at
|
||||
random_amount = 1
|
||||
|
||||
assert_equal [
|
||||
"Raised ExponentialWaitTenAttemptsError for the 1st time",
|
||||
"Next execution scheduled at #{(Time.now + 3.seconds).to_f}",
|
||||
"Raised CustomWaitTenAttemptsError for the 2nd time",
|
||||
"Next execution scheduled at #{(Time.now + 2.seconds).to_f}",
|
||||
"Raised ExponentialWaitTenAttemptsError for the 3rd time",
|
||||
"Next execution scheduled at #{(Time.now + 18.seconds).to_f}",
|
||||
"Raised CustomWaitTenAttemptsError for the 4th time",
|
||||
"Next execution scheduled at #{(Time.now + 4.seconds).to_f}",
|
||||
"Successfully completed job"
|
||||
], JobBuffer.values
|
||||
Kernel.stub(:rand, random_amount) do
|
||||
RetryJob.perform_later exceptions_to_raise, 5, :log_scheduled_at
|
||||
|
||||
assert_equal [
|
||||
"Raised ExponentialWaitTenAttemptsError for the 1st time",
|
||||
"Next execution scheduled at #{(Time.now + 3.seconds + random_amount).to_f}",
|
||||
"Raised CustomWaitTenAttemptsError for the 2nd time",
|
||||
"Next execution scheduled at #{(Time.now + 2.seconds).to_f}",
|
||||
"Raised ExponentialWaitTenAttemptsError for the 3rd time",
|
||||
"Next execution scheduled at #{(Time.now + 18.seconds + random_amount).to_f}",
|
||||
"Raised CustomWaitTenAttemptsError for the 4th time",
|
||||
"Next execution scheduled at #{(Time.now + 4.seconds).to_f}",
|
||||
"Successfully completed job"
|
||||
], JobBuffer.values
|
||||
end
|
||||
end
|
||||
|
||||
test "successfully retry job throwing one of two retryable exceptions" do
|
||||
|
|
Loading…
Reference in a new issue