2018-03-15 12:00:53 -04:00
|
|
|
# frozen_string_literal: true
|
2019-04-01 12:20:41 -04:00
|
|
|
|
|
|
|
require "sidekiq/scheduled"
|
|
|
|
require "sidekiq/api"
|
2017-01-17 17:58:08 -05:00
|
|
|
|
2019-09-15 15:55:42 -04:00
|
|
|
require "zlib"
|
|
|
|
require "base64"
|
|
|
|
|
2017-01-17 17:58:08 -05:00
|
|
|
module Sidekiq
|
|
|
|
##
|
|
|
|
# Automatically retry jobs that fail in Sidekiq.
|
|
|
|
# Sidekiq's retry support assumes a typical development lifecycle:
|
|
|
|
#
|
|
|
|
# 0. Push some code changes with a bug in it.
|
|
|
|
# 1. Bug causes job processing to fail, Sidekiq's middleware captures
|
|
|
|
# the job and pushes it onto a retry queue.
|
|
|
|
# 2. Sidekiq retries jobs in the retry queue multiple times with
|
|
|
|
# an exponential delay, the job continues to fail.
|
|
|
|
# 3. After a few days, a developer deploys a fix. The job is
|
|
|
|
# reprocessed successfully.
|
|
|
|
# 4. Once retries are exhausted, Sidekiq will give up and move the
|
|
|
|
# job to the Dead Job Queue (aka morgue) where it must be dealt with
|
|
|
|
# manually in the Web UI.
|
|
|
|
# 5. After 6 months on the DJQ, Sidekiq will discard the job.
|
|
|
|
#
|
|
|
|
# A job looks like:
|
|
|
|
#
|
2022-03-03 15:37:25 -05:00
|
|
|
# { 'class' => 'HardJob', 'args' => [1, 2, 'foo'], 'retry' => true }
|
2017-01-17 17:58:08 -05:00
|
|
|
#
|
|
|
|
# The 'retry' option also accepts a number (in place of 'true'):
|
|
|
|
#
|
2022-03-03 15:37:25 -05:00
|
|
|
# { 'class' => 'HardJob', 'args' => [1, 2, 'foo'], 'retry' => 5 }
|
2017-01-17 17:58:08 -05:00
|
|
|
#
|
|
|
|
# The job will be retried this number of times before giving up. (If simply
|
|
|
|
# 'true', Sidekiq retries 25 times)
|
|
|
|
#
|
2021-12-09 16:06:29 -05:00
|
|
|
# Relevant options for job retries:
|
2017-01-17 17:58:08 -05:00
|
|
|
#
|
2021-12-09 16:06:29 -05:00
|
|
|
# * 'queue' - the queue for the initial job
|
|
|
|
# * 'retry_queue' - if job retries should be pushed to a different (e.g. lower priority) queue
|
2017-01-17 17:58:08 -05:00
|
|
|
# * 'retry_count' - number of times we've retried so far.
|
|
|
|
# * 'error_message' - the message from the exception
|
|
|
|
# * 'error_class' - the exception class
|
|
|
|
# * 'failed_at' - the first time it failed
|
|
|
|
# * 'retried_at' - the last time it was retried
|
|
|
|
# * 'backtrace' - the number of lines of error backtrace to store
|
|
|
|
#
|
|
|
|
# We don't store the backtrace by default as that can add a lot of overhead
|
|
|
|
# to the job and everyone is using an error service, right?
|
|
|
|
#
|
|
|
|
# The default number of retries is 25 which works out to about 3 weeks
|
|
|
|
# You can change the default maximum number of retries in your initializer:
|
|
|
|
#
|
|
|
|
# Sidekiq.options[:max_retries] = 7
|
|
|
|
#
|
2022-03-03 15:37:25 -05:00
|
|
|
# or limit the number of retries for a particular job and send retries to
|
2021-12-09 16:06:29 -05:00
|
|
|
# a low priority queue with:
|
2017-01-17 17:58:08 -05:00
|
|
|
#
|
2022-03-03 15:37:25 -05:00
|
|
|
# class MyJob
|
|
|
|
# include Sidekiq::Job
|
2021-12-09 16:06:29 -05:00
|
|
|
# sidekiq_options retry: 10, retry_queue: 'low'
|
2017-01-17 17:58:08 -05:00
|
|
|
# end
|
|
|
|
#
|
|
|
|
class JobRetry
|
2019-04-11 21:33:59 -04:00
|
|
|
class Handled < ::RuntimeError; end
|
2020-12-16 14:07:31 -05:00
|
|
|
|
2019-04-11 21:33:59 -04:00
|
|
|
class Skip < Handled; end
|
2017-01-17 17:58:08 -05:00
|
|
|
|
2022-05-31 16:37:31 -04:00
|
|
|
include Sidekiq::Component
|
2017-01-17 17:58:08 -05:00
|
|
|
|
|
|
|
DEFAULT_MAX_RETRY_ATTEMPTS = 25
|
|
|
|
|
2022-05-31 16:37:31 -04:00
|
|
|
def initialize(options)
|
|
|
|
@config = options
|
|
|
|
@max_retries = @config[:max_retries] || DEFAULT_MAX_RETRY_ATTEMPTS
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
# The global retry handler requires only the barest of data.
|
|
|
|
# We want to be able to retry as much as possible so we don't
|
2022-03-03 15:37:25 -05:00
|
|
|
# require the job to be instantiated.
|
2019-10-08 00:48:39 -04:00
|
|
|
def global(jobstr, queue)
|
2017-01-17 17:58:08 -05:00
|
|
|
yield
|
2019-04-11 21:33:59 -04:00
|
|
|
rescue Handled => ex
|
2017-02-15 14:13:10 -05:00
|
|
|
raise ex
|
|
|
|
rescue Sidekiq::Shutdown => ey
|
2017-01-17 17:58:08 -05:00
|
|
|
# ignore, will be pushed back onto queue during hard_shutdown
|
2017-02-15 14:13:10 -05:00
|
|
|
raise ey
|
2017-01-17 17:58:08 -05:00
|
|
|
rescue Exception => e
|
|
|
|
# ignore, will be pushed back onto queue during hard_shutdown
|
|
|
|
raise Sidekiq::Shutdown if exception_caused_by_shutdown?(e)
|
|
|
|
|
2019-10-08 00:48:39 -04:00
|
|
|
msg = Sidekiq.load_json(jobstr)
|
2019-04-01 12:20:41 -04:00
|
|
|
if msg["retry"]
|
2018-09-27 11:39:13 -04:00
|
|
|
attempt_retry(nil, msg, queue, e)
|
|
|
|
else
|
|
|
|
Sidekiq.death_handlers.each do |handler|
|
2019-04-01 12:20:41 -04:00
|
|
|
handler.call(msg, e)
|
|
|
|
rescue => handler_ex
|
|
|
|
handle_exception(handler_ex, {context: "Error calling death handler", job: msg})
|
2018-09-27 11:39:13 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-04-11 21:33:59 -04:00
|
|
|
raise Handled
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
# The local retry support means that any errors that occur within
|
2022-03-03 15:37:25 -05:00
|
|
|
# this block can be associated with the given job instance.
|
2017-01-17 17:58:08 -05:00
|
|
|
# This is required to support the `sidekiq_retries_exhausted` block.
|
2017-02-15 14:13:10 -05:00
|
|
|
#
|
|
|
|
# Note that any exception from the block is wrapped in the Skip
|
|
|
|
# exception so the global block does not reprocess the error. The
|
|
|
|
# Skip exception is unwrapped within Sidekiq::Processor#process before
|
|
|
|
# calling the handle_exception handlers.
|
2022-03-03 15:37:25 -05:00
|
|
|
def local(jobinst, jobstr, queue)
|
2017-01-17 17:58:08 -05:00
|
|
|
yield
|
2019-04-11 21:33:59 -04:00
|
|
|
rescue Handled => ex
|
2017-02-15 14:13:10 -05:00
|
|
|
raise ex
|
|
|
|
rescue Sidekiq::Shutdown => ey
|
2017-01-17 17:58:08 -05:00
|
|
|
# ignore, will be pushed back onto queue during hard_shutdown
|
2017-02-15 14:13:10 -05:00
|
|
|
raise ey
|
2017-01-17 17:58:08 -05:00
|
|
|
rescue Exception => e
|
|
|
|
# ignore, will be pushed back onto queue during hard_shutdown
|
|
|
|
raise Sidekiq::Shutdown if exception_caused_by_shutdown?(e)
|
|
|
|
|
2019-10-08 00:48:39 -04:00
|
|
|
msg = Sidekiq.load_json(jobstr)
|
2019-04-01 12:20:41 -04:00
|
|
|
if msg["retry"].nil?
|
2022-03-03 15:37:25 -05:00
|
|
|
msg["retry"] = jobinst.class.get_sidekiq_options["retry"]
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
|
2019-04-01 12:20:41 -04:00
|
|
|
raise e unless msg["retry"]
|
2022-03-03 15:37:25 -05:00
|
|
|
attempt_retry(jobinst, msg, queue, e)
|
2017-01-17 17:58:08 -05:00
|
|
|
# We've handled this error associated with this job, don't
|
|
|
|
# need to handle it at the global level
|
|
|
|
raise Skip
|
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
2022-03-03 15:37:25 -05:00
|
|
|
# Note that +jobinst+ can be nil here if an error is raised before we can
|
|
|
|
# instantiate the job instance. All access must be guarded and
|
2017-01-17 17:58:08 -05:00
|
|
|
# best effort.
|
2022-03-03 15:37:25 -05:00
|
|
|
def attempt_retry(jobinst, msg, queue, exception)
|
2019-04-01 12:20:41 -04:00
|
|
|
max_retry_attempts = retry_attempts_from(msg["retry"], @max_retries)
|
2017-01-17 17:58:08 -05:00
|
|
|
|
2019-04-01 12:20:41 -04:00
|
|
|
msg["queue"] = (msg["retry_queue"] || queue)
|
2017-01-17 17:58:08 -05:00
|
|
|
|
2019-01-31 14:27:58 -05:00
|
|
|
m = exception_message(exception)
|
2017-01-17 17:58:08 -05:00
|
|
|
if m.respond_to?(:scrub!)
|
|
|
|
m.force_encoding("utf-8")
|
|
|
|
m.scrub!
|
|
|
|
end
|
|
|
|
|
2019-04-01 12:20:41 -04:00
|
|
|
msg["error_message"] = m
|
|
|
|
msg["error_class"] = exception.class.name
|
|
|
|
count = if msg["retry_count"]
|
|
|
|
msg["retried_at"] = Time.now.to_f
|
|
|
|
msg["retry_count"] += 1
|
2017-01-17 17:58:08 -05:00
|
|
|
else
|
2019-04-01 12:20:41 -04:00
|
|
|
msg["failed_at"] = Time.now.to_f
|
|
|
|
msg["retry_count"] = 0
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
|
2019-09-15 15:55:42 -04:00
|
|
|
if msg["backtrace"]
|
|
|
|
lines = if msg["backtrace"] == true
|
|
|
|
exception.backtrace
|
|
|
|
else
|
|
|
|
exception.backtrace[0...msg["backtrace"].to_i]
|
|
|
|
end
|
|
|
|
|
|
|
|
msg["error_backtrace"] = compress_backtrace(lines)
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
if count < max_retry_attempts
|
2022-03-03 15:37:25 -05:00
|
|
|
delay = delay_for(jobinst, count, exception)
|
2018-09-28 14:22:17 -04:00
|
|
|
# Logging here can break retries if the logging device raises ENOSPC #3979
|
2019-04-01 12:20:41 -04:00
|
|
|
# logger.debug { "Failure! Retry #{count} in #{delay} seconds" }
|
2017-01-17 17:58:08 -05:00
|
|
|
retry_at = Time.now.to_f + delay
|
|
|
|
payload = Sidekiq.dump_json(msg)
|
|
|
|
Sidekiq.redis do |conn|
|
2019-04-01 12:20:41 -04:00
|
|
|
conn.zadd("retry", retry_at.to_s, payload)
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
else
|
|
|
|
# Goodbye dear message, you (re)tried your best I'm sure.
|
2022-03-03 15:37:25 -05:00
|
|
|
retries_exhausted(jobinst, msg, exception)
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2022-03-03 15:37:25 -05:00
|
|
|
def retries_exhausted(jobinst, msg, exception)
|
2017-01-17 17:58:08 -05:00
|
|
|
begin
|
2022-03-03 15:37:25 -05:00
|
|
|
block = jobinst&.sidekiq_retries_exhausted_block
|
2019-04-01 12:20:41 -04:00
|
|
|
block&.call(msg, exception)
|
2017-01-17 17:58:08 -05:00
|
|
|
rescue => e
|
2019-04-01 12:20:41 -04:00
|
|
|
handle_exception(e, {context: "Error calling retries_exhausted", job: msg})
|
2018-01-12 17:13:45 -05:00
|
|
|
end
|
|
|
|
|
2019-11-06 06:42:28 -05:00
|
|
|
send_to_morgue(msg) unless msg["dead"] == false
|
|
|
|
|
2018-01-31 13:22:22 -05:00
|
|
|
Sidekiq.death_handlers.each do |handler|
|
2019-04-01 12:20:41 -04:00
|
|
|
handler.call(msg, exception)
|
|
|
|
rescue => e
|
|
|
|
handle_exception(e, {context: "Error calling death handler", job: msg})
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def send_to_morgue(msg)
|
2019-04-01 12:20:41 -04:00
|
|
|
logger.info { "Adding dead #{msg["class"]} job #{msg["jid"]}" }
|
2017-01-17 17:58:08 -05:00
|
|
|
payload = Sidekiq.dump_json(msg)
|
2018-01-15 13:39:32 -05:00
|
|
|
DeadSet.new.kill(payload, notify_failure: false)
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def retry_attempts_from(msg_retry, default)
|
|
|
|
if msg_retry.is_a?(Integer)
|
|
|
|
msg_retry
|
|
|
|
else
|
|
|
|
default
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2022-03-03 15:37:25 -05:00
|
|
|
def delay_for(jobinst, count, exception)
|
2021-08-09 13:48:16 -04:00
|
|
|
jitter = rand(10) * (count + 1)
|
2022-03-03 15:37:25 -05:00
|
|
|
if jobinst&.sidekiq_retry_in_block
|
|
|
|
custom_retry_in = retry_in(jobinst, count, exception).to_i
|
2021-08-09 13:48:16 -04:00
|
|
|
return custom_retry_in + jitter if custom_retry_in > 0
|
2018-03-21 10:31:03 -04:00
|
|
|
end
|
2021-08-09 13:48:16 -04:00
|
|
|
(count**4) + 15 + jitter
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
|
2022-03-03 15:37:25 -05:00
|
|
|
def retry_in(jobinst, count, exception)
|
|
|
|
jobinst.sidekiq_retry_in_block.call(count, exception)
|
2019-04-01 12:20:41 -04:00
|
|
|
rescue Exception => e
|
2022-03-03 15:37:25 -05:00
|
|
|
handle_exception(e, {context: "Failure scheduling retry using the defined `sidekiq_retry_in` in #{jobinst.class.name}, falling back to default"})
|
2019-04-01 12:20:41 -04:00
|
|
|
nil
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def exception_caused_by_shutdown?(e, checked_causes = [])
|
2017-02-15 14:13:10 -05:00
|
|
|
return false unless e.cause
|
2017-01-17 17:58:08 -05:00
|
|
|
|
|
|
|
# Handle circular causes
|
|
|
|
checked_causes << e.object_id
|
|
|
|
return false if checked_causes.include?(e.cause.object_id)
|
|
|
|
|
|
|
|
e.cause.instance_of?(Sidekiq::Shutdown) ||
|
|
|
|
exception_caused_by_shutdown?(e.cause, checked_causes)
|
|
|
|
end
|
|
|
|
|
2019-01-31 14:25:00 -05:00
|
|
|
# Extract message from exception.
|
|
|
|
# Set a default if the message raises an error
|
|
|
|
def exception_message(exception)
|
2019-04-01 12:20:41 -04:00
|
|
|
# App code can stuff all sorts of crazy binary data into the error message
|
|
|
|
# that won't convert to JSON.
|
|
|
|
exception.message.to_s[0, 10_000]
|
|
|
|
rescue
|
|
|
|
+"!!! ERROR MESSAGE THREW AN ERROR !!!"
|
2019-01-31 14:25:00 -05:00
|
|
|
end
|
2019-09-15 15:55:42 -04:00
|
|
|
|
|
|
|
def compress_backtrace(backtrace)
|
2019-10-08 16:45:17 -04:00
|
|
|
serialized = Sidekiq.dump_json(backtrace)
|
2019-09-15 15:55:42 -04:00
|
|
|
compressed = Zlib::Deflate.deflate(serialized)
|
|
|
|
Base64.encode64(compressed)
|
|
|
|
end
|
2017-01-17 17:58:08 -05:00
|
|
|
end
|
|
|
|
end
|