1
0
Fork 0
mirror of https://github.com/mperham/sidekiq.git synced 2022-11-09 13:52:34 -05:00
mperham--sidekiq/lib/sidekiq/middleware/server/retry_jobs.rb
Eugen Kuksa eb2115237d Handle circular error causes
Shall fix #2284. While checking whether an application error or a
sidekiq shutdown caused the exception, we keep track of the causes that
were already checked. If a cause was already checked, we can stop
because the whole transitive closure of its causes was checked as well.
2015-04-04 09:27:13 +02:00

206 lines
7.1 KiB
Ruby

require 'sidekiq/scheduled'
require 'sidekiq/api'
module Sidekiq
module Middleware
module Server
##
# Automatically retry jobs that fail in Sidekiq.
# Sidekiq's retry support assumes a typical development lifecycle:
#
# 0. push some code changes with a bug in it
# 1. bug causes job processing to fail, sidekiq's middleware captures
# the job and pushes it onto a retry queue
# 2. sidekiq retries jobs in the retry queue multiple times with
# an exponential delay, the job continues to fail
# 3. after a few days, a developer deploys a fix. the job is
# reprocessed successfully.
# 4. once retries are exhausted, sidekiq will give up and move the
# job to the Dead Job Queue (aka morgue) where it must be dealt with
# manually in the Web UI.
# 5. After 6 months on the DJQ, Sidekiq will discard the job.
#
# A job looks like:
#
# { 'class' => 'HardWorker', 'args' => [1, 2, 'foo'], 'retry' => true }
#
# The 'retry' option also accepts a number (in place of 'true'):
#
# { 'class' => 'HardWorker', 'args' => [1, 2, 'foo'], 'retry' => 5 }
#
# The job will be retried this number of times before giving up. (If simply
# 'true', Sidekiq retries 25 times)
#
# We'll add a bit more data to the job to support retries:
#
# * 'queue' - the queue to use
# * 'retry_count' - number of times we've retried so far.
# * 'error_message' - the message from the exception
# * 'error_class' - the exception class
# * 'failed_at' - the first time it failed
# * 'retried_at' - the last time it was retried
# * 'backtrace' - the number of lines of error backtrace to store
#
# We don't store the backtrace by default as that can add a lot of overhead
# to the job and everyone is using an error service, right?
#
# The default number of retry attempts is 25 which works out to about 3 weeks
# of retries. You can pass a value for the max number of retry attempts when
# adding the middleware using the options hash:
#
# Sidekiq.configure_server do |config|
# config.server_middleware do |chain|
# chain.add Sidekiq::Middleware::Server::RetryJobs, :max_retries => 7
# end
# end
#
# or limit the number of retries for a particular worker with:
#
# class MyWorker
# include Sidekiq::Worker
# sidekiq_options :retry => 10
# end
#
class RetryJobs
include Sidekiq::Util
DEFAULT_MAX_RETRY_ATTEMPTS = 25
def initialize(options = {})
@max_retries = options.fetch(:max_retries, DEFAULT_MAX_RETRY_ATTEMPTS)
end
def call(worker, msg, queue)
yield
rescue Sidekiq::Shutdown
# ignore, will be pushed back onto queue during hard_shutdown
raise
rescue Exception => e
# ignore, will be pushed back onto queue during hard_shutdown
raise Sidekiq::Shutdown if exception_caused_by_shutdown?(e)
raise e unless msg['retry']
attempt_retry(worker, msg, queue, e)
end
private
def attempt_retry(worker, msg, queue, exception)
max_retry_attempts = retry_attempts_from(msg['retry'], @max_retries)
msg['queue'] = if msg['retry_queue']
msg['retry_queue']
else
queue
end
# App code can stuff all sorts of crazy binary data into the error message
# that won't convert to JSON.
m = exception.message[0..10_000]
if m.respond_to?(:scrub!)
m.force_encoding("utf-8")
m.scrub!
end
msg['error_message'] = m
msg['error_class'] = exception.class.name
count = if msg['retry_count']
msg['retried_at'] = Time.now.to_f
msg['retry_count'] += 1
else
msg['failed_at'] = Time.now.to_f
msg['retry_count'] = 0
end
if msg['backtrace'] == true
msg['error_backtrace'] = exception.backtrace
elsif !msg['backtrace']
# do nothing
elsif msg['backtrace'].to_i != 0
msg['error_backtrace'] = exception.backtrace[0...msg['backtrace'].to_i]
end
if count < max_retry_attempts
delay = delay_for(worker, count)
logger.debug { "Failure! Retry #{count} in #{delay} seconds" }
retry_at = Time.now.to_f + delay
payload = Sidekiq.dump_json(msg)
Sidekiq.redis do |conn|
conn.zadd('retry', retry_at.to_s, payload)
end
else
# Goodbye dear message, you (re)tried your best I'm sure.
retries_exhausted(worker, msg)
end
raise exception
end
def retries_exhausted(worker, msg)
logger.debug { "Dropping message after hitting the retry maximum: #{msg}" }
begin
if worker.sidekiq_retries_exhausted_block?
worker.sidekiq_retries_exhausted_block.call(msg)
end
rescue => e
handle_exception(e, { context: "Error calling retries_exhausted for #{worker.class}", job: msg })
end
send_to_morgue(msg) unless msg['dead'] == false
end
def send_to_morgue(msg)
Sidekiq.logger.info { "Adding dead #{msg['class']} job #{msg['jid']}" }
payload = Sidekiq.dump_json(msg)
now = Time.now.to_f
Sidekiq.redis do |conn|
conn.multi do
conn.zadd('dead', now, payload)
conn.zremrangebyscore('dead', '-inf', now - DeadSet.timeout)
conn.zremrangebyrank('dead', 0, -DeadSet.max_jobs)
end
end
end
def retry_attempts_from(msg_retry, default)
if msg_retry.is_a?(Fixnum)
msg_retry
else
default
end
end
def delay_for(worker, count)
worker.sidekiq_retry_in_block? && retry_in(worker, count) || seconds_to_delay(count)
end
# delayed_job uses the same basic formula
def seconds_to_delay(count)
(count ** 4) + 15 + (rand(30)*(count+1))
end
def retry_in(worker, count)
begin
worker.sidekiq_retry_in_block.call(count)
rescue Exception => e
handle_exception(e, { context: "Failure scheduling retry using the defined `sidekiq_retry_in` in #{worker.class.name}, falling back to default" })
nil
end
end
def exception_caused_by_shutdown?(e, checked_causes = [])
# In Ruby 2.1.0 only, check if exception is a result of shutdown.
return false unless defined?(e.cause)
# Handle circular causes
checked_causes << e.object_id
return false if checked_causes.include?(e.cause.object_id)
e.cause.instance_of?(Sidekiq::Shutdown) ||
exception_caused_by_shutdown?(e.cause, checked_causes)
end
end
end
end
end