sidekiq: terminate child processes at shutdown
Sidekiq jobs frequently spawn long-lived child processes to do work. In some circumstances, these can be reparented to init when sidekiq is terminated, leading to duplication of work and strange concurrency problems. This commit changes sidekiq so that, if run as a process group leader, it will forward `INT` and `TERM` signals to the whole process group. If the memory killer is active, it will also use the process group when resorting to `kill -9` to shut down. These changes mean that a naive `kill <pid-of-sidekiq>` will now do the right thing, killing any child processes spawned by sidekiq, as long as the process supervisor placed it in its own process group. If sidekiq isn't a process group leader, this new code is skipped.
This commit is contained in:
parent
6b507626ed
commit
f0c52df5e5
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
title: 'sidekiq: terminate child processes at shutdown'
|
||||
merge_request: 25669
|
||||
author:
|
||||
type: added
|
|
@ -77,6 +77,9 @@ Sidekiq.configure_server do |config|
|
|||
# Avoid autoload issue such as 'Mail::Parsers::AddressStruct'
|
||||
# https://github.com/mikel/mail/issues/912#issuecomment-214850355
|
||||
Mail.eager_autoload!
|
||||
|
||||
# Ensure the whole process group is terminated if possible
|
||||
Gitlab::SidekiqSignals.install!(Sidekiq::CLI::SIGNAL_HANDLERS)
|
||||
end
|
||||
|
||||
Sidekiq.configure_client do |config|
|
||||
|
|
|
@ -17,6 +17,11 @@ With the default settings, the MemoryKiller will cause a Sidekiq restart no
|
|||
more often than once every 15 minutes, with the restart causing about one
|
||||
minute of delay for incoming background jobs.
|
||||
|
||||
Some background jobs rely on long-running external processes. To ensure these
|
||||
are cleanly terminated when Sidekiq is restarted, each Sidekiq process should be
|
||||
run as a process group leader (e.g., using `chpst -P`). If using Omnibus or the
|
||||
`bin/background_jobs` script with `runit` installed, this is handled for you.
|
||||
|
||||
## Configuring the MemoryKiller
|
||||
|
||||
The MemoryKiller is controlled using environment variables.
|
||||
|
|
|
@ -36,11 +36,13 @@ module Gitlab
|
|||
# Wait `SHUTDOWN_WAIT` to give already fetched jobs time to finish.
|
||||
# Then, tell Sidekiq to gracefully shut down by giving jobs a few more
|
||||
# moments to finish, killing and requeuing them if they didn't, and
|
||||
# then terminating itself.
|
||||
# then terminating itself. Sidekiq will replicate the TERM to all its
|
||||
# children if it can.
|
||||
wait_and_signal(SHUTDOWN_WAIT, 'SIGTERM', 'gracefully shut down')
|
||||
|
||||
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't.
|
||||
wait_and_signal(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die')
|
||||
# Kill the whole pgroup, so we can be sure no children are left behind
|
||||
wait_and_signal_pgroup(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die')
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -53,6 +55,17 @@ module Gitlab
|
|||
output.to_i
|
||||
end
|
||||
|
||||
# If this sidekiq process is pgroup leader, signal to the whole pgroup
|
||||
def wait_and_signal_pgroup(time, signal, explanation)
|
||||
return wait_and_signal(time, signal, explanation) unless Process.getpgrp == pid
|
||||
|
||||
Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})"
|
||||
sleep(time)
|
||||
|
||||
Sidekiq.logger.warn "sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})"
|
||||
Process.kill(signal, "-#{pid}")
|
||||
end
|
||||
|
||||
def wait_and_signal(time, signal, explanation)
|
||||
Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})"
|
||||
sleep(time)
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module Gitlab
|
||||
# As a process group leader, we can ensure that children of sidekiq are killed
|
||||
# at the same time as sidekiq itself, to stop long-lived children from being
|
||||
# reparented to init and "escaping". To do this, we override the default
|
||||
# handlers used by sidekiq for INT and TERM signals
|
||||
module SidekiqSignals
|
||||
REPLACE_SIGNALS = %w[INT TERM].freeze
|
||||
|
||||
SIDEKIQ_CHANGED_MESSAGE =
|
||||
"Intercepting signal handlers: #{REPLACE_SIGNALS.join(", ")} failed. " \
|
||||
"Sidekiq should have registered them, but appears not to have done so."
|
||||
|
||||
def self.install!(sidekiq_handlers)
|
||||
# This only works if we're process group leader
|
||||
return unless Process.getpgrp == Process.pid
|
||||
|
||||
raise SIDEKIQ_CHANGED_MESSAGE unless
|
||||
REPLACE_SIGNALS == sidekiq_handlers.keys & REPLACE_SIGNALS
|
||||
|
||||
REPLACE_SIGNALS.each do |signal|
|
||||
old_handler = sidekiq_handlers[signal]
|
||||
sidekiq_handlers[signal] = ->(cli) do
|
||||
blindly_signal_pgroup!(signal)
|
||||
old_handler.call(cli)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# The process group leader can forward INT and TERM signals to the whole
|
||||
# group. However, the forwarded signal is *also* received by the leader,
|
||||
# which could lead to an infinite loop. We can avoid this by temporarily
|
||||
# ignoring the forwarded signal. This may cause us to miss some repeated
|
||||
# signals from outside the process group, but that isn't fatal.
|
||||
def self.blindly_signal_pgroup!(signal)
|
||||
old_trap = trap(signal, 'IGNORE')
|
||||
Process.kill(signal, "-#{Process.getpgrp}")
|
||||
trap(signal, old_trap)
|
||||
end
|
||||
end
|
||||
end
|
|
@ -35,7 +35,7 @@ describe Gitlab::SidekiqMiddleware::MemoryKiller do
|
|||
stub_const("#{described_class}::MAX_RSS", 5.kilobytes)
|
||||
end
|
||||
|
||||
it 'sends the STP, TERM and KILL signals at expected times' do
|
||||
it 'sends the TSTP, TERM and KILL signals at expected times' do
|
||||
expect(subject).to receive(:sleep).with(15 * 60).ordered
|
||||
expect(Process).to receive(:kill).with('SIGTSTP', pid).ordered
|
||||
|
||||
|
@ -47,6 +47,17 @@ describe Gitlab::SidekiqMiddleware::MemoryKiller do
|
|||
|
||||
run
|
||||
end
|
||||
|
||||
it 'sends TSTP and TERM to the pid, but KILL to the pgroup, when running as process leader' do
|
||||
allow(Process).to receive(:getpgrp) { pid }
|
||||
allow(subject).to receive(:sleep)
|
||||
|
||||
expect(Process).to receive(:kill).with('SIGTSTP', pid).ordered
|
||||
expect(Process).to receive(:kill).with('SIGTERM', pid).ordered
|
||||
expect(Process).to receive(:kill).with('SIGKILL', "-#{pid}").ordered
|
||||
|
||||
run
|
||||
end
|
||||
end
|
||||
|
||||
context 'when MAX_RSS is not exceeded' do
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Gitlab::SidekiqSignals do
|
||||
describe '.install' do
|
||||
let(:result) { Hash.new { |h, k| h[k] = 0 } }
|
||||
let(:int_handler) { -> (_) { result['INT'] += 1 } }
|
||||
let(:term_handler) { -> (_) { result['TERM'] += 1 } }
|
||||
let(:other_handler) { -> (_) { result['OTHER'] += 1 } }
|
||||
let(:signals) { { 'INT' => int_handler, 'TERM' => term_handler, 'OTHER' => other_handler } }
|
||||
|
||||
context 'not a process group leader' do
|
||||
before do
|
||||
allow(Process).to receive(:getpgrp) { Process.pid * 2 }
|
||||
end
|
||||
|
||||
it 'does nothing' do
|
||||
expect { described_class.install!(signals) }
|
||||
.not_to change { signals }
|
||||
end
|
||||
end
|
||||
|
||||
context 'as a process group leader' do
|
||||
before do
|
||||
allow(Process).to receive(:getpgrp) { Process.pid }
|
||||
end
|
||||
|
||||
it 'installs its own signal handlers for TERM and INT only' do
|
||||
described_class.install!(signals)
|
||||
|
||||
expect(signals['INT']).not_to eq(int_handler)
|
||||
expect(signals['TERM']).not_to eq(term_handler)
|
||||
expect(signals['OTHER']).to eq(other_handler)
|
||||
end
|
||||
|
||||
%w[INT TERM].each do |signal|
|
||||
it "installs a forwarding signal handler for #{signal}" do
|
||||
described_class.install!(signals)
|
||||
|
||||
expect(described_class)
|
||||
.to receive(:trap)
|
||||
.with(signal, 'IGNORE')
|
||||
.and_return(:original_trap)
|
||||
.ordered
|
||||
|
||||
expect(Process)
|
||||
.to receive(:kill)
|
||||
.with(signal, "-#{Process.pid}")
|
||||
.ordered
|
||||
|
||||
expect(described_class)
|
||||
.to receive(:trap)
|
||||
.with(signal, :original_trap)
|
||||
.ordered
|
||||
|
||||
signals[signal].call(:cli)
|
||||
|
||||
expect(result[signal]).to eq(1)
|
||||
end
|
||||
|
||||
it "raises if sidekiq no longer traps SIG#{signal}" do
|
||||
signals.delete(signal)
|
||||
|
||||
expect { described_class.install!(signals) }
|
||||
.to raise_error(/Sidekiq should have registered/)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue