sidekiq: terminate child processes at shutdown

Sidekiq jobs frequently spawn long-lived child processes to do work.
In some circumstances, these can be reparented to init when sidekiq is
terminated, leading to duplication of work and strange concurrency
problems.

This commit changes sidekiq so that, if run as a process group leader,
it will forward `INT` and `TERM` signals to the whole process group. If
the memory killer is active, it will also use the process group when
resorting to `kill -9` to shut down.

These changes mean that a naive `kill <pid-of-sidekiq>` will now do the
right thing, killing any child processes spawned by sidekiq, as long as
the process supervisor placed it in its own process group.

If sidekiq isn't a process group leader, this new code is skipped.
This commit is contained in:
Nick Thomas 2019-02-28 23:25:37 +00:00 committed by Stan Hu
parent 6b507626ed
commit f0c52df5e5
7 changed files with 151 additions and 3 deletions

View File

@ -0,0 +1,5 @@
---
title: 'sidekiq: terminate child processes at shutdown'
merge_request: 25669
author:
type: added

View File

@ -77,6 +77,9 @@ Sidekiq.configure_server do |config|
# Avoid autoload issue such as 'Mail::Parsers::AddressStruct'
# https://github.com/mikel/mail/issues/912#issuecomment-214850355
Mail.eager_autoload!
# Ensure the whole process group is terminated if possible
Gitlab::SidekiqSignals.install!(Sidekiq::CLI::SIGNAL_HANDLERS)
end
Sidekiq.configure_client do |config|

View File

@ -17,6 +17,11 @@ With the default settings, the MemoryKiller will cause a Sidekiq restart no
more often than once every 15 minutes, with the restart causing about one
minute of delay for incoming background jobs.
Some background jobs rely on long-running external processes. To ensure these
are cleanly terminated when Sidekiq is restarted, each Sidekiq process should be
run as a process group leader (e.g., using `chpst -P`). If using Omnibus or the
`bin/background_jobs` script with `runit` installed, this is handled for you.
## Configuring the MemoryKiller
The MemoryKiller is controlled using environment variables.

View File

@ -36,11 +36,13 @@ module Gitlab
# Wait `SHUTDOWN_WAIT` to give already fetched jobs time to finish.
# Then, tell Sidekiq to gracefully shut down by giving jobs a few more
# moments to finish, killing and requeuing them if they didn't, and
# then terminating itself.
# then terminating itself. Sidekiq will replicate the TERM to all its
# children if it can.
wait_and_signal(SHUTDOWN_WAIT, 'SIGTERM', 'gracefully shut down')
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't.
wait_and_signal(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die')
# Kill the whole pgroup, so we can be sure no children are left behind
wait_and_signal_pgroup(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die')
end
end
@ -53,6 +55,17 @@ module Gitlab
output.to_i
end
# If this sidekiq process is pgroup leader, signal to the whole pgroup
def wait_and_signal_pgroup(time, signal, explanation)
return wait_and_signal(time, signal, explanation) unless Process.getpgrp == pid
Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})"
sleep(time)
Sidekiq.logger.warn "sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})"
Process.kill(signal, "-#{pid}")
end
def wait_and_signal(time, signal, explanation)
Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})"
sleep(time)

View File

@ -0,0 +1,42 @@
# frozen_string_literal: true
module Gitlab
# As a process group leader, we can ensure that children of sidekiq are killed
# at the same time as sidekiq itself, to stop long-lived children from being
# reparented to init and "escaping". To do this, we override the default
# handlers used by sidekiq for INT and TERM signals
module SidekiqSignals
REPLACE_SIGNALS = %w[INT TERM].freeze
SIDEKIQ_CHANGED_MESSAGE =
"Intercepting signal handlers: #{REPLACE_SIGNALS.join(", ")} failed. " \
"Sidekiq should have registered them, but appears not to have done so."
def self.install!(sidekiq_handlers)
# This only works if we're process group leader
return unless Process.getpgrp == Process.pid
raise SIDEKIQ_CHANGED_MESSAGE unless
REPLACE_SIGNALS == sidekiq_handlers.keys & REPLACE_SIGNALS
REPLACE_SIGNALS.each do |signal|
old_handler = sidekiq_handlers[signal]
sidekiq_handlers[signal] = ->(cli) do
blindly_signal_pgroup!(signal)
old_handler.call(cli)
end
end
end
# The process group leader can forward INT and TERM signals to the whole
# group. However, the forwarded signal is *also* received by the leader,
# which could lead to an infinite loop. We can avoid this by temporarily
# ignoring the forwarded signal. This may cause us to miss some repeated
# signals from outside the process group, but that isn't fatal.
def self.blindly_signal_pgroup!(signal)
old_trap = trap(signal, 'IGNORE')
Process.kill(signal, "-#{Process.getpgrp}")
trap(signal, old_trap)
end
end
end

View File

@ -35,7 +35,7 @@ describe Gitlab::SidekiqMiddleware::MemoryKiller do
stub_const("#{described_class}::MAX_RSS", 5.kilobytes)
end
it 'sends the STP, TERM and KILL signals at expected times' do
it 'sends the TSTP, TERM and KILL signals at expected times' do
expect(subject).to receive(:sleep).with(15 * 60).ordered
expect(Process).to receive(:kill).with('SIGTSTP', pid).ordered
@ -47,6 +47,17 @@ describe Gitlab::SidekiqMiddleware::MemoryKiller do
run
end
it 'sends TSTP and TERM to the pid, but KILL to the pgroup, when running as process leader' do
allow(Process).to receive(:getpgrp) { pid }
allow(subject).to receive(:sleep)
expect(Process).to receive(:kill).with('SIGTSTP', pid).ordered
expect(Process).to receive(:kill).with('SIGTERM', pid).ordered
expect(Process).to receive(:kill).with('SIGKILL', "-#{pid}").ordered
run
end
end
context 'when MAX_RSS is not exceeded' do

View File

@ -0,0 +1,69 @@
require 'spec_helper'
describe Gitlab::SidekiqSignals do
describe '.install' do
let(:result) { Hash.new { |h, k| h[k] = 0 } }
let(:int_handler) { -> (_) { result['INT'] += 1 } }
let(:term_handler) { -> (_) { result['TERM'] += 1 } }
let(:other_handler) { -> (_) { result['OTHER'] += 1 } }
let(:signals) { { 'INT' => int_handler, 'TERM' => term_handler, 'OTHER' => other_handler } }
context 'not a process group leader' do
before do
allow(Process).to receive(:getpgrp) { Process.pid * 2 }
end
it 'does nothing' do
expect { described_class.install!(signals) }
.not_to change { signals }
end
end
context 'as a process group leader' do
before do
allow(Process).to receive(:getpgrp) { Process.pid }
end
it 'installs its own signal handlers for TERM and INT only' do
described_class.install!(signals)
expect(signals['INT']).not_to eq(int_handler)
expect(signals['TERM']).not_to eq(term_handler)
expect(signals['OTHER']).to eq(other_handler)
end
%w[INT TERM].each do |signal|
it "installs a forwarding signal handler for #{signal}" do
described_class.install!(signals)
expect(described_class)
.to receive(:trap)
.with(signal, 'IGNORE')
.and_return(:original_trap)
.ordered
expect(Process)
.to receive(:kill)
.with(signal, "-#{Process.pid}")
.ordered
expect(described_class)
.to receive(:trap)
.with(signal, :original_trap)
.ordered
signals[signal].call(:cli)
expect(result[signal]).to eq(1)
end
it "raises if sidekiq no longer traps SIG#{signal}" do
signals.delete(signal)
expect { described_class.install!(signals) }
.to raise_error(/Sidekiq should have registered/)
end
end
end
end
end