diff --git a/changelogs/unreleased/40396-sidekiq-in-process-group.yml b/changelogs/unreleased/40396-sidekiq-in-process-group.yml new file mode 100644 index 00000000000..e41557e20d0 --- /dev/null +++ b/changelogs/unreleased/40396-sidekiq-in-process-group.yml @@ -0,0 +1,5 @@ +--- +title: 'sidekiq: terminate child processes at shutdown' +merge_request: 25669 +author: +type: added diff --git a/config/initializers/sidekiq.rb b/config/initializers/sidekiq.rb index 4a8d72e37a5..2e4aa9c1053 100644 --- a/config/initializers/sidekiq.rb +++ b/config/initializers/sidekiq.rb @@ -77,6 +77,9 @@ Sidekiq.configure_server do |config| # Avoid autoload issue such as 'Mail::Parsers::AddressStruct' # https://github.com/mikel/mail/issues/912#issuecomment-214850355 Mail.eager_autoload! + + # Ensure the whole process group is terminated if possible + Gitlab::SidekiqSignals.install!(Sidekiq::CLI::SIGNAL_HANDLERS) end Sidekiq.configure_client do |config| diff --git a/doc/administration/operations/sidekiq_memory_killer.md b/doc/administration/operations/sidekiq_memory_killer.md index cbffd883774..8eac42f2fe2 100644 --- a/doc/administration/operations/sidekiq_memory_killer.md +++ b/doc/administration/operations/sidekiq_memory_killer.md @@ -17,6 +17,11 @@ With the default settings, the MemoryKiller will cause a Sidekiq restart no more often than once every 15 minutes, with the restart causing about one minute of delay for incoming background jobs. +Some background jobs rely on long-running external processes. To ensure these +are cleanly terminated when Sidekiq is restarted, each Sidekiq process should be +run as a process group leader (e.g., using `chpst -P`). If using Omnibus or the +`bin/background_jobs` script with `runit` installed, this is handled for you. + ## Configuring the MemoryKiller The MemoryKiller is controlled using environment variables. diff --git a/lib/gitlab/sidekiq_middleware/memory_killer.rb b/lib/gitlab/sidekiq_middleware/memory_killer.rb index 47333d257eb..ed2c7ee9a2d 100644 --- a/lib/gitlab/sidekiq_middleware/memory_killer.rb +++ b/lib/gitlab/sidekiq_middleware/memory_killer.rb @@ -36,11 +36,13 @@ module Gitlab # Wait `SHUTDOWN_WAIT` to give already fetched jobs time to finish. # Then, tell Sidekiq to gracefully shut down by giving jobs a few more # moments to finish, killing and requeuing them if they didn't, and - # then terminating itself. + # then terminating itself. Sidekiq will replicate the TERM to all its + # children if it can. wait_and_signal(SHUTDOWN_WAIT, 'SIGTERM', 'gracefully shut down') # Wait for Sidekiq to shutdown gracefully, and kill it if it didn't. - wait_and_signal(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die') + # Kill the whole pgroup, so we can be sure no children are left behind + wait_and_signal_pgroup(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die') end end @@ -53,6 +55,17 @@ module Gitlab output.to_i end + # If this sidekiq process is pgroup leader, signal to the whole pgroup + def wait_and_signal_pgroup(time, signal, explanation) + return wait_and_signal(time, signal, explanation) unless Process.getpgrp == pid + + Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})" + sleep(time) + + Sidekiq.logger.warn "sending Sidekiq worker PGRP-#{pid} #{signal} (#{explanation})" + Process.kill(signal, "-#{pid}") + end + def wait_and_signal(time, signal, explanation) Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})" sleep(time) diff --git a/lib/gitlab/sidekiq_signals.rb b/lib/gitlab/sidekiq_signals.rb new file mode 100644 index 00000000000..b704ee9a0a9 --- /dev/null +++ b/lib/gitlab/sidekiq_signals.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +module Gitlab + # As a process group leader, we can ensure that children of sidekiq are killed + # at the same time as sidekiq itself, to stop long-lived children from being + # reparented to init and "escaping". To do this, we override the default + # handlers used by sidekiq for INT and TERM signals + module SidekiqSignals + REPLACE_SIGNALS = %w[INT TERM].freeze + + SIDEKIQ_CHANGED_MESSAGE = + "Intercepting signal handlers: #{REPLACE_SIGNALS.join(", ")} failed. " \ + "Sidekiq should have registered them, but appears not to have done so." + + def self.install!(sidekiq_handlers) + # This only works if we're process group leader + return unless Process.getpgrp == Process.pid + + raise SIDEKIQ_CHANGED_MESSAGE unless + REPLACE_SIGNALS == sidekiq_handlers.keys & REPLACE_SIGNALS + + REPLACE_SIGNALS.each do |signal| + old_handler = sidekiq_handlers[signal] + sidekiq_handlers[signal] = ->(cli) do + blindly_signal_pgroup!(signal) + old_handler.call(cli) + end + end + end + + # The process group leader can forward INT and TERM signals to the whole + # group. However, the forwarded signal is *also* received by the leader, + # which could lead to an infinite loop. We can avoid this by temporarily + # ignoring the forwarded signal. This may cause us to miss some repeated + # signals from outside the process group, but that isn't fatal. + def self.blindly_signal_pgroup!(signal) + old_trap = trap(signal, 'IGNORE') + Process.kill(signal, "-#{Process.getpgrp}") + trap(signal, old_trap) + end + end +end diff --git a/spec/lib/gitlab/sidekiq_middleware/memory_killer_spec.rb b/spec/lib/gitlab/sidekiq_middleware/memory_killer_spec.rb index 5df56178df2..ff8c0825ee4 100644 --- a/spec/lib/gitlab/sidekiq_middleware/memory_killer_spec.rb +++ b/spec/lib/gitlab/sidekiq_middleware/memory_killer_spec.rb @@ -35,7 +35,7 @@ describe Gitlab::SidekiqMiddleware::MemoryKiller do stub_const("#{described_class}::MAX_RSS", 5.kilobytes) end - it 'sends the STP, TERM and KILL signals at expected times' do + it 'sends the TSTP, TERM and KILL signals at expected times' do expect(subject).to receive(:sleep).with(15 * 60).ordered expect(Process).to receive(:kill).with('SIGTSTP', pid).ordered @@ -47,6 +47,17 @@ describe Gitlab::SidekiqMiddleware::MemoryKiller do run end + + it 'sends TSTP and TERM to the pid, but KILL to the pgroup, when running as process leader' do + allow(Process).to receive(:getpgrp) { pid } + allow(subject).to receive(:sleep) + + expect(Process).to receive(:kill).with('SIGTSTP', pid).ordered + expect(Process).to receive(:kill).with('SIGTERM', pid).ordered + expect(Process).to receive(:kill).with('SIGKILL', "-#{pid}").ordered + + run + end end context 'when MAX_RSS is not exceeded' do diff --git a/spec/lib/gitlab/sidekiq_signals_spec.rb b/spec/lib/gitlab/sidekiq_signals_spec.rb new file mode 100644 index 00000000000..4483224f49d --- /dev/null +++ b/spec/lib/gitlab/sidekiq_signals_spec.rb @@ -0,0 +1,69 @@ +require 'spec_helper' + +describe Gitlab::SidekiqSignals do + describe '.install' do + let(:result) { Hash.new { |h, k| h[k] = 0 } } + let(:int_handler) { -> (_) { result['INT'] += 1 } } + let(:term_handler) { -> (_) { result['TERM'] += 1 } } + let(:other_handler) { -> (_) { result['OTHER'] += 1 } } + let(:signals) { { 'INT' => int_handler, 'TERM' => term_handler, 'OTHER' => other_handler } } + + context 'not a process group leader' do + before do + allow(Process).to receive(:getpgrp) { Process.pid * 2 } + end + + it 'does nothing' do + expect { described_class.install!(signals) } + .not_to change { signals } + end + end + + context 'as a process group leader' do + before do + allow(Process).to receive(:getpgrp) { Process.pid } + end + + it 'installs its own signal handlers for TERM and INT only' do + described_class.install!(signals) + + expect(signals['INT']).not_to eq(int_handler) + expect(signals['TERM']).not_to eq(term_handler) + expect(signals['OTHER']).to eq(other_handler) + end + + %w[INT TERM].each do |signal| + it "installs a forwarding signal handler for #{signal}" do + described_class.install!(signals) + + expect(described_class) + .to receive(:trap) + .with(signal, 'IGNORE') + .and_return(:original_trap) + .ordered + + expect(Process) + .to receive(:kill) + .with(signal, "-#{Process.pid}") + .ordered + + expect(described_class) + .to receive(:trap) + .with(signal, :original_trap) + .ordered + + signals[signal].call(:cli) + + expect(result[signal]).to eq(1) + end + + it "raises if sidekiq no longer traps SIG#{signal}" do + signals.delete(signal) + + expect { described_class.install!(signals) } + .to raise_error(/Sidekiq should have registered/) + end + end + end + end +end