From ccb5bad6b8ff73df978b03a1ede8051ea78b2ee9 Mon Sep 17 00:00:00 2001 From: Douwe Maan Date: Tue, 31 Oct 2017 10:33:30 +0100 Subject: [PATCH] Send SIGSTP before SIGTERM to actually give Sidekiq jobs 30s to finish when the memory killer kicks in --- .../operations/sidekiq_memory_killer.md | 4 +- .../sidekiq_middleware/memory_killer.rb | 41 +++++++----- .../sidekiq_middleware/memory_killer_spec.rb | 63 +++++++++++++++++++ 3 files changed, 91 insertions(+), 17 deletions(-) create mode 100644 spec/lib/gitlab/sidekiq_middleware/memory_killer_spec.rb diff --git a/doc/administration/operations/sidekiq_memory_killer.md b/doc/administration/operations/sidekiq_memory_killer.md index b5e78348989..cbffd883774 100644 --- a/doc/administration/operations/sidekiq_memory_killer.md +++ b/doc/administration/operations/sidekiq_memory_killer.md @@ -28,7 +28,7 @@ The MemoryKiller is controlled using environment variables. delayed shutdown is triggered. The default value for Omnibus packages is set [in the omnibus-gitlab repository](https://gitlab.com/gitlab-org/omnibus-gitlab/blob/master/files/gitlab-cookbooks/gitlab/attributes/default.rb). -- `SIDEKIQ_MEMORY_KILLER_GRACE_TIME`: defaults 900 seconds (15 minutes). When +- `SIDEKIQ_MEMORY_KILLER_GRACE_TIME`: defaults to 900 seconds (15 minutes). When a shutdown is triggered, the Sidekiq process will keep working normally for another 15 minutes. - `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT`: defaults to 30 seconds. When the grace @@ -36,5 +36,3 @@ The MemoryKiller is controlled using environment variables. Existing jobs get 30 seconds to finish. After that, the MemoryKiller tells Sidekiq to shut down, and an external supervision mechanism (e.g. Runit) must restart Sidekiq. -- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_SIGNAL`: defaults to `SIGKILL`. The name of - the final signal sent to the Sidekiq process when we want it to shut down. diff --git a/lib/gitlab/sidekiq_middleware/memory_killer.rb b/lib/gitlab/sidekiq_middleware/memory_killer.rb index d7d24eeb37b..2bfb7caefd9 100644 --- a/lib/gitlab/sidekiq_middleware/memory_killer.rb +++ b/lib/gitlab/sidekiq_middleware/memory_killer.rb @@ -7,7 +7,6 @@ module Gitlab GRACE_TIME = (ENV['SIDEKIQ_MEMORY_KILLER_GRACE_TIME'] || 15 * 60).to_s.to_i # Wait 30 seconds for running jobs to finish during graceful shutdown SHUTDOWN_WAIT = (ENV['SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT'] || 30).to_s.to_i - SHUTDOWN_SIGNAL = (ENV['SIDEKIQ_MEMORY_KILLER_SHUTDOWN_SIGNAL'] || 'SIGKILL').to_s # Create a mutex used to ensure there will be only one thread waiting to # shut Sidekiq down @@ -15,6 +14,7 @@ module Gitlab def call(worker, job, queue) yield + current_rss = get_rss return unless MAX_RSS > 0 && current_rss > MAX_RSS @@ -23,32 +23,45 @@ module Gitlab # Return if another thread is already waiting to shut Sidekiq down return unless MUTEX.try_lock - Sidekiq.logger.warn "current RSS #{current_rss} exceeds maximum RSS "\ - "#{MAX_RSS}" - Sidekiq.logger.warn "this thread will shut down PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']} "\ - "in #{GRACE_TIME} seconds" - sleep(GRACE_TIME) + Sidekiq.logger.warn "Sidekiq worker PID-#{pid} current RSS #{current_rss}"\ + " exceeds maximum RSS #{MAX_RSS} after finishing job #{worker.class} JID-#{job['jid']}" + Sidekiq.logger.warn "Sidekiq worker PID-#{pid} will stop fetching new jobs in #{GRACE_TIME} seconds, and will be shut down #{SHUTDOWN_WAIT} seconds later" - Sidekiq.logger.warn "sending SIGTERM to PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']}" - Process.kill('SIGTERM', Process.pid) + # Wait `GRACE_TIME` to give the memory intensive job time to finish. + # Then, tell Sidekiq to stop fetching new jobs. + wait_and_signal(GRACE_TIME, 'SIGSTP', 'stop fetching new jobs') - Sidekiq.logger.warn "waiting #{SHUTDOWN_WAIT} seconds before sending "\ - "#{SHUTDOWN_SIGNAL} to PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']}" - sleep(SHUTDOWN_WAIT) + # Wait `SHUTDOWN_WAIT` to give already fetched jobs time to finish. + # Then, tell Sidekiq to gracefully shut down by giving jobs a few more + # moments to finish, killing and requeuing them if they didn't, and + # then terminating itself. + wait_and_signal(SHUTDOWN_WAIT, 'SIGTERM', 'gracefully shut down') - Sidekiq.logger.warn "sending #{SHUTDOWN_SIGNAL} to PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']}" - Process.kill(SHUTDOWN_SIGNAL, Process.pid) + # Wait for Sidekiq to shutdown gracefully, and kill it if it didn't. + wait_and_signal(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die') end end private def get_rss - output, status = Gitlab::Popen.popen(%W(ps -o rss= -p #{Process.pid})) + output, status = Gitlab::Popen.popen(%W(ps -o rss= -p #{pid})) return 0 unless status.zero? output.to_i end + + def wait_and_signal(time, signal, explanation) + Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})" + sleep(time) + + Sidekiq.logger.warn "sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})" + Process.kill(signal, pid) + end + + def pid + Process.pid + end end end end diff --git a/spec/lib/gitlab/sidekiq_middleware/memory_killer_spec.rb b/spec/lib/gitlab/sidekiq_middleware/memory_killer_spec.rb new file mode 100644 index 00000000000..8fdbbacd04d --- /dev/null +++ b/spec/lib/gitlab/sidekiq_middleware/memory_killer_spec.rb @@ -0,0 +1,63 @@ +require 'spec_helper' + +describe Gitlab::SidekiqMiddleware::MemoryKiller do + subject { described_class.new } + let(:pid) { 999 } + + let(:worker) { double(:worker, class: 'TestWorker') } + let(:job) { { 'jid' => 123 } } + let(:queue) { 'test_queue' } + + def run + thread = subject.call(worker, job, queue) { nil } + thread&.join + end + + before do + allow(subject).to receive(:get_rss).and_return(10.kilobytes) + allow(subject).to receive(:pid).and_return(pid) + end + + context 'when MAX_RSS is set to 0' do + before do + stub_const("#{described_class}::MAX_RSS", 0) + end + + it 'does nothing' do + expect(subject).not_to receive(:sleep) + + run + end + end + + context 'when MAX_RSS is exceeded' do + before do + stub_const("#{described_class}::MAX_RSS", 5.kilobytes) + end + + it 'sends the STP, TERM and KILL signals at expected times' do + expect(subject).to receive(:sleep).with(15 * 60).ordered + expect(Process).to receive(:kill).with('SIGSTP', pid).ordered + + expect(subject).to receive(:sleep).with(30).ordered + expect(Process).to receive(:kill).with('SIGTERM', pid).ordered + + expect(subject).to receive(:sleep).with(10).ordered + expect(Process).to receive(:kill).with('SIGKILL', pid).ordered + + run + end + end + + context 'when MAX_RSS is not exceeded' do + before do + stub_const("#{described_class}::MAX_RSS", 15.kilobytes) + end + + it 'does nothing' do + expect(subject).not_to receive(:sleep) + + run + end + end +end