Merge branch 'dm-sidekiq-sigstp' into 'master'

Send SIGSTP before SIGTERM to actually give Sidekiq jobs 30s to finish when the memory killer kicks in

See merge request gitlab-org/gitlab-ce!15102
This commit is contained in:
Sean McGivern 2017-11-02 15:33:53 +00:00
commit 62ab17798d
3 changed files with 91 additions and 17 deletions

View File

@ -28,7 +28,7 @@ The MemoryKiller is controlled using environment variables.
delayed shutdown is triggered. The default value for Omnibus packages is set
[in the omnibus-gitlab
repository](https://gitlab.com/gitlab-org/omnibus-gitlab/blob/master/files/gitlab-cookbooks/gitlab/attributes/default.rb).
- `SIDEKIQ_MEMORY_KILLER_GRACE_TIME`: defaults 900 seconds (15 minutes). When
- `SIDEKIQ_MEMORY_KILLER_GRACE_TIME`: defaults to 900 seconds (15 minutes). When
a shutdown is triggered, the Sidekiq process will keep working normally for
another 15 minutes.
- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT`: defaults to 30 seconds. When the grace
@ -36,5 +36,3 @@ The MemoryKiller is controlled using environment variables.
Existing jobs get 30 seconds to finish. After that, the MemoryKiller tells
Sidekiq to shut down, and an external supervision mechanism (e.g. Runit) must
restart Sidekiq.
- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_SIGNAL`: defaults to `SIGKILL`. The name of
the final signal sent to the Sidekiq process when we want it to shut down.

View File

@ -7,7 +7,6 @@ module Gitlab
GRACE_TIME = (ENV['SIDEKIQ_MEMORY_KILLER_GRACE_TIME'] || 15 * 60).to_s.to_i
# Wait 30 seconds for running jobs to finish during graceful shutdown
SHUTDOWN_WAIT = (ENV['SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT'] || 30).to_s.to_i
SHUTDOWN_SIGNAL = (ENV['SIDEKIQ_MEMORY_KILLER_SHUTDOWN_SIGNAL'] || 'SIGKILL').to_s
# Create a mutex used to ensure there will be only one thread waiting to
# shut Sidekiq down
@ -15,6 +14,7 @@ module Gitlab
def call(worker, job, queue)
yield
current_rss = get_rss
return unless MAX_RSS > 0 && current_rss > MAX_RSS
@ -23,32 +23,45 @@ module Gitlab
# Return if another thread is already waiting to shut Sidekiq down
return unless MUTEX.try_lock
Sidekiq.logger.warn "current RSS #{current_rss} exceeds maximum RSS "\
"#{MAX_RSS}"
Sidekiq.logger.warn "this thread will shut down PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']} "\
"in #{GRACE_TIME} seconds"
sleep(GRACE_TIME)
Sidekiq.logger.warn "Sidekiq worker PID-#{pid} current RSS #{current_rss}"\
" exceeds maximum RSS #{MAX_RSS} after finishing job #{worker.class} JID-#{job['jid']}"
Sidekiq.logger.warn "Sidekiq worker PID-#{pid} will stop fetching new jobs in #{GRACE_TIME} seconds, and will be shut down #{SHUTDOWN_WAIT} seconds later"
Sidekiq.logger.warn "sending SIGTERM to PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']}"
Process.kill('SIGTERM', Process.pid)
# Wait `GRACE_TIME` to give the memory intensive job time to finish.
# Then, tell Sidekiq to stop fetching new jobs.
wait_and_signal(GRACE_TIME, 'SIGSTP', 'stop fetching new jobs')
Sidekiq.logger.warn "waiting #{SHUTDOWN_WAIT} seconds before sending "\
"#{SHUTDOWN_SIGNAL} to PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']}"
sleep(SHUTDOWN_WAIT)
# Wait `SHUTDOWN_WAIT` to give already fetched jobs time to finish.
# Then, tell Sidekiq to gracefully shut down by giving jobs a few more
# moments to finish, killing and requeuing them if they didn't, and
# then terminating itself.
wait_and_signal(SHUTDOWN_WAIT, 'SIGTERM', 'gracefully shut down')
Sidekiq.logger.warn "sending #{SHUTDOWN_SIGNAL} to PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']}"
Process.kill(SHUTDOWN_SIGNAL, Process.pid)
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't.
wait_and_signal(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die')
end
end
private
def get_rss
output, status = Gitlab::Popen.popen(%W(ps -o rss= -p #{Process.pid}))
output, status = Gitlab::Popen.popen(%W(ps -o rss= -p #{pid}))
return 0 unless status.zero?
output.to_i
end
def wait_and_signal(time, signal, explanation)
Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})"
sleep(time)
Sidekiq.logger.warn "sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})"
Process.kill(signal, pid)
end
def pid
Process.pid
end
end
end
end

View File

@ -0,0 +1,63 @@
require 'spec_helper'
describe Gitlab::SidekiqMiddleware::MemoryKiller do
subject { described_class.new }
let(:pid) { 999 }
let(:worker) { double(:worker, class: 'TestWorker') }
let(:job) { { 'jid' => 123 } }
let(:queue) { 'test_queue' }
def run
thread = subject.call(worker, job, queue) { nil }
thread&.join
end
before do
allow(subject).to receive(:get_rss).and_return(10.kilobytes)
allow(subject).to receive(:pid).and_return(pid)
end
context 'when MAX_RSS is set to 0' do
before do
stub_const("#{described_class}::MAX_RSS", 0)
end
it 'does nothing' do
expect(subject).not_to receive(:sleep)
run
end
end
context 'when MAX_RSS is exceeded' do
before do
stub_const("#{described_class}::MAX_RSS", 5.kilobytes)
end
it 'sends the STP, TERM and KILL signals at expected times' do
expect(subject).to receive(:sleep).with(15 * 60).ordered
expect(Process).to receive(:kill).with('SIGSTP', pid).ordered
expect(subject).to receive(:sleep).with(30).ordered
expect(Process).to receive(:kill).with('SIGTERM', pid).ordered
expect(subject).to receive(:sleep).with(10).ordered
expect(Process).to receive(:kill).with('SIGKILL', pid).ordered
run
end
end
context 'when MAX_RSS is not exceeded' do
before do
stub_const("#{described_class}::MAX_RSS", 15.kilobytes)
end
it 'does nothing' do
expect(subject).not_to receive(:sleep)
run
end
end
end