Merge branch 'dm-sidekiq-sigstp' into 'master'
Send SIGSTP before SIGTERM to actually give Sidekiq jobs 30s to finish when the memory killer kicks in See merge request gitlab-org/gitlab-ce!15102
This commit is contained in:
commit
62ab17798d
|
@ -28,7 +28,7 @@ The MemoryKiller is controlled using environment variables.
|
||||||
delayed shutdown is triggered. The default value for Omnibus packages is set
|
delayed shutdown is triggered. The default value for Omnibus packages is set
|
||||||
[in the omnibus-gitlab
|
[in the omnibus-gitlab
|
||||||
repository](https://gitlab.com/gitlab-org/omnibus-gitlab/blob/master/files/gitlab-cookbooks/gitlab/attributes/default.rb).
|
repository](https://gitlab.com/gitlab-org/omnibus-gitlab/blob/master/files/gitlab-cookbooks/gitlab/attributes/default.rb).
|
||||||
- `SIDEKIQ_MEMORY_KILLER_GRACE_TIME`: defaults 900 seconds (15 minutes). When
|
- `SIDEKIQ_MEMORY_KILLER_GRACE_TIME`: defaults to 900 seconds (15 minutes). When
|
||||||
a shutdown is triggered, the Sidekiq process will keep working normally for
|
a shutdown is triggered, the Sidekiq process will keep working normally for
|
||||||
another 15 minutes.
|
another 15 minutes.
|
||||||
- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT`: defaults to 30 seconds. When the grace
|
- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT`: defaults to 30 seconds. When the grace
|
||||||
|
@ -36,5 +36,3 @@ The MemoryKiller is controlled using environment variables.
|
||||||
Existing jobs get 30 seconds to finish. After that, the MemoryKiller tells
|
Existing jobs get 30 seconds to finish. After that, the MemoryKiller tells
|
||||||
Sidekiq to shut down, and an external supervision mechanism (e.g. Runit) must
|
Sidekiq to shut down, and an external supervision mechanism (e.g. Runit) must
|
||||||
restart Sidekiq.
|
restart Sidekiq.
|
||||||
- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_SIGNAL`: defaults to `SIGKILL`. The name of
|
|
||||||
the final signal sent to the Sidekiq process when we want it to shut down.
|
|
||||||
|
|
|
@ -7,7 +7,6 @@ module Gitlab
|
||||||
GRACE_TIME = (ENV['SIDEKIQ_MEMORY_KILLER_GRACE_TIME'] || 15 * 60).to_s.to_i
|
GRACE_TIME = (ENV['SIDEKIQ_MEMORY_KILLER_GRACE_TIME'] || 15 * 60).to_s.to_i
|
||||||
# Wait 30 seconds for running jobs to finish during graceful shutdown
|
# Wait 30 seconds for running jobs to finish during graceful shutdown
|
||||||
SHUTDOWN_WAIT = (ENV['SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT'] || 30).to_s.to_i
|
SHUTDOWN_WAIT = (ENV['SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT'] || 30).to_s.to_i
|
||||||
SHUTDOWN_SIGNAL = (ENV['SIDEKIQ_MEMORY_KILLER_SHUTDOWN_SIGNAL'] || 'SIGKILL').to_s
|
|
||||||
|
|
||||||
# Create a mutex used to ensure there will be only one thread waiting to
|
# Create a mutex used to ensure there will be only one thread waiting to
|
||||||
# shut Sidekiq down
|
# shut Sidekiq down
|
||||||
|
@ -15,6 +14,7 @@ module Gitlab
|
||||||
|
|
||||||
def call(worker, job, queue)
|
def call(worker, job, queue)
|
||||||
yield
|
yield
|
||||||
|
|
||||||
current_rss = get_rss
|
current_rss = get_rss
|
||||||
|
|
||||||
return unless MAX_RSS > 0 && current_rss > MAX_RSS
|
return unless MAX_RSS > 0 && current_rss > MAX_RSS
|
||||||
|
@ -23,32 +23,45 @@ module Gitlab
|
||||||
# Return if another thread is already waiting to shut Sidekiq down
|
# Return if another thread is already waiting to shut Sidekiq down
|
||||||
return unless MUTEX.try_lock
|
return unless MUTEX.try_lock
|
||||||
|
|
||||||
Sidekiq.logger.warn "current RSS #{current_rss} exceeds maximum RSS "\
|
Sidekiq.logger.warn "Sidekiq worker PID-#{pid} current RSS #{current_rss}"\
|
||||||
"#{MAX_RSS}"
|
" exceeds maximum RSS #{MAX_RSS} after finishing job #{worker.class} JID-#{job['jid']}"
|
||||||
Sidekiq.logger.warn "this thread will shut down PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']} "\
|
Sidekiq.logger.warn "Sidekiq worker PID-#{pid} will stop fetching new jobs in #{GRACE_TIME} seconds, and will be shut down #{SHUTDOWN_WAIT} seconds later"
|
||||||
"in #{GRACE_TIME} seconds"
|
|
||||||
sleep(GRACE_TIME)
|
|
||||||
|
|
||||||
Sidekiq.logger.warn "sending SIGTERM to PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']}"
|
# Wait `GRACE_TIME` to give the memory intensive job time to finish.
|
||||||
Process.kill('SIGTERM', Process.pid)
|
# Then, tell Sidekiq to stop fetching new jobs.
|
||||||
|
wait_and_signal(GRACE_TIME, 'SIGSTP', 'stop fetching new jobs')
|
||||||
|
|
||||||
Sidekiq.logger.warn "waiting #{SHUTDOWN_WAIT} seconds before sending "\
|
# Wait `SHUTDOWN_WAIT` to give already fetched jobs time to finish.
|
||||||
"#{SHUTDOWN_SIGNAL} to PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']}"
|
# Then, tell Sidekiq to gracefully shut down by giving jobs a few more
|
||||||
sleep(SHUTDOWN_WAIT)
|
# moments to finish, killing and requeuing them if they didn't, and
|
||||||
|
# then terminating itself.
|
||||||
|
wait_and_signal(SHUTDOWN_WAIT, 'SIGTERM', 'gracefully shut down')
|
||||||
|
|
||||||
Sidekiq.logger.warn "sending #{SHUTDOWN_SIGNAL} to PID #{Process.pid} - Worker #{worker.class} - JID-#{job['jid']}"
|
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't.
|
||||||
Process.kill(SHUTDOWN_SIGNAL, Process.pid)
|
wait_and_signal(Sidekiq.options[:timeout] + 2, 'SIGKILL', 'die')
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
def get_rss
|
def get_rss
|
||||||
output, status = Gitlab::Popen.popen(%W(ps -o rss= -p #{Process.pid}))
|
output, status = Gitlab::Popen.popen(%W(ps -o rss= -p #{pid}))
|
||||||
return 0 unless status.zero?
|
return 0 unless status.zero?
|
||||||
|
|
||||||
output.to_i
|
output.to_i
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def wait_and_signal(time, signal, explanation)
|
||||||
|
Sidekiq.logger.warn "waiting #{time} seconds before sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})"
|
||||||
|
sleep(time)
|
||||||
|
|
||||||
|
Sidekiq.logger.warn "sending Sidekiq worker PID-#{pid} #{signal} (#{explanation})"
|
||||||
|
Process.kill(signal, pid)
|
||||||
|
end
|
||||||
|
|
||||||
|
def pid
|
||||||
|
Process.pid
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Gitlab::SidekiqMiddleware::MemoryKiller do
|
||||||
|
subject { described_class.new }
|
||||||
|
let(:pid) { 999 }
|
||||||
|
|
||||||
|
let(:worker) { double(:worker, class: 'TestWorker') }
|
||||||
|
let(:job) { { 'jid' => 123 } }
|
||||||
|
let(:queue) { 'test_queue' }
|
||||||
|
|
||||||
|
def run
|
||||||
|
thread = subject.call(worker, job, queue) { nil }
|
||||||
|
thread&.join
|
||||||
|
end
|
||||||
|
|
||||||
|
before do
|
||||||
|
allow(subject).to receive(:get_rss).and_return(10.kilobytes)
|
||||||
|
allow(subject).to receive(:pid).and_return(pid)
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when MAX_RSS is set to 0' do
|
||||||
|
before do
|
||||||
|
stub_const("#{described_class}::MAX_RSS", 0)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'does nothing' do
|
||||||
|
expect(subject).not_to receive(:sleep)
|
||||||
|
|
||||||
|
run
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when MAX_RSS is exceeded' do
|
||||||
|
before do
|
||||||
|
stub_const("#{described_class}::MAX_RSS", 5.kilobytes)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'sends the STP, TERM and KILL signals at expected times' do
|
||||||
|
expect(subject).to receive(:sleep).with(15 * 60).ordered
|
||||||
|
expect(Process).to receive(:kill).with('SIGSTP', pid).ordered
|
||||||
|
|
||||||
|
expect(subject).to receive(:sleep).with(30).ordered
|
||||||
|
expect(Process).to receive(:kill).with('SIGTERM', pid).ordered
|
||||||
|
|
||||||
|
expect(subject).to receive(:sleep).with(10).ordered
|
||||||
|
expect(Process).to receive(:kill).with('SIGKILL', pid).ordered
|
||||||
|
|
||||||
|
run
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when MAX_RSS is not exceeded' do
|
||||||
|
before do
|
||||||
|
stub_const("#{described_class}::MAX_RSS", 15.kilobytes)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'does nothing' do
|
||||||
|
expect(subject).not_to receive(:sleep)
|
||||||
|
|
||||||
|
run
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue