Improve resillency of monitor
- Retry connection when it fails - Properly shutdown daemon - Stop monitor if the Exception is raised - Properly guard exception handling
This commit is contained in:
parent
3683d2d251
commit
cb193cd6b5
4 changed files with 146 additions and 46 deletions
|
@ -46,7 +46,10 @@ module Gitlab
|
|||
|
||||
if thread
|
||||
thread.wakeup if thread.alive?
|
||||
thread.join unless Thread.current == thread
|
||||
begin
|
||||
thread.join unless Thread.current == thread
|
||||
rescue Exception # rubocop:disable Lint/RescueException
|
||||
end
|
||||
@thread = nil
|
||||
end
|
||||
end
|
||||
|
|
|
@ -6,6 +6,7 @@ module Gitlab
|
|||
|
||||
NOTIFICATION_CHANNEL = 'sidekiq:cancel:notifications'.freeze
|
||||
CANCEL_DEADLINE = 24.hours.seconds
|
||||
RECONNECT_TIME = 3.seconds
|
||||
|
||||
# We use exception derived from `Exception`
|
||||
# to consider this as an very low-level exception
|
||||
|
@ -33,7 +34,8 @@ module Gitlab
|
|||
action: 'run',
|
||||
queue: queue,
|
||||
jid: jid,
|
||||
canceled: true)
|
||||
canceled: true
|
||||
)
|
||||
raise CancelledError
|
||||
end
|
||||
|
||||
|
@ -44,32 +46,6 @@ module Gitlab
|
|||
end
|
||||
end
|
||||
|
||||
def start_working
|
||||
Sidekiq.logger.info(
|
||||
class: self.class,
|
||||
action: 'start',
|
||||
message: 'Starting Monitor Daemon')
|
||||
|
||||
::Gitlab::Redis::SharedState.with do |redis|
|
||||
redis.subscribe(NOTIFICATION_CHANNEL) do |on|
|
||||
on.message do |channel, message|
|
||||
process_message(message)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Sidekiq.logger.warn(
|
||||
class: self.class,
|
||||
action: 'stop',
|
||||
message: 'Stopping Monitor Daemon')
|
||||
rescue Exception => e # rubocop:disable Lint/RescueException
|
||||
Sidekiq.logger.warn(
|
||||
class: self.class,
|
||||
action: 'exception',
|
||||
message: e.message)
|
||||
raise e
|
||||
end
|
||||
|
||||
def self.cancel_job(jid)
|
||||
payload = {
|
||||
action: 'cancel',
|
||||
|
@ -84,12 +60,56 @@ module Gitlab
|
|||
|
||||
private
|
||||
|
||||
def start_working
|
||||
Sidekiq.logger.info(
|
||||
class: self.class,
|
||||
action: 'start',
|
||||
message: 'Starting Monitor Daemon'
|
||||
)
|
||||
|
||||
while enabled?
|
||||
process_messages
|
||||
sleep(RECONNECT_TIME)
|
||||
end
|
||||
|
||||
ensure
|
||||
Sidekiq.logger.warn(
|
||||
class: self.class,
|
||||
action: 'stop',
|
||||
message: 'Stopping Monitor Daemon'
|
||||
)
|
||||
end
|
||||
|
||||
def stop_working
|
||||
thread.raise(Interrupt) if thread.alive?
|
||||
end
|
||||
|
||||
def process_messages
|
||||
::Gitlab::Redis::SharedState.with do |redis|
|
||||
redis.subscribe(NOTIFICATION_CHANNEL) do |on|
|
||||
on.message do |channel, message|
|
||||
process_message(message)
|
||||
end
|
||||
end
|
||||
end
|
||||
rescue Exception => e # rubocop:disable Lint/RescueException
|
||||
Sidekiq.logger.warn(
|
||||
class: self.class,
|
||||
action: 'exception',
|
||||
message: e.message
|
||||
)
|
||||
|
||||
# we re-raise system exceptions
|
||||
raise e unless e.is_a?(StandardError)
|
||||
end
|
||||
|
||||
def process_message(message)
|
||||
Sidekiq.logger.info(
|
||||
class: self.class,
|
||||
channel: NOTIFICATION_CHANNEL,
|
||||
message: 'Received payload on channel',
|
||||
payload: message)
|
||||
payload: message
|
||||
)
|
||||
|
||||
message = safe_parse(message)
|
||||
return unless message
|
||||
|
@ -115,14 +135,16 @@ module Gitlab
|
|||
|
||||
Thread.new do
|
||||
# try to find a thread, but with guaranteed
|
||||
# handle that this thread corresponds to actually running job
|
||||
# that handle for thread corresponds to actually
|
||||
# running job
|
||||
find_thread_with_lock(jid) do |thread|
|
||||
Sidekiq.logger.warn(
|
||||
class: self.class,
|
||||
action: 'cancel',
|
||||
message: 'Canceling thread with CancelledError',
|
||||
jid: jid,
|
||||
thread_id: thread.object_id)
|
||||
thread_id: thread.object_id
|
||||
)
|
||||
|
||||
thread&.raise(CancelledError)
|
||||
end
|
||||
|
|
|
@ -34,12 +34,12 @@ describe Gitlab::Daemon do
|
|||
end
|
||||
end
|
||||
|
||||
describe 'when Daemon is enabled' do
|
||||
context 'when Daemon is enabled' do
|
||||
before do
|
||||
allow(subject).to receive(:enabled?).and_return(true)
|
||||
end
|
||||
|
||||
describe 'when Daemon is stopped' do
|
||||
context 'when Daemon is stopped' do
|
||||
describe '#start' do
|
||||
it 'starts the Daemon' do
|
||||
expect { subject.start.join }.to change { subject.thread? }.from(false).to(true)
|
||||
|
@ -57,14 +57,14 @@ describe Gitlab::Daemon do
|
|||
end
|
||||
end
|
||||
|
||||
describe 'when Daemon is running' do
|
||||
context 'when Daemon is running' do
|
||||
before do
|
||||
subject.start.join
|
||||
subject.start
|
||||
end
|
||||
|
||||
describe '#start' do
|
||||
it "doesn't start running Daemon" do
|
||||
expect { subject.start.join }.not_to change { subject.thread? }
|
||||
expect { subject.start.join }.not_to change { subject.thread }
|
||||
|
||||
expect(subject).to have_received(:start_working).once
|
||||
end
|
||||
|
@ -76,11 +76,29 @@ describe Gitlab::Daemon do
|
|||
|
||||
expect(subject).to have_received(:stop_working)
|
||||
end
|
||||
|
||||
context 'when stop_working raises exception' do
|
||||
before do
|
||||
allow(subject).to receive(:start_working) do
|
||||
sleep(1000)
|
||||
end
|
||||
end
|
||||
|
||||
it 'shutdowns Daemon' do
|
||||
expect(subject).to receive(:stop_working) do
|
||||
subject.thread.raise(Interrupt)
|
||||
end
|
||||
|
||||
expect(subject.thread).to be_alive
|
||||
expect { subject.stop }.not_to raise_error
|
||||
expect(subject.thread).to be_nil
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe 'when Daemon is disabled' do
|
||||
context 'when Daemon is disabled' do
|
||||
before do
|
||||
allow(subject).to receive(:enabled?).and_return(false)
|
||||
end
|
||||
|
|
|
@ -31,19 +31,26 @@ describe Gitlab::SidekiqMonitor do
|
|||
end
|
||||
|
||||
it 'raises exception' do
|
||||
expect { monitor.within_job(jid, 'queue') }.to raise_error(described_class::CancelledError)
|
||||
expect { monitor.within_job(jid, 'queue') }.to raise_error(
|
||||
described_class::CancelledError)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '#start_working' do
|
||||
subject { monitor.start_working }
|
||||
subject { monitor.send(:start_working) }
|
||||
|
||||
before do
|
||||
# we want to run at most once cycle
|
||||
# we toggle `enabled?` flag after the first call
|
||||
stub_const('Gitlab::SidekiqMonitor::RECONNECT_TIME', 0)
|
||||
allow(monitor).to receive(:enabled?).and_return(true, false)
|
||||
|
||||
allow(Sidekiq.logger).to receive(:info)
|
||||
allow(Sidekiq.logger).to receive(:warn)
|
||||
end
|
||||
|
||||
context 'when structured logging is used' do
|
||||
before do
|
||||
allow_any_instance_of(::Redis).to receive(:subscribe)
|
||||
end
|
||||
|
||||
it 'logs start message' do
|
||||
expect(Sidekiq.logger).to receive(:info)
|
||||
.with(
|
||||
|
@ -51,6 +58,8 @@ describe Gitlab::SidekiqMonitor do
|
|||
action: 'start',
|
||||
message: 'Starting Monitor Daemon')
|
||||
|
||||
expect(::Gitlab::Redis::SharedState).to receive(:with)
|
||||
|
||||
subject
|
||||
end
|
||||
|
||||
|
@ -61,10 +70,25 @@ describe Gitlab::SidekiqMonitor do
|
|||
action: 'stop',
|
||||
message: 'Stopping Monitor Daemon')
|
||||
|
||||
expect(::Gitlab::Redis::SharedState).to receive(:with)
|
||||
|
||||
subject
|
||||
end
|
||||
|
||||
it 'logs exception message' do
|
||||
it 'logs StandardError message' do
|
||||
expect(Sidekiq.logger).to receive(:warn)
|
||||
.with(
|
||||
class: described_class,
|
||||
action: 'exception',
|
||||
message: 'My Exception')
|
||||
|
||||
expect(::Gitlab::Redis::SharedState).to receive(:with)
|
||||
.and_raise(StandardError, 'My Exception')
|
||||
|
||||
expect { subject }.not_to raise_error
|
||||
end
|
||||
|
||||
it 'logs and raises Exception message' do
|
||||
expect(Sidekiq.logger).to receive(:warn)
|
||||
.with(
|
||||
class: described_class,
|
||||
|
@ -78,6 +102,20 @@ describe Gitlab::SidekiqMonitor do
|
|||
end
|
||||
end
|
||||
|
||||
context 'when StandardError is raised' do
|
||||
it 'does retry connection' do
|
||||
expect(::Gitlab::Redis::SharedState).to receive(:with)
|
||||
.and_raise(StandardError, 'My Exception')
|
||||
|
||||
expect(::Gitlab::Redis::SharedState).to receive(:with)
|
||||
|
||||
# we expect to run `process_messages` twice
|
||||
expect(monitor).to receive(:enabled?).and_return(true, true, false)
|
||||
|
||||
subject
|
||||
end
|
||||
end
|
||||
|
||||
context 'when message is published' do
|
||||
let(:subscribed) { double }
|
||||
|
||||
|
@ -128,6 +166,19 @@ describe Gitlab::SidekiqMonitor do
|
|||
end
|
||||
end
|
||||
|
||||
describe '#stop' do
|
||||
let!(:monitor_thread) { monitor.start }
|
||||
|
||||
it 'does stop the thread' do
|
||||
expect(monitor_thread).to be_alive
|
||||
|
||||
expect { monitor.stop }.not_to raise_error
|
||||
|
||||
expect(monitor_thread).not_to be_alive
|
||||
expect { monitor_thread.value }.to raise_error(Interrupt)
|
||||
end
|
||||
end
|
||||
|
||||
describe '#process_job_cancel' do
|
||||
subject { monitor.send(:process_job_cancel, jid) }
|
||||
|
||||
|
@ -156,6 +207,11 @@ describe Gitlab::SidekiqMonitor do
|
|||
monitor.jobs_thread[jid] = thread
|
||||
end
|
||||
|
||||
after do
|
||||
thread.kill
|
||||
rescue
|
||||
end
|
||||
|
||||
it 'does log cancellation message' do
|
||||
expect(Sidekiq.logger).to receive(:warn)
|
||||
.with(
|
||||
|
@ -175,8 +231,9 @@ describe Gitlab::SidekiqMonitor do
|
|||
|
||||
subject.join
|
||||
|
||||
expect(thread).not_to be_alive
|
||||
expect { thread.value }.to raise_error(described_class::CancelledError)
|
||||
# we wait for the thread to be cancelled
|
||||
# by `process_job_cancel`
|
||||
expect { thread.join(5) }.to raise_error(described_class::CancelledError)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue