Add ExclusiveLease guards for RepositoryCheck::{DispatchWorker,BatchWorker}

We saw in production that DispatchWorker was running about twice an hour,
which would schedule twice as many jobs as it should.

For some reason, BatchWorker was running 1000 times per hour, possibly
due to Sidekiq RSS kills that caused these jobs to restart.

Adding an ExclusiveLease prevents these jobs from running more
than they should.

Relates to https://gitlab.com/gitlab-com/infrastructure/issues/4526
This commit is contained in:
Stan Hu 2018-07-06 09:05:58 -07:00
parent a291bcdf0d
commit b33661d6ec
4 changed files with 45 additions and 4 deletions

View file

@ -4,9 +4,11 @@ module RepositoryCheck
class BatchWorker
include ApplicationWorker
include RepositoryCheckQueue
include ExclusiveLeaseGuard
RUN_TIME = 3600
BATCH_SIZE = 10_000
LEASE_TIMEOUT = 1.hour
attr_reader :shard_name
@ -16,6 +18,20 @@ module RepositoryCheck
return unless Gitlab::CurrentSettings.repository_checks_enabled
return unless Gitlab::ShardHealthCache.healthy_shard?(shard_name)
try_obtain_lease do
perform_repository_checks
end
end
def lease_timeout
LEASE_TIMEOUT
end
def lease_key
"repository_check_batch_worker:#{shard_name}"
end
def perform_repository_checks
start = Time.now
# This loop will break after a little more than one hour ('a little
@ -26,7 +42,7 @@ module RepositoryCheck
project_ids.each do |project_id|
break if Time.now - start >= RUN_TIME
next unless try_obtain_lease(project_id)
next unless try_obtain_lease_for_project(project_id)
SingleRepositoryWorker.new.perform(project_id)
end
@ -60,7 +76,7 @@ module RepositoryCheck
Project.where(repository_storage: shard_name)
end
def try_obtain_lease(id)
def try_obtain_lease_for_project(id)
# Use a 24-hour timeout because on servers/projects where 'git fsck' is
# super slow we definitely do not want to run it twice in parallel.
Gitlab::ExclusiveLease.new(

View file

@ -3,13 +3,22 @@ module RepositoryCheck
include ApplicationWorker
include CronjobQueue
include ::EachShardWorker
include ExclusiveLeaseGuard
LEASE_TIMEOUT = 1.hour
def perform
return unless Gitlab::CurrentSettings.repository_checks_enabled
each_eligible_shard do |shard_name|
RepositoryCheck::BatchWorker.perform_async(shard_name)
try_obtain_lease do
each_eligible_shard do |shard_name|
RepositoryCheck::BatchWorker.perform_async(shard_name)
end
end
end
def lease_timeout
LEASE_TIMEOUT
end
end
end

View file

@ -62,4 +62,12 @@ describe RepositoryCheck::BatchWorker do
expect(subject.perform(shard_name)).to eq([])
end
it 'does not run if the exclusive lease is taken' do
allow(subject).to receive(:try_obtain_lease).and_return(false)
expect(subject).not_to receive(:perform_repository_checks)
subject.perform(shard_name)
end
end

View file

@ -11,6 +11,14 @@ describe RepositoryCheck::DispatchWorker do
subject.perform
end
it 'does nothing if the exclusive lease is taken' do
allow(subject).to receive(:try_obtain_lease).and_return(false)
expect(RepositoryCheck::BatchWorker).not_to receive(:perform_async)
subject.perform
end
it 'dispatches work to RepositoryCheck::BatchWorker' do
expect(RepositoryCheck::BatchWorker).to receive(:perform_async).at_least(:once)