Add ExclusiveLease guards for RepositoryCheck::{DispatchWorker,BatchWorker}

We saw in production that DispatchWorker was running about twice an hour,
which would schedule twice as many jobs as it should.

For some reason, BatchWorker was running 1000 times per hour, possibly
due to Sidekiq RSS kills that caused these jobs to restart.

Adding an ExclusiveLease prevents these jobs from running more
than they should.

Relates to https://gitlab.com/gitlab-com/infrastructure/issues/4526
This commit is contained in:
Stan Hu 2018-07-06 09:05:58 -07:00
parent a291bcdf0d
commit b33661d6ec
4 changed files with 45 additions and 4 deletions

View File

@ -4,9 +4,11 @@ module RepositoryCheck
class BatchWorker class BatchWorker
include ApplicationWorker include ApplicationWorker
include RepositoryCheckQueue include RepositoryCheckQueue
include ExclusiveLeaseGuard
RUN_TIME = 3600 RUN_TIME = 3600
BATCH_SIZE = 10_000 BATCH_SIZE = 10_000
LEASE_TIMEOUT = 1.hour
attr_reader :shard_name attr_reader :shard_name
@ -16,6 +18,20 @@ module RepositoryCheck
return unless Gitlab::CurrentSettings.repository_checks_enabled return unless Gitlab::CurrentSettings.repository_checks_enabled
return unless Gitlab::ShardHealthCache.healthy_shard?(shard_name) return unless Gitlab::ShardHealthCache.healthy_shard?(shard_name)
try_obtain_lease do
perform_repository_checks
end
end
def lease_timeout
LEASE_TIMEOUT
end
def lease_key
"repository_check_batch_worker:#{shard_name}"
end
def perform_repository_checks
start = Time.now start = Time.now
# This loop will break after a little more than one hour ('a little # This loop will break after a little more than one hour ('a little
@ -26,7 +42,7 @@ module RepositoryCheck
project_ids.each do |project_id| project_ids.each do |project_id|
break if Time.now - start >= RUN_TIME break if Time.now - start >= RUN_TIME
next unless try_obtain_lease(project_id) next unless try_obtain_lease_for_project(project_id)
SingleRepositoryWorker.new.perform(project_id) SingleRepositoryWorker.new.perform(project_id)
end end
@ -60,7 +76,7 @@ module RepositoryCheck
Project.where(repository_storage: shard_name) Project.where(repository_storage: shard_name)
end end
def try_obtain_lease(id) def try_obtain_lease_for_project(id)
# Use a 24-hour timeout because on servers/projects where 'git fsck' is # Use a 24-hour timeout because on servers/projects where 'git fsck' is
# super slow we definitely do not want to run it twice in parallel. # super slow we definitely do not want to run it twice in parallel.
Gitlab::ExclusiveLease.new( Gitlab::ExclusiveLease.new(

View File

@ -3,13 +3,22 @@ module RepositoryCheck
include ApplicationWorker include ApplicationWorker
include CronjobQueue include CronjobQueue
include ::EachShardWorker include ::EachShardWorker
include ExclusiveLeaseGuard
LEASE_TIMEOUT = 1.hour
def perform def perform
return unless Gitlab::CurrentSettings.repository_checks_enabled return unless Gitlab::CurrentSettings.repository_checks_enabled
each_eligible_shard do |shard_name| try_obtain_lease do
RepositoryCheck::BatchWorker.perform_async(shard_name) each_eligible_shard do |shard_name|
RepositoryCheck::BatchWorker.perform_async(shard_name)
end
end end
end end
def lease_timeout
LEASE_TIMEOUT
end
end end
end end

View File

@ -62,4 +62,12 @@ describe RepositoryCheck::BatchWorker do
expect(subject.perform(shard_name)).to eq([]) expect(subject.perform(shard_name)).to eq([])
end end
it 'does not run if the exclusive lease is taken' do
allow(subject).to receive(:try_obtain_lease).and_return(false)
expect(subject).not_to receive(:perform_repository_checks)
subject.perform(shard_name)
end
end end

View File

@ -11,6 +11,14 @@ describe RepositoryCheck::DispatchWorker do
subject.perform subject.perform
end end
it 'does nothing if the exclusive lease is taken' do
allow(subject).to receive(:try_obtain_lease).and_return(false)
expect(RepositoryCheck::BatchWorker).not_to receive(:perform_async)
subject.perform
end
it 'dispatches work to RepositoryCheck::BatchWorker' do it 'dispatches work to RepositoryCheck::BatchWorker' do
expect(RepositoryCheck::BatchWorker).to receive(:perform_async).at_least(:once) expect(RepositoryCheck::BatchWorker).to receive(:perform_async).at_least(:once)