Add ExclusiveLease guards for RepositoryCheck::{DispatchWorker,BatchWorker}
We saw in production that DispatchWorker was running about twice an hour, which would schedule twice as many jobs as it should. For some reason, BatchWorker was running 1000 times per hour, possibly due to Sidekiq RSS kills that caused these jobs to restart. Adding an ExclusiveLease prevents these jobs from running more than they should. Relates to https://gitlab.com/gitlab-com/infrastructure/issues/4526
This commit is contained in:
parent
a291bcdf0d
commit
b33661d6ec
4 changed files with 45 additions and 4 deletions
|
@ -4,9 +4,11 @@ module RepositoryCheck
|
||||||
class BatchWorker
|
class BatchWorker
|
||||||
include ApplicationWorker
|
include ApplicationWorker
|
||||||
include RepositoryCheckQueue
|
include RepositoryCheckQueue
|
||||||
|
include ExclusiveLeaseGuard
|
||||||
|
|
||||||
RUN_TIME = 3600
|
RUN_TIME = 3600
|
||||||
BATCH_SIZE = 10_000
|
BATCH_SIZE = 10_000
|
||||||
|
LEASE_TIMEOUT = 1.hour
|
||||||
|
|
||||||
attr_reader :shard_name
|
attr_reader :shard_name
|
||||||
|
|
||||||
|
@ -16,6 +18,20 @@ module RepositoryCheck
|
||||||
return unless Gitlab::CurrentSettings.repository_checks_enabled
|
return unless Gitlab::CurrentSettings.repository_checks_enabled
|
||||||
return unless Gitlab::ShardHealthCache.healthy_shard?(shard_name)
|
return unless Gitlab::ShardHealthCache.healthy_shard?(shard_name)
|
||||||
|
|
||||||
|
try_obtain_lease do
|
||||||
|
perform_repository_checks
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def lease_timeout
|
||||||
|
LEASE_TIMEOUT
|
||||||
|
end
|
||||||
|
|
||||||
|
def lease_key
|
||||||
|
"repository_check_batch_worker:#{shard_name}"
|
||||||
|
end
|
||||||
|
|
||||||
|
def perform_repository_checks
|
||||||
start = Time.now
|
start = Time.now
|
||||||
|
|
||||||
# This loop will break after a little more than one hour ('a little
|
# This loop will break after a little more than one hour ('a little
|
||||||
|
@ -26,7 +42,7 @@ module RepositoryCheck
|
||||||
project_ids.each do |project_id|
|
project_ids.each do |project_id|
|
||||||
break if Time.now - start >= RUN_TIME
|
break if Time.now - start >= RUN_TIME
|
||||||
|
|
||||||
next unless try_obtain_lease(project_id)
|
next unless try_obtain_lease_for_project(project_id)
|
||||||
|
|
||||||
SingleRepositoryWorker.new.perform(project_id)
|
SingleRepositoryWorker.new.perform(project_id)
|
||||||
end
|
end
|
||||||
|
@ -60,7 +76,7 @@ module RepositoryCheck
|
||||||
Project.where(repository_storage: shard_name)
|
Project.where(repository_storage: shard_name)
|
||||||
end
|
end
|
||||||
|
|
||||||
def try_obtain_lease(id)
|
def try_obtain_lease_for_project(id)
|
||||||
# Use a 24-hour timeout because on servers/projects where 'git fsck' is
|
# Use a 24-hour timeout because on servers/projects where 'git fsck' is
|
||||||
# super slow we definitely do not want to run it twice in parallel.
|
# super slow we definitely do not want to run it twice in parallel.
|
||||||
Gitlab::ExclusiveLease.new(
|
Gitlab::ExclusiveLease.new(
|
||||||
|
|
|
@ -3,13 +3,22 @@ module RepositoryCheck
|
||||||
include ApplicationWorker
|
include ApplicationWorker
|
||||||
include CronjobQueue
|
include CronjobQueue
|
||||||
include ::EachShardWorker
|
include ::EachShardWorker
|
||||||
|
include ExclusiveLeaseGuard
|
||||||
|
|
||||||
|
LEASE_TIMEOUT = 1.hour
|
||||||
|
|
||||||
def perform
|
def perform
|
||||||
return unless Gitlab::CurrentSettings.repository_checks_enabled
|
return unless Gitlab::CurrentSettings.repository_checks_enabled
|
||||||
|
|
||||||
each_eligible_shard do |shard_name|
|
try_obtain_lease do
|
||||||
RepositoryCheck::BatchWorker.perform_async(shard_name)
|
each_eligible_shard do |shard_name|
|
||||||
|
RepositoryCheck::BatchWorker.perform_async(shard_name)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def lease_timeout
|
||||||
|
LEASE_TIMEOUT
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -62,4 +62,12 @@ describe RepositoryCheck::BatchWorker do
|
||||||
|
|
||||||
expect(subject.perform(shard_name)).to eq([])
|
expect(subject.perform(shard_name)).to eq([])
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'does not run if the exclusive lease is taken' do
|
||||||
|
allow(subject).to receive(:try_obtain_lease).and_return(false)
|
||||||
|
|
||||||
|
expect(subject).not_to receive(:perform_repository_checks)
|
||||||
|
|
||||||
|
subject.perform(shard_name)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -11,6 +11,14 @@ describe RepositoryCheck::DispatchWorker do
|
||||||
subject.perform
|
subject.perform
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'does nothing if the exclusive lease is taken' do
|
||||||
|
allow(subject).to receive(:try_obtain_lease).and_return(false)
|
||||||
|
|
||||||
|
expect(RepositoryCheck::BatchWorker).not_to receive(:perform_async)
|
||||||
|
|
||||||
|
subject.perform
|
||||||
|
end
|
||||||
|
|
||||||
it 'dispatches work to RepositoryCheck::BatchWorker' do
|
it 'dispatches work to RepositoryCheck::BatchWorker' do
|
||||||
expect(RepositoryCheck::BatchWorker).to receive(:perform_async).at_least(:once)
|
expect(RepositoryCheck::BatchWorker).to receive(:perform_async).at_least(:once)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue