2016-04-13 13:56:05 +00:00
|
|
|
module RepositoryCheck
|
|
|
|
class BatchWorker
|
2017-11-28 16:08:30 +00:00
|
|
|
include ApplicationWorker
|
2016-10-21 16:13:41 +00:00
|
|
|
include CronjobQueue
|
|
|
|
|
2016-04-13 13:56:05 +00:00
|
|
|
RUN_TIME = 3600
|
2018-05-02 18:05:40 +00:00
|
|
|
BATCH_SIZE = 10_000
|
2016-10-21 16:13:41 +00:00
|
|
|
|
2016-04-13 13:56:05 +00:00
|
|
|
def perform
|
2018-05-02 11:24:45 +00:00
|
|
|
return unless Gitlab::CurrentSettings.repository_checks_enabled
|
2018-04-24 19:13:34 +00:00
|
|
|
|
2016-04-13 13:56:05 +00:00
|
|
|
start = Time.now
|
2016-10-21 16:13:41 +00:00
|
|
|
|
2016-04-13 13:56:05 +00:00
|
|
|
# This loop will break after a little more than one hour ('a little
|
|
|
|
# more' because `git fsck` may take a few minutes), or if it runs out of
|
|
|
|
# projects to check. By default sidekiq-cron will start a new
|
|
|
|
# RepositoryCheckWorker each hour so that as long as there are repositories to
|
|
|
|
# check, only one (or two) will be checked at a time.
|
2018-05-03 19:15:12 +00:00
|
|
|
project_ids.each do |project_id|
|
2016-04-13 13:56:05 +00:00
|
|
|
break if Time.now - start >= RUN_TIME
|
2016-10-21 16:13:41 +00:00
|
|
|
|
2016-04-13 13:56:05 +00:00
|
|
|
next unless try_obtain_lease(project_id)
|
2016-10-21 16:13:41 +00:00
|
|
|
|
2016-04-13 13:56:05 +00:00
|
|
|
SingleRepositoryWorker.new.perform(project_id)
|
|
|
|
end
|
|
|
|
end
|
2016-10-21 16:13:41 +00:00
|
|
|
|
2016-04-13 13:56:05 +00:00
|
|
|
private
|
2016-10-21 16:13:41 +00:00
|
|
|
|
2016-04-13 13:56:05 +00:00
|
|
|
# Project.find_each does not support WHERE clauses and
|
|
|
|
# Project.find_in_batches does not support ordering. So we just build an
|
|
|
|
# array of ID's. This is OK because we do it only once an hour, because
|
|
|
|
# getting ID's from Postgres is not terribly slow, and because no user
|
|
|
|
# has to sit and wait for this query to finish.
|
2018-05-03 19:15:12 +00:00
|
|
|
def project_ids(batch_size = BATCH_SIZE)
|
2018-05-02 18:05:40 +00:00
|
|
|
project_ids = never_checked_project_ids(batch_size)
|
|
|
|
|
|
|
|
remaining_capacity = batch_size - project_ids.count
|
|
|
|
|
|
|
|
if remaining_capacity > 0
|
|
|
|
project_ids + old_checked_project_ids(remaining_capacity)
|
|
|
|
else
|
|
|
|
project_ids
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def never_checked_project_ids(batch_size)
|
2018-05-03 19:15:12 +00:00
|
|
|
Project.where(last_repository_check_at: nil)
|
|
|
|
.where('created_at < ?', 24.hours.ago)
|
2018-05-02 18:05:40 +00:00
|
|
|
.limit(batch_size).pluck(:id)
|
|
|
|
end
|
|
|
|
|
|
|
|
def old_checked_project_ids(batch_size)
|
2018-05-03 19:15:12 +00:00
|
|
|
Project.where.not(last_repository_check_at: nil)
|
|
|
|
.where('last_repository_check_at < ?', 1.month.ago)
|
2018-05-02 18:05:40 +00:00
|
|
|
.reorder(last_repository_check_at: :asc)
|
|
|
|
.limit(batch_size).pluck(:id)
|
2016-04-13 13:56:05 +00:00
|
|
|
end
|
2016-10-21 16:13:41 +00:00
|
|
|
|
2016-04-13 13:56:05 +00:00
|
|
|
def try_obtain_lease(id)
|
|
|
|
# Use a 24-hour timeout because on servers/projects where 'git fsck' is
|
|
|
|
# super slow we definitely do not want to run it twice in parallel.
|
|
|
|
Gitlab::ExclusiveLease.new(
|
|
|
|
"project_repository_check:#{id}",
|
|
|
|
timeout: 24.hours
|
|
|
|
).try_obtain
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|