Merge branch '27523-make-stuck-build-detection-more-performant' into 'master'
Make stuck builds detection more performant Closes #27523 See merge request !9025
This commit is contained in:
commit
34f3d8999a
7 changed files with 198 additions and 82 deletions
|
@ -1,19 +0,0 @@
|
||||||
class StuckCiBuildsWorker
|
|
||||||
include Sidekiq::Worker
|
|
||||||
include CronjobQueue
|
|
||||||
|
|
||||||
BUILD_STUCK_TIMEOUT = 1.day
|
|
||||||
|
|
||||||
def perform
|
|
||||||
Rails.logger.info 'Cleaning stuck builds'
|
|
||||||
|
|
||||||
builds = Ci::Build.joins(:project).running_or_pending.where('ci_builds.updated_at < ?', BUILD_STUCK_TIMEOUT.ago)
|
|
||||||
builds.find_each(batch_size: 50).each do |build|
|
|
||||||
Rails.logger.debug "Dropping stuck #{build.status} build #{build.id} for runner #{build.runner_id}"
|
|
||||||
build.drop
|
|
||||||
end
|
|
||||||
|
|
||||||
# Update builds that failed to drop
|
|
||||||
builds.update_all(status: 'failed')
|
|
||||||
end
|
|
||||||
end
|
|
59
app/workers/stuck_ci_jobs_worker.rb
Normal file
59
app/workers/stuck_ci_jobs_worker.rb
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
class StuckCiJobsWorker
|
||||||
|
include Sidekiq::Worker
|
||||||
|
include CronjobQueue
|
||||||
|
|
||||||
|
EXCLUSIVE_LEASE_KEY = 'stuck_ci_builds_worker_lease'.freeze
|
||||||
|
|
||||||
|
BUILD_RUNNING_OUTDATED_TIMEOUT = 1.hour
|
||||||
|
BUILD_PENDING_OUTDATED_TIMEOUT = 1.day
|
||||||
|
BUILD_PENDING_STUCK_TIMEOUT = 1.hour
|
||||||
|
|
||||||
|
def perform
|
||||||
|
return unless try_obtain_lease
|
||||||
|
|
||||||
|
Rails.logger.info "#{self.class}: Cleaning stuck builds"
|
||||||
|
|
||||||
|
drop :running, BUILD_RUNNING_OUTDATED_TIMEOUT
|
||||||
|
drop :pending, BUILD_PENDING_OUTDATED_TIMEOUT
|
||||||
|
drop_stuck :pending, BUILD_PENDING_STUCK_TIMEOUT
|
||||||
|
|
||||||
|
remove_lease
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def try_obtain_lease
|
||||||
|
@uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain
|
||||||
|
end
|
||||||
|
|
||||||
|
def remove_lease
|
||||||
|
Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid)
|
||||||
|
end
|
||||||
|
|
||||||
|
def drop(status, timeout)
|
||||||
|
search(status, timeout) do |build|
|
||||||
|
drop_build :outdated, build, status, timeout
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def drop_stuck(status, timeout)
|
||||||
|
search(status, timeout) do |build|
|
||||||
|
return unless build.stuck?
|
||||||
|
drop_build :stuck, build, status, timeout
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def search(status, timeout)
|
||||||
|
builds = Ci::Build.where(status: status).where('ci_builds.updated_at < ?', timeout.ago)
|
||||||
|
builds.joins(:project).includes(:tags, :runner, project: :namespace).find_each(batch_size: 50).each do |build|
|
||||||
|
yield(build)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def drop_build(type, build, status, timeout)
|
||||||
|
Rails.logger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout})"
|
||||||
|
Gitlab::OptimisticLocking.retry_lock(build, 3) do |b|
|
||||||
|
b.drop
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,4 @@
|
||||||
|
---
|
||||||
|
title: Make stuck builds detection more performant
|
||||||
|
merge_request: 9025
|
||||||
|
author:
|
|
@ -177,9 +177,9 @@ production: &base
|
||||||
# Periodically executed jobs, to self-heal Gitlab, do external synchronizations, etc.
|
# Periodically executed jobs, to self-heal Gitlab, do external synchronizations, etc.
|
||||||
# Please read here for more information: https://github.com/ondrejbartas/sidekiq-cron#adding-cron-job
|
# Please read here for more information: https://github.com/ondrejbartas/sidekiq-cron#adding-cron-job
|
||||||
cron_jobs:
|
cron_jobs:
|
||||||
# Flag stuck CI builds as failed
|
# Flag stuck CI jobs as failed
|
||||||
stuck_ci_builds_worker:
|
stuck_ci_jobs_worker:
|
||||||
cron: "0 0 * * *"
|
cron: "0 * * * *"
|
||||||
# Remove expired build artifacts
|
# Remove expired build artifacts
|
||||||
expire_build_artifacts_worker:
|
expire_build_artifacts_worker:
|
||||||
cron: "50 * * * *"
|
cron: "50 * * * *"
|
||||||
|
|
|
@ -308,9 +308,9 @@ Settings.gravatar['host'] = Settings.host_without_www(Settings.gravatar[
|
||||||
# Cron Jobs
|
# Cron Jobs
|
||||||
#
|
#
|
||||||
Settings['cron_jobs'] ||= Settingslogic.new({})
|
Settings['cron_jobs'] ||= Settingslogic.new({})
|
||||||
Settings.cron_jobs['stuck_ci_builds_worker'] ||= Settingslogic.new({})
|
Settings.cron_jobs['stuck_ci_jobs_worker'] ||= Settingslogic.new({})
|
||||||
Settings.cron_jobs['stuck_ci_builds_worker']['cron'] ||= '0 0 * * *'
|
Settings.cron_jobs['stuck_ci_jobs_worker']['cron'] ||= '0 * * * *'
|
||||||
Settings.cron_jobs['stuck_ci_builds_worker']['job_class'] = 'StuckCiBuildsWorker'
|
Settings.cron_jobs['stuck_ci_jobs_worker']['job_class'] = 'StuckCiJobsWorker'
|
||||||
Settings.cron_jobs['expire_build_artifacts_worker'] ||= Settingslogic.new({})
|
Settings.cron_jobs['expire_build_artifacts_worker'] ||= Settingslogic.new({})
|
||||||
Settings.cron_jobs['expire_build_artifacts_worker']['cron'] ||= '50 * * * *'
|
Settings.cron_jobs['expire_build_artifacts_worker']['cron'] ||= '50 * * * *'
|
||||||
Settings.cron_jobs['expire_build_artifacts_worker']['job_class'] = 'ExpireBuildArtifactsWorker'
|
Settings.cron_jobs['expire_build_artifacts_worker']['job_class'] = 'ExpireBuildArtifactsWorker'
|
||||||
|
|
|
@ -1,57 +0,0 @@
|
||||||
require "spec_helper"
|
|
||||||
|
|
||||||
describe StuckCiBuildsWorker do
|
|
||||||
let!(:build) { create :ci_build }
|
|
||||||
let(:worker) { described_class.new }
|
|
||||||
|
|
||||||
subject do
|
|
||||||
build.reload
|
|
||||||
build.status
|
|
||||||
end
|
|
||||||
|
|
||||||
%w(pending running).each do |status|
|
|
||||||
context "#{status} build" do
|
|
||||||
before do
|
|
||||||
build.update!(status: status)
|
|
||||||
end
|
|
||||||
|
|
||||||
it 'gets dropped if it was updated over 2 days ago' do
|
|
||||||
build.update!(updated_at: 2.days.ago)
|
|
||||||
worker.perform
|
|
||||||
is_expected.to eq('failed')
|
|
||||||
end
|
|
||||||
|
|
||||||
it "is still #{status}" do
|
|
||||||
build.update!(updated_at: 1.minute.ago)
|
|
||||||
worker.perform
|
|
||||||
is_expected.to eq(status)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
%w(success failed canceled).each do |status|
|
|
||||||
context "#{status} build" do
|
|
||||||
before do
|
|
||||||
build.update!(status: status)
|
|
||||||
end
|
|
||||||
|
|
||||||
it "is still #{status}" do
|
|
||||||
build.update!(updated_at: 2.days.ago)
|
|
||||||
worker.perform
|
|
||||||
is_expected.to eq(status)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
context "for deleted project" do
|
|
||||||
before do
|
|
||||||
build.update!(status: :running, updated_at: 2.days.ago)
|
|
||||||
build.project.update(pending_delete: true)
|
|
||||||
end
|
|
||||||
|
|
||||||
it "does not drop build" do
|
|
||||||
expect_any_instance_of(Ci::Build).not_to receive(:drop)
|
|
||||||
worker.perform
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
129
spec/workers/stuck_ci_jobs_worker_spec.rb
Normal file
129
spec/workers/stuck_ci_jobs_worker_spec.rb
Normal file
|
@ -0,0 +1,129 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe StuckCiJobsWorker do
|
||||||
|
let!(:runner) { create :ci_runner }
|
||||||
|
let!(:job) { create :ci_build, runner: runner }
|
||||||
|
let(:worker) { described_class.new }
|
||||||
|
let(:exclusive_lease_uuid) { SecureRandom.uuid }
|
||||||
|
|
||||||
|
subject do
|
||||||
|
job.reload
|
||||||
|
job.status
|
||||||
|
end
|
||||||
|
|
||||||
|
before do
|
||||||
|
job.update!(status: status, updated_at: updated_at)
|
||||||
|
allow_any_instance_of(Gitlab::ExclusiveLease).to receive(:try_obtain).and_return(exclusive_lease_uuid)
|
||||||
|
end
|
||||||
|
|
||||||
|
shared_examples 'job is dropped' do
|
||||||
|
it 'changes status' do
|
||||||
|
worker.perform
|
||||||
|
is_expected.to eq('failed')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
shared_examples 'job is unchanged' do
|
||||||
|
it "doesn't change status" do
|
||||||
|
worker.perform
|
||||||
|
is_expected.to eq(status)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when job is pending' do
|
||||||
|
let(:status) { 'pending' }
|
||||||
|
|
||||||
|
context 'when job is not stuck' do
|
||||||
|
before { allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(false) }
|
||||||
|
|
||||||
|
context 'when job was not updated for more than 1 day ago' do
|
||||||
|
let(:updated_at) { 2.days.ago }
|
||||||
|
it_behaves_like 'job is dropped'
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when job was updated in less than 1 day ago' do
|
||||||
|
let(:updated_at) { 6.hours.ago }
|
||||||
|
it_behaves_like 'job is unchanged'
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when job was not updated for more than 1 hour ago' do
|
||||||
|
let(:updated_at) { 2.hours.ago }
|
||||||
|
it_behaves_like 'job is unchanged'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when job is stuck' do
|
||||||
|
before { allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(true) }
|
||||||
|
|
||||||
|
context 'when job was not updated for more than 1 hour ago' do
|
||||||
|
let(:updated_at) { 2.hours.ago }
|
||||||
|
it_behaves_like 'job is dropped'
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when job was updated in less than 1 hour ago' do
|
||||||
|
let(:updated_at) { 30.minutes.ago }
|
||||||
|
it_behaves_like 'job is unchanged'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when job is running' do
|
||||||
|
let(:status) { 'running' }
|
||||||
|
|
||||||
|
context 'when job was not updated for more than 1 hour ago' do
|
||||||
|
let(:updated_at) { 2.hours.ago }
|
||||||
|
it_behaves_like 'job is dropped'
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when job was updated in less than 1 hour ago' do
|
||||||
|
let(:updated_at) { 30.minutes.ago }
|
||||||
|
it_behaves_like 'job is unchanged'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
%w(success skipped failed canceled).each do |status|
|
||||||
|
context "when job is #{status}" do
|
||||||
|
let(:status) { status }
|
||||||
|
let(:updated_at) { 2.days.ago }
|
||||||
|
it_behaves_like 'job is unchanged'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'for deleted project' do
|
||||||
|
let(:status) { 'running' }
|
||||||
|
let(:updated_at) { 2.days.ago }
|
||||||
|
|
||||||
|
before { job.project.update(pending_delete: true) }
|
||||||
|
|
||||||
|
it 'does not drop job' do
|
||||||
|
expect_any_instance_of(Ci::Build).not_to receive(:drop)
|
||||||
|
worker.perform
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe 'exclusive lease' do
|
||||||
|
let(:status) { 'running' }
|
||||||
|
let(:updated_at) { 2.days.ago }
|
||||||
|
let(:worker2) { described_class.new }
|
||||||
|
|
||||||
|
it 'is guard by exclusive lease when executed concurrently' do
|
||||||
|
expect(worker).to receive(:drop).at_least(:once)
|
||||||
|
expect(worker2).not_to receive(:drop)
|
||||||
|
worker.perform
|
||||||
|
allow_any_instance_of(Gitlab::ExclusiveLease).to receive(:try_obtain).and_return(false)
|
||||||
|
worker2.perform
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'can be executed in sequence' do
|
||||||
|
expect(worker).to receive(:drop).at_least(:once)
|
||||||
|
expect(worker2).to receive(:drop).at_least(:once)
|
||||||
|
worker.perform
|
||||||
|
worker2.perform
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'cancels exclusive lease after worker perform' do
|
||||||
|
expect(Gitlab::ExclusiveLease).to receive(:cancel).with(described_class::EXCLUSIVE_LEASE_KEY, exclusive_lease_uuid)
|
||||||
|
worker.perform
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in a new issue