diff --git a/app/workers/stuck_ci_builds_worker.rb b/app/workers/stuck_ci_builds_worker.rb deleted file mode 100644 index b70df5a1afa..00000000000 --- a/app/workers/stuck_ci_builds_worker.rb +++ /dev/null @@ -1,19 +0,0 @@ -class StuckCiBuildsWorker - include Sidekiq::Worker - include CronjobQueue - - BUILD_STUCK_TIMEOUT = 1.day - - def perform - Rails.logger.info 'Cleaning stuck builds' - - builds = Ci::Build.joins(:project).running_or_pending.where('ci_builds.updated_at < ?', BUILD_STUCK_TIMEOUT.ago) - builds.find_each(batch_size: 50).each do |build| - Rails.logger.debug "Dropping stuck #{build.status} build #{build.id} for runner #{build.runner_id}" - build.drop - end - - # Update builds that failed to drop - builds.update_all(status: 'failed') - end -end diff --git a/app/workers/stuck_ci_jobs_worker.rb b/app/workers/stuck_ci_jobs_worker.rb new file mode 100644 index 00000000000..ae8c980c9e4 --- /dev/null +++ b/app/workers/stuck_ci_jobs_worker.rb @@ -0,0 +1,59 @@ +class StuckCiJobsWorker + include Sidekiq::Worker + include CronjobQueue + + EXCLUSIVE_LEASE_KEY = 'stuck_ci_builds_worker_lease'.freeze + + BUILD_RUNNING_OUTDATED_TIMEOUT = 1.hour + BUILD_PENDING_OUTDATED_TIMEOUT = 1.day + BUILD_PENDING_STUCK_TIMEOUT = 1.hour + + def perform + return unless try_obtain_lease + + Rails.logger.info "#{self.class}: Cleaning stuck builds" + + drop :running, BUILD_RUNNING_OUTDATED_TIMEOUT + drop :pending, BUILD_PENDING_OUTDATED_TIMEOUT + drop_stuck :pending, BUILD_PENDING_STUCK_TIMEOUT + + remove_lease + end + + private + + def try_obtain_lease + @uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain + end + + def remove_lease + Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid) + end + + def drop(status, timeout) + search(status, timeout) do |build| + drop_build :outdated, build, status, timeout + end + end + + def drop_stuck(status, timeout) + search(status, timeout) do |build| + return unless build.stuck? + drop_build :stuck, build, status, timeout + end + end + + def search(status, timeout) + builds = Ci::Build.where(status: status).where('ci_builds.updated_at < ?', timeout.ago) + builds.joins(:project).includes(:tags, :runner, project: :namespace).find_each(batch_size: 50).each do |build| + yield(build) + end + end + + def drop_build(type, build, status, timeout) + Rails.logger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout})" + Gitlab::OptimisticLocking.retry_lock(build, 3) do |b| + b.drop + end + end +end diff --git a/changelogs/unreleased/27523-make-stuck-build-detection-more-performant.yml b/changelogs/unreleased/27523-make-stuck-build-detection-more-performant.yml new file mode 100644 index 00000000000..a4ef2b23aaa --- /dev/null +++ b/changelogs/unreleased/27523-make-stuck-build-detection-more-performant.yml @@ -0,0 +1,4 @@ +--- +title: Make stuck builds detection more performant +merge_request: 9025 +author: diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index a82ff605a70..8f99a4d541f 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -177,9 +177,9 @@ production: &base # Periodically executed jobs, to self-heal Gitlab, do external synchronizations, etc. # Please read here for more information: https://github.com/ondrejbartas/sidekiq-cron#adding-cron-job cron_jobs: - # Flag stuck CI builds as failed - stuck_ci_builds_worker: - cron: "0 0 * * *" + # Flag stuck CI jobs as failed + stuck_ci_jobs_worker: + cron: "0 * * * *" # Remove expired build artifacts expire_build_artifacts_worker: cron: "50 * * * *" diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index c64ae15fa92..5aeaa7eab81 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -308,9 +308,9 @@ Settings.gravatar['host'] = Settings.host_without_www(Settings.gravatar[ # Cron Jobs # Settings['cron_jobs'] ||= Settingslogic.new({}) -Settings.cron_jobs['stuck_ci_builds_worker'] ||= Settingslogic.new({}) -Settings.cron_jobs['stuck_ci_builds_worker']['cron'] ||= '0 0 * * *' -Settings.cron_jobs['stuck_ci_builds_worker']['job_class'] = 'StuckCiBuildsWorker' +Settings.cron_jobs['stuck_ci_jobs_worker'] ||= Settingslogic.new({}) +Settings.cron_jobs['stuck_ci_jobs_worker']['cron'] ||= '0 * * * *' +Settings.cron_jobs['stuck_ci_jobs_worker']['job_class'] = 'StuckCiJobsWorker' Settings.cron_jobs['expire_build_artifacts_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['expire_build_artifacts_worker']['cron'] ||= '50 * * * *' Settings.cron_jobs['expire_build_artifacts_worker']['job_class'] = 'ExpireBuildArtifactsWorker' diff --git a/spec/workers/stuck_ci_builds_worker_spec.rb b/spec/workers/stuck_ci_builds_worker_spec.rb deleted file mode 100644 index 801fa31b45d..00000000000 --- a/spec/workers/stuck_ci_builds_worker_spec.rb +++ /dev/null @@ -1,57 +0,0 @@ -require "spec_helper" - -describe StuckCiBuildsWorker do - let!(:build) { create :ci_build } - let(:worker) { described_class.new } - - subject do - build.reload - build.status - end - - %w(pending running).each do |status| - context "#{status} build" do - before do - build.update!(status: status) - end - - it 'gets dropped if it was updated over 2 days ago' do - build.update!(updated_at: 2.days.ago) - worker.perform - is_expected.to eq('failed') - end - - it "is still #{status}" do - build.update!(updated_at: 1.minute.ago) - worker.perform - is_expected.to eq(status) - end - end - end - - %w(success failed canceled).each do |status| - context "#{status} build" do - before do - build.update!(status: status) - end - - it "is still #{status}" do - build.update!(updated_at: 2.days.ago) - worker.perform - is_expected.to eq(status) - end - end - end - - context "for deleted project" do - before do - build.update!(status: :running, updated_at: 2.days.ago) - build.project.update(pending_delete: true) - end - - it "does not drop build" do - expect_any_instance_of(Ci::Build).not_to receive(:drop) - worker.perform - end - end -end diff --git a/spec/workers/stuck_ci_jobs_worker_spec.rb b/spec/workers/stuck_ci_jobs_worker_spec.rb new file mode 100644 index 00000000000..8434b0c8e5b --- /dev/null +++ b/spec/workers/stuck_ci_jobs_worker_spec.rb @@ -0,0 +1,129 @@ +require 'spec_helper' + +describe StuckCiJobsWorker do + let!(:runner) { create :ci_runner } + let!(:job) { create :ci_build, runner: runner } + let(:worker) { described_class.new } + let(:exclusive_lease_uuid) { SecureRandom.uuid } + + subject do + job.reload + job.status + end + + before do + job.update!(status: status, updated_at: updated_at) + allow_any_instance_of(Gitlab::ExclusiveLease).to receive(:try_obtain).and_return(exclusive_lease_uuid) + end + + shared_examples 'job is dropped' do + it 'changes status' do + worker.perform + is_expected.to eq('failed') + end + end + + shared_examples 'job is unchanged' do + it "doesn't change status" do + worker.perform + is_expected.to eq(status) + end + end + + context 'when job is pending' do + let(:status) { 'pending' } + + context 'when job is not stuck' do + before { allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(false) } + + context 'when job was not updated for more than 1 day ago' do + let(:updated_at) { 2.days.ago } + it_behaves_like 'job is dropped' + end + + context 'when job was updated in less than 1 day ago' do + let(:updated_at) { 6.hours.ago } + it_behaves_like 'job is unchanged' + end + + context 'when job was not updated for more than 1 hour ago' do + let(:updated_at) { 2.hours.ago } + it_behaves_like 'job is unchanged' + end + end + + context 'when job is stuck' do + before { allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(true) } + + context 'when job was not updated for more than 1 hour ago' do + let(:updated_at) { 2.hours.ago } + it_behaves_like 'job is dropped' + end + + context 'when job was updated in less than 1 hour ago' do + let(:updated_at) { 30.minutes.ago } + it_behaves_like 'job is unchanged' + end + end + end + + context 'when job is running' do + let(:status) { 'running' } + + context 'when job was not updated for more than 1 hour ago' do + let(:updated_at) { 2.hours.ago } + it_behaves_like 'job is dropped' + end + + context 'when job was updated in less than 1 hour ago' do + let(:updated_at) { 30.minutes.ago } + it_behaves_like 'job is unchanged' + end + end + + %w(success skipped failed canceled).each do |status| + context "when job is #{status}" do + let(:status) { status } + let(:updated_at) { 2.days.ago } + it_behaves_like 'job is unchanged' + end + end + + context 'for deleted project' do + let(:status) { 'running' } + let(:updated_at) { 2.days.ago } + + before { job.project.update(pending_delete: true) } + + it 'does not drop job' do + expect_any_instance_of(Ci::Build).not_to receive(:drop) + worker.perform + end + end + + describe 'exclusive lease' do + let(:status) { 'running' } + let(:updated_at) { 2.days.ago } + let(:worker2) { described_class.new } + + it 'is guard by exclusive lease when executed concurrently' do + expect(worker).to receive(:drop).at_least(:once) + expect(worker2).not_to receive(:drop) + worker.perform + allow_any_instance_of(Gitlab::ExclusiveLease).to receive(:try_obtain).and_return(false) + worker2.perform + end + + it 'can be executed in sequence' do + expect(worker).to receive(:drop).at_least(:once) + expect(worker2).to receive(:drop).at_least(:once) + worker.perform + worker2.perform + end + + it 'cancels exclusive lease after worker perform' do + expect(Gitlab::ExclusiveLease).to receive(:cancel).with(described_class::EXCLUSIVE_LEASE_KEY, exclusive_lease_uuid) + worker.perform + end + end +end