From 6a18a411a30e9e7406ba9335ab502ec396add662 Mon Sep 17 00:00:00 2001 From: Shinya Maeda Date: Fri, 17 May 2019 19:10:44 +0700 Subject: [PATCH] Make pipeline schedule worker resilient Currently, pipeline schedule worker is unstable because it's sometimes killed by excessive memory consumption. In order to improve the performance, we add the following fixes: 1. next_run_at is always real_next_run, which means the value always takes into account of worker's cron schedule 1. Remove exlusive lock. This is already covered by real_next_run change. 1. Use RunPipelineScheduleWorker for avoiding memory killer. Memory consumption is spread to the multiple sidekiq worker. --- app/models/ci/pipeline_schedule.rb | 25 ++- app/services/ci/pipeline_schedule_service.rb | 13 ++ app/workers/pipeline_schedule_worker.rb | 41 +---- app/workers/run_pipeline_schedule_worker.rb | 26 +++- ...reventing-duplciate-pipeline-creations.yml | 5 + spec/factories/ci/pipeline_schedule.rb | 10 ++ .../projects/pipeline_schedules_spec.rb | 2 +- spec/models/ci/pipeline_schedule_spec.rb | 145 ++++++++++++------ .../ci/pipeline_schedule_service_spec.rb | 28 ++++ spec/workers/pipeline_schedule_worker_spec.rb | 57 +------ .../run_pipeline_schedule_worker_spec.rb | 32 +++- 11 files changed, 234 insertions(+), 150 deletions(-) create mode 100644 app/services/ci/pipeline_schedule_service.rb create mode 100644 changelogs/unreleased/set-real-next-run-at-for-preventing-duplciate-pipeline-creations.yml create mode 100644 spec/services/ci/pipeline_schedule_service_spec.rb diff --git a/app/models/ci/pipeline_schedule.rb b/app/models/ci/pipeline_schedule.rb index c0a0ca9acf6..c40ad39be61 100644 --- a/app/models/ci/pipeline_schedule.rb +++ b/app/models/ci/pipeline_schedule.rb @@ -27,9 +27,13 @@ module Ci scope :active, -> { where(active: true) } scope :inactive, -> { where(active: false) } + scope :runnable_schedules, -> { active.where("next_run_at < ?", Time.now) } + scope :preloaded, -> { preload(:owner, :project) } accepts_nested_attributes_for :variables, allow_destroy: true + alias_attribute :real_next_run, :next_run_at + def owned_by?(current_user) owner == current_user end @@ -46,8 +50,14 @@ module Ci update_attribute(:active, false) end + ## + # The `next_run_at` column is set to the actual execution date of `PipelineScheduleWorker`. + # This way, a schedule like `*/1 * * * *` won't be triggered in a short interval + # when PipelineScheduleWorker runs irregularly by Sidekiq Memory Killer. def set_next_run_at - self.next_run_at = Gitlab::Ci::CronParser.new(cron, cron_timezone).next_time_from(Time.now) + self.next_run_at = Gitlab::Ci::CronParser.new(Settings.cron_jobs['pipeline_schedule_worker']['cron'], + Time.zone.name) + .next_time_from(ideal_next_run_at) end def schedule_next_run! @@ -56,15 +66,14 @@ module Ci update_attribute(:next_run_at, nil) # update without validation end - def real_next_run( - worker_cron: Settings.cron_jobs['pipeline_schedule_worker']['cron'], - worker_time_zone: Time.zone.name) - Gitlab::Ci::CronParser.new(worker_cron, worker_time_zone) - .next_time_from(next_run_at) - end - def job_variables variables&.map(&:to_runner_variable) || [] end + + private + + def ideal_next_run_at + Gitlab::Ci::CronParser.new(cron, cron_timezone).next_time_from(Time.now) + end end end diff --git a/app/services/ci/pipeline_schedule_service.rb b/app/services/ci/pipeline_schedule_service.rb new file mode 100644 index 00000000000..387d0351490 --- /dev/null +++ b/app/services/ci/pipeline_schedule_service.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +module Ci + class PipelineScheduleService < BaseService + def execute(schedule) + # Ensure `next_run_at` is set properly before creating a pipeline. + # Otherwise, multiple pipelines could be created in a short interval. + schedule.schedule_next_run! + + RunPipelineScheduleWorker.perform_async(schedule.id, schedule.owner.id) + end + end +end diff --git a/app/workers/pipeline_schedule_worker.rb b/app/workers/pipeline_schedule_worker.rb index 8a9ee7808e4..9410fd1a786 100644 --- a/app/workers/pipeline_schedule_worker.rb +++ b/app/workers/pipeline_schedule_worker.rb @@ -3,47 +3,12 @@ class PipelineScheduleWorker include ApplicationWorker include CronjobQueue - include ::Gitlab::ExclusiveLeaseHelpers - EXCLUSIVE_LOCK_KEY = 'pipeline_schedules:run:lock' - LOCK_TIMEOUT = 50.minutes - - # rubocop: disable CodeReuse/ActiveRecord def perform - in_lock(EXCLUSIVE_LOCK_KEY, ttl: LOCK_TIMEOUT, retries: 1) do - Ci::PipelineSchedule.active.where("next_run_at < ?", Time.now) - .preload(:owner, :project).find_each do |schedule| - - schedule.schedule_next_run! - - Ci::CreatePipelineService.new(schedule.project, - schedule.owner, - ref: schedule.ref) - .execute!(:schedule, ignore_skip_ci: true, save_on_errors: true, schedule: schedule) - rescue => e - error(schedule, e) + Ci::PipelineSchedule.runnable_schedules.preloaded.find_in_batches do |schedules| + schedules.each do |schedule| + Ci::PipelineScheduleService.new(schedule.project, schedule.owner).execute(schedule) end end end - # rubocop: enable CodeReuse/ActiveRecord - - private - - def error(schedule, error) - failed_creation_counter.increment - - Rails.logger.error "Failed to create a scheduled pipeline. " \ - "schedule_id: #{schedule.id} message: #{error.message}" - - Gitlab::Sentry - .track_exception(error, - issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/41231', - extra: { schedule_id: schedule.id }) - end - - def failed_creation_counter - @failed_creation_counter ||= - Gitlab::Metrics.counter(:pipeline_schedule_creation_failed_total, - "Counter of failed attempts of pipeline schedule creation") - end end diff --git a/app/workers/run_pipeline_schedule_worker.rb b/app/workers/run_pipeline_schedule_worker.rb index f72331c003a..43e0b9db22f 100644 --- a/app/workers/run_pipeline_schedule_worker.rb +++ b/app/workers/run_pipeline_schedule_worker.rb @@ -21,6 +21,30 @@ class RunPipelineScheduleWorker Ci::CreatePipelineService.new(schedule.project, user, ref: schedule.ref) - .execute(:schedule, ignore_skip_ci: true, save_on_errors: false, schedule: schedule) + .execute!(:schedule, ignore_skip_ci: true, save_on_errors: false, schedule: schedule) + rescue Ci::CreatePipelineService::CreateError + # no-op. This is a user operation error such as corrupted .gitlab-ci.yml. + rescue => e + error(schedule, e) + end + + private + + def error(schedule, error) + failed_creation_counter.increment + + Rails.logger.error "Failed to create a scheduled pipeline. " \ + "schedule_id: #{schedule.id} message: #{error.message}" + + Gitlab::Sentry + .track_exception(error, + issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/41231', + extra: { schedule_id: schedule.id }) + end + + def failed_creation_counter + @failed_creation_counter ||= + Gitlab::Metrics.counter(:pipeline_schedule_creation_failed_total, + "Counter of failed attempts of pipeline schedule creation") end end diff --git a/changelogs/unreleased/set-real-next-run-at-for-preventing-duplciate-pipeline-creations.yml b/changelogs/unreleased/set-real-next-run-at-for-preventing-duplciate-pipeline-creations.yml new file mode 100644 index 00000000000..04eb035b157 --- /dev/null +++ b/changelogs/unreleased/set-real-next-run-at-for-preventing-duplciate-pipeline-creations.yml @@ -0,0 +1,5 @@ +--- +title: Make pipeline schedule worker resilient +merge_request: 28407 +author: +type: performance diff --git a/spec/factories/ci/pipeline_schedule.rb b/spec/factories/ci/pipeline_schedule.rb index b2b79807429..4b83ba2ac1b 100644 --- a/spec/factories/ci/pipeline_schedule.rb +++ b/spec/factories/ci/pipeline_schedule.rb @@ -7,6 +7,16 @@ FactoryBot.define do description "pipeline schedule" project + trait :every_minute do + cron '*/1 * * * *' + cron_timezone Gitlab::Ci::CronParser::VALID_SYNTAX_SAMPLE_TIME_ZONE + end + + trait :hourly do + cron '* */1 * * *' + cron_timezone Gitlab::Ci::CronParser::VALID_SYNTAX_SAMPLE_TIME_ZONE + end + trait :nightly do cron '0 1 * * *' cron_timezone Gitlab::Ci::CronParser::VALID_SYNTAX_SAMPLE_TIME_ZONE diff --git a/spec/features/projects/pipeline_schedules_spec.rb b/spec/features/projects/pipeline_schedules_spec.rb index b1a705f09ce..24041a51383 100644 --- a/spec/features/projects/pipeline_schedules_spec.rb +++ b/spec/features/projects/pipeline_schedules_spec.rb @@ -225,7 +225,7 @@ describe 'Pipeline Schedules', :js do context 'when active is true and next_run_at is NULL' do before do create(:ci_pipeline_schedule, project: project, owner: user).tap do |pipeline_schedule| - pipeline_schedule.update_attribute(:cron, nil) # Consequently next_run_at will be nil + pipeline_schedule.update_attribute(:next_run_at, nil) # Consequently next_run_at will be nil end end diff --git a/spec/models/ci/pipeline_schedule_spec.rb b/spec/models/ci/pipeline_schedule_spec.rb index 42d4769a921..6382be73ea7 100644 --- a/spec/models/ci/pipeline_schedule_spec.rb +++ b/spec/models/ci/pipeline_schedule_spec.rb @@ -48,32 +48,116 @@ describe Ci::PipelineSchedule do end end + describe '.runnable_schedules' do + subject { described_class.runnable_schedules } + + let!(:pipeline_schedule) do + Timecop.freeze(1.day.ago) do + create(:ci_pipeline_schedule, :hourly) + end + end + + it 'returns the runnable schedule' do + is_expected.to eq([pipeline_schedule]) + end + + context 'when there are no runnable schedules' do + let!(:pipeline_schedule) { } + + it 'returns an empty array' do + is_expected.to be_empty + end + end + end + + describe '.preloaded' do + subject { described_class.preloaded } + + before do + create_list(:ci_pipeline_schedule, 3) + end + + it 'preloads the associations' do + subject + + query = ActiveRecord::QueryRecorder.new { subject.each(&:project) } + + expect(query.count).to eq(2) + end + end + describe '#set_next_run_at' do - let!(:pipeline_schedule) { create(:ci_pipeline_schedule, :nightly) } + let(:pipeline_schedule) { create(:ci_pipeline_schedule, :nightly) } + let(:ideal_next_run_at) { pipeline_schedule.send(:ideal_next_run_at) } + + let(:expected_next_run_at) do + Gitlab::Ci::CronParser.new(Settings.cron_jobs['pipeline_schedule_worker']['cron'], Time.zone.name) + .next_time_from(ideal_next_run_at) + end + + let(:cron_worker_next_run_at) do + Gitlab::Ci::CronParser.new(Settings.cron_jobs['pipeline_schedule_worker']['cron'], Time.zone.name) + .next_time_from(Time.now) + end context 'when creates new pipeline schedule' do - let(:expected_next_run_at) do - Gitlab::Ci::CronParser.new(pipeline_schedule.cron, pipeline_schedule.cron_timezone) - .next_time_from(Time.now) + it 'updates next_run_at automatically' do + expect(pipeline_schedule.next_run_at).to eq(expected_next_run_at) + end + end + + context 'when PipelineScheduleWorker runs at a specific interval' do + before do + allow(Settings).to receive(:cron_jobs) do + { + 'pipeline_schedule_worker' => { + 'cron' => '0 1 2 3 *' + } + } + end end - it 'updates next_run_at automatically' do - expect(described_class.last.next_run_at).to eq(expected_next_run_at) + it "updates next_run_at to the sidekiq worker's execution time" do + expect(pipeline_schedule.next_run_at.min).to eq(0) + expect(pipeline_schedule.next_run_at.hour).to eq(1) + expect(pipeline_schedule.next_run_at.day).to eq(2) + expect(pipeline_schedule.next_run_at.month).to eq(3) + end + end + + context 'when pipeline schedule runs every minute' do + let(:pipeline_schedule) { create(:ci_pipeline_schedule, :every_minute) } + + it "updates next_run_at to the sidekiq worker's execution time" do + expect(pipeline_schedule.next_run_at).to eq(cron_worker_next_run_at) + end + end + + context 'when there are two different pipeline schedules in different time zones' do + let(:pipeline_schedule_1) { create(:ci_pipeline_schedule, :weekly, cron_timezone: 'Eastern Time (US & Canada)') } + let(:pipeline_schedule_2) { create(:ci_pipeline_schedule, :weekly, cron_timezone: 'UTC') } + + it 'sets different next_run_at' do + expect(pipeline_schedule_1.next_run_at).not_to eq(pipeline_schedule_2.next_run_at) + end + end + + context 'when there are two different pipeline schedules in the same time zones' do + let(:pipeline_schedule_1) { create(:ci_pipeline_schedule, :weekly, cron_timezone: 'UTC') } + let(:pipeline_schedule_2) { create(:ci_pipeline_schedule, :weekly, cron_timezone: 'UTC') } + + it 'sets the sames next_run_at' do + expect(pipeline_schedule_1.next_run_at).to eq(pipeline_schedule_2.next_run_at) end end context 'when updates cron of exsisted pipeline schedule' do let(:new_cron) { '0 0 1 1 *' } - let(:expected_next_run_at) do - Gitlab::Ci::CronParser.new(new_cron, pipeline_schedule.cron_timezone) - .next_time_from(Time.now) - end - it 'updates next_run_at automatically' do pipeline_schedule.update!(cron: new_cron) - expect(described_class.last.next_run_at).to eq(expected_next_run_at) + expect(pipeline_schedule.next_run_at).to eq(expected_next_run_at) end end end @@ -83,10 +167,11 @@ describe Ci::PipelineSchedule do context 'when reschedules after 10 days from now' do let(:future_time) { 10.days.from_now } + let(:ideal_next_run_at) { pipeline_schedule.send(:ideal_next_run_at) } let(:expected_next_run_at) do - Gitlab::Ci::CronParser.new(pipeline_schedule.cron, pipeline_schedule.cron_timezone) - .next_time_from(future_time) + Gitlab::Ci::CronParser.new(Settings.cron_jobs['pipeline_schedule_worker']['cron'], Time.zone.name) + .next_time_from(ideal_next_run_at) end it 'points to proper next_run_at' do @@ -99,38 +184,6 @@ describe Ci::PipelineSchedule do end end - describe '#real_next_run' do - subject do - described_class.last.real_next_run(worker_cron: worker_cron, - worker_time_zone: worker_time_zone) - end - - context 'when GitLab time_zone is UTC' do - before do - allow(Time).to receive(:zone) - .and_return(ActiveSupport::TimeZone[worker_time_zone]) - end - - let(:worker_time_zone) { 'UTC' } - - context 'when cron_timezone is Eastern Time (US & Canada)' do - before do - create(:ci_pipeline_schedule, :nightly, - cron_timezone: 'Eastern Time (US & Canada)') - end - - let(:worker_cron) { '0 1 2 3 *' } - - it 'returns the next time worker executes' do - expect(subject.min).to eq(0) - expect(subject.hour).to eq(1) - expect(subject.day).to eq(2) - expect(subject.month).to eq(3) - end - end - end - end - describe '#job_variables' do let!(:pipeline_schedule) { create(:ci_pipeline_schedule) } diff --git a/spec/services/ci/pipeline_schedule_service_spec.rb b/spec/services/ci/pipeline_schedule_service_spec.rb new file mode 100644 index 00000000000..f2ac53cb25a --- /dev/null +++ b/spec/services/ci/pipeline_schedule_service_spec.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +require 'spec_helper' + +describe Ci::PipelineScheduleService do + let(:project) { create(:project) } + let(:user) { create(:user) } + let(:service) { described_class.new(project, user) } + + describe '#execute' do + subject { service.execute(schedule) } + + let(:schedule) { create(:ci_pipeline_schedule, project: project, owner: user) } + + it 'schedules next run' do + expect(schedule).to receive(:schedule_next_run!) + + subject + end + + it 'runs RunPipelineScheduleWorker' do + expect(RunPipelineScheduleWorker) + .to receive(:perform_async).with(schedule.id, schedule.owner.id) + + subject + end + end +end diff --git a/spec/workers/pipeline_schedule_worker_spec.rb b/spec/workers/pipeline_schedule_worker_spec.rb index 8c604b13297..9326db34209 100644 --- a/spec/workers/pipeline_schedule_worker_spec.rb +++ b/spec/workers/pipeline_schedule_worker_spec.rb @@ -41,16 +41,6 @@ describe PipelineScheduleWorker do it_behaves_like 'successful scheduling' - context 'when exclusive lease has already been taken by the other instance' do - before do - stub_exclusive_lease_taken(described_class::EXCLUSIVE_LOCK_KEY, timeout: described_class::LOCK_TIMEOUT) - end - - it 'raises an error and does not start creating pipelines' do - expect { subject }.to raise_error(Gitlab::ExclusiveLeaseHelpers::FailedToObtainLockError) - end - end - context 'when the latest commit contains [ci skip]' do before do allow_any_instance_of(Ci::Pipeline) @@ -77,47 +67,19 @@ describe PipelineScheduleWorker do stub_ci_pipeline_yaml_file(YAML.dump(rspec: { variables: 'rspec' } )) end - it 'creates a failed pipeline with the reason' do - expect { subject }.to change { project.ci_pipelines.count }.by(1) - expect(Ci::Pipeline.last).to be_config_error - expect(Ci::Pipeline.last.yaml_errors).not_to be_nil + it 'does not creates a new pipeline' do + expect { subject }.not_to change { project.ci_pipelines.count } end end end context 'when the schedule is not runnable by the user' do - before do - expect(Gitlab::Sentry) - .to receive(:track_exception) - .with(Ci::CreatePipelineService::CreateError, - issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/41231', - extra: { schedule_id: pipeline_schedule.id } ).once - end - it 'does not deactivate the schedule' do subject expect(pipeline_schedule.reload.active).to be_truthy end - it 'increments Prometheus counter' do - expect(Gitlab::Metrics) - .to receive(:counter) - .with(:pipeline_schedule_creation_failed_total, "Counter of failed attempts of pipeline schedule creation") - .and_call_original - - subject - end - - it 'logging a pipeline error' do - expect(Rails.logger) - .to receive(:error) - .with(a_string_matching("Insufficient permissions to create a new pipeline")) - .and_call_original - - subject - end - it 'does not create a pipeline' do expect { subject }.not_to change { project.ci_pipelines.count } end @@ -131,21 +93,6 @@ describe PipelineScheduleWorker do before do stub_ci_pipeline_yaml_file(nil) project.add_maintainer(user) - - expect(Gitlab::Sentry) - .to receive(:track_exception) - .with(Ci::CreatePipelineService::CreateError, - issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/41231', - extra: { schedule_id: pipeline_schedule.id } ).once - end - - it 'logging a pipeline error' do - expect(Rails.logger) - .to receive(:error) - .with(a_string_matching("Missing .gitlab-ci.yml file")) - .and_call_original - - subject end it 'does not create a pipeline' do diff --git a/spec/workers/run_pipeline_schedule_worker_spec.rb b/spec/workers/run_pipeline_schedule_worker_spec.rb index 690af22f4dc..7414470f8e7 100644 --- a/spec/workers/run_pipeline_schedule_worker_spec.rb +++ b/spec/workers/run_pipeline_schedule_worker_spec.rb @@ -32,7 +32,37 @@ describe RunPipelineScheduleWorker do it 'calls the Service' do expect(Ci::CreatePipelineService).to receive(:new).with(project, user, ref: pipeline_schedule.ref).and_return(create_pipeline_service) - expect(create_pipeline_service).to receive(:execute).with(:schedule, ignore_skip_ci: true, save_on_errors: false, schedule: pipeline_schedule) + expect(create_pipeline_service).to receive(:execute!).with(:schedule, ignore_skip_ci: true, save_on_errors: false, schedule: pipeline_schedule) + + worker.perform(pipeline_schedule.id, user.id) + end + end + + context 'when database statement timeout happens' do + before do + allow(Ci::CreatePipelineService).to receive(:new) { raise ActiveRecord::StatementInvalid } + + expect(Gitlab::Sentry) + .to receive(:track_exception) + .with(ActiveRecord::StatementInvalid, + issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/41231', + extra: { schedule_id: pipeline_schedule.id } ).once + end + + it 'increments Prometheus counter' do + expect(Gitlab::Metrics) + .to receive(:counter) + .with(:pipeline_schedule_creation_failed_total, "Counter of failed attempts of pipeline schedule creation") + .and_call_original + + worker.perform(pipeline_schedule.id, user.id) + end + + it 'logging a pipeline error' do + expect(Rails.logger) + .to receive(:error) + .with(a_string_matching('ActiveRecord::StatementInvalid')) + .and_call_original worker.perform(pipeline_schedule.id, user.id) end