diff --git a/app/services/ci/register_job_service.rb b/app/services/ci/register_job_service.rb index e09b445636f..d46dcff34a1 100644 --- a/app/services/ci/register_job_service.rb +++ b/app/services/ci/register_job_service.rb @@ -4,6 +4,9 @@ module Ci class RegisterJobService attr_reader :runner + JOB_QUEUE_DURATION_SECONDS_BUCKETS = [1, 3, 10, 30].freeze + JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET = 5.freeze + Result = Struct.new(:build, :valid?) def initialize(runner) @@ -104,10 +107,22 @@ module Ci end def register_success(job) - job_queue_duration_seconds.observe({ shared_runner: @runner.shared? }, Time.now - job.created_at) + labels = { shared_runner: runner.shared?, + jobs_running_for_project: jobs_running_for_project(job) } + + job_queue_duration_seconds.observe(labels, Time.now - job.queued_at) attempt_counter.increment end + def jobs_running_for_project(job) + return '+Inf' unless runner.shared? + + # excluding currently started job + running_jobs_count = job.project.builds.running.where(runner: Ci::Runner.shared) + .limit(JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET + 1).count - 1 + running_jobs_count < JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET ? running_jobs_count : "#{JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET}+" + end + def failed_attempt_counter @failed_attempt_counter ||= Gitlab::Metrics.counter(:job_register_attempts_failed_total, "Counts the times a runner tries to register a job") end @@ -117,7 +132,7 @@ module Ci end def job_queue_duration_seconds - @job_queue_duration_seconds ||= Gitlab::Metrics.histogram(:job_queue_duration_seconds, 'Request handling execution time') + @job_queue_duration_seconds ||= Gitlab::Metrics.histogram(:job_queue_duration_seconds, 'Request handling execution time', {}, JOB_QUEUE_DURATION_SECONDS_BUCKETS) end end end diff --git a/changelogs/unreleased/improve-jobs-queuing-time-metric.yml b/changelogs/unreleased/improve-jobs-queuing-time-metric.yml new file mode 100644 index 00000000000..cee8b8523fd --- /dev/null +++ b/changelogs/unreleased/improve-jobs-queuing-time-metric.yml @@ -0,0 +1,5 @@ +--- +title: Partition job_queue_duration_seconds with jobs_running_for_project +merge_request: 17730 +author: +type: changed diff --git a/spec/factories/ci/builds.rb b/spec/factories/ci/builds.rb index f6ba3a581ca..30235a3a876 100644 --- a/spec/factories/ci/builds.rb +++ b/spec/factories/ci/builds.rb @@ -62,6 +62,7 @@ FactoryBot.define do end trait :pending do + queued_at 'Di 29. Okt 09:50:59 CET 2013' status 'pending' end diff --git a/spec/services/ci/register_job_service_spec.rb b/spec/services/ci/register_job_service_spec.rb index 97a563c1ce1..aa7cc268dd7 100644 --- a/spec/services/ci/register_job_service_spec.rb +++ b/spec/services/ci/register_job_service_spec.rb @@ -370,10 +370,89 @@ module Ci it_behaves_like 'validation is not active' end end + end - def execute(runner) - described_class.new(runner).execute.build + describe '#register_success' do + let!(:current_time) { Time.new(2018, 4, 5, 14, 0, 0) } + let!(:attempt_counter) { double('Gitlab::Metrics::NullMetric') } + let!(:job_queue_duration_seconds) { double('Gitlab::Metrics::NullMetric') } + + before do + allow(Time).to receive(:now).and_return(current_time) + + # Stub defaults for any metrics other than the ones we're testing + allow(Gitlab::Metrics).to receive(:counter) + .with(any_args) + .and_return(Gitlab::Metrics::NullMetric.instance) + allow(Gitlab::Metrics).to receive(:histogram) + .with(any_args) + .and_return(Gitlab::Metrics::NullMetric.instance) + + # Stub tested metrics + allow(Gitlab::Metrics).to receive(:counter) + .with(:job_register_attempts_total, anything) + .and_return(attempt_counter) + allow(Gitlab::Metrics).to receive(:histogram) + .with(:job_queue_duration_seconds, anything, anything, anything) + .and_return(job_queue_duration_seconds) + + project.update(shared_runners_enabled: true) + pending_job.update(created_at: current_time - 3600, queued_at: current_time - 1800) end + + shared_examples 'metrics collector' do + it 'increments attempt counter' do + allow(job_queue_duration_seconds).to receive(:observe) + expect(attempt_counter).to receive(:increment) + + execute(runner) + end + + it 'counts job queuing time histogram with expected labels' do + allow(attempt_counter).to receive(:increment) + expect(job_queue_duration_seconds).to receive(:observe) + .with({ shared_runner: expected_shared_runner, + jobs_running_for_project: expected_jobs_running_for_project_first_job }, 1800) + + execute(runner) + end + + context 'when project already has running jobs' do + let!(:build2) { create( :ci_build, :running, pipeline: pipeline, runner: shared_runner) } + let!(:build3) { create( :ci_build, :running, pipeline: pipeline, runner: shared_runner) } + + it 'counts job queuing time histogram with expected labels' do + allow(attempt_counter).to receive(:increment) + expect(job_queue_duration_seconds).to receive(:observe) + .with({ shared_runner: expected_shared_runner, + jobs_running_for_project: expected_jobs_running_for_project_third_job }, 1800) + + execute(runner) + end + end + end + + context 'when shared runner is used' do + let(:runner) { shared_runner } + let(:expected_shared_runner) { true } + let(:expected_jobs_running_for_project_first_job) { 0 } + let(:expected_jobs_running_for_project_third_job) { 2 } + + it_behaves_like 'metrics collector' + end + + context 'when specific runner is used' do + let(:runner) { specific_runner } + let(:expected_shared_runner) { false } + let(:expected_jobs_running_for_project_first_job) { '+Inf' } + let(:expected_jobs_running_for_project_third_job) { '+Inf' } + + it_behaves_like 'metrics collector' + end + end + + def execute(runner) + described_class.new(runner).execute.build end end end