From 1f9edb7c4a393a6ebe78e6f98e46515ad655cece Mon Sep 17 00:00:00 2001 From: Aleksei Lipniagov Date: Fri, 2 Aug 2019 09:04:32 +0000 Subject: [PATCH] Call `GC::Profiler.clear` only in one place Previously, both InfluxSampler and RubySampler were relying on the `GC::Profiler.total_time` data which is the sum over the list of captured GC events. Also, both samplers asynchronously called `GC::Profiler.clear` which led to incorrect metric data because each sampler has the wrong assumption it is the only object who calls `GC::Profiler.clear` and thus could rely on the gathered results between such calls. We should ensure that `GC::Profiler.total_time` is called only in one place making it possible to rely on accumulated data between such wipes. Also, we need to track the amount of profiler reports we lost. --- ...63571-fix-gc-profiler-data-being-wiped.yml | 5 ++++ config/initializers/peek.rb | 7 +++++- lib/gitlab/metrics/samplers/influx_sampler.rb | 22 ------------------ lib/gitlab/metrics/samplers/ruby_sampler.rb | 23 ++++++++++++++----- .../metrics/samplers/influx_sampler_spec.rb | 20 ---------------- .../metrics/samplers/ruby_sampler_spec.rb | 20 ++++++++++++---- 6 files changed, 44 insertions(+), 53 deletions(-) create mode 100644 changelogs/unreleased/63571-fix-gc-profiler-data-being-wiped.yml diff --git a/changelogs/unreleased/63571-fix-gc-profiler-data-being-wiped.yml b/changelogs/unreleased/63571-fix-gc-profiler-data-being-wiped.yml new file mode 100644 index 00000000000..7943d9573f3 --- /dev/null +++ b/changelogs/unreleased/63571-fix-gc-profiler-data-being-wiped.yml @@ -0,0 +1,5 @@ +--- +title: Fix GC::Profiler metrics fetching +merge_request: 31331 +author: +type: fixed diff --git a/config/initializers/peek.rb b/config/initializers/peek.rb index 4a2b7931f30..b6c7f1ff4fc 100644 --- a/config/initializers/peek.rb +++ b/config/initializers/peek.rb @@ -9,5 +9,10 @@ Peek.into Peek::Views::ActiveRecord Peek.into Peek::Views::Gitaly Peek.into Peek::Views::RedisDetailed Peek.into Peek::Views::Rugged -Peek.into Peek::Views::GC + +# `Peek::Views::GC` is currently disabled in production, as it runs with every request +# even if PerformanceBar is inactive and clears `GC::Profiler` reports we need for metrics. +# Check https://gitlab.com/gitlab-org/gitlab-ce/issues/65455 +Peek.into Peek::Views::GC if Rails.env.development? + Peek.into Peek::Views::Tracing if Labkit::Tracing.tracing_url_enabled? diff --git a/lib/gitlab/metrics/samplers/influx_sampler.rb b/lib/gitlab/metrics/samplers/influx_sampler.rb index 5138b37f83e..1eae0a7bf45 100644 --- a/lib/gitlab/metrics/samplers/influx_sampler.rb +++ b/lib/gitlab/metrics/samplers/influx_sampler.rb @@ -15,19 +15,14 @@ module Gitlab @last_step = nil @metrics = [] - - @last_minor_gc = Delta.new(GC.stat[:minor_gc_count]) - @last_major_gc = Delta.new(GC.stat[:major_gc_count]) end def sample sample_memory_usage sample_file_descriptors - sample_gc flush ensure - GC::Profiler.clear @metrics.clear end @@ -43,23 +38,6 @@ module Gitlab add_metric('file_descriptors', value: System.file_descriptor_count) end - def sample_gc - time = GC::Profiler.total_time * 1000.0 - stats = GC.stat.merge(total_time: time) - - # We want the difference of GC runs compared to the last sample, not the - # total amount since the process started. - stats[:minor_gc_count] = - @last_minor_gc.compared_with(stats[:minor_gc_count]) - - stats[:major_gc_count] = - @last_major_gc.compared_with(stats[:major_gc_count]) - - stats[:count] = stats[:minor_gc_count] + stats[:major_gc_count] - - add_metric('gc_statistics', stats) - end - def add_metric(series, values, tags = {}) prefix = sidekiq? ? 'sidekiq_' : 'rails_' diff --git a/lib/gitlab/metrics/samplers/ruby_sampler.rb b/lib/gitlab/metrics/samplers/ruby_sampler.rb index 1e200db0baf..3bfa3da35e0 100644 --- a/lib/gitlab/metrics/samplers/ruby_sampler.rb +++ b/lib/gitlab/metrics/samplers/ruby_sampler.rb @@ -6,7 +6,11 @@ module Gitlab module Metrics module Samplers class RubySampler < BaseSampler + GC_REPORT_BUCKETS = [0.001, 0.002, 0.005, 0.01, 0.05, 0.1, 0.5].freeze + def initialize(interval) + GC::Profiler.clear + metrics[:process_start_time_seconds].set(labels, Time.now.to_i) super @@ -37,7 +41,7 @@ module Gitlab process_resident_memory_bytes: ::Gitlab::Metrics.gauge(with_prefix(:process, :resident_memory_bytes), 'Memory used', labels), process_start_time_seconds: ::Gitlab::Metrics.gauge(with_prefix(:process, :start_time_seconds), 'Process start time seconds'), sampler_duration: ::Gitlab::Metrics.counter(with_prefix(:sampler, :duration_seconds_total), 'Sampler time', labels), - total_time: ::Gitlab::Metrics.counter(with_prefix(:gc, :duration_seconds_total), 'Total GC time', labels) + gc_duration_seconds: ::Gitlab::Metrics.histogram(with_prefix(:gc, :duration_seconds), 'GC time', labels, GC_REPORT_BUCKETS) } GC.stat.keys.each do |key| @@ -57,20 +61,27 @@ module Gitlab sample_gc metrics[:sampler_duration].increment(labels, System.monotonic_time - start_time) - ensure - GC::Profiler.clear end private def sample_gc - # Collect generic GC stats. + # Observe all GC samples + sample_gc_reports.each do |report| + metrics[:gc_duration_seconds].observe(labels, report[:GC_TIME]) + end + + # Collect generic GC stats GC.stat.each do |key, value| metrics[key].set(labels, value) end + end - # Collect the GC time since last sample in float seconds. - metrics[:total_time].increment(labels, GC::Profiler.total_time) + def sample_gc_reports + GC::Profiler.enable + GC::Profiler.raw_data + ensure + GC::Profiler.clear end def set_memory_usage_metrics diff --git a/spec/lib/gitlab/metrics/samplers/influx_sampler_spec.rb b/spec/lib/gitlab/metrics/samplers/influx_sampler_spec.rb index 81954fcf8c5..2923048f742 100644 --- a/spec/lib/gitlab/metrics/samplers/influx_sampler_spec.rb +++ b/spec/lib/gitlab/metrics/samplers/influx_sampler_spec.rb @@ -17,18 +17,10 @@ describe Gitlab::Metrics::Samplers::InfluxSampler do it 'samples various statistics' do expect(sampler).to receive(:sample_memory_usage) expect(sampler).to receive(:sample_file_descriptors) - expect(sampler).to receive(:sample_gc) expect(sampler).to receive(:flush) sampler.sample end - - it 'clears any GC profiles' do - expect(sampler).to receive(:flush) - expect(GC::Profiler).to receive(:clear) - - sampler.sample - end end describe '#flush' do @@ -67,18 +59,6 @@ describe Gitlab::Metrics::Samplers::InfluxSampler do end end - describe '#sample_gc' do - it 'adds a metric containing garbage collection statistics' do - expect(GC::Profiler).to receive(:total_time).and_return(0.24) - - expect(sampler).to receive(:add_metric) - .with(/gc_statistics/, an_instance_of(Hash)) - .and_call_original - - sampler.sample_gc - end - end - describe '#add_metric' do it 'prefixes the series name for a Rails process' do expect(sampler).to receive(:sidekiq?).and_return(false) diff --git a/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb b/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb index 4d93b70e6e3..5005a5d9ebc 100644 --- a/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb +++ b/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb @@ -59,17 +59,29 @@ describe Gitlab::Metrics::Samplers::RubySampler do end it 'clears any GC profiles' do - expect(GC::Profiler).to receive(:clear) + expect(GC::Profiler).to receive(:clear).at_least(:once) sampler.sample end end describe '#sample_gc' do - it 'adds a metric containing garbage collection time statistics' do - expect(GC::Profiler).to receive(:total_time).and_return(0.24) + let!(:sampler) { described_class.new(5) } - expect(sampler.metrics[:total_time]).to receive(:increment).with({}, 0.24) + let(:gc_reports) { [{ GC_TIME: 0.1 }, { GC_TIME: 0.2 }, { GC_TIME: 0.3 }] } + + it 're-enables GC::Profiler if needed' do + expect(GC::Profiler).to receive(:enable) + + sampler.sample + end + + it 'observes GC cycles time' do + expect(sampler).to receive(:sample_gc_reports).and_return(gc_reports) + + expect(sampler.metrics[:gc_duration_seconds]).to receive(:observe).with({}, 0.1).ordered + expect(sampler.metrics[:gc_duration_seconds]).to receive(:observe).with({}, 0.2).ordered + expect(sampler.metrics[:gc_duration_seconds]).to receive(:observe).with({}, 0.3).ordered sampler.sample end