Call GC::Profiler.clear only in one place

Previously, both InfluxSampler and RubySampler were relying on the
`GC::Profiler.total_time` data which is the sum over the list
of captured GC events. Also, both samplers asynchronously called
`GC::Profiler.clear` which led to incorrect metric data because
each sampler has the wrong assumption it is the only object who calls
`GC::Profiler.clear` and thus could rely on the gathered results between
such calls.

We should ensure that `GC::Profiler.total_time` is called only in one
place making it possible to rely on accumulated data between such wipes.

Also, we need to track the amount of profiler reports we lost.
This commit is contained in:
Aleksei Lipniagov 2019-08-02 09:04:32 +00:00 committed by Kamil Trzciński
parent ebdd3a233e
commit 1f9edb7c4a
6 changed files with 44 additions and 53 deletions

View file

@ -0,0 +1,5 @@
---
title: Fix GC::Profiler metrics fetching
merge_request: 31331
author:
type: fixed

View file

@ -9,5 +9,10 @@ Peek.into Peek::Views::ActiveRecord
Peek.into Peek::Views::Gitaly
Peek.into Peek::Views::RedisDetailed
Peek.into Peek::Views::Rugged
Peek.into Peek::Views::GC
# `Peek::Views::GC` is currently disabled in production, as it runs with every request
# even if PerformanceBar is inactive and clears `GC::Profiler` reports we need for metrics.
# Check https://gitlab.com/gitlab-org/gitlab-ce/issues/65455
Peek.into Peek::Views::GC if Rails.env.development?
Peek.into Peek::Views::Tracing if Labkit::Tracing.tracing_url_enabled?

View file

@ -15,19 +15,14 @@ module Gitlab
@last_step = nil
@metrics = []
@last_minor_gc = Delta.new(GC.stat[:minor_gc_count])
@last_major_gc = Delta.new(GC.stat[:major_gc_count])
end
def sample
sample_memory_usage
sample_file_descriptors
sample_gc
flush
ensure
GC::Profiler.clear
@metrics.clear
end
@ -43,23 +38,6 @@ module Gitlab
add_metric('file_descriptors', value: System.file_descriptor_count)
end
def sample_gc
time = GC::Profiler.total_time * 1000.0
stats = GC.stat.merge(total_time: time)
# We want the difference of GC runs compared to the last sample, not the
# total amount since the process started.
stats[:minor_gc_count] =
@last_minor_gc.compared_with(stats[:minor_gc_count])
stats[:major_gc_count] =
@last_major_gc.compared_with(stats[:major_gc_count])
stats[:count] = stats[:minor_gc_count] + stats[:major_gc_count]
add_metric('gc_statistics', stats)
end
def add_metric(series, values, tags = {})
prefix = sidekiq? ? 'sidekiq_' : 'rails_'

View file

@ -6,7 +6,11 @@ module Gitlab
module Metrics
module Samplers
class RubySampler < BaseSampler
GC_REPORT_BUCKETS = [0.001, 0.002, 0.005, 0.01, 0.05, 0.1, 0.5].freeze
def initialize(interval)
GC::Profiler.clear
metrics[:process_start_time_seconds].set(labels, Time.now.to_i)
super
@ -37,7 +41,7 @@ module Gitlab
process_resident_memory_bytes: ::Gitlab::Metrics.gauge(with_prefix(:process, :resident_memory_bytes), 'Memory used', labels),
process_start_time_seconds: ::Gitlab::Metrics.gauge(with_prefix(:process, :start_time_seconds), 'Process start time seconds'),
sampler_duration: ::Gitlab::Metrics.counter(with_prefix(:sampler, :duration_seconds_total), 'Sampler time', labels),
total_time: ::Gitlab::Metrics.counter(with_prefix(:gc, :duration_seconds_total), 'Total GC time', labels)
gc_duration_seconds: ::Gitlab::Metrics.histogram(with_prefix(:gc, :duration_seconds), 'GC time', labels, GC_REPORT_BUCKETS)
}
GC.stat.keys.each do |key|
@ -57,20 +61,27 @@ module Gitlab
sample_gc
metrics[:sampler_duration].increment(labels, System.monotonic_time - start_time)
ensure
GC::Profiler.clear
end
private
def sample_gc
# Collect generic GC stats.
# Observe all GC samples
sample_gc_reports.each do |report|
metrics[:gc_duration_seconds].observe(labels, report[:GC_TIME])
end
# Collect generic GC stats
GC.stat.each do |key, value|
metrics[key].set(labels, value)
end
end
# Collect the GC time since last sample in float seconds.
metrics[:total_time].increment(labels, GC::Profiler.total_time)
def sample_gc_reports
GC::Profiler.enable
GC::Profiler.raw_data
ensure
GC::Profiler.clear
end
def set_memory_usage_metrics

View file

@ -17,18 +17,10 @@ describe Gitlab::Metrics::Samplers::InfluxSampler do
it 'samples various statistics' do
expect(sampler).to receive(:sample_memory_usage)
expect(sampler).to receive(:sample_file_descriptors)
expect(sampler).to receive(:sample_gc)
expect(sampler).to receive(:flush)
sampler.sample
end
it 'clears any GC profiles' do
expect(sampler).to receive(:flush)
expect(GC::Profiler).to receive(:clear)
sampler.sample
end
end
describe '#flush' do
@ -67,18 +59,6 @@ describe Gitlab::Metrics::Samplers::InfluxSampler do
end
end
describe '#sample_gc' do
it 'adds a metric containing garbage collection statistics' do
expect(GC::Profiler).to receive(:total_time).and_return(0.24)
expect(sampler).to receive(:add_metric)
.with(/gc_statistics/, an_instance_of(Hash))
.and_call_original
sampler.sample_gc
end
end
describe '#add_metric' do
it 'prefixes the series name for a Rails process' do
expect(sampler).to receive(:sidekiq?).and_return(false)

View file

@ -59,17 +59,29 @@ describe Gitlab::Metrics::Samplers::RubySampler do
end
it 'clears any GC profiles' do
expect(GC::Profiler).to receive(:clear)
expect(GC::Profiler).to receive(:clear).at_least(:once)
sampler.sample
end
end
describe '#sample_gc' do
it 'adds a metric containing garbage collection time statistics' do
expect(GC::Profiler).to receive(:total_time).and_return(0.24)
let!(:sampler) { described_class.new(5) }
expect(sampler.metrics[:total_time]).to receive(:increment).with({}, 0.24)
let(:gc_reports) { [{ GC_TIME: 0.1 }, { GC_TIME: 0.2 }, { GC_TIME: 0.3 }] }
it 're-enables GC::Profiler if needed' do
expect(GC::Profiler).to receive(:enable)
sampler.sample
end
it 'observes GC cycles time' do
expect(sampler).to receive(:sample_gc_reports).and_return(gc_reports)
expect(sampler.metrics[:gc_duration_seconds]).to receive(:observe).with({}, 0.1).ordered
expect(sampler.metrics[:gc_duration_seconds]).to receive(:observe).with({}, 0.2).ordered
expect(sampler.metrics[:gc_duration_seconds]).to receive(:observe).with({}, 0.3).ordered
sampler.sample
end