From 78a9991543cdcc0cbd7ecfaab979c8e2c98b7a75 Mon Sep 17 00:00:00 2001 From: Ben Kochie Date: Wed, 20 Jun 2018 16:42:38 +0200 Subject: [PATCH] Cleanup ruby sampler metrics * Use a simple counter for sampler duration instead of a histogram. * Use a counter to collect GC time. * Remove unused objects metric. * Cleanup metric names to match Prometheus conventions. * Prefix generic GC stats with `gc_stat`. * Include worker label on memory and file descriptor metrics. --- changelogs/unreleased/bjk-48176_ruby_gc.yml | 5 ++++ .../monitoring/prometheus/gitlab_metrics.md | 14 ++++++++++ lib/gitlab/metrics/samplers/ruby_sampler.rb | 26 ++++++++++--------- .../metrics/samplers/ruby_sampler_spec.rb | 2 +- 4 files changed, 34 insertions(+), 13 deletions(-) create mode 100644 changelogs/unreleased/bjk-48176_ruby_gc.yml diff --git a/changelogs/unreleased/bjk-48176_ruby_gc.yml b/changelogs/unreleased/bjk-48176_ruby_gc.yml new file mode 100644 index 00000000000..45c6338df81 --- /dev/null +++ b/changelogs/unreleased/bjk-48176_ruby_gc.yml @@ -0,0 +1,5 @@ +--- +title: Cleanup Prometheus ruby metrics +merge_request: 20039 +author: Ben Kochie +type: fixed diff --git a/doc/administration/monitoring/prometheus/gitlab_metrics.md b/doc/administration/monitoring/prometheus/gitlab_metrics.md index 411a0fae93f..cea6764df41 100644 --- a/doc/administration/monitoring/prometheus/gitlab_metrics.md +++ b/doc/administration/monitoring/prometheus/gitlab_metrics.md @@ -49,6 +49,20 @@ The following metrics are available: | filesystem_circuitbreaker | Gauge | 9.5 | Whether or not the circuit for a certain shard is broken or not | | circuitbreaker_storage_check_duration_seconds | Histogram | 10.3 | Time a single storage probe took | +### Ruby metrics + +Some basic Ruby runtime metrics are available: + +| Metric | Type | Since | Description | +|:-------------------------------------- |:--------- |:----- |:----------- | +| ruby_gc_duration_seconds_total | Counter | 11.1 | Time spent by Ruby in GC | +| ruby_gc_stat_... | Gauge | 11.1 | Various metrics from [GC.stat] | +| ruby_file_descriptors | Gauge | 11.1 | File descriptors per process | +| ruby_memory_bytes | Gauge | 11.1 | Memory usage by process | +| ruby_sampler_duration_seconds_total | Counter | 11.1 | Time spent collecting stats | + +[GC.stat]: https://ruby-doc.org/core-2.3.0/GC.html#method-c-stat + ## Metrics shared directory GitLab's Prometheus client requires a directory to store metrics data shared between multi-process services. diff --git a/lib/gitlab/metrics/samplers/ruby_sampler.rb b/lib/gitlab/metrics/samplers/ruby_sampler.rb index a39b3bc158c..7b2b3bedf04 100644 --- a/lib/gitlab/metrics/samplers/ruby_sampler.rb +++ b/lib/gitlab/metrics/samplers/ruby_sampler.rb @@ -22,27 +22,27 @@ module Gitlab def init_metrics metrics = {} - metrics[:sampler_duration] = Metrics.histogram(with_prefix(:sampler_duration, :seconds), 'Sampler time', { worker: nil }) - metrics[:total_time] = Metrics.gauge(with_prefix(:gc, :time_total), 'Total GC time', labels, :livesum) + metrics[:sampler_duration] = Metrics.counter(with_prefix(:sampler, :duration_seconds_total), 'Sampler time', labels) + metrics[:total_time] = Metrics.counter(with_prefix(:gc, :duration_seconds_total), 'Total GC time', labels) GC.stat.keys.each do |key| - metrics[key] = Metrics.gauge(with_prefix(:gc, key), to_doc_string(key), labels, :livesum) + metrics[key] = Metrics.gauge(with_prefix(:gc_stat, key), to_doc_string(key), labels, :livesum) end - metrics[:objects_total] = Metrics.gauge(with_prefix(:objects, :total), 'Objects total', labels.merge(class: nil), :livesum) - metrics[:memory_usage] = Metrics.gauge(with_prefix(:memory, :usage_total), 'Memory used total', labels, :livesum) - metrics[:file_descriptors] = Metrics.gauge(with_prefix(:file, :descriptors_total), 'File descriptors total', labels, :livesum) + metrics[:memory_usage] = Metrics.gauge(with_prefix(:memory, :bytes), 'Memory used', labels, :livesum) + metrics[:file_descriptors] = Metrics.gauge(with_prefix(:file, :descriptors), 'File descriptors used', labels, :livesum) metrics end def sample start_time = System.monotonic_time + + metrics[:memory_usage].set(labels.merge(worker_label), System.memory_usage) + metrics[:file_descriptors].set(labels.merge(worker_label), System.file_descriptor_count) + sample_gc - metrics[:memory_usage].set(labels, System.memory_usage) - metrics[:file_descriptors].set(labels, System.file_descriptor_count) - - metrics[:sampler_duration].observe(labels.merge(worker_label), System.monotonic_time - start_time) + metrics[:sampler_duration].increment(labels, System.monotonic_time - start_time) ensure GC::Profiler.clear end @@ -50,11 +50,13 @@ module Gitlab private def sample_gc - metrics[:total_time].set(labels, GC::Profiler.total_time * 1000) - + # Collect generic GC stats. GC.stat.each do |key, value| metrics[key].set(labels, value) end + + # Collect the GC time since last sample in float seconds. + metrics[:total_time].increment(labels, GC::Profiler.total_time) end def worker_label diff --git a/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb b/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb index 091645ee86f..7972ff253fe 100644 --- a/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb +++ b/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb @@ -45,7 +45,7 @@ describe Gitlab::Metrics::Samplers::RubySampler do it 'adds a metric containing garbage collection time statistics' do expect(GC::Profiler).to receive(:total_time).and_return(0.24) - expect(sampler.metrics[:total_time]).to receive(:set).with({}, 240) + expect(sampler.metrics[:total_time]).to receive(:increment).with({}, 0.24) sampler.sample end