Cleanup ruby sampler metrics

* Use a simple counter for sampler duration instead of a histogram.
* Use a counter to collect GC time.
* Remove unused objects metric.
* Cleanup metric names to match Prometheus conventions.
* Prefix generic GC stats with `gc_stat`.
* Include worker label on memory and file descriptor metrics.
This commit is contained in:
Ben Kochie 2018-06-20 16:42:38 +02:00
parent 348ad22d7a
commit 78a9991543
Failed to extract signature
4 changed files with 34 additions and 13 deletions

View file

@ -0,0 +1,5 @@
---
title: Cleanup Prometheus ruby metrics
merge_request: 20039
author: Ben Kochie
type: fixed

View file

@ -49,6 +49,20 @@ The following metrics are available:
| filesystem_circuitbreaker | Gauge | 9.5 | Whether or not the circuit for a certain shard is broken or not |
| circuitbreaker_storage_check_duration_seconds | Histogram | 10.3 | Time a single storage probe took |
### Ruby metrics
Some basic Ruby runtime metrics are available:
| Metric | Type | Since | Description |
|:-------------------------------------- |:--------- |:----- |:----------- |
| ruby_gc_duration_seconds_total | Counter | 11.1 | Time spent by Ruby in GC |
| ruby_gc_stat_... | Gauge | 11.1 | Various metrics from [GC.stat] |
| ruby_file_descriptors | Gauge | 11.1 | File descriptors per process |
| ruby_memory_bytes | Gauge | 11.1 | Memory usage by process |
| ruby_sampler_duration_seconds_total | Counter | 11.1 | Time spent collecting stats |
[GC.stat]: https://ruby-doc.org/core-2.3.0/GC.html#method-c-stat
## Metrics shared directory
GitLab's Prometheus client requires a directory to store metrics data shared between multi-process services.

View file

@ -22,27 +22,27 @@ module Gitlab
def init_metrics
metrics = {}
metrics[:sampler_duration] = Metrics.histogram(with_prefix(:sampler_duration, :seconds), 'Sampler time', { worker: nil })
metrics[:total_time] = Metrics.gauge(with_prefix(:gc, :time_total), 'Total GC time', labels, :livesum)
metrics[:sampler_duration] = Metrics.counter(with_prefix(:sampler, :duration_seconds_total), 'Sampler time', labels)
metrics[:total_time] = Metrics.counter(with_prefix(:gc, :duration_seconds_total), 'Total GC time', labels)
GC.stat.keys.each do |key|
metrics[key] = Metrics.gauge(with_prefix(:gc, key), to_doc_string(key), labels, :livesum)
metrics[key] = Metrics.gauge(with_prefix(:gc_stat, key), to_doc_string(key), labels, :livesum)
end
metrics[:objects_total] = Metrics.gauge(with_prefix(:objects, :total), 'Objects total', labels.merge(class: nil), :livesum)
metrics[:memory_usage] = Metrics.gauge(with_prefix(:memory, :usage_total), 'Memory used total', labels, :livesum)
metrics[:file_descriptors] = Metrics.gauge(with_prefix(:file, :descriptors_total), 'File descriptors total', labels, :livesum)
metrics[:memory_usage] = Metrics.gauge(with_prefix(:memory, :bytes), 'Memory used', labels, :livesum)
metrics[:file_descriptors] = Metrics.gauge(with_prefix(:file, :descriptors), 'File descriptors used', labels, :livesum)
metrics
end
def sample
start_time = System.monotonic_time
metrics[:memory_usage].set(labels.merge(worker_label), System.memory_usage)
metrics[:file_descriptors].set(labels.merge(worker_label), System.file_descriptor_count)
sample_gc
metrics[:memory_usage].set(labels, System.memory_usage)
metrics[:file_descriptors].set(labels, System.file_descriptor_count)
metrics[:sampler_duration].observe(labels.merge(worker_label), System.monotonic_time - start_time)
metrics[:sampler_duration].increment(labels, System.monotonic_time - start_time)
ensure
GC::Profiler.clear
end
@ -50,11 +50,13 @@ module Gitlab
private
def sample_gc
metrics[:total_time].set(labels, GC::Profiler.total_time * 1000)
# Collect generic GC stats.
GC.stat.each do |key, value|
metrics[key].set(labels, value)
end
# Collect the GC time since last sample in float seconds.
metrics[:total_time].increment(labels, GC::Profiler.total_time)
end
def worker_label

View file

@ -45,7 +45,7 @@ describe Gitlab::Metrics::Samplers::RubySampler do
it 'adds a metric containing garbage collection time statistics' do
expect(GC::Profiler).to receive(:total_time).and_return(0.24)
expect(sampler.metrics[:total_time]).to receive(:set).with({}, 240)
expect(sampler.metrics[:total_time]).to receive(:increment).with({}, 0.24)
sampler.sample
end