From 9eeedfccbcbaa98265597a964cdf895f27fcf68b Mon Sep 17 00:00:00 2001 From: Ryan Cobb Date: Mon, 20 May 2019 13:36:59 -0600 Subject: [PATCH 1/5] Adds ruby and unicorn instrumentation This adds ruby and unicorn instrumentation. This was originally intended in 11.11 but due to performance concerns it was reverted. This new commit foregoes the sys-proctable gem was causing performance issues previously. --- .../monitoring/prometheus/gitlab_metrics.md | 13 ++++--- lib/gitlab/metrics/samplers/ruby_sampler.rb | 31 ++++++++++++----- .../metrics/samplers/unicorn_sampler.rb | 34 +++++++++++++------ lib/gitlab/metrics/system.rb | 26 ++++++++++++++ .../metrics/samplers/ruby_sampler_spec.rb | 30 ++++++++++++++-- .../metrics/samplers/unicorn_sampler_spec.rb | 25 +++++++++++--- spec/lib/gitlab/metrics/system_spec.rb | 24 +++++++++++++ 7 files changed, 153 insertions(+), 30 deletions(-) diff --git a/doc/administration/monitoring/prometheus/gitlab_metrics.md b/doc/administration/monitoring/prometheus/gitlab_metrics.md index c243dd9edbb..48bd709a2b7 100644 --- a/doc/administration/monitoring/prometheus/gitlab_metrics.md +++ b/doc/administration/monitoring/prometheus/gitlab_metrics.md @@ -43,10 +43,11 @@ The following metrics are available: | redis_ping_latency_seconds | Gauge | 9.4 | Round trip time of the redis ping | | user_session_logins_total | Counter | 9.4 | Counter of how many users have logged in | | upload_file_does_not_exist | Counter | 10.7 in EE, 11.5 in CE | Number of times an upload record could not find its file | -| failed_login_captcha_total | Gauge | 11.0 | Counter of failed CAPTCHA attempts during login | -| successful_login_captcha_total | Gauge | 11.0 | Counter of successful CAPTCHA attempts during login | -| unicorn_active_connections | Gauge | 11.0 | The number of active Unicorn connections (workers) | -| unicorn_queued_connections | Gauge | 11.0 | The number of queued Unicorn connections | +| failed_login_captcha_total | Gauge | 11.0 | Counter of failed CAPTCHA attempts during login | +| successful_login_captcha_total | Gauge | 11.0 | Counter of successful CAPTCHA attempts during login | +| unicorn_active_connections | Gauge | 11.0 | The number of active Unicorn connections (workers) | +| unicorn_queued_connections | Gauge | 11.0 | The number of queued Unicorn connections | +| unicorn_workers | Gauge | 11.11 | The number of Unicorn workers | ## Sidekiq Metrics available for Geo **[PREMIUM]** @@ -100,6 +101,10 @@ Some basic Ruby runtime metrics are available: | ruby_file_descriptors | Gauge | 11.1 | File descriptors per process | | ruby_memory_bytes | Gauge | 11.1 | Memory usage by process | | ruby_sampler_duration_seconds_total | Counter | 11.1 | Time spent collecting stats | +| ruby_process_cpu_seconds_total | Gauge | 11.11 | Total amount of CPU time per process | +| ruby_process_max_fds | Gauge | 11.11 | Maximum number of open file descriptors per process | +| ruby_process_resident_memory_bytes | Gauge | 11.11 | Memory usage by process, measured in bytes | +| ruby_process_start_time_seconds | Gauge | 11.11 | The elapsed time between system boot and the process started, measured in seconds | [GC.stat]: https://ruby-doc.org/core-2.3.0/GC.html#method-c-stat diff --git a/lib/gitlab/metrics/samplers/ruby_sampler.rb b/lib/gitlab/metrics/samplers/ruby_sampler.rb index 18a69321905..4d9c43f37e7 100644 --- a/lib/gitlab/metrics/samplers/ruby_sampler.rb +++ b/lib/gitlab/metrics/samplers/ruby_sampler.rb @@ -23,25 +23,32 @@ module Gitlab end def init_metrics - metrics = {} - metrics[:sampler_duration] = ::Gitlab::Metrics.counter(with_prefix(:sampler, :duration_seconds_total), 'Sampler time', labels) - metrics[:total_time] = ::Gitlab::Metrics.counter(with_prefix(:gc, :duration_seconds_total), 'Total GC time', labels) + metrics = { + file_descriptors: ::Gitlab::Metrics.gauge(with_prefix(:file, :descriptors), 'File descriptors used', labels, :livesum), + memory_bytes: ::Gitlab::Metrics.gauge(with_prefix(:memory, :bytes), 'Memory used', labels, :livesum), + process_cpu_seconds_total: ::Gitlab::Metrics.gauge(with_prefix(:process, :cpu_seconds_total), 'Process CPU seconds total'), + process_max_fds: ::Gitlab::Metrics.gauge(with_prefix(:process, :max_fds), 'Process max fds'), + process_resident_memory_bytes: ::Gitlab::Metrics.gauge(with_prefix(:process, :resident_memory_bytes), 'Memory used', labels, :livesum), + process_start_time_seconds: ::Gitlab::Metrics.gauge(with_prefix(:process, :start_time_seconds), 'Process start time seconds'), + sampler_duration: ::Gitlab::Metrics.counter(with_prefix(:sampler, :duration_seconds_total), 'Sampler time', labels), + total_time: ::Gitlab::Metrics.counter(with_prefix(:gc, :duration_seconds_total), 'Total GC time', labels) + } + GC.stat.keys.each do |key| metrics[key] = ::Gitlab::Metrics.gauge(with_prefix(:gc_stat, key), to_doc_string(key), labels, :livesum) end - metrics[:memory_usage] = ::Gitlab::Metrics.gauge(with_prefix(:memory, :bytes), 'Memory used', labels, :livesum) - metrics[:file_descriptors] = ::Gitlab::Metrics.gauge(with_prefix(:file, :descriptors), 'File descriptors used', labels, :livesum) - metrics end def sample start_time = System.monotonic_time - metrics[:memory_usage].set(labels.merge(worker_label), System.memory_usage) metrics[:file_descriptors].set(labels.merge(worker_label), System.file_descriptor_count) - + metrics[:process_cpu_seconds_total].set(labels.merge(worker_label), ::Gitlab::Metrics::System.cpu_time) + metrics[:process_max_fds].set(labels.merge(worker_label), ::Gitlab::Metrics::System.max_open_file_descriptors) + metrics[:process_start_time_seconds].set(labels.merge(worker_label), ::Gitlab::Metrics::System.process_start_time) + set_memory_usage_metrics sample_gc metrics[:sampler_duration].increment(labels, System.monotonic_time - start_time) @@ -61,6 +68,14 @@ module Gitlab metrics[:total_time].increment(labels, GC::Profiler.total_time) end + def set_memory_usage_metrics + memory_usage = System.memory_usage + memory_labels = labels.merge(worker_label) + + metrics[:memory_bytes].set(memory_labels, memory_usage) + metrics[:process_resident_memory_bytes].set(memory_labels, memory_usage) + end + def worker_label return {} unless defined?(Unicorn::Worker) diff --git a/lib/gitlab/metrics/samplers/unicorn_sampler.rb b/lib/gitlab/metrics/samplers/unicorn_sampler.rb index bec64e864b3..1b7a4c79080 100644 --- a/lib/gitlab/metrics/samplers/unicorn_sampler.rb +++ b/lib/gitlab/metrics/samplers/unicorn_sampler.rb @@ -8,12 +8,16 @@ module Gitlab super(interval) end - def unicorn_active_connections - @unicorn_active_connections ||= ::Gitlab::Metrics.gauge(:unicorn_active_connections, 'Unicorn active connections', {}, :max) + def metrics + @metrics ||= init_metrics end - def unicorn_queued_connections - @unicorn_queued_connections ||= ::Gitlab::Metrics.gauge(:unicorn_queued_connections, 'Unicorn queued connections', {}, :max) + def init_metrics + { + unicorn_active_connections: ::Gitlab::Metrics.gauge(:unicorn_active_connections, 'Unicorn active connections', {}, :max), + unicorn_queued_connections: ::Gitlab::Metrics.gauge(:unicorn_queued_connections, 'Unicorn queued connections', {}, :max), + unicorn_workers: ::Gitlab::Metrics.gauge(:unicorn_workers, 'Unicorn workers') + } end def enabled? @@ -23,14 +27,13 @@ module Gitlab def sample Raindrops::Linux.tcp_listener_stats(tcp_listeners).each do |addr, stats| - unicorn_active_connections.set({ socket_type: 'tcp', socket_address: addr }, stats.active) - unicorn_queued_connections.set({ socket_type: 'tcp', socket_address: addr }, stats.queued) + set_unicorn_connection_metrics('tcp', addr, stats) + end + Raindrops::Linux.unix_listener_stats(unix_listeners).each do |addr, stats| + set_unicorn_connection_metrics('unix', addr, stats) end - Raindrops::Linux.unix_listener_stats(unix_listeners).each do |addr, stats| - unicorn_active_connections.set({ socket_type: 'unix', socket_address: addr }, stats.active) - unicorn_queued_connections.set({ socket_type: 'unix', socket_address: addr }, stats.queued) - end + metrics[:unicorn_workers].set({}, unicorn_workers_count) end private @@ -39,6 +42,13 @@ module Gitlab @tcp_listeners ||= Unicorn.listener_names.grep(%r{\A[^/]+:\d+\z}) end + def set_unicorn_connection_metrics(type, addr, stats) + labels = { socket_type: type, socket_address: addr } + + metrics[:unicorn_active_connections].set(labels, stats.active) + metrics[:unicorn_queued_connections].set(labels, stats.queued) + end + def unix_listeners @unix_listeners ||= Unicorn.listener_names - tcp_listeners end @@ -46,6 +56,10 @@ module Gitlab def unicorn_with_listeners? defined?(Unicorn) && Unicorn.listener_names.any? end + + def unicorn_workers_count + `pgrep -f '[u]nicorn_rails worker.+ #{Rails.root.to_s}'`.split.count + end end end end diff --git a/lib/gitlab/metrics/system.rb b/lib/gitlab/metrics/system.rb index 426496855e3..d3cbd200584 100644 --- a/lib/gitlab/metrics/system.rb +++ b/lib/gitlab/metrics/system.rb @@ -23,6 +23,20 @@ module Gitlab def self.file_descriptor_count Dir.glob('/proc/self/fd/*').length end + + def self.max_open_file_descriptors + match = File.read('/proc/self/limits').match(/Max open files\s*(\d+)/) + + return unless match && match[1] + + match[1].to_i + end + + def self.process_start_time + fields = File.read('/proc/self/stat').split + + ( fields[21].to_i || 0 ) / clk_tck + end else def self.memory_usage 0.0 @@ -31,6 +45,14 @@ module Gitlab def self.file_descriptor_count 0 end + + def self.max_open_file_descriptors + 0 + end + + def self.process_start_time + 0 + end end # THREAD_CPUTIME is not supported on OS X @@ -59,6 +81,10 @@ module Gitlab def self.monotonic_time Process.clock_gettime(Process::CLOCK_MONOTONIC, :float_second) end + + def self.clk_tck + @clk_tck ||= `genconf CLK_TCK`.to_i + end end end end diff --git a/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb b/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb index 7972ff253fe..aaf8c9fa2a0 100644 --- a/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb +++ b/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb @@ -10,17 +10,20 @@ describe Gitlab::Metrics::Samplers::RubySampler do describe '#sample' do it 'samples various statistics' do - expect(Gitlab::Metrics::System).to receive(:memory_usage) + expect(Gitlab::Metrics::System).to receive(:cpu_time) expect(Gitlab::Metrics::System).to receive(:file_descriptor_count) + expect(Gitlab::Metrics::System).to receive(:memory_usage) + expect(Gitlab::Metrics::System).to receive(:process_start_time) + expect(Gitlab::Metrics::System).to receive(:max_open_file_descriptors) expect(sampler).to receive(:sample_gc) sampler.sample end - it 'adds a metric containing the memory usage' do + it 'adds a metric containing the process resident memory bytes' do expect(Gitlab::Metrics::System).to receive(:memory_usage).and_return(9000) - expect(sampler.metrics[:memory_usage]).to receive(:set).with({}, 9000) + expect(sampler.metrics[:process_resident_memory_bytes]).to receive(:set).with({}, 9000) sampler.sample end @@ -34,6 +37,27 @@ describe Gitlab::Metrics::Samplers::RubySampler do sampler.sample end + it 'adds a metric containing the process total cpu time' do + expect(Gitlab::Metrics::System).to receive(:cpu_time).and_return(0.51) + expect(sampler.metrics[:process_cpu_seconds_total]).to receive(:set).with({}, 0.51) + + sampler.sample + end + + it 'adds a metric containing the process start time' do + expect(Gitlab::Metrics::System).to receive(:process_start_time).and_return(12345) + expect(sampler.metrics[:process_start_time_seconds]).to receive(:set).with({}, 12345) + + sampler.sample + end + + it 'adds a metric containing the process max file descriptors' do + expect(Gitlab::Metrics::System).to receive(:max_open_file_descriptors).and_return(1024) + expect(sampler.metrics[:process_max_fds]).to receive(:set).with({}, 1024) + + sampler.sample + end + it 'clears any GC profiles' do expect(GC::Profiler).to receive(:clear) diff --git a/spec/lib/gitlab/metrics/samplers/unicorn_sampler_spec.rb b/spec/lib/gitlab/metrics/samplers/unicorn_sampler_spec.rb index 4b03f3c2532..090e456644f 100644 --- a/spec/lib/gitlab/metrics/samplers/unicorn_sampler_spec.rb +++ b/spec/lib/gitlab/metrics/samplers/unicorn_sampler_spec.rb @@ -39,8 +39,8 @@ describe Gitlab::Metrics::Samplers::UnicornSampler do it 'updates metrics type unix and with addr' do labels = { socket_type: 'unix', socket_address: socket_address } - expect(subject).to receive_message_chain(:unicorn_active_connections, :set).with(labels, 'active') - expect(subject).to receive_message_chain(:unicorn_queued_connections, :set).with(labels, 'queued') + expect(subject.metrics[:unicorn_active_connections]).to receive(:set).with(labels, 'active') + expect(subject.metrics[:unicorn_queued_connections]).to receive(:set).with(labels, 'queued') subject.sample end @@ -50,7 +50,6 @@ describe Gitlab::Metrics::Samplers::UnicornSampler do context 'unicorn listens on tcp sockets' do let(:tcp_socket_address) { '0.0.0.0:8080' } let(:tcp_sockets) { [tcp_socket_address] } - before do allow(unicorn).to receive(:listener_names).and_return(tcp_sockets) end @@ -71,13 +70,29 @@ describe Gitlab::Metrics::Samplers::UnicornSampler do it 'updates metrics type unix and with addr' do labels = { socket_type: 'tcp', socket_address: tcp_socket_address } - expect(subject).to receive_message_chain(:unicorn_active_connections, :set).with(labels, 'active') - expect(subject).to receive_message_chain(:unicorn_queued_connections, :set).with(labels, 'queued') + expect(subject.metrics[:unicorn_active_connections]).to receive(:set).with(labels, 'active') + expect(subject.metrics[:unicorn_queued_connections]).to receive(:set).with(labels, 'queued') subject.sample end end end + + context 'additional metrics' do + let(:unicorn_workers) { 2 } + + before do + allow(unicorn).to receive(:listener_names).and_return([""]) + allow(::Gitlab::Metrics::System).to receive(:cpu_time).and_return(3.14) + allow(subject).to receive(:unicorn_workers_count).and_return(unicorn_workers) + end + + it "sets additional metrics" do + expect(subject.metrics[:unicorn_workers]).to receive(:set).with({}, unicorn_workers) + + subject.sample + end + end end describe '#start' do diff --git a/spec/lib/gitlab/metrics/system_spec.rb b/spec/lib/gitlab/metrics/system_spec.rb index 14afcdf5daa..b0603d96eb2 100644 --- a/spec/lib/gitlab/metrics/system_spec.rb +++ b/spec/lib/gitlab/metrics/system_spec.rb @@ -13,6 +13,18 @@ describe Gitlab::Metrics::System do expect(described_class.file_descriptor_count).to be > 0 end end + + describe '.max_open_file_descriptors' do + it 'returns the max allowed open file descriptors' do + expect(described_class.max_open_file_descriptors).to be > 0 + end + end + + describe '.process_start_time' do + it 'returns the process start time' do + expect(described_class.process_start_time).to be > 0 + end + end else describe '.memory_usage' do it 'returns 0.0' do @@ -25,6 +37,18 @@ describe Gitlab::Metrics::System do expect(described_class.file_descriptor_count).to eq(0) end end + + describe '.max_open_file_descriptors' do + it 'returns 0' do + expect(described_class.max_open_file_descriptors).to eq(0) + end + end + + describe 'process_start_time' do + it 'returns 0' do + expect(described_class.process_start_time).to eq(0) + end + end end describe '.cpu_time' do From 4fae62b9efa985e4cf6a09cb6bd4b9cf665e6b32 Mon Sep 17 00:00:00 2001 From: Ryan Cobb Date: Mon, 20 May 2019 14:11:46 -0600 Subject: [PATCH 2/5] Fix typo in system.rb --- lib/gitlab/metrics/system.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/gitlab/metrics/system.rb b/lib/gitlab/metrics/system.rb index d3cbd200584..81bd799bd1e 100644 --- a/lib/gitlab/metrics/system.rb +++ b/lib/gitlab/metrics/system.rb @@ -83,7 +83,7 @@ module Gitlab end def self.clk_tck - @clk_tck ||= `genconf CLK_TCK`.to_i + @clk_tck ||= `getconf CLK_TCK`.to_i end end end From ddfff1e4045b24a0951ec50892d1e48be497c44f Mon Sep 17 00:00:00 2001 From: Ryan Cobb Date: Wed, 29 May 2019 10:59:23 -0600 Subject: [PATCH 3/5] Update documentation to indicate new metrics for 12.0 --- .../monitoring/prometheus/gitlab_metrics.md | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/administration/monitoring/prometheus/gitlab_metrics.md b/doc/administration/monitoring/prometheus/gitlab_metrics.md index 46a0ac980e1..f461a2b6a4c 100644 --- a/doc/administration/monitoring/prometheus/gitlab_metrics.md +++ b/doc/administration/monitoring/prometheus/gitlab_metrics.md @@ -43,11 +43,11 @@ The following metrics are available: | redis_ping_latency_seconds | Gauge | 9.4 | Round trip time of the redis ping | | user_session_logins_total | Counter | 9.4 | Counter of how many users have logged in | | upload_file_does_not_exist | Counter | 10.7 in EE, 11.5 in CE | Number of times an upload record could not find its file | -| failed_login_captcha_total | Gauge | 11.0 | Counter of failed CAPTCHA attempts during login | -| successful_login_captcha_total | Gauge | 11.0 | Counter of successful CAPTCHA attempts during login | -| unicorn_active_connections | Gauge | 11.0 | The number of active Unicorn connections (workers) | -| unicorn_queued_connections | Gauge | 11.0 | The number of queued Unicorn connections | -| unicorn_workers | Gauge | 11.11 | The number of Unicorn workers | +| failed_login_captcha_total | Gauge | 11.0 | Counter of failed CAPTCHA attempts during login | +| successful_login_captcha_total | Gauge | 11.0 | Counter of successful CAPTCHA attempts during login | +| unicorn_active_connections | Gauge | 11.0 | The number of active Unicorn connections (workers) | +| unicorn_queued_connections | Gauge | 11.0 | The number of queued Unicorn connections | +| unicorn_workers | Gauge | 12.0 | The number of Unicorn workers | ## Sidekiq Metrics available for Geo **[PREMIUM]** @@ -87,8 +87,8 @@ the `monitoring.sidekiq_exporter` configuration option in `gitlab.yml`. | geo_wikis_checksum_mismatch_count | Gauge | 10.7 | Number of wikis that checksum mismatch on secondary | url | geo_repositories_checked_count | Gauge | 11.1 | Number of repositories that have been checked via `git fsck` | url | geo_repositories_checked_failed_count | Gauge | 11.1 | Number of repositories that have a failure from `git fsck` | url -| geo_repositories_retrying_verification_count | Gauge | 11.2 | Number of repositories verification failures that Geo is actively trying to correct on secondary | url -| geo_wikis_retrying_verification_count | Gauge | 11.2 | Number of wikis verification failures that Geo is actively trying to correct on secondary | url +| geo_repositories_retrying_verification_count | Gauge | 11.2 | Number of repositories verification failures that Geo is actively trying to correct on secondary | url +| geo_wikis_retrying_verification_count | Gauge | 11.2 | Number of wikis verification failures that Geo is actively trying to correct on secondary | url ### Ruby metrics @@ -101,10 +101,10 @@ Some basic Ruby runtime metrics are available: | ruby_file_descriptors | Gauge | 11.1 | File descriptors per process | | ruby_memory_bytes | Gauge | 11.1 | Memory usage by process | | ruby_sampler_duration_seconds_total | Counter | 11.1 | Time spent collecting stats | -| ruby_process_cpu_seconds_total | Gauge | 11.11 | Total amount of CPU time per process | -| ruby_process_max_fds | Gauge | 11.11 | Maximum number of open file descriptors per process | -| ruby_process_resident_memory_bytes | Gauge | 11.11 | Memory usage by process, measured in bytes | -| ruby_process_start_time_seconds | Gauge | 11.11 | The elapsed time between system boot and the process started, measured in seconds | +| ruby_process_cpu_seconds_total | Gauge | 12.0 | Total amount of CPU time per process | +| ruby_process_max_fds | Gauge | 12.0 | Maximum number of open file descriptors per process | +| ruby_process_resident_memory_bytes | Gauge | 12.0 | Memory usage by process, measured in bytes | +| ruby_process_start_time_seconds | Gauge | 12.0 | The elapsed time between system boot and the process started, measured in seconds | [GC.stat]: https://ruby-doc.org/core-2.3.0/GC.html#method-c-stat From e9ae881c9a2934beaefbc687c0de21fbc77b0ef2 Mon Sep 17 00:00:00 2001 From: Ryan Cobb Date: Mon, 3 Jun 2019 11:18:54 -0600 Subject: [PATCH 4/5] Remove unnecessary super call in unicorn sampler --- lib/gitlab/metrics/samplers/unicorn_sampler.rb | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/gitlab/metrics/samplers/unicorn_sampler.rb b/lib/gitlab/metrics/samplers/unicorn_sampler.rb index 1b7a4c79080..9af7e0afed4 100644 --- a/lib/gitlab/metrics/samplers/unicorn_sampler.rb +++ b/lib/gitlab/metrics/samplers/unicorn_sampler.rb @@ -4,10 +4,6 @@ module Gitlab module Metrics module Samplers class UnicornSampler < BaseSampler - def initialize(interval) - super(interval) - end - def metrics @metrics ||= init_metrics end From 4c6b1fc23c72dcde5456865103945ce7f228bfef Mon Sep 17 00:00:00 2001 From: Ryan Cobb Date: Tue, 4 Jun 2019 10:04:36 -0600 Subject: [PATCH 5/5] Add comment to clarify system proc stat field --- lib/gitlab/metrics/system.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/gitlab/metrics/system.rb b/lib/gitlab/metrics/system.rb index 81bd799bd1e..33c0de91c11 100644 --- a/lib/gitlab/metrics/system.rb +++ b/lib/gitlab/metrics/system.rb @@ -35,6 +35,8 @@ module Gitlab def self.process_start_time fields = File.read('/proc/self/stat').split + # fields[21] is linux proc stat field "(22) starttime". + # The value is expressed in clock ticks, divide by clock ticks for seconds. ( fields[21].to_i || 0 ) / clk_tck end else