diff --git a/app/controllers/concerns/requires_health_token.rb b/app/controllers/concerns/requires_health_token.rb new file mode 100644 index 00000000000..34ab1a97649 --- /dev/null +++ b/app/controllers/concerns/requires_health_token.rb @@ -0,0 +1,25 @@ +module RequiresHealthToken + extend ActiveSupport::Concern + included do + before_action :validate_health_check_access! + end + + private + + def validate_health_check_access! + render_404 unless token_valid? + end + + def token_valid? + token = params[:token].presence || request.headers['TOKEN'] + token.present? && + ActiveSupport::SecurityUtils.variable_size_secure_compare( + token, + current_application_settings.health_check_access_token + ) + end + + def render_404 + render file: Rails.root.join('public', '404'), layout: false, status: '404' + end +end diff --git a/app/controllers/health_check_controller.rb b/app/controllers/health_check_controller.rb index 037da7d2bce..5d3109b7187 100644 --- a/app/controllers/health_check_controller.rb +++ b/app/controllers/health_check_controller.rb @@ -1,22 +1,3 @@ class HealthCheckController < HealthCheck::HealthCheckController - before_action :validate_health_check_access! - - private - - def validate_health_check_access! - render_404 unless token_valid? - end - - def token_valid? - token = params[:token].presence || request.headers['TOKEN'] - token.present? && - ActiveSupport::SecurityUtils.variable_size_secure_compare( - token, - current_application_settings.health_check_access_token - ) - end - - def render_404 - render file: Rails.root.join('public', '404'), layout: false, status: '404' - end + include RequiresHealthToken end diff --git a/app/controllers/health_controller.rb b/app/controllers/health_controller.rb new file mode 100644 index 00000000000..df0fc3132ed --- /dev/null +++ b/app/controllers/health_controller.rb @@ -0,0 +1,60 @@ +class HealthController < ActionController::Base + protect_from_forgery with: :exception + include RequiresHealthToken + + CHECKS = [ + Gitlab::HealthChecks::DbCheck, + Gitlab::HealthChecks::RedisCheck, + Gitlab::HealthChecks::FsShardsCheck, + ].freeze + + def readiness + results = CHECKS.map { |check| [check.name, check.readiness] } + + render_check_results(results) + end + + def liveness + results = CHECKS.map { |check| [check.name, check.liveness] } + + render_check_results(results) + end + + def metrics + results = CHECKS.flat_map(&:metrics) + + response = results.map(&method(:metric_to_prom_line)).join("\n") + + render text: response, content_type: 'text/plain; version=0.0.4' + end + + private + + def metric_to_prom_line(metric) + labels = metric.labels&.map { |key, value| "#{key}=\"#{value}\"" }&.join(',') || '' + if labels.empty? + "#{metric.name} #{metric.value}" + else + "#{metric.name}{#{labels}} #{metric.value}" + end + end + + def render_check_results(results) + flattened = results.flat_map do |name, result| + if result.is_a?(Gitlab::HealthChecks::Result) + [[name, result]] + else + result.map { |r| [name, r] } + end + end + success = flattened.all? { |name, r| r.success } + + response = flattened.map do |name, r| + info = { status: r.success ? 'ok' : 'failed' } + info['message'] = r.message if r.message + info[:labels] = r.labels if r.labels + [name, info] + end + render json: response.to_h, status: success ? :ok : :service_unavailable + end +end diff --git a/changelogs/unreleased/24240-add-monitoring-endpoints.yml b/changelogs/unreleased/24240-add-monitoring-endpoints.yml new file mode 100644 index 00000000000..a22458965fc --- /dev/null +++ b/changelogs/unreleased/24240-add-monitoring-endpoints.yml @@ -0,0 +1,4 @@ +--- +title: Add /-/readiness /-/liveness and /-/metrics endpoints to track application health +merge_request: 10416 +author: diff --git a/config/routes.rb b/config/routes.rb index 1a851da6203..1da226a3b57 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -39,6 +39,12 @@ Rails.application.routes.draw do # Health check get 'health_check(/:checks)' => 'health_check#index', as: :health_check + scope path: '-', controller: 'health' do + get :liveness + get :readiness + get :metrics + end + # Koding route get 'koding' => 'koding#index' diff --git a/lib/gitlab/health_checks/base_abstract_check.rb b/lib/gitlab/health_checks/base_abstract_check.rb new file mode 100644 index 00000000000..7de6d4d9367 --- /dev/null +++ b/lib/gitlab/health_checks/base_abstract_check.rb @@ -0,0 +1,45 @@ +module Gitlab + module HealthChecks + module BaseAbstractCheck + def name + super.demodulize.underscore + end + + def human_name + name.sub(/_check$/, '').capitalize + end + + def readiness + raise NotImplementedError + end + + def liveness + HealthChecks::Result.new(true) + end + + def metrics + [] + end + + protected + + def metric(name, value, **labels) + Metric.new(name, value, labels) + end + + def with_timing(proc) + start = Time.now + result = proc.call + yield result, Time.now.to_f - start.to_f + end + + def catch_timeout(seconds, &block) + begin + Timeout.timeout(seconds.to_i, &block) + rescue Timeout::Error => ex + ex + end + end + end + end +end diff --git a/lib/gitlab/health_checks/db_check.rb b/lib/gitlab/health_checks/db_check.rb new file mode 100644 index 00000000000..fd94984f8a2 --- /dev/null +++ b/lib/gitlab/health_checks/db_check.rb @@ -0,0 +1,29 @@ +module Gitlab + module HealthChecks + class DbCheck + extend SimpleAbstractCheck + + class << self + private + + def metric_prefix + 'db_ping' + end + + def is_successful?(result) + result == '1' + end + + def check + catch_timeout 10.seconds do + if Gitlab::Database.postgresql? + ActiveRecord::Base.connection.execute('SELECT 1 as ping')&.first&.[]('ping') + else + ActiveRecord::Base.connection.execute('SELECT 1 as ping')&.first&.first&.to_s + end + end + end + end + end + end +end diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb new file mode 100644 index 00000000000..df962d203b7 --- /dev/null +++ b/lib/gitlab/health_checks/fs_shards_check.rb @@ -0,0 +1,117 @@ +module Gitlab + module HealthChecks + class FsShardsCheck + extend BaseAbstractCheck + + class << self + def readiness + repository_storages.map do |storage_name| + begin + tmp_file_path = tmp_file_path(storage_name) + + if !storage_stat_test(storage_name) + HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name) + elsif !storage_write_test(tmp_file_path) + HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name) + elsif !storage_read_test(tmp_file_path) + HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name) + else + HealthChecks::Result.new(true, nil, shard: storage_name) + end + rescue RuntimeError => ex + message = "unexpected error #{ex} when checking storage #{storage_name}" + Rails.logger.error(message) + HealthChecks::Result.new(false, message, shard: storage_name) + ensure + delete_test_file(tmp_file_path) + end + end + end + + def metrics + repository_storages.flat_map do |storage_name| + tmp_file_path = tmp_file_path(storage_name) + [ + operation_metrics(:filesystem_accessible, :filesystem_access_latency, -> { storage_stat_test(storage_name) }, shard: storage_name), + operation_metrics(:filesystem_writable, :filesystem_write_latency, -> { storage_write_test(tmp_file_path) }, shard: storage_name), + operation_metrics(:filesystem_readable, :filesystem_read_latency, -> { storage_read_test(tmp_file_path) }, shard: storage_name) + ].flatten + end + end + + private + + RANDOM_STRING = SecureRandom.hex(1000).freeze + + def operation_metrics(ok_metric, latency_metric, operation, **labels) + with_timing operation do |result, elapsed| + [ + metric(latency_metric, elapsed, **labels), + metric(ok_metric, result ? 1 : 0, **labels) + ] + end + rescue RuntimeError => ex + Rails.logger("unexpected error #{ex} when checking #{ok_metric}") + [metric(ok_metric, 0, **labels)] + end + + def repository_storages + @repository_storage ||= Gitlab::CurrentSettings.current_application_settings.repository_storages + end + + def storages_paths + @storage_paths ||= Gitlab.config.repositories.storages + end + + def with_timeout(args) + %w{timeout 1}.concat(args) + end + + def tmp_file_path(storage_name) + Dir::Tmpname.create(%w(fs_shards_check +deleted), path(storage_name)) { |path| path } + end + + def path(storage_name) + storages_paths&.dig(storage_name, 'path') + end + + def storage_stat_test(storage_name) + stat_path = File.join(path(storage_name), '.') + begin + _, status = Gitlab::Popen.popen(with_timeout(%W{ stat #{stat_path} })) + status == 0 + rescue Errno::ENOENT + File.exist?(stat_path) && File::Stat.new(stat_path).readable? + end + end + + def storage_write_test(tmp_path) + _, status = Gitlab::Popen.popen(with_timeout(%W{ tee #{tmp_path} })) do |stdin| + stdin.write(RANDOM_STRING) + end + status == 0 + rescue Errno::ENOENT + written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT + written_bytes == RANDOM_STRING.length + end + + def storage_read_test(tmp_path) + _, status = Gitlab::Popen.popen(with_timeout(%W{ diff #{tmp_path} - })) do |stdin| + stdin.write(RANDOM_STRING) + end + status == 0 + rescue Errno::ENOENT + file_contents = File.read(tmp_path) rescue Errno::ENOENT + file_contents == RANDOM_STRING + end + + def delete_test_file(tmp_path) + _, status = Gitlab::Popen.popen(with_timeout(%W{ rm -f #{tmp_path} })) + status == 0 + rescue Errno::ENOENT + File.delete(tmp_path) rescue Errno::ENOENT + end + end + end + end +end diff --git a/lib/gitlab/health_checks/metric.rb b/lib/gitlab/health_checks/metric.rb new file mode 100644 index 00000000000..1a2eab0b005 --- /dev/null +++ b/lib/gitlab/health_checks/metric.rb @@ -0,0 +1,3 @@ +module Gitlab::HealthChecks + Metric = Struct.new(:name, :value, :labels) +end diff --git a/lib/gitlab/health_checks/redis_check.rb b/lib/gitlab/health_checks/redis_check.rb new file mode 100644 index 00000000000..57bbe5b3ad0 --- /dev/null +++ b/lib/gitlab/health_checks/redis_check.rb @@ -0,0 +1,25 @@ +module Gitlab + module HealthChecks + class RedisCheck + extend SimpleAbstractCheck + + class << self + private + + def metric_prefix + 'redis_ping' + end + + def is_successful?(result) + result == 'PONG' + end + + def check + catch_timeout 10.seconds do + Gitlab::Redis.with(&:ping) + end + end + end + end + end +end diff --git a/lib/gitlab/health_checks/result.rb b/lib/gitlab/health_checks/result.rb new file mode 100644 index 00000000000..8086760023e --- /dev/null +++ b/lib/gitlab/health_checks/result.rb @@ -0,0 +1,3 @@ +module Gitlab::HealthChecks + Result = Struct.new(:success, :message, :labels) +end diff --git a/lib/gitlab/health_checks/simple_abstract_check.rb b/lib/gitlab/health_checks/simple_abstract_check.rb new file mode 100644 index 00000000000..fbe1645c1b1 --- /dev/null +++ b/lib/gitlab/health_checks/simple_abstract_check.rb @@ -0,0 +1,43 @@ +module Gitlab + module HealthChecks + module SimpleAbstractCheck + include BaseAbstractCheck + + def readiness + check_result = check + if is_successful?(check_result) + HealthChecks::Result.new(true) + elsif check_result.is_a?(Timeout::Error) + HealthChecks::Result.new(false, "#{human_name} check timed out") + else + HealthChecks::Result.new(false, "unexpected #{human_name} check result: #{check_result}") + end + end + + def metrics + with_timing method(:check) do |result, elapsed| + Rails.logger.error("#{human_name} check returned unexpected result #{result}") unless is_successful?(result) + [ + metric("#{metric_prefix}_timeout", result.is_a?(Timeout::Error) ? 1 : 0), + metric("#{metric_prefix}_success", is_successful?(result) ? 1 : 0), + metric("#{metric_prefix}_latency", elapsed) + ] + end + end + + private + + def metric_prefix + raise NotImplementedError + end + + def is_successful?(result) + raise NotImplementedError + end + + def check + raise NotImplementedError + end + end + end +end diff --git a/spec/controllers/health_controller_spec.rb b/spec/controllers/health_controller_spec.rb new file mode 100644 index 00000000000..b8b6e0c3a88 --- /dev/null +++ b/spec/controllers/health_controller_spec.rb @@ -0,0 +1,96 @@ +require 'spec_helper' + +describe HealthController do + include StubENV + + let(:token) { current_application_settings.health_check_access_token } + let(:json_response) { JSON.parse(response.body) } + + before do + stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'false') + end + + describe '#readiness' do + context 'authorization token provided' do + before do + request.headers['TOKEN'] = token + end + + it 'returns proper response' do + get :readiness + expect(json_response['db_check']['status']).to eq('ok') + expect(json_response['redis_check']['status']).to eq('ok') + expect(json_response['fs_shards_check']['status']).to eq('ok') + expect(json_response['fs_shards_check']['labels']['shard']).to eq('default') + end + end + + context 'without authorization token' do + it 'returns proper response' do + get :readiness + expect(response.status).to eq(404) + end + end + end + + describe '#liveness' do + context 'authorization token provided' do + before do + request.headers['TOKEN'] = token + end + + it 'returns proper response' do + get :liveness + expect(json_response['db_check']['status']).to eq('ok') + expect(json_response['redis_check']['status']).to eq('ok') + expect(json_response['fs_shards_check']['status']).to eq('ok') + end + end + + context 'without authorization token' do + it 'returns proper response' do + get :liveness + expect(response.status).to eq(404) + end + end + end + + describe '#metrics' do + context 'authorization token provided' do + before do + request.headers['TOKEN'] = token + end + + it 'returns DB ping metrics' do + get :metrics + expect(response.body).to match(/^db_ping_timeout 0$/) + expect(response.body).to match(/^db_ping_success 1$/) + expect(response.body).to match(/^db_ping_latency [0-9\.]+$/) + end + + it 'returns Redis ping metrics' do + get :metrics + expect(response.body).to match(/^redis_ping_timeout 0$/) + expect(response.body).to match(/^redis_ping_success 1$/) + expect(response.body).to match(/^redis_ping_latency [0-9\.]+$/) + end + + it 'returns file system check metrics' do + get :metrics + expect(response.body).to match(/^filesystem_access_latency{shard="default"} [0-9\.]+$/) + expect(response.body).to match(/^filesystem_accessible{shard="default"} 1$/) + expect(response.body).to match(/^filesystem_write_latency{shard="default"} [0-9\.]+$/) + expect(response.body).to match(/^filesystem_writable{shard="default"} 1$/) + expect(response.body).to match(/^filesystem_read_latency{shard="default"} [0-9\.]+$/) + expect(response.body).to match(/^filesystem_readable{shard="default"} 1$/) + end + end + + context 'without authorization token' do + it 'returns proper response' do + get :metrics + expect(response.status).to eq(404) + end + end + end +end diff --git a/spec/lib/gitlab/healthchecks/db_check_spec.rb b/spec/lib/gitlab/healthchecks/db_check_spec.rb new file mode 100644 index 00000000000..33c6c24449c --- /dev/null +++ b/spec/lib/gitlab/healthchecks/db_check_spec.rb @@ -0,0 +1,6 @@ +require 'spec_helper' +require_relative './simple_check_shared' + +describe Gitlab::HealthChecks::DbCheck do + include_examples 'simple_check', 'db_ping', 'Db', '1' +end diff --git a/spec/lib/gitlab/healthchecks/fs_shards_check_spec.rb b/spec/lib/gitlab/healthchecks/fs_shards_check_spec.rb new file mode 100644 index 00000000000..4cd8cf313a5 --- /dev/null +++ b/spec/lib/gitlab/healthchecks/fs_shards_check_spec.rb @@ -0,0 +1,127 @@ +require 'spec_helper' + +describe Gitlab::HealthChecks::FsShardsCheck do + let(:metric_class) { Gitlab::HealthChecks::Metric } + let(:result_class) { Gitlab::HealthChecks::Result } + let(:repository_storages) { [:default] } + let(:tmp_dir) { Dir.mktmpdir } + + let(:storages_paths) do + { + default: { path: tmp_dir } + }.with_indifferent_access + end + + before do + allow(described_class).to receive(:repository_storages) { repository_storages } + allow(described_class).to receive(:storages_paths) { storages_paths } + end + + after do + FileUtils.remove_entry_secure(tmp_dir) if Dir.exist?(tmp_dir) + end + + shared_examples 'filesystem checks' do + describe '#readiness' do + subject { described_class.readiness } + + context 'storage points to not existing folder' do + let(:storages_paths) do + { + default: { path: 'tmp/this/path/doesnt/exist' } + }.with_indifferent_access + end + + it { is_expected.to include(result_class.new(false, 'cannot stat storage', shard: :default)) } + end + + context 'storage points to directory that has both read and write rights' do + before do + FileUtils.chmod_R(0755, tmp_dir) + end + + it { is_expected.to include(result_class.new(true, nil, shard: :default)) } + + it 'cleans up files used for testing' do + expect(described_class).to receive(:storage_write_test).with(any_args).and_call_original + + subject + + expect(Dir.entries(tmp_dir).count).to eq(2) + end + + context 'read test fails' do + before do + allow(described_class).to receive(:storage_read_test).with(any_args).and_return(false) + end + + it { is_expected.to include(result_class.new(false, 'cannot read from storage', shard: :default)) } + end + + context 'write test fails' do + before do + allow(described_class).to receive(:storage_write_test).with(any_args).and_return(false) + end + + it { is_expected.to include(result_class.new(false, 'cannot write to storage', shard: :default)) } + end + end + end + + describe '#metrics' do + subject { described_class.metrics } + + context 'storage points to not existing folder' do + let(:storages_paths) do + { + default: { path: 'tmp/this/path/doesnt/exist' } + }.with_indifferent_access + end + + it { is_expected.to include(metric_class.new(:filesystem_accessible, 0, shard: :default)) } + it { is_expected.to include(metric_class.new(:filesystem_readable, 0, shard: :default)) } + it { is_expected.to include(metric_class.new(:filesystem_writable, 0, shard: :default)) } + + it { is_expected.to include(have_attributes(name: :filesystem_access_latency, value: be > 0, labels: { shard: :default })) } + it { is_expected.to include(have_attributes(name: :filesystem_read_latency, value: be > 0, labels: { shard: :default })) } + it { is_expected.to include(have_attributes(name: :filesystem_write_latency, value: be > 0, labels: { shard: :default })) } + end + + context 'storage points to directory that has both read and write rights' do + before do + FileUtils.chmod_R(0755, tmp_dir) + end + + it { is_expected.to include(metric_class.new(:filesystem_accessible, 1, shard: :default)) } + it { is_expected.to include(metric_class.new(:filesystem_readable, 1, shard: :default)) } + it { is_expected.to include(metric_class.new(:filesystem_writable, 1, shard: :default)) } + + it { is_expected.to include(have_attributes(name: :filesystem_access_latency, value: be > 0, labels: { shard: :default })) } + it { is_expected.to include(have_attributes(name: :filesystem_read_latency, value: be > 0, labels: { shard: :default })) } + it { is_expected.to include(have_attributes(name: :filesystem_write_latency, value: be > 0, labels: { shard: :default })) } + end + end + end + + context 'when popen always finds required binaries' do + before do + allow(Gitlab::Popen).to receive(:popen).and_wrap_original do |method, *args, &block| + begin + method.call(*args, &block) + rescue RuntimeError + raise 'expected not to happen' + end + end + end + + it_behaves_like 'filesystem checks' + end + + context 'when popen never finds required binaries' do + before do + allow(Gitlab::Popen).to receive(:popen).and_raise(Errno::ENOENT) + end + + it_behaves_like 'filesystem checks' + end +end diff --git a/spec/lib/gitlab/healthchecks/redis_check_spec.rb b/spec/lib/gitlab/healthchecks/redis_check_spec.rb new file mode 100644 index 00000000000..734cdcb893e --- /dev/null +++ b/spec/lib/gitlab/healthchecks/redis_check_spec.rb @@ -0,0 +1,6 @@ +require 'spec_helper' +require_relative './simple_check_shared' + +describe Gitlab::HealthChecks::RedisCheck do + include_examples 'simple_check', 'redis_ping', 'Redis', 'PONG' +end diff --git a/spec/lib/gitlab/healthchecks/simple_check_shared.rb b/spec/lib/gitlab/healthchecks/simple_check_shared.rb new file mode 100644 index 00000000000..1fa6d0faef9 --- /dev/null +++ b/spec/lib/gitlab/healthchecks/simple_check_shared.rb @@ -0,0 +1,66 @@ +shared_context 'simple_check' do |metrics_prefix, check_name, success_result| + describe '#metrics' do + subject { described_class.metrics } + context 'Check is passing' do + before do + allow(described_class).to receive(:check).and_return success_result + end + + it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_success", value: 1)) } + it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_timeout", value: 0)) } + it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_latency", value: be > 0)) } + end + + context 'Check is misbehaving' do + before do + allow(described_class).to receive(:check).and_return 'error!' + end + + it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_success", value: 0)) } + it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_timeout", value: 0)) } + it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_latency", value: be > 0)) } + end + + context 'Check is timeouting' do + before do + allow(described_class).to receive(:check).and_return Timeout::Error.new + end + + it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_success", value: 0)) } + it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_timeout", value: 1)) } + it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_latency", value: be > 0)) } + end + end + + describe '#readiness' do + subject { described_class.readiness } + context 'Check returns ok' do + before do + allow(described_class).to receive(:check).and_return success_result + end + + it { is_expected.to have_attributes(success: true) } + end + + context 'Check is misbehaving' do + before do + allow(described_class).to receive(:check).and_return 'error!' + end + + it { is_expected.to have_attributes(success: false, message: "unexpected #{check_name} check result: error!") } + end + + context 'Check is timeouting' do + before do + allow(described_class).to receive(:check ).and_return Timeout::Error.new + end + + it { is_expected.to have_attributes(success: false, message: "#{check_name} check timed out") } + end + end + + describe '#liveness' do + subject { described_class.readiness } + it { is_expected.to eq(Gitlab::HealthChecks::Result.new(true)) } + end +end