Gitaly metrics check for read/writeability

Prior to this change, health checks checked for writeability of the NFS
shards. Given we're moving away from that, this patch extends the checks
for Gitaly to check for read and writeability.

Potentially some dashboards will break, as over time these metrics will
no longer appear as Prometheus doesn't get the data anymore.
Observability in the circuit breaker will be reduced, but its not
expected to be turned on and the circuit breaker is being removed soon
too.

Closes https://gitlab.com/gitlab-org/gitaly/issues/1218
This commit is contained in:
Zeger-Jan van de Weg 2018-06-20 10:21:59 +02:00
parent 292cf66890
commit 65840591cd
No known key found for this signature in database
GPG Key ID: 65F6A8D64A88ABAC
15 changed files with 81 additions and 393 deletions

View File

@ -1 +1 @@
0.107.0
0.108.0

View File

@ -418,7 +418,7 @@ group :ed25519 do
end
# Gitaly GRPC client
gem 'gitaly-proto', '~> 0.102.0', require: 'gitaly'
gem 'gitaly-proto', '~> 0.103.0', require: 'gitaly'
gem 'grpc', '~> 1.11.0'
# Locked until https://github.com/google/protobuf/issues/4210 is closed

View File

@ -282,7 +282,7 @@ GEM
gettext_i18n_rails (>= 0.7.1)
po_to_json (>= 1.0.0)
rails (>= 3.2.0)
gitaly-proto (0.102.0)
gitaly-proto (0.103.0)
google-protobuf (~> 3.1)
grpc (~> 1.10)
github-linguist (5.3.3)
@ -1041,7 +1041,7 @@ DEPENDENCIES
gettext (~> 3.2.2)
gettext_i18n_rails (~> 1.8.0)
gettext_i18n_rails_js (~> 1.3)
gitaly-proto (~> 0.102.0)
gitaly-proto (~> 0.103.0)
github-linguist (~> 5.3.3)
gitlab-flowdock-git-hook (~> 1.0.1)
gitlab-gollum-lib (~> 4.2)

View File

@ -285,7 +285,7 @@ GEM
gettext_i18n_rails (>= 0.7.1)
po_to_json (>= 1.0.0)
rails (>= 3.2.0)
gitaly-proto (0.102.0)
gitaly-proto (0.103.0)
google-protobuf (~> 3.1)
grpc (~> 1.10)
github-linguist (5.3.3)
@ -1051,7 +1051,7 @@ DEPENDENCIES
gettext (~> 3.2.2)
gettext_i18n_rails (~> 1.8.0)
gettext_i18n_rails_js (~> 1.3)
gitaly-proto (~> 0.102.0)
gitaly-proto (~> 0.103.0)
github-linguist (~> 5.3.3)
gitlab-flowdock-git-hook (~> 1.0.1)
gitlab-gollum-lib (~> 4.2)

View File

@ -8,7 +8,6 @@ class HealthController < ActionController::Base
Gitlab::HealthChecks::Redis::CacheCheck,
Gitlab::HealthChecks::Redis::QueuesCheck,
Gitlab::HealthChecks::Redis::SharedStateCheck,
Gitlab::HealthChecks::FsShardsCheck,
Gitlab::HealthChecks::GitalyCheck
].freeze

View File

@ -6,7 +6,8 @@ class MetricsService
Gitlab::HealthChecks::Redis::RedisCheck,
Gitlab::HealthChecks::Redis::CacheCheck,
Gitlab::HealthChecks::Redis::QueuesCheck,
Gitlab::HealthChecks::Redis::SharedStateCheck
Gitlab::HealthChecks::Redis::SharedStateCheck,
Gitlab::HealthChecks::GitalyCheck
].freeze
def prometheus_metrics_text

View File

@ -0,0 +1,5 @@
---
title: Gitaly metrics check for read/writeability
merge_request: 20022
author:
type: other

View File

@ -22,6 +22,18 @@ module Gitaly
server_version == Gitlab::GitalyClient.expected_server_version
end
def read_writeable?
readable? && writeable?
end
def readable?
storage_status&.readable
end
def writeable?
storage_status&.writeable
end
def address
Gitlab::GitalyClient.address(@storage)
rescue RuntimeError => e
@ -30,13 +42,17 @@ module Gitaly
private
def storage_status
@storage_status ||= info.storage_statuses.find { |s| s.storage_name == storage }
end
def info
@info ||=
begin
Gitlab::GitalyClient::ServerService.new(@storage).info
rescue GRPC::Unavailable, GRPC::GRPC::DeadlineExceeded
# This will show the server as being out of date
Gitaly::ServerInfoResponse.new(git_version: '', server_version: '')
Gitaly::ServerInfoResponse.new(git_version: '', server_version: '', storage_statuses: [])
end
end
end

View File

@ -1,169 +0,0 @@
module Gitlab
module HealthChecks
# Gitaly migration: https://gitlab.com/gitlab-org/gitaly/issues/1218
class FsShardsCheck
extend BaseAbstractCheck
RANDOM_STRING = SecureRandom.hex(1000).freeze
COMMAND_TIMEOUT = '1'.freeze
TIMEOUT_EXECUTABLE = 'timeout'.freeze
class << self
def readiness
repository_storages.map do |storage_name|
begin
if !storage_circuitbreaker_test(storage_name)
HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name)
elsif !storage_stat_test(storage_name)
HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
else
with_temp_file(storage_name) do |tmp_file_path|
if !storage_write_test(tmp_file_path)
HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
elsif !storage_read_test(tmp_file_path)
HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
else
HealthChecks::Result.new(true, nil, shard: storage_name)
end
end
end
rescue RuntimeError => ex
message = "unexpected error #{ex} when checking storage #{storage_name}"
Rails.logger.error(message)
HealthChecks::Result.new(false, message, shard: storage_name)
end
end
end
def metrics
repository_storages.flat_map do |storage_name|
[
storage_stat_metrics(storage_name),
storage_write_metrics(storage_name),
storage_read_metrics(storage_name),
storage_circuitbreaker_metrics(storage_name)
].flatten
end
end
private
def operation_metrics(ok_metric, latency_metric, **labels)
result, elapsed = yield
[
metric(latency_metric, elapsed, **labels),
metric(ok_metric, result ? 1 : 0, **labels)
]
rescue RuntimeError => ex
Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}")
[metric(ok_metric, 0, **labels)]
end
def repository_storages
storages_paths.keys
end
def storages_paths
Gitlab.config.repositories.storages
end
def exec_with_timeout(cmd_args, *args, &block)
Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
end
def with_temp_file(storage_name)
temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path }
yield temp_file_path
ensure
delete_test_file(temp_file_path)
end
def storage_path(storage_name)
Gitlab::GitalyClient::StorageSettings.allow_disk_access do
storages_paths[storage_name]&.legacy_disk_path
end
end
# All below test methods use shell commands to perform actions on storage volumes.
# In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely,
# we can rely on shell commands to be terminated once `timeout` kills them.
#
# However we also fallback to pure Ruby file operations in case a specific shell command is missing
# so we are still able to perform healthchecks and gather metrics from such system.
def delete_test_file(tmp_path)
_, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
status.zero?
rescue Errno::ENOENT
File.delete(tmp_path) rescue Errno::ENOENT
end
def storage_stat_test(storage_name)
stat_path = File.join(storage_path(storage_name), '.')
begin
_, status = exec_with_timeout(%W{ stat #{stat_path} })
status.zero?
rescue Errno::ENOENT
File.exist?(stat_path) && File::Stat.new(stat_path).readable?
end
end
def storage_write_test(tmp_path)
_, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
stdin.write(RANDOM_STRING)
end
status.zero?
rescue Errno::ENOENT
written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
written_bytes == RANDOM_STRING.length
end
def storage_read_test(tmp_path)
_, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
stdin.write(RANDOM_STRING)
end
status.zero?
rescue Errno::ENOENT
file_contents = File.read(tmp_path) rescue Errno::ENOENT
file_contents == RANDOM_STRING
end
def storage_circuitbreaker_test(storage_name)
Gitlab::Git::Storage::CircuitBreaker.build(storage_name).perform { "OK" }
rescue Gitlab::Git::Storage::Inaccessible
nil
end
def storage_stat_metrics(storage_name)
operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
with_timing { storage_stat_test(storage_name) }
end
end
def storage_write_metrics(storage_name)
operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do
with_temp_file(storage_name) do |tmp_file_path|
with_timing { storage_write_test(tmp_file_path) }
end
end
end
def storage_read_metrics(storage_name)
operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do
with_temp_file(storage_name) do |tmp_file_path|
storage_write_test(tmp_file_path) # writes data used by read test
with_timing { storage_read_test(tmp_file_path) }
end
end
end
def storage_circuitbreaker_metrics(storage_name)
operation_metrics(:filesystem_circuitbreaker,
:filesystem_circuitbreaker_latency_seconds,
shard: storage_name) do
with_timing { storage_circuitbreaker_test(storage_name) }
end
end
end
end
end
end

View File

@ -13,14 +13,14 @@ module Gitlab
end
def metrics
repository_storages.flat_map do |storage_name|
result, elapsed = with_timing { check(storage_name) }
labels = { shard: storage_name }
Gitaly::Server.all.flat_map do |server|
result, elapsed = with_timing { server.read_writeable? }
labels = { shard: server.storage }
[
metric("#{metric_prefix}_success", successful?(result) ? 1 : 0, **labels),
metric("#{metric_prefix}_success", result ? 1 : 0, **labels),
metric("#{metric_prefix}_latency_seconds", elapsed, **labels)
].flatten
]
end
end
@ -36,10 +36,6 @@ module Gitlab
METRIC_PREFIX
end
def successful?(result)
result[:success]
end
def repository_storages
storages.keys
end

View File

@ -69,8 +69,7 @@ describe HealthController do
expect(json_response['cache_check']['status']).to eq('ok')
expect(json_response['queues_check']['status']).to eq('ok')
expect(json_response['shared_state_check']['status']).to eq('ok')
expect(json_response['fs_shards_check']['status']).to eq('ok')
expect(json_response['fs_shards_check']['labels']['shard']).to eq('default')
expect(json_response['gitaly_check']['status']).to eq('ok')
end
end
@ -122,7 +121,6 @@ describe HealthController do
expect(json_response['cache_check']['status']).to eq('ok')
expect(json_response['queues_check']['status']).to eq('ok')
expect(json_response['shared_state_check']['status']).to eq('ok')
expect(json_response['fs_shards_check']['status']).to eq('ok')
end
end

View File

@ -59,6 +59,13 @@ describe MetricsController do
expect(response.body).to match(/^redis_shared_state_ping_latency_seconds [0-9\.]+$/)
end
it 'returns Gitaly metrics' do
get :index
expect(response.body).to match(/^gitaly_health_check_success{shard="default"} 1$/)
expect(response.body).to match(/^gitaly_health_check_latency_seconds{shard="default"} [0-9\.]+$/)
end
context 'prometheus metrics are disabled' do
before do
allow(Gitlab::Metrics).to receive(:prometheus_metrics_enabled?).and_return(false)

View File

@ -1,6 +1,8 @@
require 'spec_helper'
describe Gitaly::Server do
let(:server) { described_class.new('default') }
describe '.all' do
let(:storages) { Gitlab.config.repositories.storages }
@ -17,6 +19,38 @@ describe Gitaly::Server do
it { is_expected.to respond_to(:up_to_date?) }
it { is_expected.to respond_to(:address) }
describe 'readable?' do
context 'when the storage is readable' do
it 'returns true' do
expect(server).to be_readable
end
end
context 'when the storage is not readable' do
let(:server) { described_class.new('broken') }
it 'returns false' do
expect(server).not_to be_readable
end
end
end
describe 'writeable?' do
context 'when the storage is writeable' do
it 'returns true' do
expect(server).to be_writeable
end
end
context 'when the storage is not writeable' do
let(:server) { described_class.new('broken') }
it 'returns false' do
expect(server).not_to be_writeable
end
end
end
describe 'request memoization' do
context 'when requesting multiple properties', :request_store do
it 'uses memoization for the info request' do

View File

@ -1,200 +0,0 @@
require 'spec_helper'
describe Gitlab::HealthChecks::FsShardsCheck do
def command_exists?(command)
_, status = Gitlab::Popen.popen(%W{ #{command} 1 echo })
status.zero?
rescue Errno::ENOENT
false
end
def timeout_command
@timeout_command ||=
if command_exists?('timeout')
'timeout'
elsif command_exists?('gtimeout')
'gtimeout'
else
''
end
end
let(:metric_class) { Gitlab::HealthChecks::Metric }
let(:result_class) { Gitlab::HealthChecks::Result }
let(:repository_storages) { ['default'] }
let(:tmp_dir) { Dir.mktmpdir }
let(:storages_paths) do
{
default: Gitlab::GitalyClient::StorageSettings.new('path' => tmp_dir)
}.with_indifferent_access
end
before do
allow(described_class).to receive(:repository_storages) { repository_storages }
allow(described_class).to receive(:storages_paths) { storages_paths }
stub_const('Gitlab::HealthChecks::FsShardsCheck::TIMEOUT_EXECUTABLE', timeout_command)
end
after do
FileUtils.remove_entry_secure(tmp_dir) if Dir.exist?(tmp_dir)
end
shared_examples 'filesystem checks' do
describe '#readiness' do
subject { described_class.readiness }
context 'storage has a tripped circuitbreaker', :broken_storage do
let(:repository_storages) { ['broken'] }
let(:storages_paths) do
Gitlab.config.repositories.storages
end
it { is_expected.to include(result_class.new(false, 'circuitbreaker tripped', shard: 'broken')) }
end
context 'storage points to not existing folder' do
let(:storages_paths) do
{
default: Gitlab::GitalyClient::StorageSettings.new('path' => 'tmp/this/path/doesnt/exist')
}.with_indifferent_access
end
before do
allow(described_class).to receive(:storage_circuitbreaker_test) { true }
end
it { is_expected.to include(result_class.new(false, 'cannot stat storage', shard: 'default')) }
end
context 'storage points to directory that has both read and write rights' do
before do
FileUtils.chmod_R(0755, tmp_dir)
end
it { is_expected.to include(result_class.new(true, nil, shard: 'default')) }
it 'cleans up files used for testing' do
expect(described_class).to receive(:storage_write_test).with(any_args).and_call_original
expect { subject }.not_to change(Dir.entries(tmp_dir), :count)
end
context 'read test fails' do
before do
allow(described_class).to receive(:storage_read_test).with(any_args).and_return(false)
end
it { is_expected.to include(result_class.new(false, 'cannot read from storage', shard: 'default')) }
end
context 'write test fails' do
before do
allow(described_class).to receive(:storage_write_test).with(any_args).and_return(false)
end
it { is_expected.to include(result_class.new(false, 'cannot write to storage', shard: 'default')) }
end
end
end
describe '#metrics' do
context 'storage points to not existing folder' do
let(:storages_paths) do
{
default: Gitlab::GitalyClient::StorageSettings.new('path' => 'tmp/this/path/doesnt/exist')
}.with_indifferent_access
end
it 'provides metrics' do
metrics = described_class.metrics
expect(metrics).to all(have_attributes(labels: { shard: 'default' }))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_circuitbreaker_latency_seconds, value: be >= 0))
end
end
context 'storage points to directory that has both read and write rights' do
before do
FileUtils.chmod_R(0755, tmp_dir)
end
it 'provides metrics' do
metrics = described_class.metrics
expect(metrics).to all(have_attributes(labels: { shard: 'default' }))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 1))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 1))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 1))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_circuitbreaker_latency_seconds, value: be >= 0))
end
it 'cleans up files used for metrics' do
expect { described_class.metrics }.not_to change(Dir.entries(tmp_dir), :count)
end
end
end
end
context 'when timeout kills fs checks' do
before do
stub_const('Gitlab::HealthChecks::FsShardsCheck::COMMAND_TIMEOUT', '1')
allow(described_class).to receive(:exec_with_timeout).and_wrap_original { |m| m.call(%w(sleep 60)) }
FileUtils.chmod_R(0755, tmp_dir)
end
describe '#readiness' do
subject { described_class.readiness }
it { is_expected.to include(result_class.new(false, 'cannot stat storage', shard: 'default')) }
end
describe '#metrics' do
it 'provides metrics' do
metrics = described_class.metrics
expect(metrics).to all(have_attributes(labels: { shard: 'default' }))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
end
end
end
context 'when popen always finds required binaries' do
before do
allow(described_class).to receive(:exec_with_timeout).and_wrap_original do |method, *args, &block|
begin
method.call(*args, &block)
rescue RuntimeError, Errno::ENOENT
raise 'expected not to happen'
end
end
stub_const('Gitlab::HealthChecks::FsShardsCheck::COMMAND_TIMEOUT', '10')
end
it_behaves_like 'filesystem checks'
end
context 'when popen never finds required binaries' do
before do
allow(Gitlab::Popen).to receive(:popen).and_raise(Errno::ENOENT)
end
it_behaves_like 'filesystem checks'
end
end

View File

@ -30,13 +30,14 @@ describe Gitlab::HealthChecks::GitalyCheck do
describe '#metrics' do
subject { described_class.metrics }
let(:server) { double(storage: 'default', read_writeable?: up) }
before do
expect(Gitlab::GitalyClient::HealthCheckService).to receive(:new).and_return(gitaly_check)
allow(Gitaly::Server).to receive(:new).and_return(server)
end
context 'Gitaly server is up' do
let(:gitaly_check) { double(check: { success: true }) }
let(:up) { true }
it 'provides metrics' do
expect(subject).to all(have_attributes(labels: { shard: 'default' }))
@ -46,7 +47,7 @@ describe Gitlab::HealthChecks::GitalyCheck do
end
context 'Gitaly server is down' do
let(:gitaly_check) { double(check: { success: false, message: 'Connection refused' }) }
let(:up) { false }
it 'provides metrics' do
expect(subject).to include(an_object_having_attributes(name: 'gitaly_health_check_success', value: 0))