Add a gRPC health check to ensure Gitaly is up
This will enable Geo to skip shards that not operational. Relates to gitlab-org/gitlab-ee#4329
This commit is contained in:
parent
44728e0527
commit
8671261309
|
@ -8,7 +8,8 @@ class HealthController < ActionController::Base
|
||||||
Gitlab::HealthChecks::Redis::CacheCheck,
|
Gitlab::HealthChecks::Redis::CacheCheck,
|
||||||
Gitlab::HealthChecks::Redis::QueuesCheck,
|
Gitlab::HealthChecks::Redis::QueuesCheck,
|
||||||
Gitlab::HealthChecks::Redis::SharedStateCheck,
|
Gitlab::HealthChecks::Redis::SharedStateCheck,
|
||||||
Gitlab::HealthChecks::FsShardsCheck
|
Gitlab::HealthChecks::FsShardsCheck,
|
||||||
|
Gitlab::HealthChecks::GitalyCheck
|
||||||
].freeze
|
].freeze
|
||||||
|
|
||||||
def readiness
|
def readiness
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
---
|
||||||
|
title: Add a gRPC health check to ensure Gitaly is up
|
||||||
|
merge_request:
|
||||||
|
author:
|
||||||
|
type: added
|
|
@ -1,6 +1,8 @@
|
||||||
require 'base64'
|
require 'base64'
|
||||||
|
|
||||||
require 'gitaly'
|
require 'gitaly'
|
||||||
|
require 'grpc/health/v1/health_pb'
|
||||||
|
require 'grpc/health/v1/health_services_pb'
|
||||||
|
|
||||||
module Gitlab
|
module Gitlab
|
||||||
module GitalyClient
|
module GitalyClient
|
||||||
|
@ -69,14 +71,27 @@ module Gitlab
|
||||||
@stubs ||= {}
|
@stubs ||= {}
|
||||||
@stubs[storage] ||= {}
|
@stubs[storage] ||= {}
|
||||||
@stubs[storage][name] ||= begin
|
@stubs[storage][name] ||= begin
|
||||||
klass = Gitaly.const_get(name.to_s.camelcase.to_sym).const_get(:Stub)
|
klass = stub_class(name)
|
||||||
addr = address(storage)
|
addr = stub_address(storage)
|
||||||
addr = addr.sub(%r{^tcp://}, '') if URI(addr).scheme == 'tcp'
|
|
||||||
klass.new(addr, :this_channel_is_insecure)
|
klass.new(addr, :this_channel_is_insecure)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.stub_class(name)
|
||||||
|
if name == :health_check
|
||||||
|
Grpc::Health::V1::Health::Stub
|
||||||
|
else
|
||||||
|
Gitaly.const_get(name.to_s.camelcase.to_sym).const_get(:Stub)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.stub_address(storage)
|
||||||
|
addr = address(storage)
|
||||||
|
addr = addr.sub(%r{^tcp://}, '') if URI(addr).scheme == 'tcp'
|
||||||
|
addr
|
||||||
|
end
|
||||||
|
|
||||||
def self.clear_stubs!
|
def self.clear_stubs!
|
||||||
MUTEX.synchronize do
|
MUTEX.synchronize do
|
||||||
@stubs = nil
|
@stubs = nil
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
module Gitlab
|
||||||
|
module GitalyClient
|
||||||
|
class HealthCheckService
|
||||||
|
def initialize(storage)
|
||||||
|
@storage = storage
|
||||||
|
end
|
||||||
|
|
||||||
|
# Sends a gRPC health ping to the Gitaly server for the storage shard.
|
||||||
|
def check
|
||||||
|
request = Grpc::Health::V1::HealthCheckRequest.new
|
||||||
|
response = GitalyClient.call(@storage, :health_check, :check, request, timeout: GitalyClient.fast_timeout)
|
||||||
|
|
||||||
|
{ success: response&.status == :SERVING }
|
||||||
|
rescue GRPC::BadStatus => e
|
||||||
|
{ success: false, message: e.to_s }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,53 @@
|
||||||
|
module Gitlab
|
||||||
|
module HealthChecks
|
||||||
|
class GitalyCheck
|
||||||
|
extend BaseAbstractCheck
|
||||||
|
|
||||||
|
METRIC_PREFIX = 'gitaly_health_check'.freeze
|
||||||
|
|
||||||
|
class << self
|
||||||
|
def readiness
|
||||||
|
repository_storages.map do |storage_name|
|
||||||
|
check(storage_name)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def metrics
|
||||||
|
repository_storages.flat_map do |storage_name|
|
||||||
|
result, elapsed = with_timing { check(storage_name) }
|
||||||
|
labels = { shard: storage_name }
|
||||||
|
|
||||||
|
[
|
||||||
|
metric("#{metric_prefix}_success", successful?(result) ? 1 : 0, **labels),
|
||||||
|
metric("#{metric_prefix}_latency_seconds", elapsed, **labels)
|
||||||
|
].flatten
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def check(storage_name)
|
||||||
|
serv = Gitlab::GitalyClient::HealthCheckService.new(storage_name)
|
||||||
|
result = serv.check
|
||||||
|
HealthChecks::Result.new(result[:success], result[:message], shard: storage_name)
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def metric_prefix
|
||||||
|
METRIC_PREFIX
|
||||||
|
end
|
||||||
|
|
||||||
|
def successful?(result)
|
||||||
|
result[:success]
|
||||||
|
end
|
||||||
|
|
||||||
|
def repository_storages
|
||||||
|
storages.keys
|
||||||
|
end
|
||||||
|
|
||||||
|
def storages
|
||||||
|
Gitlab.config.repositories.storages
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,41 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Gitlab::GitalyClient::HealthCheckService do
|
||||||
|
let(:project) { create(:project) }
|
||||||
|
let(:storage_name) { project.repository_storage }
|
||||||
|
|
||||||
|
subject { described_class.new(storage_name) }
|
||||||
|
|
||||||
|
describe '#check' do
|
||||||
|
it 'successfully sends a health check request' do
|
||||||
|
expect(Gitlab::GitalyClient).to receive(:call).with(
|
||||||
|
storage_name,
|
||||||
|
:health_check,
|
||||||
|
:check,
|
||||||
|
instance_of(Grpc::Health::V1::HealthCheckRequest),
|
||||||
|
timeout: Gitlab::GitalyClient.fast_timeout).and_call_original
|
||||||
|
|
||||||
|
expect(subject.check).to eq({ success: true })
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'receives an unsuccessful health check request' do
|
||||||
|
expect_any_instance_of(Grpc::Health::V1::Health::Stub)
|
||||||
|
.to receive(:check)
|
||||||
|
.and_return(double(status: false))
|
||||||
|
|
||||||
|
expect(subject.check).to eq({ success: false })
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'gracefully handles gRPC error' do
|
||||||
|
expect(Gitlab::GitalyClient).to receive(:call).with(
|
||||||
|
storage_name,
|
||||||
|
:health_check,
|
||||||
|
:check,
|
||||||
|
instance_of(Grpc::Health::V1::HealthCheckRequest),
|
||||||
|
timeout: Gitlab::GitalyClient.fast_timeout)
|
||||||
|
.and_raise(GRPC::Unavailable.new('Connection refused'))
|
||||||
|
|
||||||
|
expect(subject.check).to eq({ success: false, message: '14:Connection refused' })
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -3,6 +3,31 @@ require 'spec_helper'
|
||||||
# We stub Gitaly in `spec/support/gitaly.rb` for other tests. We don't want
|
# We stub Gitaly in `spec/support/gitaly.rb` for other tests. We don't want
|
||||||
# those stubs while testing the GitalyClient itself.
|
# those stubs while testing the GitalyClient itself.
|
||||||
describe Gitlab::GitalyClient, skip_gitaly_mock: true do
|
describe Gitlab::GitalyClient, skip_gitaly_mock: true do
|
||||||
|
describe '.stub_class' do
|
||||||
|
it 'returns the gRPC health check stub' do
|
||||||
|
expect(described_class.stub_class(:health_check)).to eq(::Grpc::Health::V1::Health::Stub)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'returns a Gitaly stub' do
|
||||||
|
expect(described_class.stub_class(:ref_service)).to eq(::Gitaly::RefService::Stub)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe '.stub_address' do
|
||||||
|
it 'returns the same result after being called multiple times' do
|
||||||
|
address = 'localhost:9876'
|
||||||
|
prefixed_address = "tcp://#{address}"
|
||||||
|
|
||||||
|
allow(Gitlab.config.repositories).to receive(:storages).and_return({
|
||||||
|
'default' => { 'gitaly_address' => prefixed_address }
|
||||||
|
})
|
||||||
|
|
||||||
|
2.times do
|
||||||
|
expect(described_class.stub_address('default')).to eq('localhost:9876')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
describe '.stub' do
|
describe '.stub' do
|
||||||
# Notice that this is referring to gRPC "stubs", not rspec stubs
|
# Notice that this is referring to gRPC "stubs", not rspec stubs
|
||||||
before do
|
before do
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Gitlab::HealthChecks::GitalyCheck do
|
||||||
|
let(:result_class) { Gitlab::HealthChecks::Result }
|
||||||
|
let(:repository_storages) { ['default'] }
|
||||||
|
|
||||||
|
before do
|
||||||
|
allow(described_class).to receive(:repository_storages) { repository_storages }
|
||||||
|
end
|
||||||
|
|
||||||
|
describe '#readiness' do
|
||||||
|
subject { described_class.readiness }
|
||||||
|
|
||||||
|
before do
|
||||||
|
expect(Gitlab::GitalyClient::HealthCheckService).to receive(:new).and_return(gitaly_check)
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'Gitaly server is up' do
|
||||||
|
let(:gitaly_check) { double(check: { success: true }) }
|
||||||
|
|
||||||
|
it { is_expected.to eq([result_class.new(true, nil, shard: 'default')]) }
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'Gitaly server is down' do
|
||||||
|
let(:gitaly_check) { double(check: { success: false, message: 'Connection refused' }) }
|
||||||
|
|
||||||
|
it { is_expected.to eq([result_class.new(false, 'Connection refused', shard: 'default')]) }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
describe '#metrics' do
|
||||||
|
subject { described_class.metrics }
|
||||||
|
|
||||||
|
before do
|
||||||
|
expect(Gitlab::GitalyClient::HealthCheckService).to receive(:new).and_return(gitaly_check)
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'Gitaly server is up' do
|
||||||
|
let(:gitaly_check) { double(check: { success: true }) }
|
||||||
|
|
||||||
|
it 'provides metrics' do
|
||||||
|
expect(subject).to all(have_attributes(labels: { shard: 'default' }))
|
||||||
|
expect(subject).to include(an_object_having_attributes(name: 'gitaly_health_check_success', value: 1))
|
||||||
|
expect(subject).to include(an_object_having_attributes(name: 'gitaly_health_check_latency_seconds', value: be >= 0))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'Gitaly server is down' do
|
||||||
|
let(:gitaly_check) { double(check: { success: false, message: 'Connection refused' }) }
|
||||||
|
|
||||||
|
it 'provides metrics' do
|
||||||
|
expect(subject).to include(an_object_having_attributes(name: 'gitaly_health_check_success', value: 0))
|
||||||
|
expect(subject).to include(an_object_having_attributes(name: 'gitaly_health_check_latency_seconds', value: be >= 0))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue