Add a gRPC health check to ensure Gitaly is up

This will enable Geo to skip shards that not operational.

Relates to gitlab-org/gitlab-ee#4329
This commit is contained in:
Stan Hu 2018-01-21 15:34:09 -08:00
parent 44728e0527
commit 8671261309
8 changed files with 220 additions and 4 deletions

View File

@ -8,7 +8,8 @@ class HealthController < ActionController::Base
Gitlab::HealthChecks::Redis::CacheCheck,
Gitlab::HealthChecks::Redis::QueuesCheck,
Gitlab::HealthChecks::Redis::SharedStateCheck,
Gitlab::HealthChecks::FsShardsCheck
Gitlab::HealthChecks::FsShardsCheck,
Gitlab::HealthChecks::GitalyCheck
].freeze
def readiness

View File

@ -0,0 +1,5 @@
---
title: Add a gRPC health check to ensure Gitaly is up
merge_request:
author:
type: added

View File

@ -1,6 +1,8 @@
require 'base64'
require 'gitaly'
require 'grpc/health/v1/health_pb'
require 'grpc/health/v1/health_services_pb'
module Gitlab
module GitalyClient
@ -69,14 +71,27 @@ module Gitlab
@stubs ||= {}
@stubs[storage] ||= {}
@stubs[storage][name] ||= begin
klass = Gitaly.const_get(name.to_s.camelcase.to_sym).const_get(:Stub)
addr = address(storage)
addr = addr.sub(%r{^tcp://}, '') if URI(addr).scheme == 'tcp'
klass = stub_class(name)
addr = stub_address(storage)
klass.new(addr, :this_channel_is_insecure)
end
end
end
def self.stub_class(name)
if name == :health_check
Grpc::Health::V1::Health::Stub
else
Gitaly.const_get(name.to_s.camelcase.to_sym).const_get(:Stub)
end
end
def self.stub_address(storage)
addr = address(storage)
addr = addr.sub(%r{^tcp://}, '') if URI(addr).scheme == 'tcp'
addr
end
def self.clear_stubs!
MUTEX.synchronize do
@stubs = nil

View File

@ -0,0 +1,19 @@
module Gitlab
module GitalyClient
class HealthCheckService
def initialize(storage)
@storage = storage
end
# Sends a gRPC health ping to the Gitaly server for the storage shard.
def check
request = Grpc::Health::V1::HealthCheckRequest.new
response = GitalyClient.call(@storage, :health_check, :check, request, timeout: GitalyClient.fast_timeout)
{ success: response&.status == :SERVING }
rescue GRPC::BadStatus => e
{ success: false, message: e.to_s }
end
end
end
end

View File

@ -0,0 +1,53 @@
module Gitlab
module HealthChecks
class GitalyCheck
extend BaseAbstractCheck
METRIC_PREFIX = 'gitaly_health_check'.freeze
class << self
def readiness
repository_storages.map do |storage_name|
check(storage_name)
end
end
def metrics
repository_storages.flat_map do |storage_name|
result, elapsed = with_timing { check(storage_name) }
labels = { shard: storage_name }
[
metric("#{metric_prefix}_success", successful?(result) ? 1 : 0, **labels),
metric("#{metric_prefix}_latency_seconds", elapsed, **labels)
].flatten
end
end
def check(storage_name)
serv = Gitlab::GitalyClient::HealthCheckService.new(storage_name)
result = serv.check
HealthChecks::Result.new(result[:success], result[:message], shard: storage_name)
end
private
def metric_prefix
METRIC_PREFIX
end
def successful?(result)
result[:success]
end
def repository_storages
storages.keys
end
def storages
Gitlab.config.repositories.storages
end
end
end
end
end

View File

@ -0,0 +1,41 @@
require 'spec_helper'
describe Gitlab::GitalyClient::HealthCheckService do
let(:project) { create(:project) }
let(:storage_name) { project.repository_storage }
subject { described_class.new(storage_name) }
describe '#check' do
it 'successfully sends a health check request' do
expect(Gitlab::GitalyClient).to receive(:call).with(
storage_name,
:health_check,
:check,
instance_of(Grpc::Health::V1::HealthCheckRequest),
timeout: Gitlab::GitalyClient.fast_timeout).and_call_original
expect(subject.check).to eq({ success: true })
end
it 'receives an unsuccessful health check request' do
expect_any_instance_of(Grpc::Health::V1::Health::Stub)
.to receive(:check)
.and_return(double(status: false))
expect(subject.check).to eq({ success: false })
end
it 'gracefully handles gRPC error' do
expect(Gitlab::GitalyClient).to receive(:call).with(
storage_name,
:health_check,
:check,
instance_of(Grpc::Health::V1::HealthCheckRequest),
timeout: Gitlab::GitalyClient.fast_timeout)
.and_raise(GRPC::Unavailable.new('Connection refused'))
expect(subject.check).to eq({ success: false, message: '14:Connection refused' })
end
end
end

View File

@ -3,6 +3,31 @@ require 'spec_helper'
# We stub Gitaly in `spec/support/gitaly.rb` for other tests. We don't want
# those stubs while testing the GitalyClient itself.
describe Gitlab::GitalyClient, skip_gitaly_mock: true do
describe '.stub_class' do
it 'returns the gRPC health check stub' do
expect(described_class.stub_class(:health_check)).to eq(::Grpc::Health::V1::Health::Stub)
end
it 'returns a Gitaly stub' do
expect(described_class.stub_class(:ref_service)).to eq(::Gitaly::RefService::Stub)
end
end
describe '.stub_address' do
it 'returns the same result after being called multiple times' do
address = 'localhost:9876'
prefixed_address = "tcp://#{address}"
allow(Gitlab.config.repositories).to receive(:storages).and_return({
'default' => { 'gitaly_address' => prefixed_address }
})
2.times do
expect(described_class.stub_address('default')).to eq('localhost:9876')
end
end
end
describe '.stub' do
# Notice that this is referring to gRPC "stubs", not rspec stubs
before do

View File

@ -0,0 +1,57 @@
require 'spec_helper'
describe Gitlab::HealthChecks::GitalyCheck do
let(:result_class) { Gitlab::HealthChecks::Result }
let(:repository_storages) { ['default'] }
before do
allow(described_class).to receive(:repository_storages) { repository_storages }
end
describe '#readiness' do
subject { described_class.readiness }
before do
expect(Gitlab::GitalyClient::HealthCheckService).to receive(:new).and_return(gitaly_check)
end
context 'Gitaly server is up' do
let(:gitaly_check) { double(check: { success: true }) }
it { is_expected.to eq([result_class.new(true, nil, shard: 'default')]) }
end
context 'Gitaly server is down' do
let(:gitaly_check) { double(check: { success: false, message: 'Connection refused' }) }
it { is_expected.to eq([result_class.new(false, 'Connection refused', shard: 'default')]) }
end
end
describe '#metrics' do
subject { described_class.metrics }
before do
expect(Gitlab::GitalyClient::HealthCheckService).to receive(:new).and_return(gitaly_check)
end
context 'Gitaly server is up' do
let(:gitaly_check) { double(check: { success: true }) }
it 'provides metrics' do
expect(subject).to all(have_attributes(labels: { shard: 'default' }))
expect(subject).to include(an_object_having_attributes(name: 'gitaly_health_check_success', value: 1))
expect(subject).to include(an_object_having_attributes(name: 'gitaly_health_check_latency_seconds', value: be >= 0))
end
end
context 'Gitaly server is down' do
let(:gitaly_check) { double(check: { success: false, message: 'Connection refused' }) }
it 'provides metrics' do
expect(subject).to include(an_object_having_attributes(name: 'gitaly_health_check_success', value: 0))
expect(subject).to include(an_object_having_attributes(name: 'gitaly_health_check_latency_seconds', value: be >= 0))
end
end
end
end