Implement backoff for the circuitbreaker
The circuitbreaker now has 2 failure modes: - Backing off: This will raise the `Gitlab::Git::Storage::Failing` exception. Access to the shard is blocked temporarily. - Circuit broken: This will raise the `Gitlab::Git::Storage::CircuitBroken` exception. Access to the shard will be blocked until the failures are reset.
This commit is contained in:
parent
1881d4f8ec
commit
430e767139
|
@ -16,17 +16,16 @@ module StorageHealthHelper
|
|||
def message_for_circuit_breaker(circuit_breaker)
|
||||
maximum_failures = circuit_breaker.failure_count_threshold
|
||||
current_failures = circuit_breaker.failure_count
|
||||
permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures
|
||||
|
||||
translation_params = { number_of_failures: current_failures,
|
||||
maximum_failures: maximum_failures,
|
||||
number_of_seconds: circuit_breaker.failure_wait_time }
|
||||
|
||||
if permanently_broken
|
||||
if circuit_breaker.circuit_broken?
|
||||
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
|
||||
"retry automatically. Reset storage information when the problem is "\
|
||||
"resolved.") % translation_params
|
||||
elsif circuit_breaker.circuit_broken?
|
||||
elsif circuit_breaker.backing_off?
|
||||
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
|
||||
"block access for %{number_of_seconds} seconds.") % translation_params
|
||||
else
|
||||
|
|
|
@ -12,6 +12,7 @@ module Gitlab
|
|||
|
||||
CircuitOpen = Class.new(Inaccessible)
|
||||
Misconfiguration = Class.new(Inaccessible)
|
||||
Failing = Class.new(Inaccessible)
|
||||
|
||||
REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
|
||||
|
||||
|
|
|
@ -64,12 +64,20 @@ module Gitlab
|
|||
def circuit_broken?
|
||||
return false if no_failures?
|
||||
|
||||
recent_failure = last_failure > failure_wait_time.seconds.ago
|
||||
too_many_failures = failure_count > failure_count_threshold
|
||||
|
||||
recent_failure || too_many_failures
|
||||
failure_count > failure_count_threshold
|
||||
end
|
||||
|
||||
def backing_off?
|
||||
return false if no_failures?
|
||||
|
||||
recent_failure = last_failure > failure_wait_time.seconds.ago
|
||||
too_many_failures = failure_count > backoff_threshold
|
||||
|
||||
recent_failure && too_many_failures
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def failure_info
|
||||
@failure_info ||= get_failure_info
|
||||
end
|
||||
|
@ -94,7 +102,11 @@ module Gitlab
|
|||
|
||||
def check_storage_accessible!
|
||||
if circuit_broken?
|
||||
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time)
|
||||
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time)
|
||||
end
|
||||
|
||||
if backing_off?
|
||||
raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time)
|
||||
end
|
||||
|
||||
unless storage_available?
|
||||
|
@ -131,12 +143,6 @@ module Gitlab
|
|||
end
|
||||
end
|
||||
|
||||
def cache_key
|
||||
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def get_failure_info
|
||||
last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
|
||||
redis.hmget(cache_key, :last_failure, :failure_count)
|
||||
|
@ -146,6 +152,10 @@ module Gitlab
|
|||
|
||||
FailureInfo.new(last_failure, failure_count.to_i)
|
||||
end
|
||||
|
||||
def cache_key
|
||||
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -18,6 +18,14 @@ module Gitlab
|
|||
application_settings.circuitbreaker_storage_timeout
|
||||
end
|
||||
|
||||
def access_retries
|
||||
application_settings.circuitbreaker_access_retries
|
||||
end
|
||||
|
||||
def backoff_threshold
|
||||
application_settings.circuitbreaker_backoff_threshold
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def application_settings
|
||||
|
|
|
@ -25,6 +25,10 @@ module Gitlab
|
|||
!!@error
|
||||
end
|
||||
|
||||
def backing_off?
|
||||
false
|
||||
end
|
||||
|
||||
def last_failure
|
||||
circuit_broken? ? Time.now : nil
|
||||
end
|
||||
|
|
|
@ -79,7 +79,9 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
|
|||
stub_application_setting(circuitbreaker_failure_count_threshold: 0,
|
||||
circuitbreaker_failure_wait_time: 1,
|
||||
circuitbreaker_failure_reset_time: 2,
|
||||
circuitbreaker_storage_timeout: 3)
|
||||
circuitbreaker_storage_timeout: 3,
|
||||
circuitbreaker_access_retries: 4,
|
||||
circuitbreaker_backoff_threshold: 5)
|
||||
end
|
||||
|
||||
describe '#failure_count_threshold' do
|
||||
|
@ -105,14 +107,43 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
|
|||
expect(circuit_breaker.storage_timeout).to eq(3)
|
||||
end
|
||||
end
|
||||
|
||||
describe '#access_retries' do
|
||||
it 'reads the value from settings' do
|
||||
expect(circuit_breaker.access_retries).to eq(4)
|
||||
end
|
||||
end
|
||||
|
||||
describe '#backoff_threshold' do
|
||||
it 'reads the value from settings' do
|
||||
expect(circuit_breaker.backoff_threshold).to eq(5)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '#perform' do
|
||||
it 'raises an exception with retry time when the circuit is open' do
|
||||
allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
|
||||
it 'raises the correct exception when the circuit is open' do
|
||||
set_in_redis(:last_failure, 1.day.ago.to_f)
|
||||
set_in_redis(:failure_count, 999)
|
||||
|
||||
expect { |b| circuit_breaker.perform(&b) }
|
||||
.to raise_error(Gitlab::Git::Storage::CircuitOpen)
|
||||
.to raise_error do |exception|
|
||||
expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
|
||||
expect(exception.retry_after).to eq(1800)
|
||||
end
|
||||
end
|
||||
|
||||
it 'raises the correct exception when backing off' do
|
||||
Timecop.freeze do
|
||||
set_in_redis(:last_failure, 1.second.ago.to_f)
|
||||
set_in_redis(:failure_count, 90)
|
||||
|
||||
expect { |b| circuit_breaker.perform(&b) }
|
||||
.to raise_error do |exception|
|
||||
expect(exception).to be_kind_of(Gitlab::Git::Storage::Failing)
|
||||
expect(exception.retry_after).to eq(30)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
it 'yields the block' do
|
||||
|
@ -122,6 +153,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
|
|||
|
||||
it 'checks if the storage is available' do
|
||||
expect(circuit_breaker).to receive(:check_storage_accessible!)
|
||||
.and_call_original
|
||||
|
||||
circuit_breaker.perform { 'hello world' }
|
||||
end
|
||||
|
@ -137,6 +169,25 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
|
|||
.to raise_error(Rugged::OSError)
|
||||
end
|
||||
|
||||
it 'tracks that the storage was accessible' do
|
||||
set_in_redis(:failure_count, 10)
|
||||
set_in_redis(:last_failure, Time.now.to_f)
|
||||
|
||||
circuit_breaker.perform { '' }
|
||||
|
||||
expect(value_from_redis(:failure_count).to_i).to eq(0)
|
||||
expect(value_from_redis(:last_failure)).to be_empty
|
||||
expect(circuit_breaker.failure_count).to eq(0)
|
||||
expect(circuit_breaker.last_failure).to be_nil
|
||||
end
|
||||
|
||||
it 'only accessibility check once' do
|
||||
expect(Gitlab::Git::Storage::ForkedStorageCheck)
|
||||
.to receive(:storage_available?).once.and_call_original
|
||||
|
||||
2.times { circuit_breaker.perform { '' } }
|
||||
end
|
||||
|
||||
context 'with the feature disabled' do
|
||||
it 'returns the block without checking accessibility' do
|
||||
stub_feature_flags(git_storage_circuit_breaker: false)
|
||||
|
@ -148,6 +199,31 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
|
|||
expect(result).to eq('hello')
|
||||
end
|
||||
end
|
||||
|
||||
context 'the storage is not available' do
|
||||
let(:storage_name) { 'broken' }
|
||||
|
||||
it 'raises the correct exception' do
|
||||
expect(circuit_breaker).to receive(:track_storage_inaccessible)
|
||||
|
||||
expect { circuit_breaker.perform { '' } }
|
||||
.to raise_error do |exception|
|
||||
expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible)
|
||||
expect(exception.retry_after).to eq(30)
|
||||
end
|
||||
end
|
||||
|
||||
it 'tracks that the storage was inaccessible' do
|
||||
Timecop.freeze do
|
||||
expect { circuit_breaker.perform { '' } }.to raise_error(Gitlab::Git::Storage::Inaccessible)
|
||||
|
||||
expect(value_from_redis(:failure_count).to_i).to eq(1)
|
||||
expect(value_from_redis(:last_failure)).not_to be_empty
|
||||
expect(circuit_breaker.failure_count).to eq(1)
|
||||
expect(circuit_breaker.last_failure).to be_within(1.second).of(Time.now)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '#circuit_broken?' do
|
||||
|
@ -158,183 +234,40 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
|
|||
expect(circuit_breaker.circuit_broken?).to be_falsey
|
||||
end
|
||||
|
||||
it 'is broken when there was a recent failure' do
|
||||
Timecop.freeze do
|
||||
set_in_redis(:last_failure, 1.second.ago.to_f)
|
||||
set_in_redis(:failure_count, 1)
|
||||
|
||||
expect(circuit_breaker.circuit_broken?).to be_truthy
|
||||
end
|
||||
end
|
||||
|
||||
it 'is broken when there are too many failures' do
|
||||
set_in_redis(:last_failure, 1.day.ago.to_f)
|
||||
set_in_redis(:failure_count, 200)
|
||||
|
||||
expect(circuit_breaker.circuit_broken?).to be_truthy
|
||||
end
|
||||
end
|
||||
|
||||
describe '#backing_off?' do
|
||||
it 'is true when there was a recent failure' do
|
||||
Timecop.freeze do
|
||||
set_in_redis(:last_failure, 1.second.ago.to_f)
|
||||
set_in_redis(:failure_count, 90)
|
||||
|
||||
expect(circuit_breaker.backing_off?).to be_truthy
|
||||
end
|
||||
end
|
||||
|
||||
context 'the `failure_wait_time` is set to 0' do
|
||||
before do
|
||||
stub_application_setting(circuitbreaker_failure_wait_time: 0)
|
||||
end
|
||||
|
||||
it 'is working even when there is a recent failure' do
|
||||
it 'is working even when there are failures' do
|
||||
Timecop.freeze do
|
||||
set_in_redis(:last_failure, 0.seconds.ago.to_f)
|
||||
set_in_redis(:failure_count, 1)
|
||||
set_in_redis(:failure_count, 90)
|
||||
|
||||
expect(circuit_breaker.circuit_broken?).to be_falsey
|
||||
expect(circuit_breaker.backing_off?).to be_falsey
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe "storage_available?" do
|
||||
context 'the storage is available' do
|
||||
it 'tracks that the storage was accessible an raises the error' do
|
||||
expect(circuit_breaker).to receive(:track_storage_accessible)
|
||||
|
||||
circuit_breaker.storage_available?
|
||||
end
|
||||
|
||||
it 'only performs the check once' do
|
||||
expect(Gitlab::Git::Storage::ForkedStorageCheck)
|
||||
.to receive(:storage_available?).once.and_call_original
|
||||
|
||||
2.times { circuit_breaker.storage_available? }
|
||||
end
|
||||
end
|
||||
|
||||
context 'storage is not available' do
|
||||
let(:storage_name) { 'broken' }
|
||||
|
||||
it 'tracks that the storage was inaccessible' do
|
||||
expect(circuit_breaker).to receive(:track_storage_inaccessible)
|
||||
|
||||
circuit_breaker.storage_available?
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '#check_storage_accessible!' do
|
||||
it 'raises an exception with retry time when the circuit is open' do
|
||||
allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
|
||||
|
||||
expect { circuit_breaker.check_storage_accessible! }
|
||||
.to raise_error do |exception|
|
||||
expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
|
||||
expect(exception.retry_after).to eq(30)
|
||||
end
|
||||
end
|
||||
|
||||
context 'the storage is not available' do
|
||||
let(:storage_name) { 'broken' }
|
||||
|
||||
it 'raises an error' do
|
||||
expect(circuit_breaker).to receive(:track_storage_inaccessible)
|
||||
|
||||
expect { circuit_breaker.check_storage_accessible! }
|
||||
.to raise_error do |exception|
|
||||
expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible)
|
||||
expect(exception.retry_after).to eq(30)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '#track_storage_inaccessible' do
|
||||
around do |example|
|
||||
Timecop.freeze { example.run }
|
||||
end
|
||||
|
||||
it 'records the failure time in redis' do
|
||||
circuit_breaker.track_storage_inaccessible
|
||||
|
||||
failure_time = value_from_redis(:last_failure)
|
||||
|
||||
expect(Time.at(failure_time.to_i)).to be_within(1.second).of(Time.now)
|
||||
end
|
||||
|
||||
it 'sets the failure time on the breaker without reloading' do
|
||||
circuit_breaker.track_storage_inaccessible
|
||||
|
||||
expect(circuit_breaker).not_to receive(:get_failure_info)
|
||||
expect(circuit_breaker.last_failure).to eq(Time.now)
|
||||
end
|
||||
|
||||
it 'increments the failure count in redis' do
|
||||
set_in_redis(:failure_count, 10)
|
||||
|
||||
circuit_breaker.track_storage_inaccessible
|
||||
|
||||
expect(value_from_redis(:failure_count).to_i).to be(11)
|
||||
end
|
||||
|
||||
it 'increments the failure count on the breaker without reloading' do
|
||||
set_in_redis(:failure_count, 10)
|
||||
|
||||
circuit_breaker.track_storage_inaccessible
|
||||
|
||||
expect(circuit_breaker).not_to receive(:get_failure_info)
|
||||
expect(circuit_breaker.failure_count).to eq(11)
|
||||
end
|
||||
end
|
||||
|
||||
describe '#track_storage_accessible' do
|
||||
it 'sets the failure count to zero in redis' do
|
||||
set_in_redis(:failure_count, 10)
|
||||
|
||||
circuit_breaker.track_storage_accessible
|
||||
|
||||
expect(value_from_redis(:failure_count).to_i).to be(0)
|
||||
end
|
||||
|
||||
it 'sets the failure count to zero on the breaker without reloading' do
|
||||
set_in_redis(:failure_count, 10)
|
||||
|
||||
circuit_breaker.track_storage_accessible
|
||||
|
||||
expect(circuit_breaker).not_to receive(:get_failure_info)
|
||||
expect(circuit_breaker.failure_count).to eq(0)
|
||||
end
|
||||
|
||||
it 'removes the last failure time from redis' do
|
||||
set_in_redis(:last_failure, Time.now.to_i)
|
||||
|
||||
circuit_breaker.track_storage_accessible
|
||||
|
||||
expect(circuit_breaker).not_to receive(:get_failure_info)
|
||||
expect(circuit_breaker.last_failure).to be_nil
|
||||
end
|
||||
|
||||
it 'removes the last failure time from the breaker without reloading' do
|
||||
set_in_redis(:last_failure, Time.now.to_i)
|
||||
|
||||
circuit_breaker.track_storage_accessible
|
||||
|
||||
expect(value_from_redis(:last_failure)).to be_empty
|
||||
end
|
||||
|
||||
it 'wont connect to redis when there are no failures' do
|
||||
expect(Gitlab::Git::Storage.redis).to receive(:with).once
|
||||
.and_call_original
|
||||
expect(circuit_breaker).to receive(:track_storage_accessible)
|
||||
.and_call_original
|
||||
|
||||
circuit_breaker.track_storage_accessible
|
||||
end
|
||||
end
|
||||
|
||||
describe '#no_failures?' do
|
||||
it 'is false when a failure was tracked' do
|
||||
set_in_redis(:last_failure, Time.now.to_i)
|
||||
set_in_redis(:failure_count, 1)
|
||||
|
||||
expect(circuit_breaker.no_failures?).to be_falsey
|
||||
end
|
||||
end
|
||||
|
||||
describe '#last_failure' do
|
||||
it 'returns the last failure time' do
|
||||
time = Time.parse("2017-05-26 17:52:30")
|
||||
|
@ -351,10 +284,4 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
|
|||
expect(circuit_breaker.failure_count).to eq(7)
|
||||
end
|
||||
end
|
||||
|
||||
describe '#cache_key' do
|
||||
it 'includes storage and host' do
|
||||
expect(circuit_breaker.cache_key).to eq(cache_key)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -65,17 +65,6 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
|
|||
ours = described_class.public_instance_methods
|
||||
theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods
|
||||
|
||||
# These methods are not part of the public API, but are public to allow the
|
||||
# CircuitBreaker specs to operate. They should be made private over time.
|
||||
exceptions = %i[
|
||||
cache_key
|
||||
check_storage_accessible!
|
||||
no_failures?
|
||||
storage_available?
|
||||
track_storage_accessible
|
||||
track_storage_inaccessible
|
||||
]
|
||||
|
||||
expect(theirs - ours).to contain_exactly(*exceptions)
|
||||
expect(theirs - ours).to be_empty
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue