Move the circuitbreaker check out in a separate process

Moving the check out of the general requests, makes sure we don't have any slowdown in the regular requests. To keep the process performing this checks small, the check is still performed inside a unicorn. But that is called from a process running on the same server. Because the checks are now done outside normal request, we can have a simpler failure strategy: The check is now performed in the background every `circuitbreaker_check_interval`. Failures are logged in redis. The failures are reset when the check succeeds. Per check we will try `circuitbreaker_access_retries` times within `circuitbreaker_storage_timeout` seconds. When the number of failures exceeds `circuitbreaker_failure_count_threshold`, we will block access to the storage. After `failure_reset_time` of no checks, we will clear the stored failures. This could happen when the process that performs the checks is not running.
2017-11-13 16:52:07 +01:00 · 2017-11-13 16:52:07 +01:00 · f1ae1e39ce
parent 12d33b883a
commit f1ae1e39ce
45 changed files with 983 additions and 362 deletions
--- a/app/controllers/admin/health_check_controller.rb
+++ b/app/controllers/admin/health_check_controller.rb
@ -5,7 +5,7 @@ class Admin::HealthCheckController < Admin::ApplicationController
  end

  def reset_storage_health
-    Gitlab::Git::Storage::CircuitBreaker.reset_all!
+    Gitlab::Git::Storage::FailureInfo.reset_all!
    redirect_to admin_health_check_path,
                notice: _('Git storage health information has been reset')
  end
--- a/app/controllers/health_controller.rb
+++ b/app/controllers/health_controller.rb
@ -1,5 +1,5 @@
 class HealthController < ActionController::Base
-  protect_from_forgery with: :exception
+  protect_from_forgery with: :exception, except: :storage_check
  include RequiresWhitelistedMonitoringClient

  CHECKS = [
@ -23,6 +23,15 @@ class HealthController < ActionController::Base
    render_check_results(results)
  end

+  def storage_check
+    results = Gitlab::Git::Storage::Checker.check_all
+
+    render json: {
+             check_interval: Gitlab::CurrentSettings.current_application_settings.circuitbreaker_check_interval,
+             results: results
+           }
+  end
+
  private

  def render_check_results(results)
--- a/app/helpers/application_settings_helper.rb
+++ b/app/helpers/application_settings_helper.rb
@ -124,17 +124,6 @@ module ApplicationSettingsHelper
    _('The number of attempts GitLab will make to access a storage.')
  end

-  def circuitbreaker_backoff_threshold_help_text
-    _("The number of failures after which GitLab will start temporarily "\
-      "disabling access to a storage shard on a host")
-  end
-
-  def circuitbreaker_failure_wait_time_help_text
-    _("When access to a storage fails. GitLab will prevent access to the "\
-      "storage for the time specified here. This allows the filesystem to "\
-      "recover. Repositories on failing shards are temporarly unavailable")
-  end
-
  def circuitbreaker_failure_reset_time_help_text
    _("The time in seconds GitLab will keep failure information. When no "\
      "failures occur during this time, information about the mount is reset.")
@ -145,6 +134,11 @@ module ApplicationSettingsHelper
      "timeout error will be raised.")
  end

+  def circuitbreaker_check_interval_help_text
+    _("The time in seconds between storage checks. When a previous check did "\
+      "complete yet, GitLab will skip a check.")
+  end
+
  def visible_attributes
    [
      :admin_notification_email,
@ -154,10 +148,9 @@ module ApplicationSettingsHelper
      :akismet_enabled,
      :auto_devops_enabled,
      :circuitbreaker_access_retries,
-      :circuitbreaker_backoff_threshold,
+      :circuitbreaker_check_interval,
      :circuitbreaker_failure_count_threshold,
      :circuitbreaker_failure_reset_time,
-      :circuitbreaker_failure_wait_time,
      :circuitbreaker_storage_timeout,
      :clientside_sentry_dsn,
      :clientside_sentry_enabled,
--- a/app/helpers/storage_health_helper.rb
+++ b/app/helpers/storage_health_helper.rb
@ -18,16 +18,12 @@ module StorageHealthHelper
    current_failures = circuit_breaker.failure_count

    translation_params = { number_of_failures: current_failures,
-                           maximum_failures: maximum_failures,
-                           number_of_seconds: circuit_breaker.failure_wait_time }
+                           maximum_failures: maximum_failures }

    if circuit_breaker.circuit_broken?
      s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
         "retry automatically. Reset storage information when the problem is "\
         "resolved.") % translation_params
-    elsif circuit_breaker.backing_off?
-      _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
-        "block access for %{number_of_seconds} seconds.") % translation_params
    else
      _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
        "allow access on the next attempt.") % translation_params
--- a/app/models/application_setting.rb
+++ b/app/models/application_setting.rb
@ -153,11 +153,10 @@ class ApplicationSetting < ActiveRecord::Base
            presence: true,
            numericality: { greater_than_or_equal_to: 0 }

-  validates :circuitbreaker_backoff_threshold,
-            :circuitbreaker_failure_count_threshold,
-            :circuitbreaker_failure_wait_time,
+  validates :circuitbreaker_failure_count_threshold,
            :circuitbreaker_failure_reset_time,
            :circuitbreaker_storage_timeout,
+            :circuitbreaker_check_interval,
            presence: true,
            numericality: { only_integer: true, greater_than_or_equal_to: 0 }

@ -165,13 +164,6 @@ class ApplicationSetting < ActiveRecord::Base
            presence: true,
            numericality: { only_integer: true, greater_than_or_equal_to: 1 }

-  validates_each :circuitbreaker_backoff_threshold do |record, attr, value|
-    if value.to_i >= record.circuitbreaker_failure_count_threshold
-      record.errors.add(attr, _("The circuitbreaker backoff threshold should be "\
-                                "lower than the failure count threshold"))
-    end
-  end
-
  validates :gitaly_timeout_default,
            presence: true,
            numericality: { only_integer: true, greater_than_or_equal_to: 0 }
--- a/app/views/admin/application_settings/_form.html.haml
+++ b/app/views/admin/application_settings/_form.html.haml
@ -545,6 +545,12 @@

  %fieldset
    %legend Git Storage Circuitbreaker settings
+    .form-group
+      = f.label :circuitbreaker_check_interval, _('Check interval'), class: 'control-label col-sm-2'
+      .col-sm-10
+        = f.number_field :circuitbreaker_check_interval, class: 'form-control'
+        .help-block
+          = circuitbreaker_check_interval_help_text
    .form-group
      = f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2'
      .col-sm-10
@ -557,18 +563,6 @@
        = f.number_field :circuitbreaker_storage_timeout, class: 'form-control'
        .help-block
          = circuitbreaker_storage_timeout_help_text
-    .form-group
-      = f.label :circuitbreaker_backoff_threshold, _('Number of failures before backing off'), class: 'control-label col-sm-2'
-      .col-sm-10
-        = f.number_field :circuitbreaker_backoff_threshold, class: 'form-control'
-        .help-block
-          = circuitbreaker_backoff_threshold_help_text
-    .form-group
-      = f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2'
-      .col-sm-10
-        = f.number_field :circuitbreaker_failure_wait_time, class: 'form-control'
-        .help-block
-          = circuitbreaker_failure_wait_time_help_text
    .form-group
      = f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2'
      .col-sm-10
--- a/bin/storage_check
+++ b/bin/storage_check
@ -0,0 +1,11 @@
+#!/usr/bin/env ruby
+
+require 'optparse'
+require 'net/http'
+require 'json'
+require 'socket'
+require 'logger'
+
+require_relative '../lib/gitlab/storage_check'
+
+Gitlab::StorageCheck::CLI.start!(ARGV)
--- a/changelogs/unreleased/bvl-circuitbreaker-process.yml
+++ b/changelogs/unreleased/bvl-circuitbreaker-process.yml
@ -0,0 +1,5 @@
+---
+title: Monitor NFS shards for circuitbreaker in a separate process
+merge_request: 15426
+author:
+type: changed
--- a/config/routes.rb
+++ b/config/routes.rb
@ -42,6 +42,7 @@ Rails.application.routes.draw do
  scope path: '-' do
    get 'liveness' => 'health#liveness'
    get 'readiness' => 'health#readiness'
+    post 'storage_check' => 'health#storage_check'
    resources :metrics, only: [:index]
    mount Peek::Railtie => '/peek'

--- a/db/migrate/20171123094802_add_circuitbreaker_check_interval_to_application_settings.rb
+++ b/db/migrate/20171123094802_add_circuitbreaker_check_interval_to_application_settings.rb
@ -0,0 +1,20 @@
+class AddCircuitbreakerCheckIntervalToApplicationSettings < ActiveRecord::Migration
+  include Gitlab::Database::MigrationHelpers
+
+  # Set this constant to true if this migration requires downtime.
+  DOWNTIME = false
+
+  disable_ddl_transaction!
+
+  def up
+    add_column_with_default :application_settings,
+                            :circuitbreaker_check_interval,
+                            :integer,
+                            default: 1
+  end
+
+  def down
+    remove_column :application_settings,
+                  :circuitbreaker_check_interval
+  end
+end
--- a/db/post_migrate/20171123101020_update_circuitbreaker_defaults.rb
+++ b/db/post_migrate/20171123101020_update_circuitbreaker_defaults.rb
@ -0,0 +1,34 @@
+# See http://doc.gitlab.com/ce/development/migration_style_guide.html
+# for more information on how to write migrations for GitLab.
+
+class UpdateCircuitbreakerDefaults < ActiveRecord::Migration
+  include Gitlab::Database::MigrationHelpers
+
+  DOWNTIME = false
+
+  class ApplicationSetting < ActiveRecord::Base; end
+
+  def up
+    change_column_default :application_settings,
+                          :circuitbreaker_failure_count_threshold,
+                          3
+    change_column_default :application_settings,
+                          :circuitbreaker_storage_timeout,
+                          15
+
+    ApplicationSetting.update_all(circuitbreaker_failure_count_threshold: 3,
+                                  circuitbreaker_storage_timeout: 15)
+  end
+
+  def down
+    change_column_default :application_settings,
+                          :circuitbreaker_failure_count_threshold,
+                          160
+    change_column_default :application_settings,
+                          :circuitbreaker_storage_timeout,
+                          30
+
+    ApplicationSetting.update_all(circuitbreaker_failure_count_threshold: 160,
+                                  circuitbreaker_storage_timeout: 30)
+  end
+end
--- a/db/post_migrate/20171123101046_remove_old_circuitbreaker_config.rb
+++ b/db/post_migrate/20171123101046_remove_old_circuitbreaker_config.rb
@ -0,0 +1,26 @@
+# See http://doc.gitlab.com/ce/development/migration_style_guide.html
+# for more information on how to write migrations for GitLab.
+
+class RemoveOldCircuitbreakerConfig < ActiveRecord::Migration
+  include Gitlab::Database::MigrationHelpers
+
+  DOWNTIME = false
+
+  def up
+    remove_column :application_settings,
+                  :circuitbreaker_backoff_threshold
+    remove_column :application_settings,
+                  :circuitbreaker_failure_wait_time
+  end
+
+  def down
+    add_column :application_settings,
+               :circuitbreaker_backoff_threshold,
+               :integer,
+               default: 80
+    add_column :application_settings,
+               :circuitbreaker_failure_wait_time,
+               :integer,
+               default: 30
+  end
+end
--- a/db/schema.rb
+++ b/db/schema.rb
@ -135,12 +135,10 @@ ActiveRecord::Schema.define(version: 20171205190711) do
    t.boolean "hashed_storage_enabled", default: false, null: false
    t.boolean "project_export_enabled", default: true, null: false
    t.boolean "auto_devops_enabled", default: false, null: false
-    t.integer "circuitbreaker_failure_count_threshold", default: 160
-    t.integer "circuitbreaker_failure_wait_time", default: 30
+    t.integer "circuitbreaker_failure_count_threshold", default: 3
    t.integer "circuitbreaker_failure_reset_time", default: 1800
-    t.integer "circuitbreaker_storage_timeout", default: 30
+    t.integer "circuitbreaker_storage_timeout", default: 15
    t.integer "circuitbreaker_access_retries", default: 3
-    t.integer "circuitbreaker_backoff_threshold", default: 80
    t.boolean "throttle_unauthenticated_enabled", default: false, null: false
    t.integer "throttle_unauthenticated_requests_per_period", default: 3600, null: false
    t.integer "throttle_unauthenticated_period_in_seconds", default: 3600, null: false
@ -150,6 +148,7 @@ ActiveRecord::Schema.define(version: 20171205190711) do
    t.boolean "throttle_authenticated_web_enabled", default: false, null: false
    t.integer "throttle_authenticated_web_requests_per_period", default: 7200, null: false
    t.integer "throttle_authenticated_web_period_in_seconds", default: 3600, null: false
+    t.integer "circuitbreaker_check_interval", default: 1, null: false
    t.boolean "password_authentication_enabled_for_web"
    t.boolean "password_authentication_enabled_for_git", default: true
    t.integer "gitaly_timeout_default", default: 55, null: false
--- a/doc/api/settings.md
+++ b/doc/api/settings.md
@ -70,10 +70,9 @@ PUT /application/settings
 | `akismet_api_key`                        | string           | no                                            | API key for akismet spam protection                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | `akismet_enabled`                        | boolean          | no                                            | Enable or disable akismet spam protection                                                                                                                                                                                                                                                                                                                                                                                                                 |
 | `circuitbreaker_access_retries           | integer          | no                                            | The number of attempts GitLab will make to access a storage.                                                                                                                                                                                                                                                                                                                                                                                              |
-| `circuitbreaker_backoff_threshold        | integer          | no                                            | The number of failures after which GitLab will start temporarily disabling access to a storage shard on a host.                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `circuitbreaker_check_interval`          | integer          | no                                            | Number of seconds in between storage checks.                                                                                                                                                                                                                                                                                                                                                                                                              |
 | `circuitbreaker_failure_count_threshold` | integer          | no                                            | The number of failures of after which GitLab will completely prevent access to the storage.                                                                                                                                                                                                                                                                                                                                                               |
 | `circuitbreaker_failure_reset_time`      | integer          | no                                            | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset.                                                                                                                                                                                                                                                                                                                  |
-| `circuitbreaker_failure_wait_time`       | integer          | no                                            | Time in seconds GitLab will block access to a failing storage to allow it to recover.                                                                                                                                                                                                                                                                                                                                                                     |
 | `circuitbreaker_storage_timeout`         | integer          | no                                            | Seconds to wait for a storage access attempt                                                                                                                                                                                                                                                                                                                                                                                                              |
 | `clientside_sentry_dsn`                  | string           | no                                            | Required if `clientside_sentry_dsn` is enabled                                                                                                                                                                                                                                                                                                                                                                                                            |
 | `clientside_sentry_enabled`              | boolean          | no                                            | Enable Sentry error reporting for the client side                                                                                                                                                                                                                                                                                                                                                                                                         |
--- a/lib/api/circuit_breakers.rb
+++ b/lib/api/circuit_breakers.rb
@ -41,7 +41,7 @@ module API
            detail 'This feature was introduced in GitLab 9.5'
          end
          delete do
-            Gitlab::Git::Storage::CircuitBreaker.reset_all!
+            Gitlab::Git::Storage::FailureInfo.reset_all!
          end
        end
      end
--- a/lib/gitlab/git/storage/checker.rb
+++ b/lib/gitlab/git/storage/checker.rb
@ -0,0 +1,98 @@
+module Gitlab
+  module Git
+    module Storage
+      class Checker
+        include CircuitBreakerSettings
+
+        attr_reader :storage_path, :storage, :hostname, :logger
+
+        def self.check_all(logger = Rails.logger)
+          threads = Gitlab.config.repositories.storages.keys.map do |storage_name|
+            Thread.new do
+              Thread.current[:result] = new(storage_name, logger).check_with_lease
+            end
+          end
+
+          threads.map do |thread|
+            thread.join
+            thread[:result]
+          end
+        end
+
+        def initialize(storage, logger = Rails.logger)
+          @storage = storage
+          config = Gitlab.config.repositories.storages[@storage]
+          @storage_path = config['path']
+          @logger = logger
+
+          @hostname = Gitlab::Environment.hostname
+        end
+
+        def check_with_lease
+          lease_key = "storage_check:#{cache_key}"
+          lease = Gitlab::ExclusiveLease.new(lease_key, timeout: storage_timeout)
+          result = { storage: storage, success: nil }
+
+          if uuid = lease.try_obtain
+            result[:success] = check
+
+            Gitlab::ExclusiveLease.cancel(lease_key, uuid)
+          else
+            logger.warn("#{hostname}: #{storage}: Skipping check, previous check still running")
+          end
+
+          result
+        end
+
+        def check
+          if Gitlab::Git::Storage::ForkedStorageCheck.storage_available?(storage_path, storage_timeout, access_retries)
+            track_storage_accessible
+            true
+          else
+            track_storage_inaccessible
+            logger.error("#{hostname}: #{storage}: Not accessible.")
+            false
+          end
+        end
+
+        private
+
+        def track_storage_inaccessible
+          first_failure = current_failure_info.first_failure || Time.now
+          last_failure = Time.now
+
+          Gitlab::Git::Storage.redis.with do |redis|
+            redis.pipelined do
+              redis.hset(cache_key, :first_failure, first_failure.to_i)
+              redis.hset(cache_key, :last_failure, last_failure.to_i)
+              redis.hincrby(cache_key, :failure_count, 1)
+              redis.expire(cache_key, failure_reset_time)
+              maintain_known_keys(redis)
+            end
+          end
+        end
+
+        def track_storage_accessible
+          Gitlab::Git::Storage.redis.with do |redis|
+            redis.pipelined do
+              redis.hset(cache_key, :first_failure, nil)
+              redis.hset(cache_key, :last_failure, nil)
+              redis.hset(cache_key, :failure_count, 0)
+              maintain_known_keys(redis)
+            end
+          end
+        end
+
+        def maintain_known_keys(redis)
+          expire_time = Time.now.to_i + failure_reset_time
+          redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, expire_time, cache_key)
+          redis.zremrangebyscore(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, '-inf', Time.now.to_i)
+        end
+
+        def current_failure_info
+          FailureInfo.load(cache_key)
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/git/storage/circuit_breaker.rb
+++ b/lib/gitlab/git/storage/circuit_breaker.rb
@ -4,22 +4,11 @@ module Gitlab
      class CircuitBreaker
        include CircuitBreakerSettings

-        FailureInfo = Struct.new(:last_failure, :failure_count)
-
        attr_reader :storage,
-                    :hostname,
-                    :storage_path
+                    :hostname

-        delegate :last_failure, :failure_count, to: :failure_info
-
-        def self.reset_all!
-          Gitlab::Git::Storage.redis.with do |redis|
-            all_storage_keys = redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
-            redis.del(*all_storage_keys) unless all_storage_keys.empty?
-          end
-
-          RequestStore.delete(:circuitbreaker_cache)
-        end
+        delegate :last_failure, :failure_count, :no_failures?,
+                 to: :failure_info

        def self.for_storage(storage)
          cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do
@ -46,9 +35,6 @@ module Gitlab
        def initialize(storage, hostname)
          @storage = storage
          @hostname = hostname
-
-          config = Gitlab.config.repositories.storages[@storage]
-          @storage_path = config['path']
        end

        def perform
@ -65,15 +51,6 @@ module Gitlab
          failure_count > failure_count_threshold
        end

-        def backing_off?
-          return false if no_failures?
-
-          recent_failure = last_failure > failure_wait_time.seconds.ago
-          too_many_failures = failure_count > backoff_threshold
-
-          recent_failure && too_many_failures
-        end
-
        private

        # The circuitbreaker can be enabled for the entire fleet using a Feature
@ -86,88 +63,13 @@ module Gitlab
        end

        def failure_info
-          @failure_info ||= get_failure_info
-        end
-
-        # Memoizing the `storage_available` call means we only do it once per
-        # request when the storage is available.
-        #
-        # When the storage appears not available, and the memoized value is `false`
-        # we might want to try again.
-        def storage_available?
-          return @storage_available if @storage_available
-
-          if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck
-                                    .storage_available?(storage_path, storage_timeout, access_retries)
-            track_storage_accessible
-          else
-            track_storage_inaccessible
-          end
-
-          @storage_available
+          @failure_info ||= FailureInfo.load(cache_key)
        end

        def check_storage_accessible!
          if circuit_broken?
            raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time)
          end
-
-          if backing_off?
-            raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time)
-          end
-
-          unless storage_available?
-            raise Gitlab::Git::Storage::Inaccessible.new("#{storage} not accessible", failure_wait_time)
-          end
-        end
-
-        def no_failures?
-          last_failure.blank? && failure_count == 0
-        end
-
-        def track_storage_inaccessible
-          @failure_info = FailureInfo.new(Time.now, failure_count + 1)
-
-          Gitlab::Git::Storage.redis.with do |redis|
-            redis.pipelined do
-              redis.hset(cache_key, :last_failure, last_failure.to_i)
-              redis.hincrby(cache_key, :failure_count, 1)
-              redis.expire(cache_key, failure_reset_time)
-              maintain_known_keys(redis)
-            end
-          end
-        end
-
-        def track_storage_accessible
-          @failure_info = FailureInfo.new(nil, 0)
-
-          Gitlab::Git::Storage.redis.with do |redis|
-            redis.pipelined do
-              redis.hset(cache_key, :last_failure, nil)
-              redis.hset(cache_key, :failure_count, 0)
-              maintain_known_keys(redis)
-            end
-          end
-        end
-
-        def maintain_known_keys(redis)
-          expire_time = Time.now.to_i + failure_reset_time
-          redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, expire_time, cache_key)
-          redis.zremrangebyscore(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, '-inf', Time.now.to_i)
-        end
-
-        def get_failure_info
-          last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
-            redis.hmget(cache_key, :last_failure, :failure_count)
-          end
-
-          last_failure = Time.at(last_failure.to_i) if last_failure.present?
-
-          FailureInfo.new(last_failure, failure_count.to_i)
-        end
-
-        def cache_key
-          @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
        end
      end
    end
--- a/lib/gitlab/git/storage/circuit_breaker_settings.rb
+++ b/lib/gitlab/git/storage/circuit_breaker_settings.rb
@ -6,10 +6,6 @@ module Gitlab
          application_settings.circuitbreaker_failure_count_threshold
        end

-        def failure_wait_time
-          application_settings.circuitbreaker_failure_wait_time
-        end
-
        def failure_reset_time
          application_settings.circuitbreaker_failure_reset_time
        end
@ -22,8 +18,12 @@ module Gitlab
          application_settings.circuitbreaker_access_retries
        end

-        def backoff_threshold
-          application_settings.circuitbreaker_backoff_threshold
+        def check_interval
+          application_settings.circuitbreaker_check_interval
+        end
+
+        def cache_key
+          @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
        end

        private
--- a/lib/gitlab/git/storage/failure_info.rb
+++ b/lib/gitlab/git/storage/failure_info.rb
@ -0,0 +1,39 @@
+module Gitlab
+  module Git
+    module Storage
+      class FailureInfo
+        attr_accessor :first_failure, :last_failure, :failure_count
+
+        def self.reset_all!
+          Gitlab::Git::Storage.redis.with do |redis|
+            all_storage_keys = redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
+            redis.del(*all_storage_keys) unless all_storage_keys.empty?
+          end
+
+          RequestStore.delete(:circuitbreaker_cache)
+        end
+
+        def self.load(cache_key)
+          first_failure, last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
+            redis.hmget(cache_key, :first_failure, :last_failure, :failure_count)
+          end
+
+          last_failure = Time.at(last_failure.to_i) if last_failure.present?
+          first_failure = Time.at(first_failure.to_i) if first_failure.present?
+
+          new(first_failure, last_failure, failure_count.to_i)
+        end
+
+        def initialize(first_failure, last_failure, failure_count)
+          @first_failure = first_failure
+          @last_failure = last_failure
+          @failure_count = failure_count
+        end
+
+        def no_failures?
+          first_failure.blank? && last_failure.blank? && failure_count == 0
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/git/storage/null_circuit_breaker.rb
+++ b/lib/gitlab/git/storage/null_circuit_breaker.rb
@ -11,6 +11,9 @@ module Gitlab
        # These will always have nil values
        attr_reader :storage_path

+        delegate :last_failure, :failure_count, :no_failures?,
+                 to: :failure_info
+
        def initialize(storage, hostname, error: nil)
          @storage = storage
          @hostname = hostname
@ -29,16 +32,17 @@ module Gitlab
          false
        end

-        def last_failure
-          circuit_broken? ? Time.now : nil
-        end
-
-        def failure_count
-          circuit_broken? ? failure_count_threshold : 0
-        end
-
        def failure_info
-          Gitlab::Git::Storage::CircuitBreaker::FailureInfo.new(last_failure, failure_count)
+          @failure_info ||=
+            if circuit_broken?
+              Gitlab::Git::Storage::FailureInfo.new(Time.now,
+                                                    Time.now,
+                                                    failure_count_threshold)
+            else
+              Gitlab::Git::Storage::FailureInfo.new(nil,
+                                                    nil,
+                                                    0)
+            end
        end
      end
    end
--- a/lib/gitlab/storage_check.rb
+++ b/lib/gitlab/storage_check.rb
@ -0,0 +1,11 @@
+require_relative 'storage_check/cli'
+require_relative 'storage_check/gitlab_caller'
+require_relative 'storage_check/option_parser'
+require_relative 'storage_check/response'
+
+module Gitlab
+  module StorageCheck
+    ENDPOINT = '/-/storage_check'.freeze
+    Options = Struct.new(:target, :token, :interval, :dryrun)
+  end
+end
--- a/lib/gitlab/storage_check/cli.rb
+++ b/lib/gitlab/storage_check/cli.rb
@ -0,0 +1,69 @@
+module Gitlab
+  module StorageCheck
+    class CLI
+      def self.start!(args)
+        runner = new(Gitlab::StorageCheck::OptionParser.parse!(args))
+        runner.start_loop
+      end
+
+      attr_reader :logger, :options
+
+      def initialize(options)
+        @options = options
+        @logger = Logger.new(STDOUT)
+      end
+
+      def start_loop
+        logger.info "Checking #{options.target} every #{options.interval} seconds"
+
+        if options.dryrun
+          logger.info "Dryrun, exiting..."
+          return
+        end
+
+        begin
+          loop do
+            response = GitlabCaller.new(options).call!
+            log_response(response)
+            update_settings(response)
+
+            sleep options.interval
+          end
+        rescue Interrupt
+          logger.info "Ending storage-check"
+        end
+      end
+
+      def update_settings(response)
+        previous_interval = options.interval
+
+        if response.valid?
+          options.interval = response.check_interval || previous_interval
+        end
+
+        if previous_interval != options.interval
+          logger.info "Interval changed: #{options.interval} seconds"
+        end
+      end
+
+      def log_response(response)
+        unless response.valid?
+          return logger.error("Invalid response checking nfs storage: #{response.http_response.inspect}")
+        end
+
+        if response.responsive_shards.any?
+          logger.debug("Responsive shards: #{response.responsive_shards.join(', ')}")
+        end
+
+        warnings = []
+        if response.skipped_shards.any?
+          warnings << "Skipped shards: #{response.skipped_shards.join(', ')}"
+        end
+        if response.failing_shards.any?
+          warnings << "Failing shards: #{response.failing_shards.join(', ')}"
+        end
+        logger.warn(warnings.join(' - ')) if warnings.any?
+      end
+    end
+  end
+end
--- a/lib/gitlab/storage_check/gitlab_caller.rb
+++ b/lib/gitlab/storage_check/gitlab_caller.rb
@ -0,0 +1,39 @@
+require 'excon'
+
+module Gitlab
+  module StorageCheck
+    class GitlabCaller
+      def initialize(options)
+        @options = options
+      end
+
+      def call!
+        Gitlab::StorageCheck::Response.new(get_response)
+      rescue Errno::ECONNREFUSED, Excon::Error
+        # Server not ready, treated as invalid response.
+        Gitlab::StorageCheck::Response.new(nil)
+      end
+
+      def get_response
+        scheme, *other_parts = URI.split(@options.target)
+        socket_path = if  scheme == 'unix'
+                        other_parts.compact.join
+                      end
+
+        connection = Excon.new(@options.target, socket: socket_path)
+        connection.post(path: Gitlab::StorageCheck::ENDPOINT,
+                        headers: headers)
+      end
+
+      def headers
+        @headers ||= begin
+                       headers = {}
+                       headers['Content-Type'] = headers['Accept'] = 'application/json'
+                       headers['TOKEN'] = @options.token if @options.token
+
+                       headers
+                     end
+      end
+    end
+  end
+end
--- a/lib/gitlab/storage_check/option_parser.rb
+++ b/lib/gitlab/storage_check/option_parser.rb
@ -0,0 +1,39 @@
+module Gitlab
+  module StorageCheck
+    class OptionParser
+      def self.parse!(args)
+        # Start out with some defaults
+        options = Gitlab::StorageCheck::Options.new(nil, nil, 1, false)
+
+        parser = ::OptionParser.new do |opts|
+          opts.banner = "Usage: bin/storage_check [options]"
+
+          opts.on('-t=string', '--target string', 'URL or socket to trigger storage check') do |value|
+            options.target = value
+          end
+
+          opts.on('-T=string', '--token string', 'Health token to use') { |value| options.token = value }
+
+          opts.on('-i=n', '--interval n', ::OptionParser::DecimalInteger, 'Seconds between checks') do |value|
+            options.interval = value
+          end
+
+          opts.on('-d', '--dryrun', "Output what will be performed, but don't start the process") do |value|
+            options.dryrun = value
+          end
+        end
+        parser.parse!(args)
+
+        unless options.target
+          raise ::OptionParser::InvalidArgument.new('Provide a URI to provide checks')
+        end
+
+        if URI.parse(options.target).scheme.nil?
+          raise ::OptionParser::InvalidArgument.new('Add the scheme to the target, `unix://`, `https://` or `http://` are supported')
+        end
+
+        options
+      end
+    end
+  end
+end
--- a/lib/gitlab/storage_check/response.rb
+++ b/lib/gitlab/storage_check/response.rb
@ -0,0 +1,77 @@
+require 'json'
+
+module Gitlab
+  module StorageCheck
+    class Response
+      attr_reader :http_response
+
+      def initialize(http_response)
+        @http_response = http_response
+      end
+
+      def valid?
+        @http_response && (200...299).cover?(@http_response.status) &&
+          @http_response.headers['Content-Type'].include?('application/json') &&
+          parsed_response
+      end
+
+      def check_interval
+        return nil unless parsed_response
+
+        parsed_response['check_interval']
+      end
+
+      def responsive_shards
+        divided_results[:responsive_shards]
+      end
+
+      def skipped_shards
+        divided_results[:skipped_shards]
+      end
+
+      def failing_shards
+        divided_results[:failing_shards]
+      end
+
+      private
+
+      def results
+        return [] unless parsed_response
+
+        parsed_response['results']
+      end
+
+      def divided_results
+        return @divided_results if @divided_results
+
+        @divided_results = {}
+        @divided_results[:responsive_shards] = []
+        @divided_results[:skipped_shards] = []
+        @divided_results[:failing_shards] = []
+
+        results.each do |info|
+          name = info['storage']
+
+          case info['success']
+          when true
+            @divided_results[:responsive_shards] << name
+          when false
+            @divided_results[:failing_shards] << name
+          else
+            @divided_results[:skipped_shards] << name
+          end
+        end
+
+        @divided_results
+      end
+
+      def parsed_response
+        return @parsed_response if defined?(@parsed_response)
+
+        @parsed_response = JSON.parse(@http_response.body)
+      rescue JSON::JSONError
+        @parsed_response = nil
+      end
+    end
+  end
+end
--- a/spec/bin/storage_check_spec.rb
+++ b/spec/bin/storage_check_spec.rb
@ -0,0 +1,13 @@
+require 'spec_helper'
+
+describe 'bin/storage_check' do
+  it 'is executable' do
+    command = %w[bin/storage_check -t unix://the/path/to/a/unix-socket.sock -i 10 -d]
+    expected_output = 'Checking unix://the/path/to/a/unix-socket.sock every 10 seconds'
+
+    output, status = Gitlab::Popen.popen(command, Rails.root.to_s)
+
+    expect(status).to eq(0)
+    expect(output).to include(expected_output)
+  end
+end
--- a/spec/controllers/admin/health_check_controller_spec.rb
+++ b/spec/controllers/admin/health_check_controller_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Admin::HealthCheckController, broken_storage: true do
+describe Admin::HealthCheckController do
  let(:admin) { create(:admin) }

  before do
@ -17,7 +17,7 @@ describe Admin::HealthCheckController, broken_storage: true do

  describe 'POST reset_storage_health' do
    it 'resets all storage health information' do
-      expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!)
+      expect(Gitlab::Git::Storage::FailureInfo).to receive(:reset_all!)

      post :reset_storage_health
    end
--- a/spec/controllers/health_controller_spec.rb
+++ b/spec/controllers/health_controller_spec.rb
@ -14,6 +14,48 @@ describe HealthController do
    stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'false')
  end

+  describe '#storage_check' do
+    before do
+      allow(Gitlab::RequestContext).to receive(:client_ip).and_return(whitelisted_ip)
+    end
+
+    subject { post :storage_check }
+
+    it 'checks all the configured storages' do
+      expect(Gitlab::Git::Storage::Checker).to receive(:check_all).and_call_original
+
+      subject
+    end
+
+    it 'returns the check interval' do
+      stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'true')
+      stub_application_setting(circuitbreaker_check_interval: 10)
+
+      subject
+
+      expect(json_response['check_interval']).to eq(10)
+    end
+
+    context 'with failing storages', :broken_storage do
+      before do
+        stub_storage_settings(
+          broken: { path: 'tmp/tests/non-existent-repositories' }
+        )
+      end
+
+      it 'includes the failure information' do
+        subject
+
+        expected_results = [
+          { 'storage' => 'broken', 'success' => false },
+          { 'storage' => 'default', 'success' => true }
+        ]
+
+        expect(json_response['results']).to eq(expected_results)
+      end
+    end
+  end
+
  describe '#readiness' do
    shared_context 'endpoint responding with readiness data' do
      let(:request_params) { {} }
--- a/spec/features/admin/admin_health_check_spec.rb
+++ b/spec/features/admin/admin_health_check_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-feature "Admin Health Check", :feature, :broken_storage do
+feature "Admin Health Check", :feature do
  include StubENV

  before do
@ -36,6 +36,7 @@ feature "Admin Health Check", :feature, :broken_storage do

  context 'when services are up' do
    before do
+      stub_storage_settings({}) # Hide the broken storage
      visit admin_health_check_path
    end

@ -56,10 +57,8 @@ feature "Admin Health Check", :feature, :broken_storage do
    end
  end

-  context 'with repository storage failures' do
+  context 'with repository storage failures', :broken_storage do
    before do
-      # Track a failure
-      Gitlab::Git::Storage::CircuitBreaker.for_storage('broken').perform { nil } rescue nil
      visit admin_health_check_path
    end

@ -67,9 +66,10 @@ feature "Admin Health Check", :feature, :broken_storage do
      hostname = Gitlab::Environment.hostname
      maximum_failures = Gitlab::CurrentSettings.current_application_settings
                           .circuitbreaker_failure_count_threshold
+      number_of_failures = maximum_failures + 1

-      expect(page).to have_content('broken: failed storage access attempt on host:')
-      expect(page).to have_content("#{hostname}: 1 of #{maximum_failures} failures.")
+      expect(page).to have_content("broken: #{number_of_failures} failed storage access attempts:")
+      expect(page).to have_content("#{hostname}: #{number_of_failures} of #{maximum_failures} failures.")
    end

    it 'allows resetting storage failures' do
--- a/spec/lib/gitlab/git/storage/checker_spec.rb
+++ b/spec/lib/gitlab/git/storage/checker_spec.rb
@ -0,0 +1,132 @@
+require 'spec_helper'
+
+describe Gitlab::Git::Storage::Checker, :clean_gitlab_redis_shared_state do
+  let(:storage_name) { 'default' }
+  let(:hostname) { Gitlab::Environment.hostname }
+  let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" }
+
+  subject(:checker) { described_class.new(storage_name) }
+
+  def value_from_redis(name)
+    Gitlab::Git::Storage.redis.with do |redis|
+      redis.hmget(cache_key, name)
+    end.first
+  end
+
+  def set_in_redis(name, value)
+    Gitlab::Git::Storage.redis.with do |redis|
+      redis.hmset(cache_key, name, value)
+    end.first
+  end
+
+  describe '.check_all' do
+    it 'calls a check for each storage' do
+      fake_checker_default = double
+      fake_checker_broken = double
+      fake_logger = fake_logger
+
+      expect(described_class).to receive(:new).with('default', fake_logger) { fake_checker_default }
+      expect(described_class).to receive(:new).with('broken', fake_logger) { fake_checker_broken }
+      expect(fake_checker_default).to receive(:check_with_lease)
+      expect(fake_checker_broken).to receive(:check_with_lease)
+
+      described_class.check_all(fake_logger)
+    end
+
+    context 'with broken storage', :broken_storage do
+      it 'returns the results' do
+        expected_result = [
+          { storage: 'default', success: true },
+          { storage: 'broken', success: false }
+        ]
+
+        expect(described_class.check_all).to eq(expected_result)
+      end
+    end
+  end
+
+  describe '#initialize' do
+    it 'assigns the settings' do
+      expect(checker.hostname).to eq(hostname)
+      expect(checker.storage).to eq('default')
+      expect(checker.storage_path).to eq(TestEnv.repos_path)
+    end
+  end
+
+  describe '#check_with_lease' do
+    it 'only allows one check at a time' do
+      expect(checker).to receive(:check).once { sleep 1 }
+
+      thread = Thread.new { checker.check_with_lease }
+      checker.check_with_lease
+      thread.join
+    end
+
+    it 'returns a result hash' do
+      expect(checker.check_with_lease).to eq(storage: 'default', success: true)
+    end
+  end
+
+  describe '#check' do
+    it 'tracks that the storage was accessible' do
+      set_in_redis(:failure_count, 10)
+      set_in_redis(:last_failure, Time.now.to_f)
+
+      checker.check
+
+      expect(value_from_redis(:failure_count).to_i).to eq(0)
+      expect(value_from_redis(:last_failure)).to be_empty
+      expect(value_from_redis(:first_failure)).to be_empty
+    end
+
+    it 'calls the check with the correct arguments' do
+      stub_application_setting(circuitbreaker_storage_timeout: 30,
+                               circuitbreaker_access_retries: 3)
+
+      expect(Gitlab::Git::Storage::ForkedStorageCheck)
+        .to receive(:storage_available?).with(TestEnv.repos_path, 30, 3)
+              .and_call_original
+
+      checker.check
+    end
+
+    it 'returns `true`' do
+      expect(checker.check).to eq(true)
+    end
+
+    it 'maintains known storage keys' do
+      Timecop.freeze do
+        # Insert an old key to expire
+        old_entry = Time.now.to_i - 3.days.to_i
+        Gitlab::Git::Storage.redis.with do |redis|
+          redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, old_entry, 'to_be_removed')
+        end
+
+        checker.check
+
+        known_keys = Gitlab::Git::Storage.redis.with do |redis|
+          redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
+        end
+
+        expect(known_keys).to contain_exactly(cache_key)
+      end
+    end
+
+    context 'the storage is not available', :broken_storage do
+      let(:storage_name) { 'broken' }
+
+      it 'tracks that the storage was inaccessible' do
+        Timecop.freeze do
+          expect { checker.check }.to change { value_from_redis(:failure_count).to_i }.by(1)
+
+          expect(value_from_redis(:last_failure)).not_to be_empty
+          expect(value_from_redis(:first_failure)).not_to be_empty
+        end
+      end
+
+      it 'returns `false`' do
+        expect(checker.check).to eq(false)
+      end
+    end
+  end
+end
--- a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb
+++ b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb
@ -1,11 +1,18 @@
 require 'spec_helper'

-describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: true, broken_storage: true do
+describe Gitlab::Git::Storage::CircuitBreaker, :broken_storage do
  let(:storage_name) { 'default' }
  let(:circuit_breaker) { described_class.new(storage_name, hostname) }
  let(:hostname) { Gitlab::Environment.hostname }
  let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" }

+  def set_in_redis(name, value)
+    Gitlab::Git::Storage.redis.with do |redis|
+      redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
+      redis.hmset(cache_key, name, value)
+    end.first
+  end
+
  before do
    # Override test-settings for the circuitbreaker with something more realistic
    # for these specs.
@ -19,36 +26,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
                         )
  end

-  def value_from_redis(name)
-    Gitlab::Git::Storage.redis.with do |redis|
-      redis.hmget(cache_key, name)
-    end.first
-  end
-
-  def set_in_redis(name, value)
-    Gitlab::Git::Storage.redis.with do |redis|
-      redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
-      redis.hmset(cache_key, name, value)
-    end.first
-  end
-
-  describe '.reset_all!' do
-    it 'clears all entries form redis' do
-      set_in_redis(:failure_count, 10)
-
-      described_class.reset_all!
-
-      key_exists = Gitlab::Git::Storage.redis.with { |redis| redis.exists(cache_key) }
-
-      expect(key_exists).to be_falsey
-    end
-
-    it 'does not break when there are no keys in redis' do
-      expect { described_class.reset_all! }.not_to raise_error
-    end
-  end
-
-  describe '.for_storage' do
+  describe '.for_storage', :request_store do
    it 'only builds a single circuitbreaker per storage' do
      expect(described_class).to receive(:new).once.and_call_original

@ -71,7 +49,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
    it 'assigns the settings' do
      expect(circuit_breaker.hostname).to eq(hostname)
      expect(circuit_breaker.storage).to eq('default')
-      expect(circuit_breaker.storage_path).to eq(TestEnv.repos_path)
    end
  end

@ -91,9 +68,9 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
      end
    end

-    describe '#failure_wait_time' do
+    describe '#check_interval' do
      it 'reads the value from settings' do
-        expect(circuit_breaker.failure_wait_time).to eq(1)
+        expect(circuit_breaker.check_interval).to eq(1)
      end
    end

@ -114,12 +91,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
        expect(circuit_breaker.access_retries).to eq(4)
      end
    end
-
-    describe '#backoff_threshold' do
-      it 'reads the value from settings' do
-        expect(circuit_breaker.backoff_threshold).to eq(5)
-      end
-    end
  end

  describe '#perform' do
@ -134,19 +105,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
      end
    end

-    it 'raises the correct exception when backing off' do
-      Timecop.freeze do
-        set_in_redis(:last_failure, 1.second.ago.to_f)
-        set_in_redis(:failure_count, 90)
-
-        expect { |b| circuit_breaker.perform(&b) }
-          .to raise_error do |exception|
-          expect(exception).to be_kind_of(Gitlab::Git::Storage::Failing)
-          expect(exception.retry_after).to eq(30)
-        end
-      end
-    end
-
    it 'yields the block' do
      expect { |b| circuit_breaker.perform(&b) }
        .to yield_control
@ -170,54 +128,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
        .to raise_error(Rugged::OSError)
    end

-    it 'tracks that the storage was accessible' do
-      set_in_redis(:failure_count, 10)
-      set_in_redis(:last_failure, Time.now.to_f)
-
-      circuit_breaker.perform { '' }
-
-      expect(value_from_redis(:failure_count).to_i).to eq(0)
-      expect(value_from_redis(:last_failure)).to be_empty
-      expect(circuit_breaker.failure_count).to eq(0)
-      expect(circuit_breaker.last_failure).to be_nil
-    end
-
-    it 'maintains known storage keys' do
-      Timecop.freeze do
-        # Insert an old key to expire
-        old_entry = Time.now.to_i - 3.days.to_i
-        Gitlab::Git::Storage.redis.with do |redis|
-          redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, old_entry, 'to_be_removed')
-        end
-
-        circuit_breaker.perform { '' }
-
-        known_keys = Gitlab::Git::Storage.redis.with do |redis|
-          redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
-        end
-
-        expect(known_keys).to contain_exactly(cache_key)
-      end
-    end
-
-    it 'only performs the accessibility check once' do
-      expect(Gitlab::Git::Storage::ForkedStorageCheck)
-        .to receive(:storage_available?).once.and_call_original
-
-      2.times { circuit_breaker.perform { '' } }
-    end
-
-    it 'calls the check with the correct arguments' do
-      stub_application_setting(circuitbreaker_storage_timeout: 30,
-                               circuitbreaker_access_retries: 3)
-
-      expect(Gitlab::Git::Storage::ForkedStorageCheck)
-        .to receive(:storage_available?).with(TestEnv.repos_path, 30, 3)
-              .and_call_original
-
-      circuit_breaker.perform { '' }
-    end
-
    context 'with the feature disabled' do
      before do
        stub_feature_flags(git_storage_circuit_breaker: false)
@ -240,31 +150,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
        expect(result).to eq('hello')
      end
    end
-
-    context 'the storage is not available' do
-      let(:storage_name) { 'broken' }
-
-      it 'raises the correct exception' do
-        expect(circuit_breaker).to receive(:track_storage_inaccessible)
-
-        expect { circuit_breaker.perform { '' } }
-          .to raise_error do |exception|
-          expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible)
-          expect(exception.retry_after).to eq(30)
-        end
-      end
-
-      it 'tracks that the storage was inaccessible' do
-        Timecop.freeze do
-          expect { circuit_breaker.perform { '' } }.to raise_error(Gitlab::Git::Storage::Inaccessible)
-
-          expect(value_from_redis(:failure_count).to_i).to eq(1)
-          expect(value_from_redis(:last_failure)).not_to be_empty
-          expect(circuit_breaker.failure_count).to eq(1)
-          expect(circuit_breaker.last_failure).to be_within(1.second).of(Time.now)
-        end
-      end
-    end
  end

  describe '#circuit_broken?' do
@ -283,32 +168,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
    end
  end

-  describe '#backing_off?' do
-    it 'is true when there was a recent failure' do
-      Timecop.freeze do
-        set_in_redis(:last_failure, 1.second.ago.to_f)
-        set_in_redis(:failure_count, 90)
-
-        expect(circuit_breaker.backing_off?).to be_truthy
-      end
-    end
-
-    context 'the `failure_wait_time` is set to 0' do
-      before do
-        stub_application_setting(circuitbreaker_failure_wait_time: 0)
-      end
-
-      it 'is working even when there are failures' do
-        Timecop.freeze do
-          set_in_redis(:last_failure, 0.seconds.ago.to_f)
-          set_in_redis(:failure_count, 90)
-
-          expect(circuit_breaker.backing_off?).to be_falsey
-        end
-      end
-    end
-  end
-
  describe '#last_failure' do
    it 'returns the last failure time' do
      time = Time.parse("2017-05-26 17:52:30")
--- a/spec/lib/gitlab/git/storage/failure_info_spec.rb
+++ b/spec/lib/gitlab/git/storage/failure_info_spec.rb
@ -0,0 +1,70 @@
+require 'spec_helper'
+
+describe Gitlab::Git::Storage::FailureInfo, :broken_storage do
+  let(:storage_name) { 'default' }
+  let(:hostname) { Gitlab::Environment.hostname }
+  let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" }
+
+  def value_from_redis(name)
+    Gitlab::Git::Storage.redis.with do |redis|
+      redis.hmget(cache_key, name)
+    end.first
+  end
+
+  def set_in_redis(name, value)
+    Gitlab::Git::Storage.redis.with do |redis|
+      redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
+      redis.hmset(cache_key, name, value)
+    end.first
+  end
+
+  describe '.reset_all!' do
+    it 'clears all entries form redis' do
+      set_in_redis(:failure_count, 10)
+
+      described_class.reset_all!
+
+      key_exists = Gitlab::Git::Storage.redis.with { |redis| redis.exists(cache_key) }
+
+      expect(key_exists).to be_falsey
+    end
+
+    it 'does not break when there are no keys in redis' do
+      expect { described_class.reset_all! }.not_to raise_error
+    end
+  end
+
+  describe '.load' do
+    it 'loads failure information for a storage on a host' do
+      first_failure = Time.parse("2017-11-14 17:52:30")
+      last_failure = Time.parse("2017-11-14 18:54:37")
+      failure_count = 11
+
+      set_in_redis(:first_failure, first_failure.to_i)
+      set_in_redis(:last_failure, last_failure.to_i)
+      set_in_redis(:failure_count, failure_count.to_i)
+
+      info = described_class.load(cache_key)
+
+      expect(info.first_failure).to eq(first_failure)
+      expect(info.last_failure).to eq(last_failure)
+      expect(info.failure_count).to eq(failure_count)
+    end
+  end
+
+  describe '#no_failures?' do
+    it 'is true when there are no failures' do
+      info = described_class.new(nil, nil, 0)
+
+      expect(info.no_failures?).to be_truthy
+    end
+
+    it 'is false when there are failures' do
+      info = described_class.new(Time.parse("2017-11-14 17:52:30"),
+                                 Time.parse("2017-11-14 18:54:37"),
+                                 20)
+
+      expect(info.no_failures?).to be_falsy
+    end
+  end
+end
--- a/spec/lib/gitlab/git/storage/health_spec.rb
+++ b/spec/lib/gitlab/git/storage/health_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-describe Gitlab::Git::Storage::Health, clean_gitlab_redis_shared_state: true, broken_storage: true do
+describe Gitlab::Git::Storage::Health, broken_storage: true do
  let(:host1_key) { 'storage_accessible:broken:web01' }
  let(:host2_key) { 'storage_accessible:default:kiq01' }

--- a/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb
+++ b/spec/lib/gitlab/git/storage/null_circuit_breaker_spec.rb
@ -27,7 +27,7 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
    end

    describe '#failure_info' do
-      it { Timecop.freeze { expect(breaker.failure_info).to eq(Gitlab::Git::Storage::CircuitBreaker::FailureInfo.new(Time.now, breaker.failure_count_threshold)) } }
+      it { expect(breaker.failure_info.no_failures?).to be_falsy }
    end
  end

@ -49,7 +49,7 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
    end

    describe '#failure_info' do
-      it { expect(breaker.failure_info).to eq(Gitlab::Git::Storage::CircuitBreaker::FailureInfo.new(nil, 0)) }
+      it { expect(breaker.failure_info.no_failures?).to be_truthy }
    end
  end

--- a/spec/lib/gitlab/storage_check/cli_spec.rb
+++ b/spec/lib/gitlab/storage_check/cli_spec.rb
@ -0,0 +1,19 @@
+require 'spec_helper'
+
+describe Gitlab::StorageCheck::CLI do
+  let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', nil, 1, false) }
+  subject(:runner) { described_class.new(options) }
+
+  describe '#update_settings' do
+    it 'updates the interval when changed in a valid response and logs the change' do
+      fake_response = double
+      expect(fake_response).to receive(:valid?).and_return(true)
+      expect(fake_response).to receive(:check_interval).and_return(42)
+      expect(runner.logger).to receive(:info)
+
+      runner.update_settings(fake_response)
+
+      expect(options.interval).to eq(42)
+    end
+  end
+end
--- a/spec/lib/gitlab/storage_check/gitlab_caller_spec.rb
+++ b/spec/lib/gitlab/storage_check/gitlab_caller_spec.rb
@ -0,0 +1,46 @@
+require 'spec_helper'
+
+describe Gitlab::StorageCheck::GitlabCaller do
+  let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', nil, nil, false) }
+  subject(:gitlab_caller) { described_class.new(options) }
+
+  describe '#call!' do
+    context 'when a socket is given' do
+      it 'calls a socket' do
+        fake_connection = double
+        expect(fake_connection).to receive(:post)
+        expect(Excon).to receive(:new).with('unix://tmp/socket.sock', socket: "tmp/socket.sock") { fake_connection }
+
+        gitlab_caller.call!
+      end
+    end
+
+    context 'when a host is given' do
+      let(:options) { Gitlab::StorageCheck::Options.new('http://localhost:8080', nil, nil, false) }
+
+      it 'it calls a http response' do
+        fake_connection = double
+        expect(Excon).to receive(:new).with('http://localhost:8080', socket: nil) { fake_connection }
+        expect(fake_connection).to receive(:post)
+
+        gitlab_caller.call!
+      end
+    end
+  end
+
+  describe '#headers' do
+    it 'Adds the JSON header' do
+      headers = gitlab_caller.headers
+
+      expect(headers['Content-Type']).to eq('application/json')
+    end
+
+    context 'when a token was provided' do
+      let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', 'atoken', nil, false) }
+
+      it 'adds it to the headers' do
+        expect(gitlab_caller.headers['TOKEN']).to eq('atoken')
+      end
+    end
+  end
+end
--- a/spec/lib/gitlab/storage_check/option_parser_spec.rb
+++ b/spec/lib/gitlab/storage_check/option_parser_spec.rb
@ -0,0 +1,31 @@
+require 'spec_helper'
+
+describe Gitlab::StorageCheck::OptionParser do
+  describe '.parse!' do
+    it 'assigns all options' do
+      args = %w(--target unix://tmp/hello/world.sock --token thetoken --interval 42)
+
+      options = described_class.parse!(args)
+
+      expect(options.token).to eq('thetoken')
+      expect(options.interval).to eq(42)
+      expect(options.target).to eq('unix://tmp/hello/world.sock')
+    end
+
+    it 'requires the interval to be a number' do
+      args = %w(--target unix://tmp/hello/world.sock --interval fortytwo)
+
+      expect { described_class.parse!(args) }.to raise_error(OptionParser::InvalidArgument)
+    end
+
+    it 'raises an error if the scheme is not included' do
+      args = %w(--target tmp/hello/world.sock)
+
+      expect { described_class.parse!(args) }.to raise_error(OptionParser::InvalidArgument)
+    end
+
+    it 'raises an error if both socket and host are missing' do
+      expect { described_class.parse!([]) }.to raise_error(OptionParser::InvalidArgument)
+    end
+  end
+end
--- a/spec/lib/gitlab/storage_check/response_spec.rb
+++ b/spec/lib/gitlab/storage_check/response_spec.rb
@ -0,0 +1,54 @@
+require 'spec_helper'
+
+describe Gitlab::StorageCheck::Response do
+  let(:fake_json) do
+    {
+      check_interval: 42,
+      results: [
+        { storage: 'working', success: true },
+        { storage: 'skipped', success: nil },
+        { storage: 'failing', success: false }
+      ]
+    }.to_json
+  end
+
+  let(:fake_http_response) do
+    fake_response = instance_double("Excon::Response - Status check")
+    allow(fake_response).to receive(:status).and_return(200)
+    allow(fake_response).to receive(:body).and_return(fake_json)
+    allow(fake_response).to receive(:headers).and_return('Content-Type' => 'application/json')
+
+    fake_response
+  end
+  let(:response) { described_class.new(fake_http_response) }
+
+  describe '#valid?' do
+    it 'is valid for a success response with parseable JSON' do
+      expect(response).to be_valid
+    end
+  end
+
+  describe '#check_interval' do
+    it 'returns the result from the JSON' do
+      expect(response.check_interval).to eq(42)
+    end
+  end
+
+  describe '#responsive_shards' do
+    it 'contains the names of working shards' do
+      expect(response.responsive_shards).to contain_exactly('working')
+    end
+  end
+
+  describe '#skipped_shards' do
+    it 'contains the names of skipped shards' do
+      expect(response.skipped_shards).to contain_exactly('skipped')
+    end
+  end
+
+  describe '#failing_shards' do
+    it 'contains the name of failing shards' do
+      expect(response.failing_shards).to contain_exactly('failing')
+    end
+  end
+end
--- a/spec/models/application_setting_spec.rb
+++ b/spec/models/application_setting_spec.rb
@ -115,9 +115,8 @@ describe ApplicationSetting do
    end

    context 'circuitbreaker settings' do
-      [:circuitbreaker_backoff_threshold,
-       :circuitbreaker_failure_count_threshold,
-       :circuitbreaker_failure_wait_time,
+      [:circuitbreaker_failure_count_threshold,
+       :circuitbreaker_check_interval,
       :circuitbreaker_failure_reset_time,
       :circuitbreaker_storage_timeout].each do |field|
        it "Validates #{field} as number" do
@ -126,16 +125,6 @@ describe ApplicationSetting do
                           .is_greater_than_or_equal_to(0)
        end
      end
-
-      it 'requires the `backoff_threshold` to be lower than the `failure_count_threshold`' do
-        setting.circuitbreaker_failure_count_threshold = 10
-        setting.circuitbreaker_backoff_threshold = 15
-        failure_message = "The circuitbreaker backoff threshold should be lower "\
-                          "than the failure count threshold"
-
-        expect(setting).not_to be_valid
-        expect(setting.errors[:circuitbreaker_backoff_threshold]).to include(failure_message)
-      end
    end

    context 'repository storages' do
--- a/spec/models/repository_spec.rb
+++ b/spec/models/repository_spec.rb
@ -29,7 +29,9 @@ describe Repository do
  def expect_to_raise_storage_error
    expect { yield }.to raise_error do |exception|
      storage_exceptions = [Gitlab::Git::Storage::Inaccessible, Gitlab::Git::CommandError, GRPC::Unavailable]
-      expect(exception.class).to be_in(storage_exceptions)
+      known_exception = storage_exceptions.select { |e| exception.is_a?(e) }
+
+      expect(known_exception).not_to be_nil
    end
  end

@ -634,9 +636,7 @@ describe Repository do
  end

  describe '#fetch_ref' do
-    # Setting the var here, sidesteps the stub that makes gitaly raise an error
-    # before the actual test call
-    set(:broken_repository) { create(:project, :broken_storage).repository }
+    let(:broken_repository) { create(:project, :broken_storage).repository }

    describe 'when storage is broken', :broken_storage  do
      it 'should raise a storage error' do
--- a/spec/requests/api/circuit_breakers_spec.rb
+++ b/spec/requests/api/circuit_breakers_spec.rb
@ -47,7 +47,7 @@ describe API::CircuitBreakers do

  describe 'DELETE circuit_breakers/repository_storage' do
    it 'clears all circuit_breakers' do
-      expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!)
+      expect(Gitlab::Git::Storage::FailureInfo).to receive(:reset_all!)

      delete api('/circuit_breakers/repository_storage', admin)

--- a/spec/requests/api/settings_spec.rb
+++ b/spec/requests/api/settings_spec.rb
@ -54,7 +54,7 @@ describe API::Settings, 'Settings' do
          dsa_key_restriction: 2048,
          ecdsa_key_restriction: 384,
          ed25519_key_restriction: 256,
-          circuitbreaker_failure_wait_time: 2
+          circuitbreaker_check_interval: 2

        expect(response).to have_gitlab_http_status(200)
        expect(json_response['default_projects_limit']).to eq(3)
@ -75,7 +75,7 @@ describe API::Settings, 'Settings' do
        expect(json_response['dsa_key_restriction']).to eq(2048)
        expect(json_response['ecdsa_key_restriction']).to eq(384)
        expect(json_response['ed25519_key_restriction']).to eq(256)
-        expect(json_response['circuitbreaker_failure_wait_time']).to eq(2)
+        expect(json_response['circuitbreaker_check_interval']).to eq(2)
      end
    end

--- a/spec/spec_helper.rb
+++ b/spec/spec_helper.rb
@ -121,18 +121,6 @@ RSpec.configure do |config|
    reset_delivered_emails!
  end

-  # Stub the `ForkedStorageCheck.storage_available?` method unless
-  # `:broken_storage` metadata is defined
-  #
-  # This check can be slow and is unnecessary in a test environment where we
-  # know the storage is available, because we create it at runtime
-  config.before(:example) do |example|
-    unless example.metadata[:broken_storage]
-      allow(Gitlab::Git::Storage::ForkedStorageCheck)
-        .to receive(:storage_available?).and_return(true)
-    end
-  end
-
  config.around(:each, :use_clean_rails_memory_store_caching) do |example|
    caching_store = Rails.cache
    Rails.cache = ActiveSupport::Cache::MemoryStore.new
--- a/spec/support/stored_repositories.rb
+++ b/spec/support/stored_repositories.rb
@ -12,6 +12,25 @@ RSpec.configure do |config|
      raise GRPC::Unavailable.new('Gitaly broken in this spec')
    end

-    Gitlab::Git::Storage::CircuitBreaker.reset_all!
+    # Track the maximum number of failures
+    first_failure = Time.parse("2017-11-14 17:52:30")
+    last_failure = Time.parse("2017-11-14 18:54:37")
+    failure_count = Gitlab::CurrentSettings
+                      .current_application_settings
+                      .circuitbreaker_failure_count_threshold + 1
+    cache_key = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}broken:#{Gitlab::Environment.hostname}"
+
+    Gitlab::Git::Storage.redis.with do |redis|
+      redis.pipelined do
+        redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
+        redis.hset(cache_key, :first_failure, first_failure.to_i)
+        redis.hset(cache_key, :last_failure, last_failure.to_i)
+        redis.hset(cache_key, :failure_count, failure_count.to_i)
+      end
+    end
+  end
+
+  config.after(:each, :broken_storage) do
+    Gitlab::Git::Storage.redis.with(&:flushall)
  end
 end
--- a/spec/support/stub_configuration.rb
+++ b/spec/support/stub_configuration.rb
@ -43,6 +43,8 @@ module StubConfiguration
  end

  def stub_storage_settings(messages)
+    messages.deep_stringify_keys!
+
    # Default storage is always required
    messages['default'] ||= Gitlab.config.repositories.storages.default
    messages.each do |storage_name, storage_settings|