Add a Circuitbreaker for storage paths

2017-05-17 18:17:15 +02:00 · 2017-05-17 18:17:15 +02:00 · 3598e60bf2
commit 3598e60bf2
parent 990feb9f2b
40 changed files with 1421 additions and 37 deletions
--- a/app/controllers/admin/health_check_controller.rb
+++ b/app/controllers/admin/health_check_controller.rb
@ -1,5 +1,12 @@
 class Admin::HealthCheckController < Admin::ApplicationController
  def show
    @errors = HealthCheck::Utils.process_checks(['standard'])
+    @failing_storage_statuses = Gitlab::Git::Storage::Health.for_failing_storages
+  end
+
+  def reset_storage_health
+    Gitlab::Git::Storage::CircuitBreaker.reset_all!
+    redirect_to admin_health_check_path,
+                notice: _('Git storage health information has been reset')
  end
 end
--- a/app/controllers/application_controller.rb
+++ b/app/controllers/application_controller.rb
@ -52,6 +52,15 @@ class ApplicationController < ActionController::Base
    head :forbidden, retry_after: Gitlab::Auth::UniqueIpsLimiter.config.unique_ips_limit_time_window
  end

+  rescue_from Gitlab::Git::Storage::Inaccessible, GRPC::Unavailable do |exception|
+    Raven.capture_exception(exception) if sentry_enabled?
+    log_exception(exception)
+
+    headers['Retry-After'] = exception.retry_after if exception.respond_to?(:retry_after)
+
+    render_503
+  end
+
  def redirect_back_or_default(default: root_path, options: {})
    redirect_to request.referer.present? ? :back : default, options
  end
@ -152,6 +161,19 @@ class ApplicationController < ActionController::Base
    head :unprocessable_entity
  end

+  def render_503
+    respond_to do |format|
+      format.html do
+        render(
+          file: Rails.root.join("public", "503"),
+          layout: false,
+          status: :service_unavailable
+        )
+      end
+      format.any { head :service_unavailable }
+    end
+  end
+
  def no_cache_headers
    response.headers["Cache-Control"] = "no-cache, no-store, max-age=0, must-revalidate"
    response.headers["Pragma"] = "no-cache"
--- a/app/helpers/storage_health_helper.rb
+++ b/app/helpers/storage_health_helper.rb
@ -0,0 +1,37 @@
+module StorageHealthHelper
+  def failing_storage_health_message(storage_health)
+    storage_name = content_tag(:strong, h(storage_health.storage_name))
+    host_names = h(storage_health.failing_on_hosts.to_sentence)
+    translation_params = { storage_name: storage_name,
+                           host_names: host_names,
+                           failed_attempts: storage_health.total_failures }
+
+    translation = n_('%{storage_name}: failed storage access attempt on host:',
+                     '%{storage_name}: %{failed_attempts} failed storage access attempts:',
+                     storage_health.total_failures) % translation_params
+
+    translation.html_safe
+  end
+
+  def message_for_circuit_breaker(circuit_breaker)
+    maximum_failures = circuit_breaker.failure_count_threshold
+    current_failures = circuit_breaker.failure_count
+    permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures
+
+    translation_params = { number_of_failures: current_failures,
+                           maximum_failures: maximum_failures,
+                           number_of_seconds: circuit_breaker.failure_wait_time }
+
+    if permanently_broken
+      s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
+         "retry automatically. Reset storage information when the problem is "\
+         "resolved.") % translation_params
+    elsif circuit_breaker.circuit_broken?
+      _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
+        "block access for %{number_of_seconds} seconds.") % translation_params
+    else
+      _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
+        "allow access on the next attempt.") % translation_params
+    end
+  end
+end
--- a/app/models/repository.rb
+++ b/app/models/repository.rb
@ -133,12 +133,13 @@ class Repository
    ref ||= root_ref

    args = %W(
-      #{Gitlab.config.git.bin_path} log #{ref} --pretty=%H --skip #{offset}
+      log #{ref} --pretty=%H --skip #{offset}
      --max-count #{limit} --grep=#{query} --regexp-ignore-case
    )
    args = args.concat(%W(-- #{path})) if path.present?

-    git_log_results = Gitlab::Popen.popen(args, path_to_repo).first.lines
+    git_log_results = run_git(args).first.lines
+
    git_log_results.map { |c| commit(c.chomp) }.compact
  end

@ -622,8 +623,8 @@ class Repository
    key = path.blank? ? "last_commit_id_for_path:#{sha}" : "last_commit_id_for_path:#{sha}:#{Digest::SHA1.hexdigest(path)}"

    cache.fetch(key) do
-      args = %W(#{Gitlab.config.git.bin_path} rev-list --max-count=1 #{sha} -- #{path})
-      Gitlab::Popen.popen(args, path_to_repo).first.strip
+      args = %W(rev-list --max-count=1 #{sha} -- #{path})
+      run_git(args).first.strip
    end
  end

@ -678,8 +679,8 @@ class Repository
  end

  def refs_contains_sha(ref_type, sha)
-    args = %W(#{Gitlab.config.git.bin_path} #{ref_type} --contains #{sha})
-    names = Gitlab::Popen.popen(args, path_to_repo).first
+    args = %W(#{ref_type} --contains #{sha})
+    names = run_git(args).first

    if names.respond_to?(:split)
      names = names.split("\n").map(&:strip)
@ -957,15 +958,17 @@ class Repository
    return [] if empty_repo? || query.blank?

    offset = 2
-    args = %W(#{Gitlab.config.git.bin_path} grep -i -I -n --before-context #{offset} --after-context #{offset} -E -e #{Regexp.escape(query)} #{ref || root_ref})
-    Gitlab::Popen.popen(args, path_to_repo).first.scrub.split(/^--$/)
+    args = %W(grep -i -I -n --before-context #{offset} --after-context #{offset} -E -e #{Regexp.escape(query)} #{ref || root_ref})
+
+    run_git(args).first.scrub.split(/^--$/)
  end

  def search_files_by_name(query, ref)
    return [] if empty_repo? || query.blank?

-    args = %W(#{Gitlab.config.git.bin_path} ls-tree --full-tree -r #{ref || root_ref} --name-status | #{Regexp.escape(query)})
-    Gitlab::Popen.popen(args, path_to_repo).first.lines.map(&:strip)
+    args = %W(ls-tree --full-tree -r #{ref || root_ref} --name-status | #{Regexp.escape(query)})
+
+    run_git(args).first.lines.map(&:strip)
  end

  def with_repo_branch_commit(start_repository, start_branch_name)
@ -1010,8 +1013,8 @@ class Repository
  end

  def fetch_ref(source_path, source_ref, target_ref)
-    args = %W(#{Gitlab.config.git.bin_path} fetch --no-tags -f #{source_path} #{source_ref}:#{target_ref})
-    Gitlab::Popen.popen(args, path_to_repo)
+    args = %W(fetch --no-tags -f #{source_path} #{source_ref}:#{target_ref})
+    run_git(args)
  end

  def create_ref(ref, ref_path)
@ -1092,6 +1095,12 @@ class Repository

  private

+  def run_git(args)
+    circuit_breaker.perform do
+      Gitlab::Popen.popen([Gitlab.config.git.bin_path, *args], path_to_repo)
+    end
+  end
+
  def blob_data_at(sha, path)
    blob = blob_at(sha, path)
    return unless blob
@ -1101,7 +1110,9 @@ class Repository
  end

  def refs_directory_exists?
-    File.exist?(File.join(path_to_repo, 'refs'))
+    circuit_breaker.perform do
+      File.exist?(File.join(path_to_repo, 'refs'))
+    end
  end

  def cache
@ -1145,4 +1156,8 @@ class Repository
  def initialize_raw_repository
    Gitlab::Git::Repository.new(project.repository_storage, disk_path + '.git')
  end
+
+  def circuit_breaker
+    @circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(project.repository_storage)
+  end
 end
--- a/app/views/admin/health_check/_failing_storages.html.haml
+++ b/app/views/admin/health_check/_failing_storages.html.haml
@ -0,0 +1,15 @@
+- if failing_storages.any?
+  = _('There are problems accessing Git storage: ')
+  %ul
+    - failing_storages.each do |storage_health|
+      %li
+        = failing_storage_health_message(storage_health)
+        %ul
+          - storage_health.failing_circuit_breakers.each do |circuit_breaker|
+            %li
+              #{circuit_breaker.hostname}: #{message_for_circuit_breaker(circuit_breaker)}
+
+  = _("Access to failing storages has been temporarily disabled to allow the mount to recover. Reset storage information after the issue has been resolved to allow access again.")
+  .prepend-top-10
+    = button_to _("Reset git storage health information"), reset_storage_health_admin_health_check_path,
+      method: :post, class: 'btn btn-default'
--- a/app/views/admin/health_check/show.html.haml
+++ b/app/views/admin/health_check/show.html.haml
@ -1,22 +1,22 @@
 - @no_container = true
- page_title "Health Check"
+- page_title _('Health Check')
+- no_errors = @errors.blank? && @failing_storage_statuses.blank?
 = render 'admin/monitoring/head'

 %div{ class: container_class }
-  %h3.page-title
-    Health Check
+  %h3.page-title= page_title
  .bs-callout.clearfix
    .pull-left
      %p
-    Access token is
+    #{ s_('HealthCheck|Access token is') }
    %code#health-check-token= current_application_settings.health_check_access_token
    .prepend-top-10
-      = button_to "Reset health check access token", reset_health_check_token_admin_application_settings_path,
+      = button_to _("Reset health check access token"), reset_health_check_token_admin_application_settings_path,
        method: :put, class: 'btn btn-default',
-        data: { confirm: 'Are you sure you want to reset the health check token?' }
+        data: { confirm: _('Are you sure you want to reset the health check token?') }
  %p.light
-    Health information can be retrieved from the following endpoints. More information is available
-    = link_to 'here', help_page_path('user/admin_area/monitoring/health_check')
+    #{ _('Health information can be retrieved from the following endpoints. More information is available') }
+    = link_to s_('More information is available|here'), help_page_path('user/admin_area/monitoring/health_check')
    %ul
      %li
        %code= readiness_url(token: current_application_settings.health_check_access_token)
@ -29,14 +29,15 @@
  .panel.panel-default
    .panel-heading
      Current Status:
-      - if @errors.blank?
+      - if no_errors
        = icon('circle', class: 'cgreen')
-        Healthy
+        #{ s_('HealthCheck|Healthy') }
      - else
        = icon('warning', class: 'cred')
-        Unhealthy
+        #{ s_('HealthCheck|Unhealthy') }
    .panel-body
-      - if @errors.blank?
-        No Health Problems Detected
+      - if no_errors
+        #{ s_('HealthCheck|No Health Problems Detected') }
      - else
        = @errors
+        = render partial: 'failing_storages', object: @failing_storage_statuses
--- a/changelogs/unreleased/bvl-nfs-circuitbreaker.yml
+++ b/changelogs/unreleased/bvl-nfs-circuitbreaker.yml
@ -0,0 +1,4 @@
+---
+title: Block access to failing repository storage
+merge_request: 11449
+author:
--- a/config/gitlab.yml.example
+++ b/config/gitlab.yml.example
@ -506,6 +506,11 @@ production: &base
        path: /home/git/repositories/
        gitaly_address: unix:/home/git/gitlab/tmp/sockets/private/gitaly.socket # TCP connections are supported too (e.g. tcp://host:port)
        # gitaly_token: 'special token' # Optional: override global gitaly.token for this storage.
+        failure_count_threshold: 10 # number of failures before stopping attempts
+        failure_wait_time: 30 # Seconds after an access failure before allowing access again
+        failure_reset_time: 1800 # Time in seconds to expire failures
+        storage_timeout: 5 # Time in seconds to wait before aborting a storage access attempt
+

  ## Backup settings
  backup:
@ -638,6 +643,10 @@ test:
      default:
        path: tmp/tests/repositories/
        gitaly_address: unix:tmp/tests/gitaly/gitaly.socket
+      broken:
+        path: tmp/tests/non-existent-repositories
+        gitaly_address: unix:tmp/tests/gitaly/gitaly.socket
+
  gitaly:
    enabled: true
    token: secret
--- a/config/initializers/1_settings.rb
+++ b/config/initializers/1_settings.rb
@ -222,6 +222,7 @@ Settings.gitlab['default_branch_protection'] ||= 2
 Settings.gitlab['default_can_create_group'] = true if Settings.gitlab['default_can_create_group'].nil?
 Settings.gitlab['host']       ||= ENV['GITLAB_HOST'] || 'localhost'
 Settings.gitlab['ssh_host']   ||= Settings.gitlab.host
+Settings.gitlab['hostname']   ||= ENV['HOSTNAME'] || Socket.gethostname
 Settings.gitlab['https']        = false if Settings.gitlab['https'].nil?
 Settings.gitlab['port']       ||= ENV['GITLAB_PORT'] || (Settings.gitlab.https ? 443 : 80)
 Settings.gitlab['relative_url_root'] ||= ENV['RAILS_RELATIVE_URL_ROOT'] || ''
@ -433,6 +434,17 @@ end
 Settings.repositories.storages.values.each do |storage|
  # Expand relative paths
  storage['path'] = Settings.absolute(storage['path'])
+  # Set failure defaults
+  storage['failure_count_threshold'] ||= 10
+  storage['failure_wait_time'] ||= 30
+  storage['failure_reset_time'] ||= 1800
+  storage['storage_timeout'] ||= 5
+  # Set turn strings into numbers
+  storage['failure_count_threshold'] = storage['failure_count_threshold'].to_i
+  storage['failure_wait_time'] = storage['failure_wait_time'].to_i
+  storage['failure_reset_time'] = storage['failure_reset_time'].to_i
+  # We might want to have a timeout shorter than 1 second.
+  storage['storage_timeout'] = storage['storage_timeout'].to_f
 end

 #
--- a/config/initializers/6_validations.rb
+++ b/config/initializers/6_validations.rb
@ -7,6 +7,13 @@ def find_parent_path(name, path)
  Gitlab.config.repositories.storages.detect do |n, rs|
    name != n && Pathname.new(rs['path']).realpath == parent
  end
+rescue Errno::EIO, Errno::ENOENT => e
+  warning = "WARNING: couldn't verify #{path} (#{name}). "\
+            "If this is an external storage, it might be offline."
+  message = "#{warning}\n#{e.message}"
+  Rails.logger.error("#{message}\n\t" + e.backtrace.join("\n\t"))
+
+  nil
 end

 def storage_validation_error(message)
@ -29,6 +36,15 @@ def validate_storages_config
    if !repository_storage.is_a?(Hash) || repository_storage['path'].nil?
      storage_validation_error("#{name} is not a valid storage, because it has no `path` key. Refer to gitlab.yml.example for an updated example")
    end
+
+    %w(failure_count_threshold failure_wait_time failure_reset_time storage_timeout).each do |setting|
+      # Falling back to the defaults is fine!
+      next if repository_storage[setting].nil?
+
+      unless repository_storage[setting].to_f > 0
+        storage_validation_error("#{setting}, for storage `#{name}` needs to be greater than 0")
+      end
+    end
  end
 end

--- a/config/routes/admin.rb
+++ b/config/routes/admin.rb
@ -67,7 +67,9 @@ namespace :admin do
  end

  resource :logs, only: [:show]
-  resource :health_check, controller: 'health_check', only: [:show]
+  resource :health_check, controller: 'health_check', only: [:show] do
+    post :reset_storage_health
+  end
  resource :background_jobs, controller: 'background_jobs', only: [:show]
  resource :system_info, controller: 'system_info', only: [:show]
  resources :requests_profiles, only: [:index, :show], param: :name, constraints: { name: /.+\.html/ }
--- a/doc/administration/img/failing_storage.png
+++ b/doc/administration/img/failing_storage.png
--- a/doc/administration/repository_storage_paths.md
+++ b/doc/administration/repository_storage_paths.md
@ -60,7 +60,7 @@ respectively.
          path: /mnt/cephfs/repositories
    ```

-1. [Restart GitLab] for the changes to take effect.
+1. [Restart GitLab][restart-gitlab] for the changes to take effect.

 >**Note:**
 The [`gitlab_shell: repos_path` entry][repospath] in `gitlab.yml` will be
@ -97,9 +97,80 @@ be stored via the **Application Settings** in the Admin area.
 Beginning with GitLab 8.13.4, multiple paths can be chosen. New projects will be
 randomly placed on one of the selected paths.

+## Handling failing repository storage
+
+> [Introduced][ce-11449] in GitLab 9.5.
+
+When GitLab detects access to the repositories storage fails repeatedly, it can
+gracefully prevent attempts to access the storage. This might be useful when
+the repositories are stored somewhere on the network.
+
+The configuration could look as follows:
+
+**For Omnibus installations**
+
+1. Edit `/etc/gitlab/gitlab.rb`:
+
+    ```ruby
+    git_data_dirs({
+      "default" => {
+        "path" => "/mnt/nfs-01/git-data",
+        "failure_count_threshold" => 10,
+        "failure_wait_time" => 30,
+        "failure_reset_time" => 1800,
+        "storage_timeout" => 5
+       }
+    })
+    ```
+
+1. Save the file and [reconfigure GitLab][reconfigure-gitlab] for the changes to take effect.
+
+---
+
+**For installations from source**
+
+1. Edit `config/gitlab.yml`:
+
+    ```yaml
+    repositories:
+      storages: # You must have at least a `default` storage path.
+        default:
+          path: /home/git/repositories/
+          failure_count_threshold: 10 # number of failures before stopping attempts
+          failure_wait_time: 30 # Seconds after last access failure before trying again
+          failure_reset_time: 1800 # Time in seconds to expire failures
+          storage_timeout: 5 # Time in seconds to wait before aborting a storage access attempt
+    ```
+
+1. Save the file and [restart GitLab][restart-gitlab] for the changes to take effect.
+
+
+**`failure_count_threshold`:** The number of failures of after which GitLab will
+completely prevent access to the storage. The number of failures can be reset in
+the admin interface: `https://gitlab.example.com/admin/health_check` or using the
+[api](../api/repository_storage_health.md) to allow access to the storage again.
+
+**`failure_wait_time`:** When access to a storage fails. GitLab will prevent
+access to the storage for the time specified here. This allows the filesystem to
+recover without.
+
+**`failure_reset_time`:** The time in seconds GitLab will keep failure
+information. When no failures occur during this time, information about the
+mount is reset.
+
+**`storage_timeout`:** The time in seconds GitLab will try to access storage.
+After this time a timeout error will be raised.
+
+When storage failures occur, this will be visible in the admin interface like this:
+
+![failing storage](img/failing_storage.png)
+
+To allow access to all storages, click the `Reset git storage health information` button.
+
 [ce-4578]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/4578
-[restart gitlab]: restart_gitlab.md#installations-from-source
-[reconfigure gitlab]: restart_gitlab.md#omnibus-gitlab-reconfigure
+[restart-gitlab]: restart_gitlab.md#installations-from-source
+[reconfigure-gitlab]: restart_gitlab.md#omnibus-gitlab-reconfigure
 [backups]: ../raketasks/backup_restore.md
 [raketask]: https://gitlab.com/gitlab-org/gitlab-ce/blob/033e5423a2594e08a7ebcd2379bd2331f4c39032/lib/backup/repository.rb#L54-56
 [repospath]: https://gitlab.com/gitlab-org/gitlab-ce/blob/8-9-stable/config/gitlab.yml.example#L457
+[ce-11449]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/11449
--- a/doc/api/repository_storage_health.md
+++ b/doc/api/repository_storage_health.md
@ -0,0 +1,74 @@
+# Circuitbreaker API
+
+> [Introduced][ce-11449] in GitLab 9.5.
+
+The Circuitbreaker API is only accessible to administrators. All requests by
+guests will respond with `401 Unauthorized`, and all requests by normal users
+will respond with `403 Forbidden`.
+
+## Repository Storages
+
+### Get all storage information
+
+Returns of all currently configured storages and their health information.
+
+```
+GET /circuit_breakers/repository_storage
+```
+
+```bash
+curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage
+```
+
+```json
+[
+  {
+    "storage_name": "default",
+    "failing_on_hosts": [],
+    "total_failures": 0
+  },
+  {
+    "storage_name": "broken",
+    "failing_on_hosts": [
+      "web01", "worker01"
+    ],
+    "total_failures": 1
+  }
+]
+```
+
+### Get failing storages
+
+This returns a list of all currently failing storages.
+
+```
+GET /circuit_breakers/repository_storage/failing
+```
+
+```bash
+curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage/failing
+```
+
+```json
+[
+    {
+        "storage_name":"broken",
+        "failing_on_hosts":["web01", "worker01"],
+        "total_failures":2
+    }
+]
+```
+
+## Reset failing storage information
+
+Use this remove all failing storage information and allow access to the storage again.
+
+```
+DELETE /circuit_breakers/repository_storage
+```
+
+```bash
+curl --request DELETE --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage
+```
+
+[ce-11449]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/11449
--- a/lib/api/api.rb
+++ b/lib/api/api.rb
@ -95,6 +95,7 @@ module API
    mount ::API::Boards
    mount ::API::Branches
    mount ::API::BroadcastMessages
+    mount ::API::CircuitBreakers
    mount ::API::Commits
    mount ::API::CommitStatuses
    mount ::API::DeployKeys
--- a/lib/api/circuit_breakers.rb
+++ b/lib/api/circuit_breakers.rb
@ -0,0 +1,50 @@
+module API
+  class CircuitBreakers < Grape::API
+    before { authenticated_as_admin! }
+
+    resource :circuit_breakers do
+      params do
+        requires :type,
+                 type: String,
+                 desc: "The type of circuitbreaker",
+                 values: ['repository_storage']
+      end
+      resource ':type' do
+        namespace '', requirements: { type: 'repository_storage' } do
+          helpers do
+            def failing_storage_health
+              @failing_storage_health ||= Gitlab::Git::Storage::Health.for_failing_storages
+            end
+
+            def storage_health
+              @failing_storage_health ||= Gitlab::Git::Storage::Health.for_all_storages
+            end
+          end
+
+          desc 'Get all failing git storages' do
+            detail 'This feature was introduced in GitLab 9.5'
+            success Entities::RepositoryStorageHealth
+          end
+          get do
+            present storage_health, with: Entities::RepositoryStorageHealth
+          end
+
+          desc 'Get all failing git storages' do
+            detail 'This feature was introduced in GitLab 9.5'
+            success Entities::RepositoryStorageHealth
+          end
+          get 'failing' do
+            present failing_storage_health, with: Entities::RepositoryStorageHealth
+          end
+
+          desc 'Reset all storage failures and open circuitbreaker' do
+            detail 'This feature was introduced in GitLab 9.5'
+          end
+          delete do
+            Gitlab::Git::Storage::CircuitBreaker.reset_all!
+          end
+        end
+      end
+    end
+  end
+end
--- a/lib/api/entities.rb
+++ b/lib/api/entities.rb
@ -954,5 +954,11 @@ module API
      expose :ip_address
      expose :submitted, as: :akismet_submitted
    end
+
+    class RepositoryStorageHealth < Grape::Entity
+      expose :storage_name
+      expose :failing_on_hosts
+      expose :total_failures
+    end
  end
 end
--- a/lib/gitlab/git/repository.rb
+++ b/lib/gitlab/git/repository.rb
@ -64,11 +64,17 @@ module Gitlab
      end

      def rugged
-        @rugged ||= Rugged::Repository.new(path, alternates: alternate_object_directories)
+        @rugged ||= circuit_breaker.perform do
+          Rugged::Repository.new(path, alternates: alternate_object_directories)
+        end
      rescue Rugged::RepositoryError, Rugged::OSError
        raise NoRepository.new('no repository for such path')
      end

+      def circuit_breaker
+        @circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(storage)
+      end
+
      # Returns an Array of branch names
      # sorted by name ASC
      def branch_names
--- a/lib/gitlab/git/storage.rb
+++ b/lib/gitlab/git/storage.rb
@ -0,0 +1,22 @@
+module Gitlab
+  module Git
+    module Storage
+      class Inaccessible < StandardError
+        attr_reader :retry_after
+
+        def initialize(message = nil, retry_after = nil)
+          super(message)
+          @retry_after = retry_after
+        end
+      end
+
+      CircuitOpen = Class.new(Inaccessible)
+
+      REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
+
+      def self.redis
+        Gitlab::Redis::SharedState
+      end
+    end
+  end
+end
--- a/lib/gitlab/git/storage/circuit_breaker.rb
+++ b/lib/gitlab/git/storage/circuit_breaker.rb
@ -0,0 +1,142 @@
+module Gitlab
+  module Git
+    module Storage
+      class CircuitBreaker
+        attr_reader :storage,
+                    :hostname,
+                    :storage_path,
+                    :failure_count_threshold,
+                    :failure_wait_time,
+                    :failure_reset_time,
+                    :storage_timeout
+
+        def self.reset_all!
+          pattern = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}*"
+
+          Gitlab::Git::Storage.redis.with do |redis|
+            all_storage_keys = redis.scan_each(match: pattern).to_a
+            redis.del(*all_storage_keys) unless all_storage_keys.empty?
+          end
+
+          RequestStore.delete(:circuitbreaker_cache)
+        end
+
+        def self.for_storage(storage)
+          cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do
+            Hash.new do |hash, storage_name|
+              hash[storage_name] = new(storage_name)
+            end
+          end
+
+          cached_circuitbreakers[storage]
+        end
+
+        def initialize(storage, hostname = Gitlab.config.gitlab.hostname)
+          @storage = storage
+          @hostname = hostname
+
+          config = Gitlab.config.repositories.storages[@storage]
+          @storage_path = config['path']
+          @failure_count_threshold = config['failure_count_threshold']
+          @failure_wait_time = config['failure_wait_time']
+          @failure_reset_time = config['failure_reset_time']
+          @storage_timeout = config['storage_timeout']
+        end
+
+        def perform
+          return yield unless Feature.enabled?('git_storage_circuit_breaker')
+
+          if circuit_broken?
+            raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} open", failure_wait_time)
+          end
+
+          check_storage_accessible!
+
+          yield
+        end
+
+        def circuit_broken?
+          return false if no_failures?
+
+          recent_failure = last_failure > failure_wait_time.seconds.ago
+          too_many_failures = failure_count > failure_count_threshold
+
+          recent_failure || too_many_failures
+        end
+
+        # Memoizing the `storage_available` call means we only do it once per
+        # request when the storage is available.
+        #
+        # When the storage appears not available, and the memoized value is `false`
+        # we might want to try again.
+        def storage_available?
+          @storage_available ||= Gitlab::Git::Storage::ForkedStorageCheck.storage_available?(storage_path, storage_timeout)
+        end
+
+        def check_storage_accessible!
+          if storage_available?
+            track_storage_accessible
+          else
+            track_storage_inaccessible
+            raise Gitlab::Git::Storage::Inaccessible.new("#{storage} not accessible", failure_wait_time)
+          end
+        end
+
+        def no_failures?
+          last_failure.blank? && failure_count == 0
+        end
+
+        def track_storage_inaccessible
+          @failure_info = [Time.now, failure_count + 1]
+
+          Gitlab::Git::Storage.redis.with do |redis|
+            redis.pipelined do
+              redis.hset(cache_key, :last_failure, last_failure.to_i)
+              redis.hincrby(cache_key, :failure_count, 1)
+              redis.expire(cache_key, failure_reset_time)
+            end
+          end
+        end
+
+        def track_storage_accessible
+          return if no_failures?
+
+          @failure_info = [nil, 0]
+
+          Gitlab::Git::Storage.redis.with do |redis|
+            redis.pipelined do
+              redis.hset(cache_key, :last_failure, nil)
+              redis.hset(cache_key, :failure_count, 0)
+            end
+          end
+        end
+
+        def last_failure
+          failure_info.first
+        end
+
+        def failure_count
+          failure_info.last
+        end
+
+        def failure_info
+          @failure_info ||= get_failure_info
+        end
+
+        def get_failure_info
+          last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
+            redis.hmget(cache_key, :last_failure, :failure_count)
+          end
+
+          last_failure = Time.at(last_failure.to_i) if last_failure.present?
+
+          [last_failure, failure_count.to_i]
+        end
+
+        def cache_key
+          @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/git/storage/forked_storage_check.rb
+++ b/lib/gitlab/git/storage/forked_storage_check.rb
@ -0,0 +1,59 @@
+module Gitlab
+  module Git
+    module Storage
+      module ForkedStorageCheck
+        extend self
+
+        def storage_available?(path, timeout_seconds = 5)
+          status = timeout_check(path, timeout_seconds)
+
+          status.success?
+        end
+
+        def timeout_check(path, timeout_seconds)
+          filesystem_check_pid = check_filesystem_in_fork(path)
+
+          deadline = timeout_seconds.seconds.from_now.utc
+          wait_time = 0.01
+          status = nil
+
+          while status.nil?
+            if deadline > Time.now.utc
+              sleep(wait_time)
+              _pid, status = Process.wait2(filesystem_check_pid, Process::WNOHANG)
+            else
+              Process.kill('KILL', filesystem_check_pid)
+              # Blocking wait, so we are sure the process is gone before continuing
+              _pid, status = Process.wait2(filesystem_check_pid)
+            end
+          end
+
+          status
+        end
+
+        # This call forks out into a process, that process will then be replaced
+        # With an `exec` call, since we fork out into a shell, we can create a
+        # child process without needing an ActiveRecord-connection.
+        #
+        # Inside the shell, we use `& wait` to fork another child. We do this
+        # to prevent leaving a zombie process when the parent gets killed by the
+        # timeout.
+        #
+        # https://stackoverflow.com/questions/27892975/what-causes-activerecord-breaking-postgres-connection-after-forking
+        # https://stackoverflow.com/questions/22012943/activerecordstatementinvalid-runtimeerror-the-connection-cannot-be-reused-in
+        def check_filesystem_in_fork(path)
+          fork do
+            STDOUT.reopen('/dev/null')
+            STDERR.reopen('/dev/null')
+
+            exec("(#{test_script(path)}) & wait %1")
+          end
+        end
+
+        def test_script(path)
+          "testpath=\"$(realpath #{Shellwords.escape(path)})\" && stat $testpath"
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/git/storage/health.rb
+++ b/lib/gitlab/git/storage/health.rb
@ -0,0 +1,101 @@
+module Gitlab
+  module Git
+    module Storage
+      class Health
+        attr_reader :storage_name, :info
+
+        def self.pattern_for_storage(storage_name)
+          "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage_name}:*"
+        end
+
+        def self.for_all_storages
+          storage_names = Gitlab.config.repositories.storages.keys
+          results_per_storage = nil
+
+          Gitlab::Git::Storage.redis.with do |redis|
+            keys_per_storage = all_keys_for_storages(storage_names, redis)
+
+            # We need to make sure all keys are actually loaded as an array.
+            # Otherwise when using the enumerator of the `scan_each` within a
+            # second pipeline, it will be assumed unloaded, wich would make the
+            # result unusable inside the pipeline.
+            loaded_keys_per_storage = keys_per_storage.inject({}) do |loaded_keys, (storage_name, keys)|
+              loaded_keys[storage_name] = keys.to_a
+              loaded_keys
+            end
+
+            results_per_storage = load_for_keys(loaded_keys_per_storage, redis)
+          end
+
+          results_per_storage.map do |name, info|
+            info.each { |i| i[:failure_count] = i[:failure_count].value.to_i }
+            new(name, info)
+          end
+        end
+
+        def self.all_keys_for_storages(storage_names, redis)
+          keys_per_storage = nil
+
+          redis.pipelined do
+            keys_per_storage = storage_names.inject({}) do |result, storage_name|
+              key = pattern_for_storage(storage_name)
+
+              result.merge(storage_name => redis.scan_each(match: key))
+            end
+          end
+
+          keys_per_storage
+        end
+
+        def self.load_for_keys(keys_per_storage, redis)
+          info_for_keys = nil
+
+          redis.pipelined do
+            info_for_keys = keys_per_storage.inject({}) do |result, (storage_name, keys)|
+              info_for_storage = keys.map do |key|
+                { name: key, failure_count: redis.hget(key, :failure_count) }
+              end
+
+              result.merge(storage_name => info_for_storage)
+            end
+          end
+
+          info_for_keys
+        end
+
+        def self.for_failing_storages
+          for_all_storages.select(&:failing?)
+        end
+
+        def initialize(storage_name, info)
+          @storage_name = storage_name
+          @info = info
+        end
+
+        def failing_info
+          @failing_info ||= info.select { |info_for_host| info_for_host[:failure_count] > 0 }
+        end
+
+        def failing?
+          failing_info.any?
+        end
+
+        def failing_on_hosts
+          @failing_on_hosts ||= failing_info.map do |info_for_host|
+            info_for_host[:name].split(':').last
+          end
+        end
+
+        def failing_circuit_breakers
+          @failing_circuit_breakers ||= failing_on_hosts.map do |hostname|
+            CircuitBreaker.new(storage_name, hostname)
+          end
+        end
+
+        def total_failures
+          @total_failures ||= failing_info.sum { |info_for_host| info_for_host[:failure_count] }
+        end
+      end
+    end
+  end
+end
--- a/lib/gitlab/health_checks/fs_shards_check.rb
+++ b/lib/gitlab/health_checks/fs_shards_check.rb
@ -10,7 +10,9 @@ module Gitlab
        def readiness
          repository_storages.map do |storage_name|
            begin
-              if !storage_stat_test(storage_name)
+              if !storage_circuitbreaker_test(storage_name)
+                HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name)
+              elsif !storage_stat_test(storage_name)
                HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
              else
                with_temp_file(storage_name) do |tmp_file_path|
@ -36,7 +38,8 @@ module Gitlab
            [
              storage_stat_metrics(storage_name),
              storage_write_metrics(storage_name),
-              storage_read_metrics(storage_name)
+              storage_read_metrics(storage_name),
+              storage_circuitbreaker_metrics(storage_name)
            ].flatten
          end
        end
@ -121,6 +124,12 @@ module Gitlab
          file_contents == RANDOM_STRING
        end

+        def storage_circuitbreaker_test(storage_name)
+          Gitlab::Git::Storage::CircuitBreaker.new(storage_name).perform { "OK" }
+        rescue Gitlab::Git::Storage::Inaccessible
+          nil
+        end
+
        def storage_stat_metrics(storage_name)
          operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
            with_timing { storage_stat_test(storage_name) }
@ -143,6 +152,14 @@ module Gitlab
            end
          end
        end
+
+        def storage_circuitbreaker_metrics(storage_name)
+          operation_metrics(:filesystem_circuitbreaker,
+                            :filesystem_circuitbreaker_latency_seconds,
+                            shard: storage_name) do
+            with_timing { storage_circuitbreaker_test(storage_name) }
+          end
+        end
      end
    end
  end
--- a/spec/controllers/admin/health_check_controller_spec.rb
+++ b/spec/controllers/admin/health_check_controller_spec.rb
@ -0,0 +1,25 @@
+require 'spec_helper'
+
+describe Admin::HealthCheckController, broken_storage: true do
+  let(:admin) { create(:admin) }
+
+  before do
+    sign_in(admin)
+  end
+
+  describe 'GET show' do
+    it 'loads the git storage health information' do
+      get :show
+
+      expect(assigns[:failing_storage_statuses]).not_to be_nil
+    end
+  end
+
+  describe 'POST reset_storage_health' do
+    it 'resets all storage health information' do
+      expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!)
+
+      post :reset_storage_health
+    end
+  end
+end
--- a/spec/controllers/application_controller_spec.rb
+++ b/spec/controllers/application_controller_spec.rb
@ -108,6 +108,30 @@ describe ApplicationController do
    end
  end

+  describe 'rescue from Gitlab::Git::Storage::Inaccessible' do
+    controller(described_class) do
+      def index
+        raise Gitlab::Git::Storage::Inaccessible.new('broken', 100)
+      end
+    end
+
+    it 'renders a 503 when storage is not available' do
+      sign_in(create(:user))
+
+      get :index
+
+      expect(response.status).to eq(503)
+    end
+
+    it 'renders includes a Retry-After header' do
+      sign_in(create(:user))
+
+      get :index
+
+      expect(response.headers['Retry-After']).to eq(100)
+    end
+  end
+
  describe 'response format' do
    controller(described_class) do
      def index
--- a/spec/controllers/projects_controller_spec.rb
+++ b/spec/controllers/projects_controller_spec.rb
@ -107,6 +107,20 @@ describe ProjectsController do
      end
    end

+    context 'when the storage is not available', broken_storage: true do
+      let(:project) { create(:project, :broken_storage) }
+      before do
+        project.add_developer(user)
+        sign_in(user)
+      end
+
+      it 'renders a 503' do
+        get :show, namespace_id: project.namespace, id: project
+
+        expect(response).to have_http_status(503)
+      end
+    end
+
    context "project with empty repo" do
      let(:empty_project) { create(:project_empty_repo, :public) }

--- a/spec/factories/projects.rb
+++ b/spec/factories/projects.rb
@ -54,6 +54,12 @@ FactoryGirl.define do
      avatar { File.open(Rails.root.join('spec/fixtures/dk.png')) }
    end

+    trait :broken_storage do
+      after(:create) do |project|
+        project.update_column(:repository_storage, 'broken')
+      end
+    end
+
    # Test repository - https://gitlab.com/gitlab-org/gitlab-test
    trait :repository do
      path { 'gitlabhq' }
--- a/spec/features/admin/admin_health_check_spec.rb
+++ b/spec/features/admin/admin_health_check_spec.rb
@ -1,6 +1,6 @@
 require 'spec_helper'

-feature "Admin Health Check" do
+feature "Admin Health Check", feature: true, broken_storage: true do
  include StubENV

  before do
@ -55,4 +55,26 @@ feature "Admin Health Check" do
      expect(page).to have_content('The server is on fire')
    end
  end
+
+  context 'with repository storage failures' do
+    before do
+      # Track a failure
+      Gitlab::Git::Storage::CircuitBreaker.for_storage('broken').perform { nil } rescue nil
+      visit admin_health_check_path
+    end
+
+    it 'shows storage failure information' do
+      hostname = Gitlab.config.gitlab.hostname
+
+      expect(page).to have_content('broken: failed storage access attempt on host:')
+      expect(page).to have_content("#{hostname}: 1 of 10 failures.")
+    end
+
+    it 'allows resetting storage failures' do
+      click_button 'Reset git storage health information'
+
+      expect(page).to have_content('Git storage health information has been reset')
+      expect(page).not_to have_content('failed storage access attempt')
+    end
+  end
 end
--- a/spec/helpers/storage_health_helper_spec.rb
+++ b/spec/helpers/storage_health_helper_spec.rb
@ -0,0 +1,20 @@
+require 'spec_helper'
+
+describe StorageHealthHelper do
+  describe '#failing_storage_health_message' do
+    let(:health) do
+      Gitlab::Git::Storage::Health.new(
+        "<script>alert('storage name');)</script>",
+        []
+      )
+    end
+
+    it 'escapes storage names' do
+      escaped_storage_name = '&lt;script&gt;alert(&#39;storage name&#39;);)&lt;/script&gt;'
+
+      result = helper.failing_storage_health_message(health)
+
+      expect(result).to include(escaped_storage_name)
+    end
+  end
+end
--- a/spec/initializers/6_validations_spec.rb
+++ b/spec/initializers/6_validations_spec.rb
@ -23,6 +23,16 @@ describe '6_validations' do
      end
    end

+    context 'when one of the settings is incorrect' do
+      before do
+        mock_storages('foo' => { 'path' => 'tmp/tests/paths/a/b/c', 'failure_count_threshold' => 'not a number' })
+      end
+
+      it 'throws an error' do
+        expect { validate_storages_config }.to raise_error(/failure_count_threshold/)
+      end
+    end
+
    context 'with invalid storage names' do
      before do
        mock_storages('name with spaces' => { 'path' => 'tmp/tests/paths/a/b/c' })
@ -84,6 +94,17 @@ describe '6_validations' do
        expect { validate_storages_paths }.not_to raise_error
      end
    end
+
+    describe 'inaccessible storage' do
+      before do
+        mock_storages('foo' => { 'path' => 'tmp/tests/a/path/that/does/not/exist' })
+      end
+
+      it 'passes through with a warning' do
+        expect(Rails.logger).to receive(:error)
+        expect { validate_storages_paths }.not_to raise_error
+      end
+    end
  end

  def mock_storages(storages)
--- a/spec/initializers/settings_spec.rb
+++ b/spec/initializers/settings_spec.rb
@ -2,6 +2,17 @@ require 'spec_helper'
 require_relative '../../config/initializers/1_settings'

 describe Settings do
+  describe '#repositories' do
+    it 'assigns the default failure attributes' do
+      repository_settings = Gitlab.config.repositories.storages['broken']
+
+      expect(repository_settings['failure_count_threshold']).to eq(10)
+      expect(repository_settings['failure_wait_time']).to eq(30)
+      expect(repository_settings['failure_reset_time']).to eq(1800)
+      expect(repository_settings['storage_timeout']).to eq(5)
+    end
+  end
+
  describe '#host_without_www' do
    context 'URL with protocol' do
      it 'returns the host' do
--- a/spec/lib/gitlab/cache/ci/project_pipeline_status_spec.rb
+++ b/spec/lib/gitlab/cache/ci/project_pipeline_status_spec.rb
@ -48,8 +48,9 @@ describe Gitlab::Cache::Ci::ProjectPipelineStatus, :clean_gitlab_redis_cache do
          described_class.load_in_batch_for_projects([project_without_status])
        end

-        it 'only connects to redis_cache twice' do
-          # Once to load, once to store in the cache
+        it 'only connects to redis twice' do
+          # Stub circuitbreaker so it doesn't count the redis connections in there
+          stub_circuit_breaker(project_without_status)
          expect(Gitlab::Redis::Cache).to receive(:with).exactly(2).and_call_original

          described_class.load_in_batch_for_projects([project_without_status])
@ -301,4 +302,13 @@ describe Gitlab::Cache::Ci::ProjectPipelineStatus, :clean_gitlab_redis_cache do
      end
    end
  end
+
+  def stub_circuit_breaker(project)
+    fake_circuitbreaker = double
+    allow(fake_circuitbreaker).to receive(:perform).and_yield
+    allow(project.repository.raw_repository)
+      .to receive(:circuit_breaker).and_return(fake_circuitbreaker)
+    allow(project.repository)
+      .to receive(:circuit_breaker).and_return(fake_circuitbreaker)
+  end
 end
--- a/spec/lib/gitlab/git/repository_spec.rb
+++ b/spec/lib/gitlab/git/repository_spec.rb
@ -55,6 +55,20 @@ describe Gitlab::Git::Repository, seed_helper: true do
  end

  describe "#rugged" do
+    describe 'when storage is broken', broken_storage: true  do
+      it 'raises a storage exception when storage is not available' do
+        broken_repo = described_class.new('broken', 'a/path.git')
+
+        expect { broken_repo.rugged }.to raise_error(Gitlab::Git::Storage::Inaccessible)
+      end
+    end
+
+    it 'raises a no repository exception when there is no repo' do
+      broken_repo = described_class.new('default', 'a/path.git')
+
+      expect { broken_repo.rugged }.to raise_error(Gitlab::Git::Repository::NoRepository)
+    end
+
    context 'with no Git env stored' do
      before do
        expect(Gitlab::Git::Env).to receive(:all).and_return({})
--- a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb
+++ b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb
@ -0,0 +1,265 @@
+require 'spec_helper'
+
+describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: true, broken_storage: true do
+  let(:circuit_breaker) { described_class.new('default') }
+  let(:hostname) { Gitlab.config.gitlab.hostname }
+  let(:cache_key) { "storage_accessible:default:#{hostname}" }
+
+  def value_from_redis(name)
+    Gitlab::Git::Storage.redis.with do |redis|
+      redis.hmget(cache_key, name)
+    end.first
+  end
+
+  def set_in_redis(name, value)
+    Gitlab::Git::Storage.redis.with do |redis|
+      redis.hmset(cache_key, name, value)
+    end.first
+  end
+
+  describe '.reset_all!' do
+    it 'clears all entries form redis' do
+      set_in_redis(:failure_count, 10)
+
+      described_class.reset_all!
+
+      key_exists = Gitlab::Git::Storage.redis.with { |redis| redis.exists(cache_key) }
+
+      expect(key_exists).to be_falsey
+    end
+  end
+
+  describe '.for_storage' do
+    it 'only builds a single circuitbreaker per storage' do
+      expect(described_class).to receive(:new).once.and_call_original
+
+      breaker = described_class.for_storage('default')
+
+      expect(breaker).to be_a(described_class)
+      expect(described_class.for_storage('default')).to eq(breaker)
+    end
+  end
+
+  describe '#initialize' do
+    it 'assigns the settings' do
+      expect(circuit_breaker.hostname).to eq(hostname)
+      expect(circuit_breaker.storage).to eq('default')
+      expect(circuit_breaker.storage_path).to eq(TestEnv.repos_path)
+      expect(circuit_breaker.failure_count_threshold).to eq(10)
+      expect(circuit_breaker.failure_wait_time).to eq(30)
+      expect(circuit_breaker.failure_reset_time).to eq(1800)
+      expect(circuit_breaker.storage_timeout).to eq(5)
+    end
+  end
+
+  describe '#perform' do
+    it 'raises an exception with retry time when the circuit is open' do
+      allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
+
+      expect { |b| circuit_breaker.perform(&b) }
+        .to raise_error(Gitlab::Git::Storage::CircuitOpen)
+    end
+
+    it 'yields the block' do
+      expect { |b| circuit_breaker.perform(&b) }
+        .to yield_control
+    end
+
+    it 'checks if the storage is available' do
+      expect(circuit_breaker).to receive(:check_storage_accessible!)
+
+      circuit_breaker.perform { 'hello world' }
+    end
+
+    it 'returns the value of the block' do
+      result = circuit_breaker.perform { 'return value' }
+
+      expect(result).to eq('return value')
+    end
+
+    it 'raises possible errors' do
+      expect { circuit_breaker.perform { raise Rugged::OSError.new('Broken') } }
+        .to raise_error(Rugged::OSError)
+    end
+
+    context 'with the feature disabled' do
+      it 'returns the block without checking accessibility' do
+        stub_feature_flags(git_storage_circuit_breaker: false)
+
+        expect(circuit_breaker).not_to receive(:circuit_broken?)
+
+        result = circuit_breaker.perform { 'hello' }
+
+        expect(result).to eq('hello')
+      end
+    end
+  end
+
+  describe '#circuit_broken?' do
+    it 'is closed when there is no last failure' do
+      set_in_redis(:last_failure, nil)
+      set_in_redis(:failure_count, 0)
+
+      expect(circuit_breaker.circuit_broken?).to be_falsey
+    end
+
+    it 'is open when there was a recent failure' do
+      Timecop.freeze do
+        set_in_redis(:last_failure, 1.second.ago.to_f)
+        set_in_redis(:failure_count, 1)
+
+        expect(circuit_breaker.circuit_broken?).to be_truthy
+      end
+    end
+
+    it 'is open when there are to many failures' do
+      set_in_redis(:last_failure, 1.day.ago.to_f)
+      set_in_redis(:failure_count, 200)
+
+      expect(circuit_breaker.circuit_broken?).to be_truthy
+    end
+  end
+
+  describe '#check_storage_accessible!' do
+    context 'when the storage is available' do
+      it 'tracks that the storage was accessible an raises the error' do
+        expect(circuit_breaker).to receive(:track_storage_accessible)
+
+        circuit_breaker.check_storage_accessible!
+      end
+    end
+
+    context 'when the storage is not available' do
+      let(:circuit_breaker) { described_class.new('broken') }
+
+      it 'tracks that the storage was unavailable and raises an error with retry time' do
+        expect(circuit_breaker).to receive(:track_storage_inaccessible)
+
+        expect { circuit_breaker.check_storage_accessible! }
+          .to raise_error do |exception|
+          expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible)
+          expect(exception.retry_after).to eq(30)
+        end
+      end
+    end
+  end
+
+  describe '#track_storage_inaccessible' do
+    around(:each) do |example|
+      Timecop.freeze
+
+      example.run
+
+      Timecop.return
+    end
+
+    it 'records the failure time in redis' do
+      circuit_breaker.track_storage_inaccessible
+
+      failure_time = value_from_redis(:last_failure)
+
+      expect(Time.at(failure_time.to_i)).to be_within(1.second).of(Time.now)
+    end
+
+    it 'sets the failure time on the breaker without reloading' do
+      circuit_breaker.track_storage_inaccessible
+
+      expect(circuit_breaker).not_to receive(:get_failure_info)
+      expect(circuit_breaker.last_failure).to eq(Time.now)
+    end
+
+    it 'increments the failure count in redis' do
+      set_in_redis(:failure_count, 10)
+
+      circuit_breaker.track_storage_inaccessible
+
+      expect(value_from_redis(:failure_count).to_i).to be(11)
+    end
+
+    it 'increments the failure count on the breaker without reloading' do
+      set_in_redis(:failure_count, 10)
+
+      circuit_breaker.track_storage_inaccessible
+
+      expect(circuit_breaker).not_to receive(:get_failure_info)
+      expect(circuit_breaker.failure_count).to eq(11)
+    end
+  end
+
+  describe '#track_storage_accessible' do
+    it 'sets the failure count to zero in redis' do
+      set_in_redis(:failure_count, 10)
+
+      circuit_breaker.track_storage_accessible
+
+      expect(value_from_redis(:failure_count).to_i).to be(0)
+    end
+
+    it 'sets the failure count to zero on the breaker without reloading' do
+      set_in_redis(:failure_count, 10)
+
+      circuit_breaker.track_storage_accessible
+
+      expect(circuit_breaker).not_to receive(:get_failure_info)
+      expect(circuit_breaker.failure_count).to eq(0)
+    end
+
+    it 'removes the last failure time from redis' do
+      set_in_redis(:last_failure, Time.now.to_i)
+
+      circuit_breaker.track_storage_accessible
+
+      expect(circuit_breaker).not_to receive(:get_failure_info)
+      expect(circuit_breaker.last_failure).to be_nil
+    end
+
+    it 'removes the last failure time from the breaker without reloading' do
+      set_in_redis(:last_failure, Time.now.to_i)
+
+      circuit_breaker.track_storage_accessible
+
+      expect(value_from_redis(:last_failure)).to be_empty
+    end
+
+    it 'wont connect to redis when there are no failures' do
+      expect(Gitlab::Git::Storage.redis).to receive(:with).once
+                                              .and_call_original
+      expect(circuit_breaker).to receive(:track_storage_accessible)
+                           .and_call_original
+
+      circuit_breaker.track_storage_accessible
+    end
+  end
+
+  describe '#no_failures?' do
+    it 'is false when a failure was tracked' do
+      set_in_redis(:last_failure, Time.now.to_i)
+      set_in_redis(:failure_count, 1)
+
+      expect(circuit_breaker.no_failures?).to be_falsey
+    end
+  end
+
+  describe '#last_failure' do
+    it 'returns the last failure time' do
+      time = Time.parse("2017-05-26 17:52:30")
+      set_in_redis(:last_failure, time.to_i)
+
+      expect(circuit_breaker.last_failure).to eq(time)
+    end
+  end
+
+  describe '#failure_count' do
+    it 'returns the failure count' do
+      set_in_redis(:failure_count, 7)
+
+      expect(circuit_breaker.failure_count).to eq(7)
+    end
+  end
+
+  describe '#cache_key' do
+    it 'includes storage and host' do
+      expect(circuit_breaker.cache_key).to eq(cache_key)
+    end
+  end
+end
--- a/spec/lib/gitlab/git/storage/forked_storage_check_spec.rb
+++ b/spec/lib/gitlab/git/storage/forked_storage_check_spec.rb
@ -0,0 +1,27 @@
+require 'spec_helper'
+
+describe Gitlab::Git::Storage::ForkedStorageCheck, skip_database_cleaner: true do
+  let(:existing_path) do
+    existing_path = TestEnv.repos_path
+    FileUtils.mkdir_p(existing_path)
+    existing_path
+  end
+
+  describe '.storage_accessible?' do
+    it 'detects when a storage is not available' do
+      expect(described_class.storage_available?('/non/existant/path')).to be_falsey
+    end
+
+    it 'detects when a storage is available' do
+      expect(described_class.storage_available?(existing_path)).to be_truthy
+    end
+
+    it 'returns false when the check takes to long' do
+      allow(described_class).to receive(:check_filesystem_in_fork) do
+        fork { sleep 10 }
+      end
+
+      expect(described_class.storage_available?(existing_path, 0.5)).to be_falsey
+    end
+  end
+end
--- a/spec/lib/gitlab/git/storage/health_spec.rb
+++ b/spec/lib/gitlab/git/storage/health_spec.rb
@ -0,0 +1,85 @@
+require 'spec_helper'
+
+describe Gitlab::Git::Storage::Health, clean_gitlab_redis_shared_state: true, broken_storage: true do
+  let(:host1_key) { 'storage_accessible:broken:web01' }
+  let(:host2_key) { 'storage_accessible:default:kiq01' }
+
+  def set_in_redis(cache_key, value)
+    Gitlab::Git::Storage.redis.with do |redis|
+      redis.hmset(cache_key, :failure_count, value)
+    end.first
+  end
+
+  describe '.for_failing_storages' do
+    it 'only includes health status for failures' do
+      set_in_redis(host1_key, 10)
+      set_in_redis(host2_key, 0)
+
+      expect(described_class.for_failing_storages.map(&:storage_name))
+        .to contain_exactly('broken')
+    end
+  end
+
+  describe '.load_for_keys' do
+    let(:subject) do
+      results = Gitlab::Git::Storage.redis.with do |redis|
+        described_class.load_for_keys({ 'broken' => [host1_key] }, redis)
+      end
+
+      # Make sure the `Redis#future is loaded
+      results.inject({}) do |result, (name, info)|
+        info.each { |i| i[:failure_count] = i[:failure_count].value.to_i }
+
+        result[name] = info
+
+        result
+      end
+    end
+
+    it 'loads when there is no info in redis' do
+      expect(subject).to eq('broken' => [{ name: host1_key, failure_count: 0 }])
+    end
+
+    it 'reads the correct values for a storage from redis' do
+      set_in_redis(host1_key, 5)
+      set_in_redis(host2_key, 7)
+
+      expect(subject).to eq('broken' => [{ name: host1_key, failure_count: 5 }])
+    end
+  end
+
+  describe '.for_all_storages' do
+    it 'loads health status for all configured storages' do
+      healths = described_class.for_all_storages
+
+      expect(healths.map(&:storage_name)).to contain_exactly('default', 'broken')
+    end
+  end
+
+  describe '#failing_info' do
+    it 'only contains storages that have failures' do
+      health = described_class.new('broken', [{ name: host1_key, failure_count: 0 },
+                                              { name: host2_key, failure_count: 3 }])
+
+      expect(health.failing_info).to contain_exactly({ name: host2_key, failure_count: 3 })
+    end
+  end
+
+  describe '#total_failures' do
+    it 'sums up all the failures' do
+      health = described_class.new('broken', [{ name: host1_key, failure_count: 2 },
+                                              { name: host2_key, failure_count: 3 }])
+
+      expect(health.total_failures).to eq(5)
+    end
+  end
+
+  describe '#failing_on_hosts' do
+    it 'collects only the failing hostnames' do
+      health = described_class.new('broken', [{ name: host1_key, failure_count: 2 },
+                                              { name: host2_key, failure_count: 0 }])
+
+      expect(health.failing_on_hosts).to contain_exactly('web01')
+    end
+  end
+end
--- a/spec/lib/gitlab/health_checks/fs_shards_check_spec.rb
+++ b/spec/lib/gitlab/health_checks/fs_shards_check_spec.rb
@ -44,6 +44,15 @@ describe Gitlab::HealthChecks::FsShardsCheck do
    describe '#readiness' do
      subject { described_class.readiness }

+      context 'storage has a tripped circuitbreaker' do
+        let(:repository_storages) { ['broken'] }
+        let(:storages_paths) do
+          Gitlab.config.repositories.storages
+        end
+
+        it { is_expected.to include(result_class.new(false, 'circuitbreaker tripped', shard: 'broken')) }
+      end
+
      context 'storage points to not existing folder' do
        let(:storages_paths) do
          {
@ -51,6 +60,10 @@ describe Gitlab::HealthChecks::FsShardsCheck do
          }.with_indifferent_access
        end

+        before do
+          allow(described_class).to receive(:storage_circuitbreaker_test) { true }
+        end
+
        it { is_expected.to include(result_class.new(false, 'cannot stat storage', shard: :default)) }
      end

@ -109,6 +122,7 @@ describe Gitlab::HealthChecks::FsShardsCheck do
          expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
          expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
          expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
+          expect(metrics).to include(an_object_having_attributes(name: :filesystem_circuitbreaker_latency_seconds, value: be >= 0))
        end
      end

@ -127,6 +141,7 @@ describe Gitlab::HealthChecks::FsShardsCheck do
          expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
          expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
          expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
+          expect(metrics).to include(an_object_having_attributes(name: :filesystem_circuitbreaker_latency_seconds, value: be >= 0))
        end

        it 'cleans up files used for metrics' do
--- a/spec/models/repository_spec.rb
+++ b/spec/models/repository_spec.rb
@ -1,11 +1,12 @@
 require 'spec_helper'

-describe Repository do
+describe Repository, models: true do
  include RepoHelpers
  TestBlob = Struct.new(:path)

  let(:project) { create(:project, :repository) }
  let(:repository) { project.repository }
+  let(:broken_repository) { create(:project, :broken_storage).repository }
  let(:user) { create(:user) }

  let(:commit_options) do
@ -27,12 +28,26 @@ describe Repository do
  let(:author_email) { 'user@example.org' }
  let(:author_name) { 'John Doe' }

+  def expect_to_raise_storage_error
+    expect { yield }.to raise_error do |exception|
+      expect(exception.class).to be_in([Gitlab::Git::Storage::Inaccessible, GRPC::Unavailable])
+    end
+  end
+
  describe '#branch_names_contains' do
    subject { repository.branch_names_contains(sample_commit.id) }

    it { is_expected.to include('master') }
    it { is_expected.not_to include('feature') }
    it { is_expected.not_to include('fix') }
+
+    describe 'when storage is broken', broken_storage: true  do
+      it 'should raise a storage error' do
+        expect_to_raise_storage_error do
+          broken_repository.branch_names_contains(sample_commit.id)
+        end
+      end
+    end
  end

  describe '#tag_names_contains' do
@ -142,6 +157,14 @@ describe Repository do
    subject { repository.last_commit_for_path(sample_commit.id, '.gitignore').id }

    it { is_expected.to eq('c1acaa58bbcbc3eafe538cb8274ba387047b69f8') }
+
+    describe 'when storage is broken', broken_storage: true  do
+      it 'should raise a storage error' do
+        expect_to_raise_storage_error do
+          broken_repository.last_commit_for_path(sample_commit.id, '.gitignore').id
+        end
+      end
+    end
  end

  describe '#last_commit_id_for_path' do
@ -158,6 +181,14 @@ describe Repository do
      expect(cache).to receive(:fetch).with(key).and_return('c1acaa5')
      is_expected.to eq('c1acaa5')
    end
+
+    describe 'when storage is broken', broken_storage: true  do
+      it 'should raise a storage error' do
+        expect_to_raise_storage_error do
+          broken_repository.last_commit_id_for_path(sample_commit.id, '.gitignore')
+        end
+      end
+    end
  end

  describe '#commits' do
@ -196,6 +227,12 @@ describe Repository do

      expect(commit_ids).to include('5937ac0a7beb003549fc5fd26fc247adbce4a52e')
    end
+
+    describe 'when storage is broken', broken_storage: true  do
+      it 'should raise a storage error' do
+        expect_to_raise_storage_error { broken_repository.find_commits_by_message('s') }
+      end
+    end
  end

  describe '#blob_at' do
@ -521,6 +558,14 @@ describe Repository do
      expect(results).to match_array([])
    end

+    describe 'when storage is broken', broken_storage: true  do
+      it 'should raise a storage error' do
+        expect_to_raise_storage_error do
+          broken_repository.search_files_by_content('feature', 'master')
+        end
+      end
+    end
+
    describe 'result' do
      subject { results.first }

@ -549,6 +594,22 @@ describe Repository do

      expect(results).to match_array([])
    end
+
+    describe 'when storage is broken', broken_storage: true  do
+      it 'should raise a storage error' do
+        expect_to_raise_storage_error { broken_repository.search_files_by_name('files', 'master') }
+      end
+    end
+  end
+
+  describe '#fetch_ref' do
+    describe 'when storage is broken', broken_storage: true  do
+      it 'should raise a storage error' do
+        path = broken_repository.path_to_repo
+
+        expect_to_raise_storage_error { broken_repository.fetch_ref(path, '1', '2') }
+      end
+    end
  end

  describe '#create_ref' do
@ -966,6 +1027,12 @@ describe Repository do

      expect(repository.exists?).to eq(false)
    end
+
+    context 'with broken storage', broken_storage: true do
+      it 'should raise a storage error' do
+        expect_to_raise_storage_error { broken_repository.exists? }
+      end
+    end
  end

  describe '#exists?' do
--- a/spec/requests/api/circuit_breakers_spec.rb
+++ b/spec/requests/api/circuit_breakers_spec.rb
@ -0,0 +1,57 @@
+require 'spec_helper'
+
+describe API::CircuitBreakers do
+  let(:user) { create(:user) }
+  let(:admin) { create(:admin) }
+
+  describe 'GET circuit_breakers/repository_storage' do
+    it 'returns a 401 for anonymous users' do
+      get api('/circuit_breakers/repository_storage')
+
+      expect(response).to have_http_status(401)
+    end
+
+    it 'returns a 403 for users' do
+      get api('/circuit_breakers/repository_storage', user)
+
+      expect(response).to have_http_status(403)
+    end
+
+    it 'returns an Array of storages' do
+      expect(Gitlab::Git::Storage::Health).to receive(:for_all_storages) do
+        [Gitlab::Git::Storage::Health.new('broken', [{ name: 'prefix:broken:web01', failure_count: 4 }])]
+      end
+
+      get api('/circuit_breakers/repository_storage', admin)
+
+      expect(response).to have_http_status(200)
+      expect(json_response).to be_kind_of(Array)
+      expect(json_response.first['storage_name']).to eq('broken')
+      expect(json_response.first['failing_on_hosts']).to eq(['web01'])
+      expect(json_response.first['total_failures']).to eq(4)
+    end
+
+    describe 'GET circuit_breakers/repository_storage/failing' do
+      it 'returns an array of failing storages' do
+        expect(Gitlab::Git::Storage::Health).to receive(:for_failing_storages) do
+          [Gitlab::Git::Storage::Health.new('broken', [{ name: 'prefix:broken:web01', failure_count: 4 }])]
+        end
+
+        get api('/circuit_breakers/repository_storage/failing', admin)
+
+        expect(response).to have_http_status(200)
+        expect(json_response).to be_kind_of(Array)
+      end
+    end
+  end
+
+  describe 'DELETE circuit_breakers/repository_storage' do
+    it 'clears all circuit_breakers' do
+      expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!)
+
+      delete api('/circuit_breakers/repository_storage', admin)
+
+      expect(response).to have_http_status(204)
+    end
+  end
+end
--- a/spec/support/stored_repositories.rb
+++ b/spec/support/stored_repositories.rb
@ -2,4 +2,16 @@ RSpec.configure do |config|
  config.before(:each, :repository) do
    TestEnv.clean_test_path
  end
+
+  config.before(:all, :broken_storage) do
+    FileUtils.rm_rf Gitlab.config.repositories.storages.broken['path']
+  end
+
+  config.before(:each, :broken_storage) do
+    allow(Gitlab::GitalyClient).to receive(:call) do
+      raise GRPC::Unavailable.new('Gitaly broken in this spec')
+    end
+
+    Gitlab::Git::Storage::CircuitBreaker.reset_all!
+  end
 end