2017-04-07 06:27:15 -04:00
|
|
|
module Gitlab
|
|
|
|
module HealthChecks
|
|
|
|
class FsShardsCheck
|
|
|
|
extend BaseAbstractCheck
|
2017-05-25 14:53:42 -04:00
|
|
|
RANDOM_STRING = SecureRandom.hex(1000).freeze
|
|
|
|
COMMAND_TIMEOUT = '1'.freeze
|
|
|
|
TIMEOUT_EXECUTABLE = 'timeout'.freeze
|
2017-04-07 06:27:15 -04:00
|
|
|
|
|
|
|
class << self
|
|
|
|
def readiness
|
|
|
|
repository_storages.map do |storage_name|
|
|
|
|
begin
|
2017-05-17 12:17:15 -04:00
|
|
|
if !storage_circuitbreaker_test(storage_name)
|
|
|
|
HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name)
|
|
|
|
elsif !storage_stat_test(storage_name)
|
2017-04-07 06:27:15 -04:00
|
|
|
HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
|
|
|
|
else
|
2017-07-25 08:19:09 -04:00
|
|
|
with_temp_file(storage_name) do |tmp_file_path|
|
|
|
|
if !storage_write_test(tmp_file_path)
|
|
|
|
HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
|
|
|
|
elsif !storage_read_test(tmp_file_path)
|
|
|
|
HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
|
|
|
|
else
|
|
|
|
HealthChecks::Result.new(true, nil, shard: storage_name)
|
|
|
|
end
|
|
|
|
end
|
2017-04-07 06:27:15 -04:00
|
|
|
end
|
|
|
|
rescue RuntimeError => ex
|
|
|
|
message = "unexpected error #{ex} when checking storage #{storage_name}"
|
|
|
|
Rails.logger.error(message)
|
|
|
|
HealthChecks::Result.new(false, message, shard: storage_name)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def metrics
|
2017-07-25 18:28:13 -04:00
|
|
|
repository_storages.flat_map do |storage_name|
|
|
|
|
[
|
|
|
|
storage_stat_metrics(storage_name),
|
|
|
|
storage_write_metrics(storage_name),
|
2017-05-17 12:17:15 -04:00
|
|
|
storage_read_metrics(storage_name),
|
|
|
|
storage_circuitbreaker_metrics(storage_name)
|
2017-07-25 18:28:13 -04:00
|
|
|
].flatten
|
2017-04-07 06:27:15 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
2017-07-25 08:19:09 -04:00
|
|
|
def operation_metrics(ok_metric, latency_metric, **labels)
|
|
|
|
result, elapsed = yield
|
|
|
|
[
|
|
|
|
metric(latency_metric, elapsed, **labels),
|
|
|
|
metric(ok_metric, result ? 1 : 0, **labels)
|
|
|
|
]
|
2017-04-07 06:27:15 -04:00
|
|
|
rescue RuntimeError => ex
|
2017-07-05 02:22:59 -04:00
|
|
|
Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}")
|
2017-04-07 06:27:15 -04:00
|
|
|
[metric(ok_metric, 0, **labels)]
|
|
|
|
end
|
|
|
|
|
|
|
|
def repository_storages
|
2017-09-21 10:07:40 -04:00
|
|
|
storages_paths.keys
|
2017-04-07 06:27:15 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def storages_paths
|
2017-09-21 10:07:40 -04:00
|
|
|
Gitlab.config.repositories.storages
|
2017-04-07 06:27:15 -04:00
|
|
|
end
|
|
|
|
|
2017-05-16 10:25:02 -04:00
|
|
|
def exec_with_timeout(cmd_args, *args, &block)
|
2017-05-25 14:53:42 -04:00
|
|
|
Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
|
2017-04-07 06:27:15 -04:00
|
|
|
end
|
|
|
|
|
2017-07-25 08:19:09 -04:00
|
|
|
def with_temp_file(storage_name)
|
2017-07-26 11:16:59 -04:00
|
|
|
temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path }
|
|
|
|
yield temp_file_path
|
|
|
|
ensure
|
|
|
|
delete_test_file(temp_file_path)
|
2017-04-07 06:27:15 -04:00
|
|
|
end
|
|
|
|
|
2017-07-25 18:28:13 -04:00
|
|
|
def storage_path(storage_name)
|
2017-04-07 06:27:15 -04:00
|
|
|
storages_paths&.dig(storage_name, 'path')
|
|
|
|
end
|
|
|
|
|
2017-07-27 09:44:13 -04:00
|
|
|
# All below test methods use shell commands to perform actions on storage volumes.
|
|
|
|
# In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely,
|
|
|
|
# we can rely on shell commands to be terminated once `timeout` kills them.
|
|
|
|
#
|
|
|
|
# However we also fallback to pure Ruby file operations in case a specific shell command is missing
|
|
|
|
# so we are still able to perform healthchecks and gather metrics from such system.
|
|
|
|
|
2017-07-25 18:28:13 -04:00
|
|
|
def delete_test_file(tmp_path)
|
|
|
|
_, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
|
2017-07-26 07:23:27 -04:00
|
|
|
status.zero?
|
2017-07-25 18:28:13 -04:00
|
|
|
rescue Errno::ENOENT
|
|
|
|
File.delete(tmp_path) rescue Errno::ENOENT
|
|
|
|
end
|
|
|
|
|
2017-04-07 06:27:15 -04:00
|
|
|
def storage_stat_test(storage_name)
|
2017-07-25 18:28:13 -04:00
|
|
|
stat_path = File.join(storage_path(storage_name), '.')
|
2017-04-07 06:27:15 -04:00
|
|
|
begin
|
2017-05-16 10:25:02 -04:00
|
|
|
_, status = exec_with_timeout(%W{ stat #{stat_path} })
|
2017-07-26 07:23:27 -04:00
|
|
|
status.zero?
|
2017-04-07 06:27:15 -04:00
|
|
|
rescue Errno::ENOENT
|
|
|
|
File.exist?(stat_path) && File::Stat.new(stat_path).readable?
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def storage_write_test(tmp_path)
|
2017-05-16 10:25:02 -04:00
|
|
|
_, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
|
2017-04-07 06:27:15 -04:00
|
|
|
stdin.write(RANDOM_STRING)
|
|
|
|
end
|
2017-07-26 07:23:27 -04:00
|
|
|
status.zero?
|
2017-04-07 06:27:15 -04:00
|
|
|
rescue Errno::ENOENT
|
|
|
|
written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
|
|
|
|
written_bytes == RANDOM_STRING.length
|
|
|
|
end
|
|
|
|
|
|
|
|
def storage_read_test(tmp_path)
|
2017-05-16 10:25:02 -04:00
|
|
|
_, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
|
2017-04-07 06:27:15 -04:00
|
|
|
stdin.write(RANDOM_STRING)
|
|
|
|
end
|
2017-07-26 07:23:27 -04:00
|
|
|
status.zero?
|
2017-04-07 06:27:15 -04:00
|
|
|
rescue Errno::ENOENT
|
|
|
|
file_contents = File.read(tmp_path) rescue Errno::ENOENT
|
|
|
|
file_contents == RANDOM_STRING
|
|
|
|
end
|
|
|
|
|
2017-05-17 12:17:15 -04:00
|
|
|
def storage_circuitbreaker_test(storage_name)
|
2017-09-21 09:55:08 -04:00
|
|
|
Gitlab::Git::Storage::CircuitBreaker.build(storage_name).perform { "OK" }
|
2017-05-17 12:17:15 -04:00
|
|
|
rescue Gitlab::Git::Storage::Inaccessible
|
|
|
|
nil
|
|
|
|
end
|
|
|
|
|
2017-07-25 18:28:13 -04:00
|
|
|
def storage_stat_metrics(storage_name)
|
|
|
|
operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
|
|
|
|
with_timing { storage_stat_test(storage_name) }
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def storage_write_metrics(storage_name)
|
|
|
|
operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do
|
|
|
|
with_temp_file(storage_name) do |tmp_file_path|
|
|
|
|
with_timing { storage_write_test(tmp_file_path) }
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def storage_read_metrics(storage_name)
|
|
|
|
operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do
|
|
|
|
with_temp_file(storage_name) do |tmp_file_path|
|
|
|
|
storage_write_test(tmp_file_path) # writes data used by read test
|
|
|
|
with_timing { storage_read_test(tmp_file_path) }
|
|
|
|
end
|
|
|
|
end
|
2017-04-07 06:27:15 -04:00
|
|
|
end
|
2017-05-17 12:17:15 -04:00
|
|
|
|
|
|
|
def storage_circuitbreaker_metrics(storage_name)
|
|
|
|
operation_metrics(:filesystem_circuitbreaker,
|
|
|
|
:filesystem_circuitbreaker_latency_seconds,
|
|
|
|
shard: storage_name) do
|
|
|
|
with_timing { storage_circuitbreaker_test(storage_name) }
|
|
|
|
end
|
|
|
|
end
|
2017-04-07 06:27:15 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|