Merge branch 'pawel/ensure_temp_files_are_deleted_in_fs_metrics-35457' into 'master'
Ensure test files are deleted after fs metrics gathering run Closes #35457 See merge request !13080
This commit is contained in:
commit
842bcfa777
|
@ -0,0 +1,4 @@
|
||||||
|
---
|
||||||
|
title: Ensure filesystem metrics test files are deleted
|
||||||
|
merge_request:
|
||||||
|
author:
|
|
@ -6,7 +6,7 @@ Prometheus::Client.configure do |config|
|
||||||
config.initial_mmap_file_size = 4 * 1024
|
config.initial_mmap_file_size = 4 * 1024
|
||||||
config.multiprocess_files_dir = ENV['prometheus_multiproc_dir']
|
config.multiprocess_files_dir = ENV['prometheus_multiproc_dir']
|
||||||
|
|
||||||
if Rails.env.development? && Rails.env.test?
|
if Rails.env.development? || Rails.env.test?
|
||||||
config.multiprocess_files_dir ||= Rails.root.join('tmp/prometheus_multiproc_dir')
|
config.multiprocess_files_dir ||= Rails.root.join('tmp/prometheus_multiproc_dir')
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -27,10 +27,10 @@ module Gitlab
|
||||||
Metric.new(name, value, labels)
|
Metric.new(name, value, labels)
|
||||||
end
|
end
|
||||||
|
|
||||||
def with_timing(proc)
|
def with_timing
|
||||||
start = Time.now
|
start = Time.now
|
||||||
result = proc.call
|
result = yield
|
||||||
yield result, Time.now.to_f - start.to_f
|
[result, Time.now.to_f - start.to_f]
|
||||||
end
|
end
|
||||||
|
|
||||||
def catch_timeout(seconds, &block)
|
def catch_timeout(seconds, &block)
|
||||||
|
|
|
@ -10,47 +10,45 @@ module Gitlab
|
||||||
def readiness
|
def readiness
|
||||||
repository_storages.map do |storage_name|
|
repository_storages.map do |storage_name|
|
||||||
begin
|
begin
|
||||||
tmp_file_path = tmp_file_path(storage_name)
|
|
||||||
|
|
||||||
if !storage_stat_test(storage_name)
|
if !storage_stat_test(storage_name)
|
||||||
HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
|
HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
|
||||||
elsif !storage_write_test(tmp_file_path)
|
|
||||||
HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
|
|
||||||
elsif !storage_read_test(tmp_file_path)
|
|
||||||
HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
|
|
||||||
else
|
else
|
||||||
HealthChecks::Result.new(true, nil, shard: storage_name)
|
with_temp_file(storage_name) do |tmp_file_path|
|
||||||
|
if !storage_write_test(tmp_file_path)
|
||||||
|
HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
|
||||||
|
elsif !storage_read_test(tmp_file_path)
|
||||||
|
HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
|
||||||
|
else
|
||||||
|
HealthChecks::Result.new(true, nil, shard: storage_name)
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
rescue RuntimeError => ex
|
rescue RuntimeError => ex
|
||||||
message = "unexpected error #{ex} when checking storage #{storage_name}"
|
message = "unexpected error #{ex} when checking storage #{storage_name}"
|
||||||
Rails.logger.error(message)
|
Rails.logger.error(message)
|
||||||
HealthChecks::Result.new(false, message, shard: storage_name)
|
HealthChecks::Result.new(false, message, shard: storage_name)
|
||||||
ensure
|
|
||||||
delete_test_file(tmp_file_path)
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def metrics
|
def metrics
|
||||||
repository_storages.flat_map do |storage_name|
|
repository_storages.flat_map do |storage_name|
|
||||||
tmp_file_path = tmp_file_path(storage_name)
|
|
||||||
[
|
[
|
||||||
operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, -> { storage_stat_test(storage_name) }, shard: storage_name),
|
storage_stat_metrics(storage_name),
|
||||||
operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, -> { storage_write_test(tmp_file_path) }, shard: storage_name),
|
storage_write_metrics(storage_name),
|
||||||
operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, -> { storage_read_test(tmp_file_path) }, shard: storage_name)
|
storage_read_metrics(storage_name)
|
||||||
].flatten
|
].flatten
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
def operation_metrics(ok_metric, latency_metric, operation, **labels)
|
def operation_metrics(ok_metric, latency_metric, **labels)
|
||||||
with_timing operation do |result, elapsed|
|
result, elapsed = yield
|
||||||
[
|
[
|
||||||
metric(latency_metric, elapsed, **labels),
|
metric(latency_metric, elapsed, **labels),
|
||||||
metric(ok_metric, result ? 1 : 0, **labels)
|
metric(ok_metric, result ? 1 : 0, **labels)
|
||||||
]
|
]
|
||||||
end
|
|
||||||
rescue RuntimeError => ex
|
rescue RuntimeError => ex
|
||||||
Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}")
|
Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}")
|
||||||
[metric(ok_metric, 0, **labels)]
|
[metric(ok_metric, 0, **labels)]
|
||||||
|
@ -68,19 +66,36 @@ module Gitlab
|
||||||
Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
|
Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
|
||||||
end
|
end
|
||||||
|
|
||||||
def tmp_file_path(storage_name)
|
def with_temp_file(storage_name)
|
||||||
Dir::Tmpname.create(%w(fs_shards_check +deleted), path(storage_name)) { |path| path }
|
temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path }
|
||||||
|
yield temp_file_path
|
||||||
|
ensure
|
||||||
|
delete_test_file(temp_file_path)
|
||||||
end
|
end
|
||||||
|
|
||||||
def path(storage_name)
|
def storage_path(storage_name)
|
||||||
storages_paths&.dig(storage_name, 'path')
|
storages_paths&.dig(storage_name, 'path')
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# All below test methods use shell commands to perform actions on storage volumes.
|
||||||
|
# In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely,
|
||||||
|
# we can rely on shell commands to be terminated once `timeout` kills them.
|
||||||
|
#
|
||||||
|
# However we also fallback to pure Ruby file operations in case a specific shell command is missing
|
||||||
|
# so we are still able to perform healthchecks and gather metrics from such system.
|
||||||
|
|
||||||
|
def delete_test_file(tmp_path)
|
||||||
|
_, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
|
||||||
|
status.zero?
|
||||||
|
rescue Errno::ENOENT
|
||||||
|
File.delete(tmp_path) rescue Errno::ENOENT
|
||||||
|
end
|
||||||
|
|
||||||
def storage_stat_test(storage_name)
|
def storage_stat_test(storage_name)
|
||||||
stat_path = File.join(path(storage_name), '.')
|
stat_path = File.join(storage_path(storage_name), '.')
|
||||||
begin
|
begin
|
||||||
_, status = exec_with_timeout(%W{ stat #{stat_path} })
|
_, status = exec_with_timeout(%W{ stat #{stat_path} })
|
||||||
status == 0
|
status.zero?
|
||||||
rescue Errno::ENOENT
|
rescue Errno::ENOENT
|
||||||
File.exist?(stat_path) && File::Stat.new(stat_path).readable?
|
File.exist?(stat_path) && File::Stat.new(stat_path).readable?
|
||||||
end
|
end
|
||||||
|
@ -90,7 +105,7 @@ module Gitlab
|
||||||
_, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
|
_, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
|
||||||
stdin.write(RANDOM_STRING)
|
stdin.write(RANDOM_STRING)
|
||||||
end
|
end
|
||||||
status == 0
|
status.zero?
|
||||||
rescue Errno::ENOENT
|
rescue Errno::ENOENT
|
||||||
written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
|
written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
|
||||||
written_bytes == RANDOM_STRING.length
|
written_bytes == RANDOM_STRING.length
|
||||||
|
@ -100,17 +115,33 @@ module Gitlab
|
||||||
_, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
|
_, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
|
||||||
stdin.write(RANDOM_STRING)
|
stdin.write(RANDOM_STRING)
|
||||||
end
|
end
|
||||||
status == 0
|
status.zero?
|
||||||
rescue Errno::ENOENT
|
rescue Errno::ENOENT
|
||||||
file_contents = File.read(tmp_path) rescue Errno::ENOENT
|
file_contents = File.read(tmp_path) rescue Errno::ENOENT
|
||||||
file_contents == RANDOM_STRING
|
file_contents == RANDOM_STRING
|
||||||
end
|
end
|
||||||
|
|
||||||
def delete_test_file(tmp_path)
|
def storage_stat_metrics(storage_name)
|
||||||
_, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
|
operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
|
||||||
status == 0
|
with_timing { storage_stat_test(storage_name) }
|
||||||
rescue Errno::ENOENT
|
end
|
||||||
File.delete(tmp_path) rescue Errno::ENOENT
|
end
|
||||||
|
|
||||||
|
def storage_write_metrics(storage_name)
|
||||||
|
operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do
|
||||||
|
with_temp_file(storage_name) do |tmp_file_path|
|
||||||
|
with_timing { storage_write_test(tmp_file_path) }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def storage_read_metrics(storage_name)
|
||||||
|
operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do
|
||||||
|
with_temp_file(storage_name) do |tmp_file_path|
|
||||||
|
storage_write_test(tmp_file_path) # writes data used by read test
|
||||||
|
with_timing { storage_read_test(tmp_file_path) }
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -15,14 +15,13 @@ module Gitlab
|
||||||
end
|
end
|
||||||
|
|
||||||
def metrics
|
def metrics
|
||||||
with_timing method(:check) do |result, elapsed|
|
result, elapsed = with_timing(&method(:check))
|
||||||
Rails.logger.error("#{human_name} check returned unexpected result #{result}") unless is_successful?(result)
|
Rails.logger.error("#{human_name} check returned unexpected result #{result}") unless is_successful?(result)
|
||||||
[
|
[
|
||||||
metric("#{metric_prefix}_timeout", result.is_a?(Timeout::Error) ? 1 : 0),
|
metric("#{metric_prefix}_timeout", result.is_a?(Timeout::Error) ? 1 : 0),
|
||||||
metric("#{metric_prefix}_success", is_successful?(result) ? 1 : 0),
|
metric("#{metric_prefix}_success", is_successful?(result) ? 1 : 0),
|
||||||
metric("#{metric_prefix}_latency_seconds", elapsed)
|
metric("#{metric_prefix}_latency_seconds", elapsed)
|
||||||
]
|
]
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
|
@ -3,7 +3,7 @@ require 'spec_helper'
|
||||||
describe Gitlab::HealthChecks::FsShardsCheck do
|
describe Gitlab::HealthChecks::FsShardsCheck do
|
||||||
def command_exists?(command)
|
def command_exists?(command)
|
||||||
_, status = Gitlab::Popen.popen(%W{ #{command} 1 echo })
|
_, status = Gitlab::Popen.popen(%W{ #{command} 1 echo })
|
||||||
status == 0
|
status.zero?
|
||||||
rescue Errno::ENOENT
|
rescue Errno::ENOENT
|
||||||
false
|
false
|
||||||
end
|
end
|
||||||
|
@ -64,9 +64,7 @@ describe Gitlab::HealthChecks::FsShardsCheck do
|
||||||
it 'cleans up files used for testing' do
|
it 'cleans up files used for testing' do
|
||||||
expect(described_class).to receive(:storage_write_test).with(any_args).and_call_original
|
expect(described_class).to receive(:storage_write_test).with(any_args).and_call_original
|
||||||
|
|
||||||
subject
|
expect { subject }.not_to change(Dir.entries(tmp_dir), :count)
|
||||||
|
|
||||||
expect(Dir.entries(tmp_dir).count).to eq(2)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
context 'read test fails' do
|
context 'read test fails' do
|
||||||
|
@ -88,8 +86,6 @@ describe Gitlab::HealthChecks::FsShardsCheck do
|
||||||
end
|
end
|
||||||
|
|
||||||
describe '#metrics' do
|
describe '#metrics' do
|
||||||
subject { described_class.metrics }
|
|
||||||
|
|
||||||
context 'storage points to not existing folder' do
|
context 'storage points to not existing folder' do
|
||||||
let(:storages_paths) do
|
let(:storages_paths) do
|
||||||
{
|
{
|
||||||
|
@ -104,14 +100,15 @@ describe Gitlab::HealthChecks::FsShardsCheck do
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'provides metrics' do
|
it 'provides metrics' do
|
||||||
expect(subject).to all(have_attributes(labels: { shard: :default }))
|
metrics = described_class.metrics
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
|
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
|
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
|
|
||||||
|
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
|
expect(metrics).to all(have_attributes(labels: { shard: :default }))
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
|
||||||
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
|
||||||
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
|
||||||
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
|
||||||
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -121,15 +118,19 @@ describe Gitlab::HealthChecks::FsShardsCheck do
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'provides metrics' do
|
it 'provides metrics' do
|
||||||
expect(subject).to all(have_attributes(labels: { shard: :default }))
|
metrics = described_class.metrics
|
||||||
|
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_accessible, value: 1))
|
expect(metrics).to all(have_attributes(labels: { shard: :default }))
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_readable, value: 1))
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 1))
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_writable, value: 1))
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 1))
|
||||||
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 1))
|
||||||
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
|
||||||
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
|
||||||
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
|
||||||
|
end
|
||||||
|
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
|
it 'cleans up files used for metrics' do
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
|
expect { described_class.metrics }.not_to change(Dir.entries(tmp_dir), :count)
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -150,18 +151,16 @@ describe Gitlab::HealthChecks::FsShardsCheck do
|
||||||
end
|
end
|
||||||
|
|
||||||
describe '#metrics' do
|
describe '#metrics' do
|
||||||
subject { described_class.metrics }
|
|
||||||
|
|
||||||
it 'provides metrics' do
|
it 'provides metrics' do
|
||||||
expect(subject).to all(have_attributes(labels: { shard: :default }))
|
metrics = described_class.metrics
|
||||||
|
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
|
expect(metrics).to all(have_attributes(labels: { shard: :default }))
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
|
||||||
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
|
||||||
expect(subject).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
|
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue