diff --git a/changelogs/unreleased/pawel-ensure_temp_files_are_deleted_in_fs_metrics-35457.yml b/changelogs/unreleased/pawel-ensure_temp_files_are_deleted_in_fs_metrics-35457.yml new file mode 100644 index 00000000000..1186dc59dc7 --- /dev/null +++ b/changelogs/unreleased/pawel-ensure_temp_files_are_deleted_in_fs_metrics-35457.yml @@ -0,0 +1,4 @@ +--- +title: Ensure filesystem metrics test files are deleted +merge_request: +author: diff --git a/config/initializers/7_prometheus_metrics.rb b/config/initializers/7_prometheus_metrics.rb index 987324a86c9..a2f8421f5d7 100644 --- a/config/initializers/7_prometheus_metrics.rb +++ b/config/initializers/7_prometheus_metrics.rb @@ -6,7 +6,7 @@ Prometheus::Client.configure do |config| config.initial_mmap_file_size = 4 * 1024 config.multiprocess_files_dir = ENV['prometheus_multiproc_dir'] - if Rails.env.development? && Rails.env.test? + if Rails.env.development? || Rails.env.test? config.multiprocess_files_dir ||= Rails.root.join('tmp/prometheus_multiproc_dir') end end diff --git a/lib/gitlab/health_checks/base_abstract_check.rb b/lib/gitlab/health_checks/base_abstract_check.rb index 7de6d4d9367..8b365dab185 100644 --- a/lib/gitlab/health_checks/base_abstract_check.rb +++ b/lib/gitlab/health_checks/base_abstract_check.rb @@ -27,10 +27,10 @@ module Gitlab Metric.new(name, value, labels) end - def with_timing(proc) + def with_timing start = Time.now - result = proc.call - yield result, Time.now.to_f - start.to_f + result = yield + [result, Time.now.to_f - start.to_f] end def catch_timeout(seconds, &block) diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb index bebde857b16..9e91c135956 100644 --- a/lib/gitlab/health_checks/fs_shards_check.rb +++ b/lib/gitlab/health_checks/fs_shards_check.rb @@ -10,47 +10,45 @@ module Gitlab def readiness repository_storages.map do |storage_name| begin - tmp_file_path = tmp_file_path(storage_name) - if !storage_stat_test(storage_name) HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name) - elsif !storage_write_test(tmp_file_path) - HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name) - elsif !storage_read_test(tmp_file_path) - HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name) else - HealthChecks::Result.new(true, nil, shard: storage_name) + with_temp_file(storage_name) do |tmp_file_path| + if !storage_write_test(tmp_file_path) + HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name) + elsif !storage_read_test(tmp_file_path) + HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name) + else + HealthChecks::Result.new(true, nil, shard: storage_name) + end + end end rescue RuntimeError => ex message = "unexpected error #{ex} when checking storage #{storage_name}" Rails.logger.error(message) HealthChecks::Result.new(false, message, shard: storage_name) - ensure - delete_test_file(tmp_file_path) end end end def metrics repository_storages.flat_map do |storage_name| - tmp_file_path = tmp_file_path(storage_name) [ - operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, -> { storage_stat_test(storage_name) }, shard: storage_name), - operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, -> { storage_write_test(tmp_file_path) }, shard: storage_name), - operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, -> { storage_read_test(tmp_file_path) }, shard: storage_name) + storage_stat_metrics(storage_name), + storage_write_metrics(storage_name), + storage_read_metrics(storage_name) ].flatten end end private - def operation_metrics(ok_metric, latency_metric, operation, **labels) - with_timing operation do |result, elapsed| - [ - metric(latency_metric, elapsed, **labels), - metric(ok_metric, result ? 1 : 0, **labels) - ] - end + def operation_metrics(ok_metric, latency_metric, **labels) + result, elapsed = yield + [ + metric(latency_metric, elapsed, **labels), + metric(ok_metric, result ? 1 : 0, **labels) + ] rescue RuntimeError => ex Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}") [metric(ok_metric, 0, **labels)] @@ -68,19 +66,36 @@ module Gitlab Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block) end - def tmp_file_path(storage_name) - Dir::Tmpname.create(%w(fs_shards_check +deleted), path(storage_name)) { |path| path } + def with_temp_file(storage_name) + temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path } + yield temp_file_path + ensure + delete_test_file(temp_file_path) end - def path(storage_name) + def storage_path(storage_name) storages_paths&.dig(storage_name, 'path') end + # All below test methods use shell commands to perform actions on storage volumes. + # In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely, + # we can rely on shell commands to be terminated once `timeout` kills them. + # + # However we also fallback to pure Ruby file operations in case a specific shell command is missing + # so we are still able to perform healthchecks and gather metrics from such system. + + def delete_test_file(tmp_path) + _, status = exec_with_timeout(%W{ rm -f #{tmp_path} }) + status.zero? + rescue Errno::ENOENT + File.delete(tmp_path) rescue Errno::ENOENT + end + def storage_stat_test(storage_name) - stat_path = File.join(path(storage_name), '.') + stat_path = File.join(storage_path(storage_name), '.') begin _, status = exec_with_timeout(%W{ stat #{stat_path} }) - status == 0 + status.zero? rescue Errno::ENOENT File.exist?(stat_path) && File::Stat.new(stat_path).readable? end @@ -90,7 +105,7 @@ module Gitlab _, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin| stdin.write(RANDOM_STRING) end - status == 0 + status.zero? rescue Errno::ENOENT written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT written_bytes == RANDOM_STRING.length @@ -100,17 +115,33 @@ module Gitlab _, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin| stdin.write(RANDOM_STRING) end - status == 0 + status.zero? rescue Errno::ENOENT file_contents = File.read(tmp_path) rescue Errno::ENOENT file_contents == RANDOM_STRING end - def delete_test_file(tmp_path) - _, status = exec_with_timeout(%W{ rm -f #{tmp_path} }) - status == 0 - rescue Errno::ENOENT - File.delete(tmp_path) rescue Errno::ENOENT + def storage_stat_metrics(storage_name) + operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do + with_timing { storage_stat_test(storage_name) } + end + end + + def storage_write_metrics(storage_name) + operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do + with_temp_file(storage_name) do |tmp_file_path| + with_timing { storage_write_test(tmp_file_path) } + end + end + end + + def storage_read_metrics(storage_name) + operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do + with_temp_file(storage_name) do |tmp_file_path| + storage_write_test(tmp_file_path) # writes data used by read test + with_timing { storage_read_test(tmp_file_path) } + end + end end end end diff --git a/lib/gitlab/health_checks/simple_abstract_check.rb b/lib/gitlab/health_checks/simple_abstract_check.rb index 3dcb28a193c..f5026171ba4 100644 --- a/lib/gitlab/health_checks/simple_abstract_check.rb +++ b/lib/gitlab/health_checks/simple_abstract_check.rb @@ -15,14 +15,13 @@ module Gitlab end def metrics - with_timing method(:check) do |result, elapsed| - Rails.logger.error("#{human_name} check returned unexpected result #{result}") unless is_successful?(result) - [ - metric("#{metric_prefix}_timeout", result.is_a?(Timeout::Error) ? 1 : 0), - metric("#{metric_prefix}_success", is_successful?(result) ? 1 : 0), - metric("#{metric_prefix}_latency_seconds", elapsed) - ] - end + result, elapsed = with_timing(&method(:check)) + Rails.logger.error("#{human_name} check returned unexpected result #{result}") unless is_successful?(result) + [ + metric("#{metric_prefix}_timeout", result.is_a?(Timeout::Error) ? 1 : 0), + metric("#{metric_prefix}_success", is_successful?(result) ? 1 : 0), + metric("#{metric_prefix}_latency_seconds", elapsed) + ] end private diff --git a/spec/lib/gitlab/health_checks/fs_shards_check_spec.rb b/spec/lib/gitlab/health_checks/fs_shards_check_spec.rb index 3de73a9ff65..8abc4320c59 100644 --- a/spec/lib/gitlab/health_checks/fs_shards_check_spec.rb +++ b/spec/lib/gitlab/health_checks/fs_shards_check_spec.rb @@ -3,7 +3,7 @@ require 'spec_helper' describe Gitlab::HealthChecks::FsShardsCheck do def command_exists?(command) _, status = Gitlab::Popen.popen(%W{ #{command} 1 echo }) - status == 0 + status.zero? rescue Errno::ENOENT false end @@ -64,9 +64,7 @@ describe Gitlab::HealthChecks::FsShardsCheck do it 'cleans up files used for testing' do expect(described_class).to receive(:storage_write_test).with(any_args).and_call_original - subject - - expect(Dir.entries(tmp_dir).count).to eq(2) + expect { subject }.not_to change(Dir.entries(tmp_dir), :count) end context 'read test fails' do @@ -88,8 +86,6 @@ describe Gitlab::HealthChecks::FsShardsCheck do end describe '#metrics' do - subject { described_class.metrics } - context 'storage points to not existing folder' do let(:storages_paths) do { @@ -104,14 +100,15 @@ describe Gitlab::HealthChecks::FsShardsCheck do end it 'provides metrics' do - expect(subject).to all(have_attributes(labels: { shard: :default })) - expect(subject).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_readable, value: 0)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_writable, value: 0)) + metrics = described_class.metrics - expect(subject).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0)) + expect(metrics).to all(have_attributes(labels: { shard: :default })) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0)) end end @@ -121,15 +118,19 @@ describe Gitlab::HealthChecks::FsShardsCheck do end it 'provides metrics' do - expect(subject).to all(have_attributes(labels: { shard: :default })) + metrics = described_class.metrics - expect(subject).to include(an_object_having_attributes(name: :filesystem_accessible, value: 1)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_readable, value: 1)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_writable, value: 1)) + expect(metrics).to all(have_attributes(labels: { shard: :default })) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 1)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 1)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 1)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0)) + end - expect(subject).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0)) + it 'cleans up files used for metrics' do + expect { described_class.metrics }.not_to change(Dir.entries(tmp_dir), :count) end end end @@ -150,18 +151,16 @@ describe Gitlab::HealthChecks::FsShardsCheck do end describe '#metrics' do - subject { described_class.metrics } - it 'provides metrics' do - expect(subject).to all(have_attributes(labels: { shard: :default })) + metrics = described_class.metrics - expect(subject).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_readable, value: 0)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_writable, value: 0)) - - expect(subject).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0)) - expect(subject).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0)) + expect(metrics).to all(have_attributes(labels: { shard: :default })) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0)) end end end