[cherry-pick] '4862-verify-file-checksums'
See merge request gitlab-org/gitlab-ee!4753
This commit is contained in:
parent
2387ef2b4a
commit
edbcde8877
16 changed files with 423 additions and 11 deletions
|
@ -24,4 +24,8 @@ class LfsObject < ActiveRecord::Base
|
|||
.where(lfs_objects_projects: { id: nil })
|
||||
.destroy_all
|
||||
end
|
||||
|
||||
def self.calculate_oid(path)
|
||||
Digest::SHA256.file(path).hexdigest
|
||||
end
|
||||
end
|
||||
|
|
|
@ -78,34 +78,41 @@ Example output:
|
|||
|
||||
## Uploaded Files Integrity
|
||||
|
||||
The uploads check Rake task will loop through all uploads in the database
|
||||
and run two checks to determine the integrity of each file:
|
||||
Various types of file can be uploaded to a GitLab installation by users.
|
||||
Checksums are generated and stored in the database upon upload, and integrity
|
||||
checks using those checksums can be run. These checks also detect missing files.
|
||||
|
||||
1. Check if the file exist on the file system.
|
||||
1. Check if the checksum of the file on the file system matches the checksum in the database.
|
||||
Currently, integrity checks are supported for the following types of file:
|
||||
|
||||
* LFS objects
|
||||
* User uploads
|
||||
|
||||
**Omnibus Installation**
|
||||
|
||||
```
|
||||
sudo gitlab-rake gitlab:lfs:check
|
||||
sudo gitlab-rake gitlab:uploads:check
|
||||
```
|
||||
|
||||
**Source Installation**
|
||||
|
||||
```bash
|
||||
sudo -u git -H bundle exec rake gitlab:lfs:check RAILS_ENV=production
|
||||
sudo -u git -H bundle exec rake gitlab:uploads:check RAILS_ENV=production
|
||||
```
|
||||
|
||||
This task also accepts some environment variables which you can use to override
|
||||
These tasks also accept some environment variables which you can use to override
|
||||
certain values:
|
||||
|
||||
Variable | Type | Description
|
||||
-------- | ---- | -----------
|
||||
`BATCH` | integer | Specifies the size of the batch. Defaults to 200.
|
||||
`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value.
|
||||
`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value.
|
||||
Variable | Type | Description
|
||||
--------- | ------- | -----------
|
||||
`BATCH` | integer | Specifies the size of the batch. Defaults to 200.
|
||||
`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value.
|
||||
`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value.
|
||||
`VERBOSE` | boolean | Causes failures to be listed individually, rather than being summarized.
|
||||
|
||||
```bash
|
||||
sudo gitlab-rake gitlab:lfs:check BATCH=100 ID_FROM=50 ID_TO=250
|
||||
sudo gitlab-rake gitlab:uploads:check BATCH=100 ID_FROM=50 ID_TO=250
|
||||
```
|
||||
|
||||
|
|
64
lib/gitlab/verify/batch_verifier.rb
Normal file
64
lib/gitlab/verify/batch_verifier.rb
Normal file
|
@ -0,0 +1,64 @@
|
|||
module Gitlab
|
||||
module Verify
|
||||
class BatchVerifier
|
||||
attr_reader :batch_size, :start, :finish
|
||||
|
||||
def initialize(batch_size:, start: nil, finish: nil)
|
||||
@batch_size = batch_size
|
||||
@start = start
|
||||
@finish = finish
|
||||
end
|
||||
|
||||
# Yields a Range of IDs and a Hash of failed verifications (object => error)
|
||||
def run_batches(&blk)
|
||||
relation.in_batches(of: batch_size, start: start, finish: finish) do |relation| # rubocop: disable Cop/InBatches
|
||||
range = relation.first.id..relation.last.id
|
||||
failures = run_batch(relation)
|
||||
|
||||
yield(range, failures)
|
||||
end
|
||||
end
|
||||
|
||||
def name
|
||||
raise NotImplementedError.new
|
||||
end
|
||||
|
||||
def describe(_object)
|
||||
raise NotImplementedError.new
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def run_batch(relation)
|
||||
relation.map { |upload| verify(upload) }.compact.to_h
|
||||
end
|
||||
|
||||
def verify(object)
|
||||
expected = expected_checksum(object)
|
||||
actual = actual_checksum(object)
|
||||
|
||||
raise 'Checksum missing' unless expected.present?
|
||||
raise 'Checksum mismatch' unless expected == actual
|
||||
|
||||
nil
|
||||
rescue => err
|
||||
[object, err]
|
||||
end
|
||||
|
||||
# This should return an ActiveRecord::Relation suitable for calling #in_batches on
|
||||
def relation
|
||||
raise NotImplementedError.new
|
||||
end
|
||||
|
||||
# The checksum we expect the object to have
|
||||
def expected_checksum(_object)
|
||||
raise NotImplementedError.new
|
||||
end
|
||||
|
||||
# The freshly-recalculated checksum of the object
|
||||
def actual_checksum(_object)
|
||||
raise NotImplementedError.new
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
27
lib/gitlab/verify/lfs_objects.rb
Normal file
27
lib/gitlab/verify/lfs_objects.rb
Normal file
|
@ -0,0 +1,27 @@
|
|||
module Gitlab
|
||||
module Verify
|
||||
class LfsObjects < BatchVerifier
|
||||
def name
|
||||
'LFS objects'
|
||||
end
|
||||
|
||||
def describe(object)
|
||||
"LFS object: #{object.oid}"
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def relation
|
||||
LfsObject.with_files_stored_locally
|
||||
end
|
||||
|
||||
def expected_checksum(lfs_object)
|
||||
lfs_object.oid
|
||||
end
|
||||
|
||||
def actual_checksum(lfs_object)
|
||||
LfsObject.calculate_oid(lfs_object.file.path)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
53
lib/gitlab/verify/rake_task.rb
Normal file
53
lib/gitlab/verify/rake_task.rb
Normal file
|
@ -0,0 +1,53 @@
|
|||
module Gitlab
|
||||
module Verify
|
||||
class RakeTask
|
||||
def self.run!(verify_kls)
|
||||
verifier = verify_kls.new(
|
||||
batch_size: ENV.fetch('BATCH', 200).to_i,
|
||||
start: ENV['ID_FROM'],
|
||||
finish: ENV['ID_TO']
|
||||
)
|
||||
|
||||
verbose = Gitlab::Utils.to_boolean(ENV['VERBOSE'])
|
||||
|
||||
new(verifier, verbose).run!
|
||||
end
|
||||
|
||||
attr_reader :verifier, :output
|
||||
|
||||
def initialize(verifier, verbose)
|
||||
@verifier = verifier
|
||||
@verbose = verbose
|
||||
end
|
||||
|
||||
def run!
|
||||
say "Checking integrity of #{verifier.name}"
|
||||
|
||||
verifier.run_batches { |*args| run_batch(*args) }
|
||||
|
||||
say 'Done!'
|
||||
end
|
||||
|
||||
def verbose?
|
||||
!!@verbose
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def say(text)
|
||||
puts(text) # rubocop:disable Rails/Output
|
||||
end
|
||||
|
||||
def run_batch(range, failures)
|
||||
status_color = failures.empty? ? :green : :red
|
||||
say "- #{range}: Failures: #{failures.count}".color(status_color)
|
||||
|
||||
return unless verbose?
|
||||
|
||||
failures.each do |object, error|
|
||||
say " - #{verifier.describe(object)}: #{error.inspect}".color(:red)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
27
lib/gitlab/verify/uploads.rb
Normal file
27
lib/gitlab/verify/uploads.rb
Normal file
|
@ -0,0 +1,27 @@
|
|||
module Gitlab
|
||||
module Verify
|
||||
class Uploads < BatchVerifier
|
||||
def name
|
||||
'Uploads'
|
||||
end
|
||||
|
||||
def describe(object)
|
||||
"Upload: #{object.id}"
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def relation
|
||||
Upload.with_files_stored_locally
|
||||
end
|
||||
|
||||
def expected_checksum(upload)
|
||||
upload.checksum
|
||||
end
|
||||
|
||||
def actual_checksum(upload)
|
||||
Upload.hexdigest(upload.absolute_path)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
8
lib/tasks/gitlab/lfs/check.rake
Normal file
8
lib/tasks/gitlab/lfs/check.rake
Normal file
|
@ -0,0 +1,8 @@
|
|||
namespace :gitlab do
|
||||
namespace :lfs do
|
||||
desc 'GitLab | LFS | Check integrity of uploaded LFS objects'
|
||||
task check: :environment do
|
||||
Gitlab::Verify::RakeTask.run!(Gitlab::Verify::LfsObjects)
|
||||
end
|
||||
end
|
||||
end
|
|
@ -9,4 +9,14 @@ FactoryBot.define do
|
|||
trait :with_file do
|
||||
file { fixture_file_upload(Rails.root + "spec/fixtures/dk.png", "`/png") }
|
||||
end
|
||||
|
||||
# The uniqueness constraint means we can't use the correct OID for all LFS
|
||||
# objects, so the test needs to decide which (if any) object gets it
|
||||
trait :correct_oid do
|
||||
oid 'b804383982bb89b00e828e3f44c038cc991d3d1768009fc39ba8e2c081b9fb75'
|
||||
end
|
||||
|
||||
trait :object_storage do
|
||||
file_store { LfsObjectUploader::Store::REMOTE }
|
||||
end
|
||||
end
|
||||
|
|
51
spec/lib/gitlab/verify/lfs_objects_spec.rb
Normal file
51
spec/lib/gitlab/verify/lfs_objects_spec.rb
Normal file
|
@ -0,0 +1,51 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Gitlab::Verify::LfsObjects do
|
||||
include GitlabVerifyHelpers
|
||||
|
||||
it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
|
||||
let!(:objects) { create_list(:lfs_object, 3, :with_file) }
|
||||
end
|
||||
|
||||
describe '#run_batches' do
|
||||
let(:failures) { collect_failures }
|
||||
let(:failure) { failures[lfs_object] }
|
||||
|
||||
let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
|
||||
|
||||
it 'passes LFS objects with the correct file' do
|
||||
expect(failures).to eq({})
|
||||
end
|
||||
|
||||
it 'fails LFS objects with a missing file' do
|
||||
FileUtils.rm_f(lfs_object.file.path)
|
||||
|
||||
expect(failures.keys).to contain_exactly(lfs_object)
|
||||
expect(failure).to be_a(Errno::ENOENT)
|
||||
expect(failure.to_s).to include(lfs_object.file.path)
|
||||
end
|
||||
|
||||
it 'fails LFS objects with a mismatched oid' do
|
||||
File.truncate(lfs_object.file.path, 0)
|
||||
|
||||
expect(failures.keys).to contain_exactly(lfs_object)
|
||||
expect(failure.to_s).to include('Checksum mismatch')
|
||||
end
|
||||
|
||||
context 'with remote files' do
|
||||
before do
|
||||
stub_lfs_object_storage
|
||||
end
|
||||
|
||||
it 'skips LFS objects in object storage' do
|
||||
local_failure = create(:lfs_object)
|
||||
create(:lfs_object, :object_storage)
|
||||
|
||||
failures = {}
|
||||
described_class.new(batch_size: 10).run_batches { |_, failed| failures.merge!(failed) }
|
||||
|
||||
expect(failures.keys).to contain_exactly(local_failure)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
60
spec/lib/gitlab/verify/uploads_spec.rb
Normal file
60
spec/lib/gitlab/verify/uploads_spec.rb
Normal file
|
@ -0,0 +1,60 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Gitlab::Verify::Uploads do
|
||||
include GitlabVerifyHelpers
|
||||
|
||||
it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
|
||||
let(:projects) { create_list(:project, 3, :with_avatar) }
|
||||
let!(:objects) { projects.flat_map(&:uploads) }
|
||||
end
|
||||
|
||||
describe '#run_batches' do
|
||||
let(:project) { create(:project, :with_avatar) }
|
||||
let(:failures) { collect_failures }
|
||||
let(:failure) { failures[upload] }
|
||||
|
||||
let!(:upload) { project.uploads.first }
|
||||
|
||||
it 'passes uploads with the correct file' do
|
||||
expect(failures).to eq({})
|
||||
end
|
||||
|
||||
it 'fails uploads with a missing file' do
|
||||
FileUtils.rm_f(upload.absolute_path)
|
||||
|
||||
expect(failures.keys).to contain_exactly(upload)
|
||||
expect(failure).to be_a(Errno::ENOENT)
|
||||
expect(failure.to_s).to include(upload.absolute_path)
|
||||
end
|
||||
|
||||
it 'fails uploads with a mismatched checksum' do
|
||||
upload.update!(checksum: 'something incorrect')
|
||||
|
||||
expect(failures.keys).to contain_exactly(upload)
|
||||
expect(failure.to_s).to include('Checksum mismatch')
|
||||
end
|
||||
|
||||
it 'fails uploads with a missing precalculated checksum' do
|
||||
upload.update!(checksum: '')
|
||||
|
||||
expect(failures.keys).to contain_exactly(upload)
|
||||
expect(failure.to_s).to include('Checksum missing')
|
||||
end
|
||||
|
||||
context 'with remote files' do
|
||||
before do
|
||||
stub_uploads_object_storage(AvatarUploader)
|
||||
end
|
||||
|
||||
it 'skips uploads in object storage' do
|
||||
local_failure = create(:upload)
|
||||
create(:upload, :object_storage)
|
||||
|
||||
failures = {}
|
||||
described_class.new(batch_size: 10).run_batches { |_, failed| failures.merge!(failed) }
|
||||
|
||||
expect(failures.keys).to contain_exactly(local_failure)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
45
spec/support/gitlab_verify.rb
Normal file
45
spec/support/gitlab_verify.rb
Normal file
|
@ -0,0 +1,45 @@
|
|||
RSpec.shared_examples 'Gitlab::Verify::BatchVerifier subclass' do
|
||||
describe 'batching' do
|
||||
let(:first_batch) { objects[0].id..objects[0].id }
|
||||
let(:second_batch) { objects[1].id..objects[1].id }
|
||||
let(:third_batch) { objects[2].id..objects[2].id }
|
||||
|
||||
it 'iterates through objects in batches' do
|
||||
expect(collect_ranges).to eq([first_batch, second_batch, third_batch])
|
||||
end
|
||||
|
||||
it 'allows the starting ID to be specified' do
|
||||
expect(collect_ranges(start: second_batch.first)).to eq([second_batch, third_batch])
|
||||
end
|
||||
|
||||
it 'allows the finishing ID to be specified' do
|
||||
expect(collect_ranges(finish: second_batch.last)).to eq([first_batch, second_batch])
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
module GitlabVerifyHelpers
|
||||
def collect_ranges(args = {})
|
||||
verifier = described_class.new(args.merge(batch_size: 1))
|
||||
|
||||
collect_results(verifier).map { |range, _| range }
|
||||
end
|
||||
|
||||
def collect_failures
|
||||
verifier = described_class.new(batch_size: 1)
|
||||
|
||||
out = {}
|
||||
|
||||
collect_results(verifier).map { |_, failures| out.merge!(failures) }
|
||||
|
||||
out
|
||||
end
|
||||
|
||||
def collect_results(verifier)
|
||||
out = []
|
||||
|
||||
verifier.run_batches { |*args| out << args }
|
||||
|
||||
out
|
||||
end
|
||||
end
|
28
spec/tasks/gitlab/lfs/check_rake_spec.rb
Normal file
28
spec/tasks/gitlab/lfs/check_rake_spec.rb
Normal file
|
@ -0,0 +1,28 @@
|
|||
require 'rake_helper'
|
||||
|
||||
describe 'gitlab:lfs rake tasks' do
|
||||
describe 'check' do
|
||||
let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
|
||||
|
||||
before do
|
||||
Rake.application.rake_require('tasks/gitlab/lfs/check')
|
||||
stub_env('VERBOSE' => 'true')
|
||||
end
|
||||
|
||||
it 'outputs the integrity check for each batch' do
|
||||
expect { run_rake_task('gitlab:lfs:check') }.to output(/Failures: 0/).to_stdout
|
||||
end
|
||||
|
||||
it 'errors out about missing files on the file system' do
|
||||
FileUtils.rm_f(lfs_object.file.path)
|
||||
|
||||
expect { run_rake_task('gitlab:lfs:check') }.to output(/No such file.*#{Regexp.quote(lfs_object.file.path)}/).to_stdout
|
||||
end
|
||||
|
||||
it 'errors out about invalid checksum' do
|
||||
File.truncate(lfs_object.file.path, 0)
|
||||
|
||||
expect { run_rake_task('gitlab:lfs:check') }.to output(/Checksum mismatch/).to_stdout
|
||||
end
|
||||
end
|
||||
end
|
|
@ -2,7 +2,7 @@ require 'rake_helper'
|
|||
|
||||
describe 'gitlab:lfs namespace rake task' do
|
||||
before :all do
|
||||
Rake.application.rake_require 'tasks/gitlab/lfs'
|
||||
Rake.application.rake_require 'tasks/gitlab/lfs/migrate'
|
||||
end
|
||||
|
||||
describe 'migrate' do
|
28
spec/tasks/gitlab/uploads/check_rake_spec.rb
Normal file
28
spec/tasks/gitlab/uploads/check_rake_spec.rb
Normal file
|
@ -0,0 +1,28 @@
|
|||
require 'rake_helper'
|
||||
|
||||
describe 'gitlab:uploads rake tasks' do
|
||||
describe 'check' do
|
||||
let!(:upload) { create(:upload, path: Rails.root.join('spec/fixtures/banana_sample.gif')) }
|
||||
|
||||
before do
|
||||
Rake.application.rake_require('tasks/gitlab/uploads/check')
|
||||
stub_env('VERBOSE' => 'true')
|
||||
end
|
||||
|
||||
it 'outputs the integrity check for each batch' do
|
||||
expect { run_rake_task('gitlab:uploads:check') }.to output(/Failures: 0/).to_stdout
|
||||
end
|
||||
|
||||
it 'errors out about missing files on the file system' do
|
||||
missing_upload = create(:upload)
|
||||
|
||||
expect { run_rake_task('gitlab:uploads:check') }.to output(/No such file.*#{Regexp.quote(missing_upload.absolute_path)}/).to_stdout
|
||||
end
|
||||
|
||||
it 'errors out about invalid checksum' do
|
||||
upload.update_column(:checksum, '01a3156db2cf4f67ec823680b40b7302f89ab39179124ad219f94919b8a1769e')
|
||||
|
||||
expect { run_rake_task('gitlab:uploads:check') }.to output(/Checksum mismatch/).to_stdout
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in a new issue