[cherry-pick] '4862-verify-file-checksums'

See merge request gitlab-org/gitlab-ee!4753
This commit is contained in:
Sean McGivern 2018-03-05 11:16:17 +00:00 committed by Micaël Bergeron
parent 2387ef2b4a
commit edbcde8877
16 changed files with 423 additions and 11 deletions

View file

@ -24,4 +24,8 @@ class LfsObject < ActiveRecord::Base
.where(lfs_objects_projects: { id: nil })
.destroy_all
end
def self.calculate_oid(path)
Digest::SHA256.file(path).hexdigest
end
end

View file

@ -78,34 +78,41 @@ Example output:
## Uploaded Files Integrity
The uploads check Rake task will loop through all uploads in the database
and run two checks to determine the integrity of each file:
Various types of file can be uploaded to a GitLab installation by users.
Checksums are generated and stored in the database upon upload, and integrity
checks using those checksums can be run. These checks also detect missing files.
1. Check if the file exist on the file system.
1. Check if the checksum of the file on the file system matches the checksum in the database.
Currently, integrity checks are supported for the following types of file:
* LFS objects
* User uploads
**Omnibus Installation**
```
sudo gitlab-rake gitlab:lfs:check
sudo gitlab-rake gitlab:uploads:check
```
**Source Installation**
```bash
sudo -u git -H bundle exec rake gitlab:lfs:check RAILS_ENV=production
sudo -u git -H bundle exec rake gitlab:uploads:check RAILS_ENV=production
```
This task also accepts some environment variables which you can use to override
These tasks also accept some environment variables which you can use to override
certain values:
Variable | Type | Description
-------- | ---- | -----------
`BATCH` | integer | Specifies the size of the batch. Defaults to 200.
`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value.
`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value.
Variable | Type | Description
--------- | ------- | -----------
`BATCH` | integer | Specifies the size of the batch. Defaults to 200.
`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value.
`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value.
`VERBOSE` | boolean | Causes failures to be listed individually, rather than being summarized.
```bash
sudo gitlab-rake gitlab:lfs:check BATCH=100 ID_FROM=50 ID_TO=250
sudo gitlab-rake gitlab:uploads:check BATCH=100 ID_FROM=50 ID_TO=250
```

View file

@ -0,0 +1,64 @@
module Gitlab
module Verify
class BatchVerifier
attr_reader :batch_size, :start, :finish
def initialize(batch_size:, start: nil, finish: nil)
@batch_size = batch_size
@start = start
@finish = finish
end
# Yields a Range of IDs and a Hash of failed verifications (object => error)
def run_batches(&blk)
relation.in_batches(of: batch_size, start: start, finish: finish) do |relation| # rubocop: disable Cop/InBatches
range = relation.first.id..relation.last.id
failures = run_batch(relation)
yield(range, failures)
end
end
def name
raise NotImplementedError.new
end
def describe(_object)
raise NotImplementedError.new
end
private
def run_batch(relation)
relation.map { |upload| verify(upload) }.compact.to_h
end
def verify(object)
expected = expected_checksum(object)
actual = actual_checksum(object)
raise 'Checksum missing' unless expected.present?
raise 'Checksum mismatch' unless expected == actual
nil
rescue => err
[object, err]
end
# This should return an ActiveRecord::Relation suitable for calling #in_batches on
def relation
raise NotImplementedError.new
end
# The checksum we expect the object to have
def expected_checksum(_object)
raise NotImplementedError.new
end
# The freshly-recalculated checksum of the object
def actual_checksum(_object)
raise NotImplementedError.new
end
end
end
end

View file

@ -0,0 +1,27 @@
module Gitlab
module Verify
class LfsObjects < BatchVerifier
def name
'LFS objects'
end
def describe(object)
"LFS object: #{object.oid}"
end
private
def relation
LfsObject.with_files_stored_locally
end
def expected_checksum(lfs_object)
lfs_object.oid
end
def actual_checksum(lfs_object)
LfsObject.calculate_oid(lfs_object.file.path)
end
end
end
end

View file

@ -0,0 +1,53 @@
module Gitlab
module Verify
class RakeTask
def self.run!(verify_kls)
verifier = verify_kls.new(
batch_size: ENV.fetch('BATCH', 200).to_i,
start: ENV['ID_FROM'],
finish: ENV['ID_TO']
)
verbose = Gitlab::Utils.to_boolean(ENV['VERBOSE'])
new(verifier, verbose).run!
end
attr_reader :verifier, :output
def initialize(verifier, verbose)
@verifier = verifier
@verbose = verbose
end
def run!
say "Checking integrity of #{verifier.name}"
verifier.run_batches { |*args| run_batch(*args) }
say 'Done!'
end
def verbose?
!!@verbose
end
private
def say(text)
puts(text) # rubocop:disable Rails/Output
end
def run_batch(range, failures)
status_color = failures.empty? ? :green : :red
say "- #{range}: Failures: #{failures.count}".color(status_color)
return unless verbose?
failures.each do |object, error|
say " - #{verifier.describe(object)}: #{error.inspect}".color(:red)
end
end
end
end
end

View file

@ -0,0 +1,27 @@
module Gitlab
module Verify
class Uploads < BatchVerifier
def name
'Uploads'
end
def describe(object)
"Upload: #{object.id}"
end
private
def relation
Upload.with_files_stored_locally
end
def expected_checksum(upload)
upload.checksum
end
def actual_checksum(upload)
Upload.hexdigest(upload.absolute_path)
end
end
end
end

View file

@ -0,0 +1,8 @@
namespace :gitlab do
namespace :lfs do
desc 'GitLab | LFS | Check integrity of uploaded LFS objects'
task check: :environment do
Gitlab::Verify::RakeTask.run!(Gitlab::Verify::LfsObjects)
end
end
end

View file

@ -9,4 +9,14 @@ FactoryBot.define do
trait :with_file do
file { fixture_file_upload(Rails.root + "spec/fixtures/dk.png", "`/png") }
end
# The uniqueness constraint means we can't use the correct OID for all LFS
# objects, so the test needs to decide which (if any) object gets it
trait :correct_oid do
oid 'b804383982bb89b00e828e3f44c038cc991d3d1768009fc39ba8e2c081b9fb75'
end
trait :object_storage do
file_store { LfsObjectUploader::Store::REMOTE }
end
end

View file

@ -0,0 +1,51 @@
require 'spec_helper'
describe Gitlab::Verify::LfsObjects do
include GitlabVerifyHelpers
it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
let!(:objects) { create_list(:lfs_object, 3, :with_file) }
end
describe '#run_batches' do
let(:failures) { collect_failures }
let(:failure) { failures[lfs_object] }
let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
it 'passes LFS objects with the correct file' do
expect(failures).to eq({})
end
it 'fails LFS objects with a missing file' do
FileUtils.rm_f(lfs_object.file.path)
expect(failures.keys).to contain_exactly(lfs_object)
expect(failure).to be_a(Errno::ENOENT)
expect(failure.to_s).to include(lfs_object.file.path)
end
it 'fails LFS objects with a mismatched oid' do
File.truncate(lfs_object.file.path, 0)
expect(failures.keys).to contain_exactly(lfs_object)
expect(failure.to_s).to include('Checksum mismatch')
end
context 'with remote files' do
before do
stub_lfs_object_storage
end
it 'skips LFS objects in object storage' do
local_failure = create(:lfs_object)
create(:lfs_object, :object_storage)
failures = {}
described_class.new(batch_size: 10).run_batches { |_, failed| failures.merge!(failed) }
expect(failures.keys).to contain_exactly(local_failure)
end
end
end
end

View file

@ -0,0 +1,60 @@
require 'spec_helper'
describe Gitlab::Verify::Uploads do
include GitlabVerifyHelpers
it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
let(:projects) { create_list(:project, 3, :with_avatar) }
let!(:objects) { projects.flat_map(&:uploads) }
end
describe '#run_batches' do
let(:project) { create(:project, :with_avatar) }
let(:failures) { collect_failures }
let(:failure) { failures[upload] }
let!(:upload) { project.uploads.first }
it 'passes uploads with the correct file' do
expect(failures).to eq({})
end
it 'fails uploads with a missing file' do
FileUtils.rm_f(upload.absolute_path)
expect(failures.keys).to contain_exactly(upload)
expect(failure).to be_a(Errno::ENOENT)
expect(failure.to_s).to include(upload.absolute_path)
end
it 'fails uploads with a mismatched checksum' do
upload.update!(checksum: 'something incorrect')
expect(failures.keys).to contain_exactly(upload)
expect(failure.to_s).to include('Checksum mismatch')
end
it 'fails uploads with a missing precalculated checksum' do
upload.update!(checksum: '')
expect(failures.keys).to contain_exactly(upload)
expect(failure.to_s).to include('Checksum missing')
end
context 'with remote files' do
before do
stub_uploads_object_storage(AvatarUploader)
end
it 'skips uploads in object storage' do
local_failure = create(:upload)
create(:upload, :object_storage)
failures = {}
described_class.new(batch_size: 10).run_batches { |_, failed| failures.merge!(failed) }
expect(failures.keys).to contain_exactly(local_failure)
end
end
end
end

View file

@ -0,0 +1,45 @@
RSpec.shared_examples 'Gitlab::Verify::BatchVerifier subclass' do
describe 'batching' do
let(:first_batch) { objects[0].id..objects[0].id }
let(:second_batch) { objects[1].id..objects[1].id }
let(:third_batch) { objects[2].id..objects[2].id }
it 'iterates through objects in batches' do
expect(collect_ranges).to eq([first_batch, second_batch, third_batch])
end
it 'allows the starting ID to be specified' do
expect(collect_ranges(start: second_batch.first)).to eq([second_batch, third_batch])
end
it 'allows the finishing ID to be specified' do
expect(collect_ranges(finish: second_batch.last)).to eq([first_batch, second_batch])
end
end
end
module GitlabVerifyHelpers
def collect_ranges(args = {})
verifier = described_class.new(args.merge(batch_size: 1))
collect_results(verifier).map { |range, _| range }
end
def collect_failures
verifier = described_class.new(batch_size: 1)
out = {}
collect_results(verifier).map { |_, failures| out.merge!(failures) }
out
end
def collect_results(verifier)
out = []
verifier.run_batches { |*args| out << args }
out
end
end

View file

@ -0,0 +1,28 @@
require 'rake_helper'
describe 'gitlab:lfs rake tasks' do
describe 'check' do
let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
before do
Rake.application.rake_require('tasks/gitlab/lfs/check')
stub_env('VERBOSE' => 'true')
end
it 'outputs the integrity check for each batch' do
expect { run_rake_task('gitlab:lfs:check') }.to output(/Failures: 0/).to_stdout
end
it 'errors out about missing files on the file system' do
FileUtils.rm_f(lfs_object.file.path)
expect { run_rake_task('gitlab:lfs:check') }.to output(/No such file.*#{Regexp.quote(lfs_object.file.path)}/).to_stdout
end
it 'errors out about invalid checksum' do
File.truncate(lfs_object.file.path, 0)
expect { run_rake_task('gitlab:lfs:check') }.to output(/Checksum mismatch/).to_stdout
end
end
end

View file

@ -2,7 +2,7 @@ require 'rake_helper'
describe 'gitlab:lfs namespace rake task' do
before :all do
Rake.application.rake_require 'tasks/gitlab/lfs'
Rake.application.rake_require 'tasks/gitlab/lfs/migrate'
end
describe 'migrate' do

View file

@ -0,0 +1,28 @@
require 'rake_helper'
describe 'gitlab:uploads rake tasks' do
describe 'check' do
let!(:upload) { create(:upload, path: Rails.root.join('spec/fixtures/banana_sample.gif')) }
before do
Rake.application.rake_require('tasks/gitlab/uploads/check')
stub_env('VERBOSE' => 'true')
end
it 'outputs the integrity check for each batch' do
expect { run_rake_task('gitlab:uploads:check') }.to output(/Failures: 0/).to_stdout
end
it 'errors out about missing files on the file system' do
missing_upload = create(:upload)
expect { run_rake_task('gitlab:uploads:check') }.to output(/No such file.*#{Regexp.quote(missing_upload.absolute_path)}/).to_stdout
end
it 'errors out about invalid checksum' do
upload.update_column(:checksum, '01a3156db2cf4f67ec823680b40b7302f89ab39179124ad219f94919b8a1769e')
expect { run_rake_task('gitlab:uploads:check') }.to output(/Checksum mismatch/).to_stdout
end
end
end