Add local project uploads cleanup task

This commit is contained in:
Michael Kozono 2018-07-26 14:23:33 -07:00
parent 400925c480
commit 3cbd8b1343
6 changed files with 578 additions and 5 deletions

View file

@ -0,0 +1,5 @@
---
title: Add local project uploads cleanup task
merge_request: 20863
author:
type: added

View file

@ -22,3 +22,34 @@ sudo gitlab-rake gitlab:cleanup:repos
# installation from source
bundle exec rake gitlab:cleanup:repos RAILS_ENV=production
```
Clean up local project upload files if they don't exist in GitLab database. The
task attempts to fix the file if it can find its project, otherwise it moves the
file to a lost and found directory.
```
# omnibus-gitlab
sudo gitlab-rake gitlab:cleanup:project_uploads
# installation from source
bundle exec rake gitlab:cleanup:project_uploads RAILS_ENV=production
```
Example output:
```
$ sudo gitlab-rake gitlab:cleanup:project_uploads
I, [2018-07-27T12:08:27.671559 #89817] INFO -- : Looking for orphaned project uploads to clean up. Dry run...
D, [2018-07-27T12:08:28.293568 #89817] DEBUG -- : Processing batch of 500 project upload file paths, starting with /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out
I, [2018-07-27T12:08:28.689869 #89817] INFO -- : Can move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/test.out
I, [2018-07-27T12:08:28.755624 #89817] INFO -- : Can fix /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/qux/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt
I, [2018-07-27T12:08:28.760257 #89817] INFO -- : Can move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png
I, [2018-07-27T12:08:28.764470 #89817] INFO -- : To cleanup these files run this command with DRY_RUN=false
$ sudo gitlab-rake gitlab:cleanup:project_uploads DRY_RUN=false
I, [2018-07-27T12:08:32.944414 #89936] INFO -- : Looking for orphaned project uploads to clean up...
D, [2018-07-27T12:08:33.293568 #89817] DEBUG -- : Processing batch of 500 project upload file paths, starting with /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out
I, [2018-07-27T12:08:33.689869 #89817] INFO -- : Did move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/test.out
I, [2018-07-27T12:08:33.755624 #89817] INFO -- : Did fix /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/qux/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt
I, [2018-07-27T12:08:33.760257 #89817] INFO -- : Did move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png
```

View file

@ -0,0 +1,66 @@
# frozen_string_literal: true
module Gitlab
module Cleanup
class ProjectUploadFileFinder
FIND_BATCH_SIZE = 500
ABSOLUTE_UPLOAD_DIR = FileUploader.root.freeze
EXCLUDED_SYSTEM_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/-/*".freeze
EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*".freeze
EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*".freeze
# Paths are relative to the upload directory
def each_file_batch(batch_size: FIND_BATCH_SIZE, &block)
cmd = build_find_command(ABSOLUTE_UPLOAD_DIR)
Open3.popen2(*cmd) do |stdin, stdout, status_thread|
yield_paths_in_batches(stdout, batch_size, &block)
raise "Find command failed" unless status_thread.value.success?
end
end
private
def yield_paths_in_batches(stdout, batch_size, &block)
paths = []
stdout.each_line("\0") do |line|
paths << line.chomp("\0")
if paths.size >= batch_size
yield(paths)
paths = []
end
end
yield(paths) if paths.any?
end
def build_find_command(search_dir)
cmd = %W[find -L #{search_dir}
-type f
! ( -path #{EXCLUDED_SYSTEM_UPLOADS_PATH} -prune )
! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune )
! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune )
-print0]
ionice = which_ionice
cmd = %W[#{ionice} -c Idle] + cmd if ionice
log_msg = "find command: \"#{cmd.join(' ')}\""
Rails.logger.info log_msg
cmd
end
def which_ionice
Gitlab::Utils.which('ionice')
rescue StandardError
# In this case, returning false is relatively safe,
# even though it isn't very nice
false
end
end
end
end

View file

@ -0,0 +1,125 @@
# frozen_string_literal: true
module Gitlab
module Cleanup
class ProjectUploads
LOST_AND_FOUND = File.join(ProjectUploadFileFinder::ABSOLUTE_UPLOAD_DIR, '-', 'project-lost-found')
attr_reader :logger
def initialize(logger: nil)
@logger = logger || Rails.logger
end
def run!(dry_run: true)
logger.info "Looking for orphaned project uploads to clean up#{'. Dry run' if dry_run}..."
each_orphan_file do |path, upload_path|
result = cleanup(path, upload_path, dry_run)
logger.info result
end
end
private
def cleanup(path, upload_path, dry_run)
# This happened in staging:
# `find` returned a path on which `File.delete` raised `Errno::ENOENT`
return "Cannot find file: #{path}" unless File.exist?(path)
correct_path = upload_path && find_correct_path(upload_path)
if correct_path
move(path, correct_path, 'fix', dry_run)
else
move_to_lost_and_found(path, dry_run)
end
end
# Accepts a path in the form of "#{hex_secret}/#{filename}"
def find_correct_path(upload_path)
upload = Upload.find_by(uploader: 'FileUploader', path: upload_path)
return unless upload && upload.local?
upload.absolute_path
rescue => e
logger.error e.message
# absolute_path depends on a lot of code. If it doesn't work, then it
# it doesn't matter if the upload file is in the right place. Treat it
# as uncorrectable.
# I.e. the project record might be missing, which raises an exception.
nil
end
def move_to_lost_and_found(path, dry_run)
new_path = path.sub(/\A#{ProjectUploadFileFinder::ABSOLUTE_UPLOAD_DIR}/, LOST_AND_FOUND)
move(path, new_path, 'move to lost and found', dry_run)
end
def move(path, new_path, prefix, dry_run)
action = "#{prefix} #{path} -> #{new_path}"
if dry_run
"Can #{action}"
else
begin
FileUtils.mkdir_p(File.dirname(new_path))
FileUtils.mv(path, new_path)
"Did #{action}"
rescue => e
"Error during #{action}: #{e.inspect}"
end
end
end
# Yields absolute paths of project upload files that are not in the
# uploads table
def each_orphan_file
ProjectUploadFileFinder.new.each_file_batch do |file_paths|
logger.debug "Processing batch of #{file_paths.size} project upload file paths, starting with #{file_paths.first}"
file_paths.each do |path|
pup = ProjectUploadPath.from_path(path)
yield(path, pup.upload_path) if pup.orphan?
end
end
end
class ProjectUploadPath
PROJECT_FULL_PATH_REGEX = %r{\A#{FileUploader.root}/(.+)/(\h+/[^/]+)\z}.freeze
attr_reader :full_path, :upload_path
def initialize(full_path, upload_path)
@full_path = full_path
@upload_path = upload_path
end
def self.from_path(path)
path_matched = path.match(PROJECT_FULL_PATH_REGEX)
return new(nil, nil) unless path_matched
new(path_matched[1], path_matched[2])
end
def orphan?
return true if full_path.nil? || upload_path.nil?
# It's possible to reduce to one query, but `where_full_path_in` is complex
!Upload.exists?(path: upload_path, model_id: project_id, model_type: 'Project', uploader: 'FileUploader')
end
private
def project_id
@project_id ||= Project.where_full_path_in([full_path]).pluck(:id)
end
end
end
end
end

View file

@ -7,9 +7,8 @@ namespace :gitlab do
desc "GitLab | Cleanup | Clean namespaces"
task dirs: :gitlab_environment do
warn_user_is_not_gitlab
remove_flag = ENV['REMOVE']
namespaces = Namespace.pluck(:path)
namespaces = Namespace.pluck(:path)
namespaces << HASHED_REPOSITORY_NAME # add so that it will be ignored
Gitlab.config.repositories.storages.each do |name, repository_storage|
git_base_path = Gitlab::GitalyClient::StorageSettings.allow_disk_access { repository_storage.legacy_disk_path }
@ -31,7 +30,7 @@ namespace :gitlab do
end
all_dirs.each do |dir_path|
if remove_flag
if remove?
if FileUtils.rm_rf dir_path
puts "Removed...#{dir_path}".color(:red)
else
@ -43,7 +42,7 @@ namespace :gitlab do
end
end
unless remove_flag
unless remove?
puts "To cleanup this directories run this command with REMOVE=true".color(:yellow)
end
end
@ -104,5 +103,37 @@ namespace :gitlab do
puts "To block these users run this command with BLOCK=true".color(:yellow)
end
end
desc "GitLab | Cleanup | Clean orphaned project uploads"
task project_uploads: :gitlab_environment do
warn_user_is_not_gitlab
cleaner = Gitlab::Cleanup::ProjectUploads.new(logger: logger)
cleaner.run!(dry_run: dry_run?)
if dry_run?
logger.info "To clean up these files run this command with DRY_RUN=false".color(:yellow)
end
end
def remove?
ENV['REMOVE'] == 'true'
end
def dry_run?
ENV['DRY_RUN'] != 'false'
end
def logger
return @logger if defined?(@logger)
@logger = if Rails.env.development? || Rails.env.production?
Logger.new(STDOUT).tap do |stdout_logger|
stdout_logger.extend(ActiveSupport::Logger.broadcast(Rails.logger))
end
else
Rails.logger
end
end
end
end

View file

@ -5,7 +5,7 @@ describe 'gitlab:cleanup rake tasks' do
Rake.application.rake_require 'tasks/gitlab/cleanup'
end
describe 'cleanup' do
describe 'cleanup namespaces and repos' do
let(:storages) do
{
'default' => Gitlab::GitalyClient::StorageSettings.new(@default_storage_hash.merge('path' => 'tmp/tests/default_storage'))
@ -67,4 +67,319 @@ describe 'gitlab:cleanup rake tasks' do
end
end
end
describe 'cleanup:project_uploads' do
context 'orphaned project upload file' do
context 'when an upload record matching the secret and filename is found' do
context 'when the project is still in legacy storage' do
let!(:orphaned) { create(:upload, :issuable_upload, :with_file, model: build(:project, :legacy_storage)) }
let!(:correct_path) { orphaned.absolute_path }
let!(:other_project) { create(:project, :legacy_storage) }
let!(:orphaned_path) { correct_path.sub(/#{orphaned.model.full_path}/, other_project.full_path) }
before do
FileUtils.mkdir_p(File.dirname(orphaned_path))
FileUtils.mv(correct_path, orphaned_path)
end
it 'moves the file to its proper location' do
expect(Rails.logger).to receive(:info).twice
expect(Rails.logger).to receive(:info).with("Did fix #{orphaned_path} -> #{correct_path}")
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(correct_path)).to be_falsey
stub_env('DRY_RUN', 'false')
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(orphaned_path)).to be_falsey
expect(File.exist?(correct_path)).to be_truthy
end
it 'a dry run does not move the file' do
expect(Rails.logger).to receive(:info).twice
expect(Rails.logger).to receive(:info).with("Can fix #{orphaned_path} -> #{correct_path}")
expect(Rails.logger).to receive(:info)
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(correct_path)).to be_falsey
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(correct_path)).to be_falsey
end
context 'when the project record is missing (Upload#absolute_path raises error)' do
let!(:lost_and_found_path) { File.join(FileUploader.root, '-', 'project-lost-found', other_project.full_path, orphaned.path) }
before do
orphaned.model.delete
end
it 'moves the file to lost and found' do
expect(Rails.logger).to receive(:info).twice
expect(Rails.logger).to receive(:info).with("Did move to lost and found #{orphaned_path} -> #{lost_and_found_path}")
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(lost_and_found_path)).to be_falsey
stub_env('DRY_RUN', 'false')
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(orphaned_path)).to be_falsey
expect(File.exist?(lost_and_found_path)).to be_truthy
end
it 'a dry run does not move the file' do
expect(Rails.logger).to receive(:info).twice
expect(Rails.logger).to receive(:info).with("Can move to lost and found #{orphaned_path} -> #{lost_and_found_path}")
expect(Rails.logger).to receive(:info)
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(lost_and_found_path)).to be_falsey
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(lost_and_found_path)).to be_falsey
end
end
end
context 'when the project was moved to hashed storage' do
let!(:orphaned) { create(:upload, :issuable_upload, :with_file) }
let!(:correct_path) { orphaned.absolute_path }
let!(:orphaned_path) { File.join(FileUploader.root, 'foo', 'bar', orphaned.path) }
before do
FileUtils.mkdir_p(File.dirname(orphaned_path))
FileUtils.mv(correct_path, orphaned_path)
end
it 'moves the file to its proper location' do
expect(Rails.logger).to receive(:info).twice
expect(Rails.logger).to receive(:info).with("Did fix #{orphaned_path} -> #{correct_path}")
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(correct_path)).to be_falsey
stub_env('DRY_RUN', 'false')
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(orphaned_path)).to be_falsey
expect(File.exist?(correct_path)).to be_truthy
end
it 'a dry run does not move the file' do
expect(Rails.logger).to receive(:info).twice
expect(Rails.logger).to receive(:info).with("Can fix #{orphaned_path} -> #{correct_path}")
expect(Rails.logger).to receive(:info)
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(correct_path)).to be_falsey
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(correct_path)).to be_falsey
end
end
end
context 'when a matching upload record can not be found' do
context 'when the file path fits the known pattern' do
let!(:orphaned) { create(:upload, :issuable_upload, :with_file, model: build(:project, :legacy_storage)) }
let!(:orphaned_path) { orphaned.absolute_path }
let!(:lost_and_found_path) { File.join(FileUploader.root, '-', 'project-lost-found', orphaned.model.full_path, orphaned.path) }
before do
orphaned.delete
end
it 'moves the file to lost and found' do
expect(Rails.logger).to receive(:info).twice
expect(Rails.logger).to receive(:info).with("Did move to lost and found #{orphaned_path} -> #{lost_and_found_path}")
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(lost_and_found_path)).to be_falsey
stub_env('DRY_RUN', 'false')
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(orphaned_path)).to be_falsey
expect(File.exist?(lost_and_found_path)).to be_truthy
end
it 'a dry run does not move the file' do
expect(Rails.logger).to receive(:info).twice
expect(Rails.logger).to receive(:info).with("Can move to lost and found #{orphaned_path} -> #{lost_and_found_path}")
expect(Rails.logger).to receive(:info)
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(lost_and_found_path)).to be_falsey
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(lost_and_found_path)).to be_falsey
end
end
context 'when the file path does not fit the known pattern' do
let!(:invalid_path) { File.join('group', 'file.jpg') }
let!(:orphaned_path) { File.join(FileUploader.root, invalid_path) }
let!(:lost_and_found_path) { File.join(FileUploader.root, '-', 'project-lost-found', invalid_path) }
before do
FileUtils.mkdir_p(File.dirname(orphaned_path))
FileUtils.touch(orphaned_path)
end
after do
File.delete(orphaned_path) if File.exist?(orphaned_path)
end
it 'moves the file to lost and found' do
expect(Rails.logger).to receive(:info).twice
expect(Rails.logger).to receive(:info).with("Did move to lost and found #{orphaned_path} -> #{lost_and_found_path}")
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(lost_and_found_path)).to be_falsey
stub_env('DRY_RUN', 'false')
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(orphaned_path)).to be_falsey
expect(File.exist?(lost_and_found_path)).to be_truthy
end
it 'a dry run does not move the file' do
expect(Rails.logger).to receive(:info).twice
expect(Rails.logger).to receive(:info).with("Can move to lost and found #{orphaned_path} -> #{lost_and_found_path}")
expect(Rails.logger).to receive(:info)
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(lost_and_found_path)).to be_falsey
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(orphaned_path)).to be_truthy
expect(File.exist?(lost_and_found_path)).to be_falsey
end
end
end
end
context 'non-orphaned project upload file' do
it 'does not move the file' do
tracked = create(:upload, :issuable_upload, :with_file, model: build(:project, :legacy_storage))
tracked_path = tracked.absolute_path
expect(Rails.logger).not_to receive(:info).with(/move|fix/i)
expect(File.exist?(tracked_path)).to be_truthy
stub_env('DRY_RUN', 'false')
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(tracked_path)).to be_truthy
end
end
context 'ignorable cases' do
shared_examples_for 'does not move anything' do
it 'does not move even an orphan file' do
orphaned = create(:upload, :issuable_upload, :with_file, model: project)
orphaned_path = orphaned.absolute_path
orphaned.delete
expect(File.exist?(orphaned_path)).to be_truthy
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(orphaned_path)).to be_truthy
end
end
# Because we aren't concerned about these, and can save a lot of
# processing time by ignoring them. If we wish to cleanup hashed storage
# directories, it should simply require removing this test and modifying
# the find command.
context 'when the file is already in hashed storage' do
let(:project) { create(:project) }
before do
stub_env('DRY_RUN', 'false')
expect(Rails.logger).not_to receive(:info).with(/move|fix/i)
end
it_behaves_like 'does not move anything'
end
context 'when DRY_RUN env var is unset' do
let(:project) { create(:project, :legacy_storage) }
it_behaves_like 'does not move anything'
end
context 'when DRY_RUN env var is true' do
let(:project) { create(:project, :legacy_storage) }
before do
stub_env('DRY_RUN', 'true')
end
it_behaves_like 'does not move anything'
end
context 'when DRY_RUN env var is foo' do
let(:project) { create(:project, :legacy_storage) }
before do
stub_env('DRY_RUN', 'foo')
end
it_behaves_like 'does not move anything'
end
it 'does not move any non-project (FileUploader) uploads' do
stub_env('DRY_RUN', 'false')
paths = []
orphaned1 = create(:upload, :personal_snippet_upload, :with_file)
orphaned2 = create(:upload, :namespace_upload, :with_file)
orphaned3 = create(:upload, :attachment_upload, :with_file)
paths << orphaned1.absolute_path
paths << orphaned2.absolute_path
paths << orphaned3.absolute_path
Upload.delete_all
expect(Rails.logger).not_to receive(:info).with(/move|fix/i)
paths.each do |path|
expect(File.exist?(path)).to be_truthy
end
run_rake_task('gitlab:cleanup:project_uploads')
paths.each do |path|
expect(File.exist?(path)).to be_truthy
end
end
it 'does not move any uploads in tmp (which would interfere with ongoing upload activity)' do
stub_env('DRY_RUN', 'false')
path = File.join(FileUploader.root, 'tmp', 'foo.jpg')
FileUtils.mkdir_p(File.dirname(path))
FileUtils.touch(path)
expect(Rails.logger).not_to receive(:info).with(/move|fix/i)
expect(File.exist?(path)).to be_truthy
run_rake_task('gitlab:cleanup:project_uploads')
expect(File.exist?(path)).to be_truthy
end
end
end
end