diff --git a/changelogs/unreleased/mk-add-local-project-uploads-cleanup-task.yml b/changelogs/unreleased/mk-add-local-project-uploads-cleanup-task.yml new file mode 100644 index 00000000000..9d38b353a41 --- /dev/null +++ b/changelogs/unreleased/mk-add-local-project-uploads-cleanup-task.yml @@ -0,0 +1,5 @@ +--- +title: Add local project uploads cleanup task +merge_request: 20863 +author: +type: added diff --git a/doc/raketasks/cleanup.md b/doc/raketasks/cleanup.md index cf891cd90ad..e2eb342361a 100644 --- a/doc/raketasks/cleanup.md +++ b/doc/raketasks/cleanup.md @@ -22,3 +22,34 @@ sudo gitlab-rake gitlab:cleanup:repos # installation from source bundle exec rake gitlab:cleanup:repos RAILS_ENV=production ``` + +Clean up local project upload files if they don't exist in GitLab database. The +task attempts to fix the file if it can find its project, otherwise it moves the +file to a lost and found directory. + +``` +# omnibus-gitlab +sudo gitlab-rake gitlab:cleanup:project_uploads + +# installation from source +bundle exec rake gitlab:cleanup:project_uploads RAILS_ENV=production +``` + +Example output: + +``` +$ sudo gitlab-rake gitlab:cleanup:project_uploads +I, [2018-07-27T12:08:27.671559 #89817] INFO -- : Looking for orphaned project uploads to clean up. Dry run... +D, [2018-07-27T12:08:28.293568 #89817] DEBUG -- : Processing batch of 500 project upload file paths, starting with /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out +I, [2018-07-27T12:08:28.689869 #89817] INFO -- : Can move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/test.out +I, [2018-07-27T12:08:28.755624 #89817] INFO -- : Can fix /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/qux/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt +I, [2018-07-27T12:08:28.760257 #89817] INFO -- : Can move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png +I, [2018-07-27T12:08:28.764470 #89817] INFO -- : To cleanup these files run this command with DRY_RUN=false + +$ sudo gitlab-rake gitlab:cleanup:project_uploads DRY_RUN=false +I, [2018-07-27T12:08:32.944414 #89936] INFO -- : Looking for orphaned project uploads to clean up... +D, [2018-07-27T12:08:33.293568 #89817] DEBUG -- : Processing batch of 500 project upload file paths, starting with /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out +I, [2018-07-27T12:08:33.689869 #89817] INFO -- : Did move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/test.out -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/test.out +I, [2018-07-27T12:08:33.755624 #89817] INFO -- : Did fix /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/qux/foo/bar/89a0f7b0b97008a4a18cedccfdcd93fb/foo.txt +I, [2018-07-27T12:08:33.760257 #89817] INFO -- : Did move to lost and found /opt/gitlab/embedded/service/gitlab-rails/public/uploads/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png -> /opt/gitlab/embedded/service/gitlab-rails/public/uploads/-/project-lost-found/foo/bar/1dd6f0f7eefd2acc4c2233f89a0f7b0b/image.png +``` \ No newline at end of file diff --git a/lib/gitlab/cleanup/project_upload_file_finder.rb b/lib/gitlab/cleanup/project_upload_file_finder.rb new file mode 100644 index 00000000000..2ee8b60e76a --- /dev/null +++ b/lib/gitlab/cleanup/project_upload_file_finder.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +module Gitlab + module Cleanup + class ProjectUploadFileFinder + FIND_BATCH_SIZE = 500 + ABSOLUTE_UPLOAD_DIR = FileUploader.root.freeze + EXCLUDED_SYSTEM_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/-/*".freeze + EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*".freeze + EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*".freeze + + # Paths are relative to the upload directory + def each_file_batch(batch_size: FIND_BATCH_SIZE, &block) + cmd = build_find_command(ABSOLUTE_UPLOAD_DIR) + + Open3.popen2(*cmd) do |stdin, stdout, status_thread| + yield_paths_in_batches(stdout, batch_size, &block) + + raise "Find command failed" unless status_thread.value.success? + end + end + + private + + def yield_paths_in_batches(stdout, batch_size, &block) + paths = [] + + stdout.each_line("\0") do |line| + paths << line.chomp("\0") + + if paths.size >= batch_size + yield(paths) + paths = [] + end + end + + yield(paths) if paths.any? + end + + def build_find_command(search_dir) + cmd = %W[find -L #{search_dir} + -type f + ! ( -path #{EXCLUDED_SYSTEM_UPLOADS_PATH} -prune ) + ! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune ) + ! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune ) + -print0] + + ionice = which_ionice + cmd = %W[#{ionice} -c Idle] + cmd if ionice + + log_msg = "find command: \"#{cmd.join(' ')}\"" + Rails.logger.info log_msg + + cmd + end + + def which_ionice + Gitlab::Utils.which('ionice') + rescue StandardError + # In this case, returning false is relatively safe, + # even though it isn't very nice + false + end + end + end +end diff --git a/lib/gitlab/cleanup/project_uploads.rb b/lib/gitlab/cleanup/project_uploads.rb new file mode 100644 index 00000000000..b88e00311d5 --- /dev/null +++ b/lib/gitlab/cleanup/project_uploads.rb @@ -0,0 +1,125 @@ +# frozen_string_literal: true + +module Gitlab + module Cleanup + class ProjectUploads + LOST_AND_FOUND = File.join(ProjectUploadFileFinder::ABSOLUTE_UPLOAD_DIR, '-', 'project-lost-found') + + attr_reader :logger + + def initialize(logger: nil) + @logger = logger || Rails.logger + end + + def run!(dry_run: true) + logger.info "Looking for orphaned project uploads to clean up#{'. Dry run' if dry_run}..." + + each_orphan_file do |path, upload_path| + result = cleanup(path, upload_path, dry_run) + + logger.info result + end + end + + private + + def cleanup(path, upload_path, dry_run) + # This happened in staging: + # `find` returned a path on which `File.delete` raised `Errno::ENOENT` + return "Cannot find file: #{path}" unless File.exist?(path) + + correct_path = upload_path && find_correct_path(upload_path) + + if correct_path + move(path, correct_path, 'fix', dry_run) + else + move_to_lost_and_found(path, dry_run) + end + end + + # Accepts a path in the form of "#{hex_secret}/#{filename}" + def find_correct_path(upload_path) + upload = Upload.find_by(uploader: 'FileUploader', path: upload_path) + return unless upload && upload.local? + + upload.absolute_path + rescue => e + logger.error e.message + + # absolute_path depends on a lot of code. If it doesn't work, then it + # it doesn't matter if the upload file is in the right place. Treat it + # as uncorrectable. + # I.e. the project record might be missing, which raises an exception. + nil + end + + def move_to_lost_and_found(path, dry_run) + new_path = path.sub(/\A#{ProjectUploadFileFinder::ABSOLUTE_UPLOAD_DIR}/, LOST_AND_FOUND) + + move(path, new_path, 'move to lost and found', dry_run) + end + + def move(path, new_path, prefix, dry_run) + action = "#{prefix} #{path} -> #{new_path}" + + if dry_run + "Can #{action}" + else + begin + FileUtils.mkdir_p(File.dirname(new_path)) + FileUtils.mv(path, new_path) + + "Did #{action}" + rescue => e + "Error during #{action}: #{e.inspect}" + end + end + end + + # Yields absolute paths of project upload files that are not in the + # uploads table + def each_orphan_file + ProjectUploadFileFinder.new.each_file_batch do |file_paths| + logger.debug "Processing batch of #{file_paths.size} project upload file paths, starting with #{file_paths.first}" + + file_paths.each do |path| + pup = ProjectUploadPath.from_path(path) + + yield(path, pup.upload_path) if pup.orphan? + end + end + end + + class ProjectUploadPath + PROJECT_FULL_PATH_REGEX = %r{\A#{FileUploader.root}/(.+)/(\h+/[^/]+)\z}.freeze + + attr_reader :full_path, :upload_path + + def initialize(full_path, upload_path) + @full_path = full_path + @upload_path = upload_path + end + + def self.from_path(path) + path_matched = path.match(PROJECT_FULL_PATH_REGEX) + return new(nil, nil) unless path_matched + + new(path_matched[1], path_matched[2]) + end + + def orphan? + return true if full_path.nil? || upload_path.nil? + + # It's possible to reduce to one query, but `where_full_path_in` is complex + !Upload.exists?(path: upload_path, model_id: project_id, model_type: 'Project', uploader: 'FileUploader') + end + + private + + def project_id + @project_id ||= Project.where_full_path_in([full_path]).pluck(:id) + end + end + end + end +end diff --git a/lib/tasks/gitlab/cleanup.rake b/lib/tasks/gitlab/cleanup.rake index 5e07b12ee1c..a2feb074b1d 100644 --- a/lib/tasks/gitlab/cleanup.rake +++ b/lib/tasks/gitlab/cleanup.rake @@ -7,9 +7,8 @@ namespace :gitlab do desc "GitLab | Cleanup | Clean namespaces" task dirs: :gitlab_environment do warn_user_is_not_gitlab - remove_flag = ENV['REMOVE'] - namespaces = Namespace.pluck(:path) + namespaces = Namespace.pluck(:path) namespaces << HASHED_REPOSITORY_NAME # add so that it will be ignored Gitlab.config.repositories.storages.each do |name, repository_storage| git_base_path = Gitlab::GitalyClient::StorageSettings.allow_disk_access { repository_storage.legacy_disk_path } @@ -31,7 +30,7 @@ namespace :gitlab do end all_dirs.each do |dir_path| - if remove_flag + if remove? if FileUtils.rm_rf dir_path puts "Removed...#{dir_path}".color(:red) else @@ -43,7 +42,7 @@ namespace :gitlab do end end - unless remove_flag + unless remove? puts "To cleanup this directories run this command with REMOVE=true".color(:yellow) end end @@ -104,5 +103,37 @@ namespace :gitlab do puts "To block these users run this command with BLOCK=true".color(:yellow) end end + + desc "GitLab | Cleanup | Clean orphaned project uploads" + task project_uploads: :gitlab_environment do + warn_user_is_not_gitlab + + cleaner = Gitlab::Cleanup::ProjectUploads.new(logger: logger) + cleaner.run!(dry_run: dry_run?) + + if dry_run? + logger.info "To clean up these files run this command with DRY_RUN=false".color(:yellow) + end + end + + def remove? + ENV['REMOVE'] == 'true' + end + + def dry_run? + ENV['DRY_RUN'] != 'false' + end + + def logger + return @logger if defined?(@logger) + + @logger = if Rails.env.development? || Rails.env.production? + Logger.new(STDOUT).tap do |stdout_logger| + stdout_logger.extend(ActiveSupport::Logger.broadcast(Rails.logger)) + end + else + Rails.logger + end + end end end diff --git a/spec/tasks/gitlab/cleanup_rake_spec.rb b/spec/tasks/gitlab/cleanup_rake_spec.rb index 2bf873c923f..ba08ece1b4b 100644 --- a/spec/tasks/gitlab/cleanup_rake_spec.rb +++ b/spec/tasks/gitlab/cleanup_rake_spec.rb @@ -5,7 +5,7 @@ describe 'gitlab:cleanup rake tasks' do Rake.application.rake_require 'tasks/gitlab/cleanup' end - describe 'cleanup' do + describe 'cleanup namespaces and repos' do let(:storages) do { 'default' => Gitlab::GitalyClient::StorageSettings.new(@default_storage_hash.merge('path' => 'tmp/tests/default_storage')) @@ -67,4 +67,319 @@ describe 'gitlab:cleanup rake tasks' do end end end + + describe 'cleanup:project_uploads' do + context 'orphaned project upload file' do + context 'when an upload record matching the secret and filename is found' do + context 'when the project is still in legacy storage' do + let!(:orphaned) { create(:upload, :issuable_upload, :with_file, model: build(:project, :legacy_storage)) } + let!(:correct_path) { orphaned.absolute_path } + let!(:other_project) { create(:project, :legacy_storage) } + let!(:orphaned_path) { correct_path.sub(/#{orphaned.model.full_path}/, other_project.full_path) } + + before do + FileUtils.mkdir_p(File.dirname(orphaned_path)) + FileUtils.mv(correct_path, orphaned_path) + end + + it 'moves the file to its proper location' do + expect(Rails.logger).to receive(:info).twice + expect(Rails.logger).to receive(:info).with("Did fix #{orphaned_path} -> #{correct_path}") + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(correct_path)).to be_falsey + + stub_env('DRY_RUN', 'false') + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(orphaned_path)).to be_falsey + expect(File.exist?(correct_path)).to be_truthy + end + + it 'a dry run does not move the file' do + expect(Rails.logger).to receive(:info).twice + expect(Rails.logger).to receive(:info).with("Can fix #{orphaned_path} -> #{correct_path}") + expect(Rails.logger).to receive(:info) + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(correct_path)).to be_falsey + + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(correct_path)).to be_falsey + end + + context 'when the project record is missing (Upload#absolute_path raises error)' do + let!(:lost_and_found_path) { File.join(FileUploader.root, '-', 'project-lost-found', other_project.full_path, orphaned.path) } + + before do + orphaned.model.delete + end + + it 'moves the file to lost and found' do + expect(Rails.logger).to receive(:info).twice + expect(Rails.logger).to receive(:info).with("Did move to lost and found #{orphaned_path} -> #{lost_and_found_path}") + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(lost_and_found_path)).to be_falsey + + stub_env('DRY_RUN', 'false') + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(orphaned_path)).to be_falsey + expect(File.exist?(lost_and_found_path)).to be_truthy + end + + it 'a dry run does not move the file' do + expect(Rails.logger).to receive(:info).twice + expect(Rails.logger).to receive(:info).with("Can move to lost and found #{orphaned_path} -> #{lost_and_found_path}") + expect(Rails.logger).to receive(:info) + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(lost_and_found_path)).to be_falsey + + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(lost_and_found_path)).to be_falsey + end + end + end + + context 'when the project was moved to hashed storage' do + let!(:orphaned) { create(:upload, :issuable_upload, :with_file) } + let!(:correct_path) { orphaned.absolute_path } + let!(:orphaned_path) { File.join(FileUploader.root, 'foo', 'bar', orphaned.path) } + + before do + FileUtils.mkdir_p(File.dirname(orphaned_path)) + FileUtils.mv(correct_path, orphaned_path) + end + + it 'moves the file to its proper location' do + expect(Rails.logger).to receive(:info).twice + expect(Rails.logger).to receive(:info).with("Did fix #{orphaned_path} -> #{correct_path}") + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(correct_path)).to be_falsey + + stub_env('DRY_RUN', 'false') + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(orphaned_path)).to be_falsey + expect(File.exist?(correct_path)).to be_truthy + end + + it 'a dry run does not move the file' do + expect(Rails.logger).to receive(:info).twice + expect(Rails.logger).to receive(:info).with("Can fix #{orphaned_path} -> #{correct_path}") + expect(Rails.logger).to receive(:info) + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(correct_path)).to be_falsey + + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(correct_path)).to be_falsey + end + end + end + + context 'when a matching upload record can not be found' do + context 'when the file path fits the known pattern' do + let!(:orphaned) { create(:upload, :issuable_upload, :with_file, model: build(:project, :legacy_storage)) } + let!(:orphaned_path) { orphaned.absolute_path } + let!(:lost_and_found_path) { File.join(FileUploader.root, '-', 'project-lost-found', orphaned.model.full_path, orphaned.path) } + + before do + orphaned.delete + end + + it 'moves the file to lost and found' do + expect(Rails.logger).to receive(:info).twice + expect(Rails.logger).to receive(:info).with("Did move to lost and found #{orphaned_path} -> #{lost_and_found_path}") + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(lost_and_found_path)).to be_falsey + + stub_env('DRY_RUN', 'false') + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(orphaned_path)).to be_falsey + expect(File.exist?(lost_and_found_path)).to be_truthy + end + + it 'a dry run does not move the file' do + expect(Rails.logger).to receive(:info).twice + expect(Rails.logger).to receive(:info).with("Can move to lost and found #{orphaned_path} -> #{lost_and_found_path}") + expect(Rails.logger).to receive(:info) + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(lost_and_found_path)).to be_falsey + + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(lost_and_found_path)).to be_falsey + end + end + + context 'when the file path does not fit the known pattern' do + let!(:invalid_path) { File.join('group', 'file.jpg') } + let!(:orphaned_path) { File.join(FileUploader.root, invalid_path) } + let!(:lost_and_found_path) { File.join(FileUploader.root, '-', 'project-lost-found', invalid_path) } + + before do + FileUtils.mkdir_p(File.dirname(orphaned_path)) + FileUtils.touch(orphaned_path) + end + + after do + File.delete(orphaned_path) if File.exist?(orphaned_path) + end + + it 'moves the file to lost and found' do + expect(Rails.logger).to receive(:info).twice + expect(Rails.logger).to receive(:info).with("Did move to lost and found #{orphaned_path} -> #{lost_and_found_path}") + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(lost_and_found_path)).to be_falsey + + stub_env('DRY_RUN', 'false') + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(orphaned_path)).to be_falsey + expect(File.exist?(lost_and_found_path)).to be_truthy + end + + it 'a dry run does not move the file' do + expect(Rails.logger).to receive(:info).twice + expect(Rails.logger).to receive(:info).with("Can move to lost and found #{orphaned_path} -> #{lost_and_found_path}") + expect(Rails.logger).to receive(:info) + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(lost_and_found_path)).to be_falsey + + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(orphaned_path)).to be_truthy + expect(File.exist?(lost_and_found_path)).to be_falsey + end + end + end + end + + context 'non-orphaned project upload file' do + it 'does not move the file' do + tracked = create(:upload, :issuable_upload, :with_file, model: build(:project, :legacy_storage)) + tracked_path = tracked.absolute_path + + expect(Rails.logger).not_to receive(:info).with(/move|fix/i) + expect(File.exist?(tracked_path)).to be_truthy + + stub_env('DRY_RUN', 'false') + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(tracked_path)).to be_truthy + end + end + + context 'ignorable cases' do + shared_examples_for 'does not move anything' do + it 'does not move even an orphan file' do + orphaned = create(:upload, :issuable_upload, :with_file, model: project) + orphaned_path = orphaned.absolute_path + orphaned.delete + + expect(File.exist?(orphaned_path)).to be_truthy + + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(orphaned_path)).to be_truthy + end + end + + # Because we aren't concerned about these, and can save a lot of + # processing time by ignoring them. If we wish to cleanup hashed storage + # directories, it should simply require removing this test and modifying + # the find command. + context 'when the file is already in hashed storage' do + let(:project) { create(:project) } + + before do + stub_env('DRY_RUN', 'false') + expect(Rails.logger).not_to receive(:info).with(/move|fix/i) + end + + it_behaves_like 'does not move anything' + end + + context 'when DRY_RUN env var is unset' do + let(:project) { create(:project, :legacy_storage) } + + it_behaves_like 'does not move anything' + end + + context 'when DRY_RUN env var is true' do + let(:project) { create(:project, :legacy_storage) } + + before do + stub_env('DRY_RUN', 'true') + end + + it_behaves_like 'does not move anything' + end + + context 'when DRY_RUN env var is foo' do + let(:project) { create(:project, :legacy_storage) } + + before do + stub_env('DRY_RUN', 'foo') + end + + it_behaves_like 'does not move anything' + end + + it 'does not move any non-project (FileUploader) uploads' do + stub_env('DRY_RUN', 'false') + + paths = [] + orphaned1 = create(:upload, :personal_snippet_upload, :with_file) + orphaned2 = create(:upload, :namespace_upload, :with_file) + orphaned3 = create(:upload, :attachment_upload, :with_file) + paths << orphaned1.absolute_path + paths << orphaned2.absolute_path + paths << orphaned3.absolute_path + Upload.delete_all + + expect(Rails.logger).not_to receive(:info).with(/move|fix/i) + paths.each do |path| + expect(File.exist?(path)).to be_truthy + end + + run_rake_task('gitlab:cleanup:project_uploads') + + paths.each do |path| + expect(File.exist?(path)).to be_truthy + end + end + + it 'does not move any uploads in tmp (which would interfere with ongoing upload activity)' do + stub_env('DRY_RUN', 'false') + + path = File.join(FileUploader.root, 'tmp', 'foo.jpg') + FileUtils.mkdir_p(File.dirname(path)) + FileUtils.touch(path) + + expect(Rails.logger).not_to receive(:info).with(/move|fix/i) + expect(File.exist?(path)).to be_truthy + + run_rake_task('gitlab:cleanup:project_uploads') + + expect(File.exist?(path)).to be_truthy + end + end + end end