gitlab-org--gitlab-foss/lib/gitlab/background_migration/prepare_untracked_uploads.rb

174 lines
5.4 KiB
Ruby
Raw Normal View History

2017-11-27 01:57:21 -05:00
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
2017-11-27 01:57:21 -05:00
# This class finds all non-hashed uploaded file paths and saves them to a
# `untracked_files_for_uploads` table.
class PrepareUntrackedUploads # rubocop:disable Metrics/ClassLength
# For bulk_queue_background_migration_jobs_by_range
include Database::MigrationHelpers
include ::Gitlab::Utils::StrongMemoize
2017-11-27 01:57:21 -05:00
FIND_BATCH_SIZE = 500
RELATIVE_UPLOAD_DIR = "uploads"
2018-01-29 12:57:34 -05:00
ABSOLUTE_UPLOAD_DIR = File.join(
Gitlab.config.uploads.storage_path,
RELATIVE_UPLOAD_DIR
)
FOLLOW_UP_MIGRATION = 'PopulateUntrackedUploads'
2019-05-05 06:19:14 -04:00
START_WITH_ROOT_REGEX = %r{\A#{Gitlab.config.uploads.storage_path}/}.freeze
EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*"
EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*"
2017-11-27 01:57:21 -05:00
# This class is used to iterate over batches of
# `untracked_files_for_uploads` rows.
class UntrackedFile < ActiveRecord::Base
include EachBatch
self.table_name = 'untracked_files_for_uploads'
end
def perform
ensure_temporary_tracking_table_exists
2017-11-20 19:27:24 -05:00
# Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since
# doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be
# slow, start with an empty table for Postgres < 9.5.
# That way we can do bulk inserts at ~30x the speed of individual
# inserts (~20 minutes worth of inserts at GitLab.com scale instead of
# ~10 hours).
# In all other cases, installations will get both bulk inserts and the
# ability for these jobs to retry without having to clear and reinsert.
clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates?
store_untracked_file_paths
2017-11-20 19:27:24 -05:00
if UntrackedFile.all.empty?
drop_temp_table
else
schedule_populate_untracked_uploads_jobs
end
end
private
def ensure_temporary_tracking_table_exists
2017-11-27 01:57:21 -05:00
table_name = :untracked_files_for_uploads
unless ActiveRecord::Base.connection.table_exists?(table_name)
2017-11-27 01:57:21 -05:00
UntrackedFile.connection.create_table table_name do |t|
t.string :path, limit: 600, null: false
2017-11-22 13:23:24 -05:00
t.index :path, unique: true
end
end
end
2017-11-20 19:27:24 -05:00
def clear_untracked_file_paths
UntrackedFile.delete_all
end
def store_untracked_file_paths
return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR)
2017-11-27 01:57:21 -05:00
each_file_batch(ABSOLUTE_UPLOAD_DIR, FIND_BATCH_SIZE) do |file_paths|
2017-11-08 18:05:08 -05:00
insert_file_paths(file_paths)
end
end
2017-11-08 18:05:08 -05:00
def each_file_batch(search_dir, batch_size, &block)
cmd = build_find_command(search_dir)
2017-11-08 18:05:08 -05:00
Open3.popen2(*cmd) do |stdin, stdout, status_thread|
2017-11-08 18:05:08 -05:00
yield_paths_in_batches(stdout, batch_size, &block)
raise "Find command failed" unless status_thread.value.success?
end
end
2017-11-08 18:05:08 -05:00
def yield_paths_in_batches(stdout, batch_size, &block)
paths = []
stdout.each_line("\0") do |line|
2018-01-29 12:57:34 -05:00
paths << line.chomp("\0").sub(START_WITH_ROOT_REGEX, '')
2017-11-08 18:05:08 -05:00
if paths.size >= batch_size
yield(paths)
paths = []
end
end
yield(paths) if paths.any?
2017-11-08 18:05:08 -05:00
end
def build_find_command(search_dir)
cmd = %W[find -L #{search_dir}
2017-11-27 01:57:21 -05:00
-type f
! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune )
! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune )
-print0]
2017-11-17 16:49:25 -05:00
ionice = which_ionice
cmd = %W[#{ionice} -c Idle] + cmd if ionice
2017-11-27 01:57:21 -05:00
log_msg = "PrepareUntrackedUploads find command: \"#{cmd.join(' ')}\""
Rails.logger.info log_msg # rubocop:disable Gitlab/RailsLogger
2017-11-16 19:24:42 -05:00
cmd
end
2017-11-17 16:49:25 -05:00
def which_ionice
Gitlab::Utils.which('ionice')
rescue StandardError
2017-11-27 01:57:21 -05:00
# In this case, returning false is relatively safe,
# even though it isn't very nice
false
end
def insert_file_paths(file_paths)
2017-11-27 01:57:21 -05:00
sql = insert_sql(file_paths)
2017-11-20 19:27:24 -05:00
ActiveRecord::Base.connection.execute(sql)
end
2017-11-15 07:51:28 -05:00
2017-11-27 01:57:21 -05:00
def insert_sql(file_paths)
if postgresql_pre_9_5?
"INSERT INTO #{table_columns_and_values_for_insert(file_paths)};"
else
2017-11-27 01:57:21 -05:00
"INSERT INTO #{table_columns_and_values_for_insert(file_paths)}"\
" ON CONFLICT DO NOTHING;"
end
end
2017-11-20 19:27:24 -05:00
def table_columns_and_values_for_insert(file_paths)
values = file_paths.map do |file_path|
2018-07-02 06:43:06 -04:00
ActiveRecord::Base.send(:sanitize_sql_array, ['(?)', file_path]) # rubocop:disable GitlabSecurity/PublicSend
2017-11-20 19:27:24 -05:00
end.join(', ')
"#{UntrackedFile.table_name} (path) VALUES #{values}"
end
2017-11-20 19:27:24 -05:00
def can_bulk_insert_and_ignore_duplicates?
!postgresql_pre_9_5?
end
2017-11-15 07:51:28 -05:00
def postgresql_pre_9_5?
strong_memoize(:postgresql_pre_9_5) do
Gitlab::Database.version.to_f < 9.5
end
2017-11-15 07:51:28 -05:00
end
def schedule_populate_untracked_uploads_jobs
2017-11-27 01:57:21 -05:00
bulk_queue_background_migration_jobs_by_range(
UntrackedFile, FOLLOW_UP_MIGRATION)
end
def drop_temp_table
unless Rails.env.test? # Dropping a table intermittently breaks test cleanup
UntrackedFile.connection.drop_table(:untracked_files_for_uploads,
if_exists: true)
end
end
end
end
end