gitlab-org--gitlab-foss/db/post_migrate/20200511083541_cleanup_proj...

264 lines
8.8 KiB
Ruby

# frozen_string_literal: true
# rubocop:disable Migration/PreventStrings
# This migration cleans up Projects that were orphaned when their namespace was deleted
# Instead of deleting them, we:
# - Find (or create) the Ghost User
# - Create (if not already exists) a `lost-and-found` group owned by the Ghost User
# - Find orphaned projects --> namespace_id can not be found in namespaces
# - Move the orphaned projects to the `lost-and-found` group
# (while making them private and setting `archived=true`)
#
# On GitLab.com (2020-05-11) this migration will update 66 orphaned projects
class CleanupProjectsWithMissingNamespace < ActiveRecord::Migration[6.0]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
VISIBILITY_PRIVATE = 0
ACCESS_LEVEL_OWNER = 50
# The batch size of projects to check in each iteration
# We expect the selectivity for orphaned projects to be very low:
# (66 orphaned projects out of a total 13.6M)
# so 10K should be a safe choice
BATCH_SIZE = 10000
disable_ddl_transaction!
class UserDetail < ActiveRecord::Base
self.table_name = 'user_details'
belongs_to :user, class_name: 'CleanupProjectsWithMissingNamespace::User'
end
class User < ActiveRecord::Base
self.table_name = 'users'
LOST_AND_FOUND_GROUP = 'lost-and-found'
USER_TYPE_GHOST = 5
DEFAULT_PROJECTS_LIMIT = 100000
default_value_for :admin, false
default_value_for :can_create_group, true # we need this to create the group
default_value_for :can_create_team, false
default_value_for :project_view, :files
default_value_for :notified_of_own_activity, false
default_value_for :preferred_language, I18n.default_locale
has_one :user_detail, class_name: 'CleanupProjectsWithMissingNamespace::UserDetail'
has_one :namespace, -> { where(type: nil) },
foreign_key: :owner_id, inverse_of: :owner, autosave: true,
class_name: 'CleanupProjectsWithMissingNamespace::Namespace'
before_save :ensure_namespace_correct
before_save :ensure_bio_is_assigned_to_user_details, if: :bio_changed?
enum project_view: { readme: 0, activity: 1, files: 2 }
def ensure_namespace_correct
if namespace
namespace.path = username if username_changed?
namespace.name = name if name_changed?
else
build_namespace(path: username, name: name)
end
end
def ensure_bio_is_assigned_to_user_details
return if Feature.disabled?(:migrate_bio_to_user_details, default_enabled: true)
user_detail.bio = bio.to_s[0...255]
end
def user_detail
super.presence || build_user_detail
end
# Return (or create if necessary) the `lost-and-found` group
def lost_and_found_group
existing_lost_and_found_group || Group.create_unique_group(self, LOST_AND_FOUND_GROUP)
end
def existing_lost_and_found_group
# There should only be one Group for User Ghost starting with LOST_AND_FOUND_GROUP
Group
.joins('INNER JOIN members ON namespaces.id = members.source_id')
.where('namespaces.type = ?', 'Group')
.where('members.type = ?', 'GroupMember')
.where('members.source_type = ?', 'Namespace')
.where('members.user_id = ?', self.id)
.where('members.requested_at IS NULL')
.where('members.access_level = ?', ACCESS_LEVEL_OWNER)
.find_by(Group.arel_table[:name].matches("#{LOST_AND_FOUND_GROUP}%"))
end
class << self
# Return (or create if necessary) the ghost user
def ghost
email = 'ghost%s@example.com'
unique_internal(where(user_type: USER_TYPE_GHOST), 'ghost', email) do |u|
u.bio = _('This is a "Ghost User", created to hold all issues authored by users that have since been deleted. This user cannot be removed.')
u.name = 'Ghost User'
end
end
def unique_internal(scope, username, email_pattern, &block)
scope.first || create_unique_internal(scope, username, email_pattern, &block)
end
def create_unique_internal(scope, username, email_pattern, &creation_block)
# Since we only want a single one of these in an instance, we use an
# exclusive lease to ensure that this block is never run concurrently.
lease_key = "user:unique_internal:#{username}"
lease = Gitlab::ExclusiveLease.new(lease_key, timeout: 1.minute.to_i)
until uuid = lease.try_obtain
# Keep trying until we obtain the lease. To prevent hammering Redis too
# much we'll wait for a bit between retries.
sleep(1)
end
# Recheck if the user is already present. One might have been
# added between the time we last checked (first line of this method)
# and the time we acquired the lock.
existing_user = uncached { scope.first }
return existing_user if existing_user.present?
uniquify = Uniquify.new
username = uniquify.string(username) { |s| User.find_by_username(s) }
email = uniquify.string(-> (n) { Kernel.sprintf(email_pattern, n) }) do |s|
User.find_by_email(s)
end
User.create!(
username: username,
email: email,
user_type: USER_TYPE_GHOST,
projects_limit: DEFAULT_PROJECTS_LIMIT,
state: :active,
&creation_block
)
ensure
Gitlab::ExclusiveLease.cancel(lease_key, uuid)
end
end
end
class Namespace < ActiveRecord::Base
self.table_name = 'namespaces'
belongs_to :owner, class_name: 'CleanupProjectsWithMissingNamespace::User'
end
class Group < Namespace
# Disable STI to allow us to manually set "type = 'Group'"
# Otherwise rails forces "type = CleanupProjectsWithMissingNamespace::Group"
self.inheritance_column = :_type_disabled
def self.create_unique_group(user, group_name)
# 'lost-and-found' may be already defined, find a unique one
group_name = Uniquify.new.string(group_name) do |str|
Group.where(parent_id: nil, name: str).exists?
end
group = Group.create!(
name: group_name,
path: group_name,
type: 'Group',
description: 'Group to store orphaned projects',
visibility_level: VISIBILITY_PRIVATE
)
# No need to create a route for the lost-and-found group
GroupMember.add_user(group, user, ACCESS_LEVEL_OWNER)
group
end
end
class Member < ActiveRecord::Base
self.table_name = 'members'
end
class GroupMember < Member
NOTIFICATION_SETTING_GLOBAL = 3
# Disable STI to allow us to manually set "type = 'GroupMember'"
# Otherwise rails forces "type = CleanupProjectsWithMissingNamespace::GroupMember"
self.inheritance_column = :_type_disabled
def self.add_user(source, user, access_level)
GroupMember.create!(
type: 'GroupMember',
source_id: source.id,
user_id: user.id,
source_type: 'Namespace',
access_level: access_level,
notification_level: NOTIFICATION_SETTING_GLOBAL
)
end
end
class Project < ActiveRecord::Base
self.table_name = 'projects'
include ::EachBatch
def self.without_namespace
where(
'NOT EXISTS (
SELECT 1
FROM namespaces
WHERE projects.namespace_id = namespaces.id
)'
)
end
end
def up
# Reset the column information of all the models that update the database
# to ensure the Active Record's knowledge of the table structure is current
User.reset_column_information
Namespace.reset_column_information
Member.reset_column_information
Project.reset_column_information
# Find or Create the ghost user
ghost_user = User.ghost
# Find or Create the `lost-and-found`
lost_and_found = ghost_user.lost_and_found_group
# With BATCH_SIZE=10K and projects.count=13.6M
# ~1360 iterations will be run:
# - each requires on average ~160ms for relation.without_namespace
# - worst case scenario is that 66 of those batches will trigger an update (~200ms each)
# In general, we expect less than 5% (=66/13.6M x 10K) to trigger an update
# Expected total run time: ~235 seconds (== 220 seconds + 14 seconds)
Project.each_batch(of: BATCH_SIZE) do |relation|
relation.without_namespace.update_all <<~SQL
namespace_id = #{lost_and_found.id},
archived = TRUE,
visibility_level = #{VISIBILITY_PRIVATE},
-- Names are expected to be unique inside their namespace
-- (uniqueness validation on namespace_id, name)
-- Attach the id to the name and path to make sure that they are unique
name = name || '_' || id::text,
path = path || '_' || id::text
SQL
end
end
def down
# no-op: the original state for those projects was inconsistent
# Also, the original namespace_id for each project is lost during the update
end
end
# rubocop:enable Migration/PreventStrings