Backfill project_repositories for legacy storage projects

Adds a background migration that will ensure all projects that
are on legacy storage have a row in `project_repositories`.
This commit is contained in:
Douglas Barbosa Alexandre 2018-12-18 19:02:36 -02:00
parent 5d68c23792
commit 754f66113e
No known key found for this signature in database
GPG key ID: F1E98EF6393565A0
9 changed files with 423 additions and 176 deletions

View file

@ -0,0 +1,26 @@
# frozen_string_literal: true
class BackfillProjectRepositoriesForLegacyStorageProjects < ActiveRecord::Migration[5.0]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
BATCH_SIZE = 1_000
DELAY_INTERVAL = 5.minutes
MIGRATION = 'BackfillLegacyProjectRepositories'
disable_ddl_transaction!
class Project < ActiveRecord::Base
include EachBatch
self.table_name = 'projects'
end
def up
queue_background_migration_jobs_by_range_at_intervals(Project, MIGRATION, DELAY_INTERVAL)
end
def down
# no-op: since there could have been existing rows before the migration do not remove anything
end
end

View file

@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20181212104941) do
ActiveRecord::Schema.define(version: 20181218192239) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"

View file

@ -2,132 +2,13 @@
module Gitlab
module BackgroundMigration
# Class that will create fill the project_repositories table
# for all projects that are on hashed storage and an entry is
# is missing in this table.
class BackfillHashedProjectRepositories
# Shard model
class Shard < ActiveRecord::Base
self.table_name = 'shards'
end
# Class that will find or create the shard by name.
# There is only a small set of shards, which would
# not change quickly, so look them up from memory
# instead of hitting the DB each time.
class ShardFinder
def find_shard_id(name)
shard_id = shards.fetch(name, nil)
return shard_id if shard_id.present?
Shard.transaction(requires_new: true) do
create!(name)
end
rescue ActiveRecord::RecordNotUnique
reload!
retry
end
private
def create!(name)
Shard.create!(name: name).tap { |shard| @shards[name] = shard.id }
end
def shards
@shards ||= reload!
end
def reload!
@shards = Hash[*Shard.all.map { |shard| [shard.name, shard.id] }.flatten]
end
end
# ProjectRegistry model
class ProjectRepository < ActiveRecord::Base
self.table_name = 'project_repositories'
belongs_to :project, inverse_of: :project_repository
end
# Project model
class Project < ActiveRecord::Base
self.table_name = 'projects'
HASHED_PATH_PREFIX = '@hashed'
HASHED_STORAGE_FEATURES = {
repository: 1,
attachments: 2
}.freeze
has_one :project_repository, inverse_of: :project
class << self
def on_hashed_storage
where(Project.arel_table[:storage_version]
.gteq(HASHED_STORAGE_FEATURES[:repository]))
end
def without_project_repository
joins(left_outer_join_project_repository)
.where(ProjectRepository.arel_table[:project_id].eq(nil))
end
def left_outer_join_project_repository
projects_table = Project.arel_table
repository_table = ProjectRepository.arel_table
projects_table
.join(repository_table, Arel::Nodes::OuterJoin)
.on(projects_table[:id].eq(repository_table[:project_id]))
.join_sources
end
end
def hashed_storage?
self.storage_version && self.storage_version >= 1
end
def hashed_disk_path
"#{HASHED_PATH_PREFIX}/#{disk_hash[0..1]}/#{disk_hash[2..3]}/#{disk_hash}"
end
def disk_hash
@disk_hash ||= Digest::SHA2.hexdigest(id.to_s)
end
end
def perform(start_id, stop_id)
Gitlab::Database.bulk_insert(:project_repositories, project_repositories(start_id, stop_id))
end
# Class that will fill the project_repositories table for projects that
# are on hashed storage and an entry is is missing in this table.
class BackfillHashedProjectRepositories < BackfillProjectRepositories
private
def project_repositories(start_id, stop_id)
def projects
Project.on_hashed_storage
.without_project_repository
.where(id: start_id..stop_id)
.map { |project| build_attributes_for_project(project) }
.compact
end
def build_attributes_for_project(project)
return unless project.hashed_storage?
{
project_id: project.id,
shard_id: find_shard_id(project.repository_storage),
disk_path: project.hashed_disk_path
}
end
def find_shard_id(repository_storage)
shard_finder.find_shard_id(repository_storage)
end
def shard_finder
@shard_finder ||= ShardFinder.new
end
end
end

View file

@ -0,0 +1,15 @@
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
# Class that will fill the project_repositories table for projects that
# are on legacy storage and an entry is is missing in this table.
class BackfillLegacyProjectRepositories < BackfillProjectRepositories
private
def projects
Project.with_parent.on_legacy_storage
end
end
end
end

View file

@ -0,0 +1,219 @@
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
# Class that will create fill the project_repositories table
# for projects an entry is is missing in this table.
class BackfillProjectRepositories
OrphanedNamespaceError = Class.new(StandardError)
# Shard model
class Shard < ActiveRecord::Base
self.table_name = 'shards'
end
# Class that will find or create the shard by name.
# There is only a small set of shards, which would
# not change quickly, so look them up from memory
# instead of hitting the DB each time.
class ShardFinder
def find_shard_id(name)
shard_id = shards.fetch(name, nil)
return shard_id if shard_id.present?
Shard.transaction(requires_new: true) do
create!(name)
end
rescue ActiveRecord::RecordNotUnique
reload!
retry
end
private
def create!(name)
Shard.create!(name: name).tap { |shard| @shards[name] = shard.id }
end
def shards
@shards ||= reload!
end
def reload!
@shards = Hash[*Shard.all.map { |shard| [shard.name, shard.id] }.flatten]
end
end
module Storage
# Class that returns the disk path for a project using hashed storage
class HashedProject
attr_accessor :project
ROOT_PATH_PREFIX = '@hashed'
def initialize(project)
@project = project
end
def disk_path
"#{ROOT_PATH_PREFIX}/#{disk_hash[0..1]}/#{disk_hash[2..3]}/#{disk_hash}"
end
def disk_hash
@disk_hash ||= Digest::SHA2.hexdigest(project.id.to_s)
end
end
# Class that returns the disk path for a project using legacy storage
class LegacyProject
attr_accessor :project
def initialize(project)
@project = project
end
def disk_path
project.full_path
end
end
end
# Concern used by Project and Namespace to determine the full route to the project
module Routable
extend ActiveSupport::Concern
def full_path
@full_path ||= build_full_path
end
def build_full_path
return path unless has_parent?
raise OrphanedNamespaceError if parent.nil?
parent.full_path + '/' + path
end
def has_parent?
read_attribute(association(:parent).reflection.foreign_key)
end
end
# Namespace model.
class Namespace < ActiveRecord::Base
self.table_name = 'namespaces'
self.inheritance_column = nil
include Routable
belongs_to :parent, class_name: 'Namespace', inverse_of: 'namespaces'
has_many :projects, inverse_of: :parent
has_many :namespaces, inverse_of: :parent
end
# ProjectRegistry model
class ProjectRepository < ActiveRecord::Base
self.table_name = 'project_repositories'
belongs_to :project, inverse_of: :project_repository
end
# Project model
class Project < ActiveRecord::Base
self.table_name = 'projects'
include Routable
HASHED_STORAGE_FEATURES = {
repository: 1,
attachments: 2
}.freeze
scope :with_parent, -> { includes(:parent) }
belongs_to :parent, class_name: 'Namespace', foreign_key: :namespace_id, inverse_of: 'projects'
has_one :project_repository, inverse_of: :project
delegate :disk_path, to: :storage
class << self
def on_hashed_storage
where(Project.arel_table[:storage_version]
.gteq(HASHED_STORAGE_FEATURES[:repository]))
end
def on_legacy_storage
where(Project.arel_table[:storage_version].eq(nil)
.or(Project.arel_table[:storage_version].eq(0)))
end
def without_project_repository
joins(left_outer_join_project_repository)
.where(ProjectRepository.arel_table[:project_id].eq(nil))
end
def left_outer_join_project_repository
projects_table = Project.arel_table
repository_table = ProjectRepository.arel_table
projects_table
.join(repository_table, Arel::Nodes::OuterJoin)
.on(projects_table[:id].eq(repository_table[:project_id]))
.join_sources
end
end
def storage
@storage ||=
if hashed_storage?
Storage::HashedProject.new(self)
else
Storage::LegacyProject.new(self)
end
end
def hashed_storage?
self.storage_version &&
self.storage_version >= HASHED_STORAGE_FEATURES[:repository]
end
end
def perform(start_id, stop_id)
Gitlab::Database.bulk_insert(:project_repositories, project_repositories(start_id, stop_id))
end
private
def projects
raise NotImplementedError,
"#{self.class} does not implement #{__method__}"
end
def project_repositories(start_id, stop_id)
projects
.without_project_repository
.where(id: start_id..stop_id)
.map { |project| build_attributes_for_project(project) }
.compact
end
def build_attributes_for_project(project)
{
project_id: project.id,
shard_id: find_shard_id(project.repository_storage),
disk_path: project.disk_path
}
end
def find_shard_id(repository_storage)
shard_finder.find_shard_id(repository_storage)
end
def shard_finder
@shard_finder ||= ShardFinder.new
end
end
end
end

View file

@ -0,0 +1,12 @@
# frozen_string_literal: true
FactoryBot.define do
factory :project_repository do
project
after(:build) do |project_repository, _|
project_repository.shard_name = project_repository.project.repository_storage
project_repository.disk_path = project_repository.project.disk_path
end
end
end

View file

@ -3,59 +3,14 @@
require 'spec_helper'
describe Gitlab::BackgroundMigration::BackfillHashedProjectRepositories, :migration, schema: 20181130102132 do
let(:namespaces) { table(:namespaces) }
let(:project_repositories) { table(:project_repositories) }
let(:projects) { table(:projects) }
let(:shards) { table(:shards) }
let(:group) { namespaces.create!(name: 'foo', path: 'foo') }
let(:shard) { shards.create!(name: 'default') }
describe described_class::ShardFinder do
describe '#find_shard_id' do
it 'creates a new shard when it does not exist yet' do
expect { subject.find_shard_id('other') }.to change(shards, :count).by(1)
end
it 'returns the shard when it exists' do
shards.create(id: 5, name: 'other')
shard_id = subject.find_shard_id('other')
expect(shard_id).to eq(5)
end
it 'only queries the database once to retrieve shards' do
subject.find_shard_id('default')
expect { subject.find_shard_id('default') }.not_to exceed_query_limit(0)
end
end
end
describe described_class::Project do
describe '.on_hashed_storage' do
it 'finds projects with repository on hashed storage' do
projects.create!(id: 1, name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 1)
projects.create!(id: 2, name: 'bar', path: 'bar', namespace_id: group.id, storage_version: 2)
projects.create!(id: 3, name: 'baz', path: 'baz', namespace_id: group.id, storage_version: 0)
projects.create!(id: 4, name: 'zoo', path: 'zoo', namespace_id: group.id, storage_version: nil)
expect(described_class.on_hashed_storage.pluck(:id)).to match_array([1, 2])
end
end
describe '.without_project_repository' do
it 'finds projects which do not have a projects_repositories entry' do
projects.create!(id: 1, name: 'foo', path: 'foo', namespace_id: group.id)
projects.create!(id: 2, name: 'bar', path: 'bar', namespace_id: group.id)
project_repositories.create!(project_id: 2, disk_path: '@phony/foo/bar', shard_id: shard.id)
expect(described_class.without_project_repository.pluck(:id)).to contain_exactly(1)
end
end
end
describe '#perform' do
let(:namespaces) { table(:namespaces) }
let(:project_repositories) { table(:project_repositories) }
let(:projects) { table(:projects) }
let(:shards) { table(:shards) }
let(:group) { namespaces.create!(name: 'foo', path: 'foo') }
let(:shard) { shards.create!(name: 'default') }
it 'creates a project_repository row for projects on hashed storage that need one' do
projects.create!(id: 1, name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 1)
projects.create!(id: 2, name: 'bar', path: 'bar', namespace_id: group.id, storage_version: 2)

View file

@ -0,0 +1,45 @@
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::BackgroundMigration::BackfillLegacyProjectRepositories, :migration, schema: 20181218192239 do
describe '#perform' do
let(:namespaces) { table(:namespaces) }
let(:project_repositories) { table(:project_repositories) }
let(:projects) { table(:projects) }
let(:shards) { table(:shards) }
let(:group) { namespaces.create!(name: 'foo', path: 'foo') }
let(:shard) { shards.create!(name: 'default') }
it 'creates a project_repository row for projects on legacy storage that need one' do
projects.create!(id: 1, name: 'foo', path: 'foo', namespace_id: group.id, storage_version: nil)
projects.create!(id: 2, name: 'bar', path: 'bar', namespace_id: group.id, storage_version: 0)
expect { described_class.new.perform(1, projects.last.id) }.to change(project_repositories, :count).by(2)
end
it 'does nothing for projects on legacy storage that have already a project_repository row' do
projects.create!(id: 1, name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 0)
project_repositories.create!(project_id: 1, disk_path: 'phony/foo/bar', shard_id: shard.id)
expect { described_class.new.perform(1, projects.last.id) }.not_to change(project_repositories, :count)
end
it 'does nothing for projects on hashed storage' do
projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 1)
expect { described_class.new.perform(1, projects.last.id) }.not_to change(project_repositories, :count)
end
it 'inserts rows in a single query' do
projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 0, repository_storage: shard.name)
control_count = ActiveRecord::QueryRecorder.new { described_class.new.perform(1, projects.last.id) }
projects.create!(name: 'bar', path: 'bar', namespace_id: group.id, storage_version: 0, repository_storage: shard.name)
projects.create!(name: 'zoo', path: 'zoo', namespace_id: group.id, storage_version: 0, repository_storage: shard.name)
expect { described_class.new.perform(1, projects.last.id) }.not_to exceed_query_limit(control_count)
end
end
end

View file

@ -0,0 +1,94 @@
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::BackgroundMigration::BackfillProjectRepositories do
let(:group) { create(:group, name: 'foo', path: 'foo') }
describe described_class::ShardFinder do
let(:shard) { create(:shard, name: 'default') }
describe '#find_shard_id' do
it 'creates a new shard when it does not exist yet' do
expect { subject.find_shard_id('other') }.to change(Shard, :count).by(1)
end
it 'returns the shard when it exists' do
other_shard = create(:shard, name: 'other')
shard_id = subject.find_shard_id('other')
expect(shard_id).to eq(other_shard.id)
end
it 'only queries the database once to retrieve shards' do
subject.find_shard_id('default')
expect { subject.find_shard_id('default') }.not_to exceed_query_limit(0)
end
end
end
describe described_class::Project do
let!(:project_hashed_storage_1) { create(:project, name: 'foo', path: 'foo', namespace: group, storage_version: 1) }
let!(:project_hashed_storage_2) { create(:project, name: 'bar', path: 'bar', namespace: group, storage_version: 2) }
let!(:project_legacy_storage_3) { create(:project, name: 'baz', path: 'baz', namespace: group, storage_version: 0) }
let!(:project_legacy_storage_4) { create(:project, name: 'zoo', path: 'zoo', namespace: group, storage_version: nil) }
describe '.on_hashed_storage' do
it 'finds projects with repository on hashed storage' do
projects = described_class.on_hashed_storage.pluck(:id)
expect(projects).to match_array([project_hashed_storage_1.id, project_hashed_storage_2.id])
end
end
describe '.on_legacy_storage' do
it 'finds projects with repository on legacy storage' do
projects = described_class.on_legacy_storage.pluck(:id)
expect(projects).to match_array([project_legacy_storage_3.id, project_legacy_storage_4.id])
end
end
describe '.without_project_repository' do
it 'finds projects which do not have a projects_repositories entry' do
create(:project_repository, project: project_hashed_storage_1)
create(:project_repository, project: project_legacy_storage_3)
projects = described_class.without_project_repository.pluck(:id)
expect(projects).to contain_exactly(project_hashed_storage_2.id, project_legacy_storage_4.id)
end
end
describe '#disk_path' do
context 'for projects on hashed storage' do
it 'returns the correct disk_path' do
project = described_class.find(project_hashed_storage_1.id)
expect(project.disk_path).to eq(project_hashed_storage_1.disk_path)
end
end
context 'for projects on legacy storage' do
it 'returns the correct disk_path' do
project = described_class.find(project_legacy_storage_3.id)
expect(project.disk_path).to eq(project_legacy_storage_3.disk_path)
end
it 'raises OrphanedNamespaceError when any parent namespace does not exist' do
subgroup = create(:group, parent: group)
project_orphaned_namespace = create(:project, name: 'baz', path: 'baz', namespace: subgroup, storage_version: nil)
subgroup.update_column(:parent_id, Namespace.maximum(:id).succ)
project = described_class.find(project_orphaned_namespace.id)
expect { project.disk_path }
.to raise_error(Gitlab::BackgroundMigration::BackfillProjectRepositories::OrphanedNamespaceError)
end
end
end
end
end