From 5237a55d62f8dcb021a041741b3f09cad7784a36 Mon Sep 17 00:00:00 2001 From: Toon Claes Date: Fri, 30 Nov 2018 17:03:29 +0100 Subject: [PATCH] Fill project_repositories for hashed storage This adds a background migration that will ensure all projects that are on hashed storage have a row in `project_repositories`. Related issue: https://gitlab.com/gitlab-org/gitlab-ce/issues/48527 --- ...c-backfill-hashed-project_repositories.yml | 5 + ...32_backfill_hashed_project_repositories.rb | 27 +++ .../backfill_hashed_project_repositories.rb | 124 ++++++++++++++ ...ckfill_hashed_project_repositories_spec.rb | 155 ++++++++++++++++++ 4 files changed, 311 insertions(+) create mode 100644 changelogs/unreleased/tc-backfill-hashed-project_repositories.yml create mode 100644 db/post_migrate/20181130102132_backfill_hashed_project_repositories.rb create mode 100644 lib/gitlab/background_migration/backfill_hashed_project_repositories.rb create mode 100644 spec/lib/gitlab/background_migration/backfill_hashed_project_repositories_spec.rb diff --git a/changelogs/unreleased/tc-backfill-hashed-project_repositories.yml b/changelogs/unreleased/tc-backfill-hashed-project_repositories.yml new file mode 100644 index 00000000000..90a5c8c4e2c --- /dev/null +++ b/changelogs/unreleased/tc-backfill-hashed-project_repositories.yml @@ -0,0 +1,5 @@ +--- +title: Fill project_repositories for hashed storage projects +merge_request: 23482 +author: +type: added diff --git a/db/post_migrate/20181130102132_backfill_hashed_project_repositories.rb b/db/post_migrate/20181130102132_backfill_hashed_project_repositories.rb new file mode 100644 index 00000000000..b989d9fb43d --- /dev/null +++ b/db/post_migrate/20181130102132_backfill_hashed_project_repositories.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +class BackfillHashedProjectRepositories < ActiveRecord::Migration[5.0] + include Gitlab::Database::MigrationHelpers + + DOWNTIME = false + BATCH_SIZE = 1_000 + DELAY_INTERVAL = 1.minutes + MIGRATION = 'BackfillHashedProjectRepositories' + + disable_ddl_transaction! + + class Project < ActiveRecord::Base + include EachBatch + + self.table_name = 'projects' + end + + def up + queue_background_migration_jobs_by_range_at_intervals(Project, MIGRATION, DELAY_INTERVAL) + end + + def down + # Since there could have been existing rows before the migration + # do not remove anything + end +end diff --git a/lib/gitlab/background_migration/backfill_hashed_project_repositories.rb b/lib/gitlab/background_migration/backfill_hashed_project_repositories.rb new file mode 100644 index 00000000000..88696dd1aa6 --- /dev/null +++ b/lib/gitlab/background_migration/backfill_hashed_project_repositories.rb @@ -0,0 +1,124 @@ +# frozen_string_literal: true + +module Gitlab + module BackgroundMigration + # Class the will create rows in project_repositories for all + # projects that are on hashed storage + class BackfillHashedProjectRepositories + # Model for a Shard + class Shard < ActiveRecord::Base + self.table_name = 'shards' + + def self.by_name(name) + to_a.detect { |shard| shard.name == name } || create_by(name: name) + rescue ActiveRecord::RecordNotUnique + retry + end + end + + # Class that will find or create the shard by name. + # There is only a small set of shards, which would not change quickly, + # so look them up from memory instead of hitting the DB each time. + class ShardFinder + def find(name) + shards.detect { |shard| shard.name == name } || create!(name) + rescue ActiveRecord::RecordNotUnique + load! + retry + end + + private + + def create!(name) + Shard.create!(name: name).tap { |shard| @shards << shard } + end + + def shards + @shards || load! + end + + def load! + @shards = Shard.all.to_a + end + end + + # Model for a ProjectRepository + class ProjectRepository < ActiveRecord::Base + self.table_name = 'project_repositories' + + belongs_to :project, inverse_of: :project_repository + end + + # Model for a Project + class Project < ActiveRecord::Base + self.table_name = 'projects' + + HASHED_PATH_PREFIX = '@hashed' + HASHED_STORAGE_FEATURES = { + repository: 1, + attachments: 2 + }.freeze + + has_one :project_repository, inverse_of: :project + + class << self + def on_hashed_storage + where(arel_table[:storage_version].gteq(HASHED_STORAGE_FEATURES[:repository])) + end + + def without_project_repository + cond = ProjectRepository.arel_table[:project_id].eq(nil) + left_outer_joins(:project_repository).where(cond) + end + + def left_outer_joins(relation) + return super if Gitlab.rails5? + + # TODO Rails 4? + end + end + + def project_repository_attributes(shard_finder) + return unless hashed_storage? + + { + project_id: id, + shard_id: shard_finder.find(repository_storage).id, + disk_path: hashed_disk_path + } + end + + private + + def hashed_storage? + self.storage_version && self.storage_version >= 1 + end + + def hashed_disk_path + "#{HASHED_PATH_PREFIX}/#{disk_hash[0..1]}/#{disk_hash[2..3]}/#{disk_hash}" + end + + def disk_hash + @disk_hash ||= Digest::SHA2.hexdigest(id.to_s) if id + end + end + + def perform(start_id, stop_id) + Gitlab::Database.bulk_insert(:project_repositories, project_repositories(start_id, stop_id)) + end + + private + + def project_repositories(start_id, stop_id) + Project.on_hashed_storage.without_project_repository + .where(id: start_id..stop_id) + .map { |project| project.project_repository_attributes(shard_finder) } + .compact + end + + def shard_finder + @shard_finder ||= ShardFinder.new + end + end + end +end diff --git a/spec/lib/gitlab/background_migration/backfill_hashed_project_repositories_spec.rb b/spec/lib/gitlab/background_migration/backfill_hashed_project_repositories_spec.rb new file mode 100644 index 00000000000..d2f499ffa64 --- /dev/null +++ b/spec/lib/gitlab/background_migration/backfill_hashed_project_repositories_spec.rb @@ -0,0 +1,155 @@ +# frozen_string_literal: true + +require 'spec_helper' + +describe Gitlab::BackgroundMigration::BackfillHashedProjectRepositories, :migration, schema: 20181130102132 do + let(:shards) { table(:shards) } + let(:namespaces) { table(:namespaces) } + let(:projects) { table(:projects) } + let(:project_repositories) { table(:project_repositories) } + let(:group) { namespaces.create!(name: 'foo', path: 'foo') } + let(:default_shard) { shards.create!(name: 'default') } + + describe described_class::ShardFinder do + describe '#find' do + subject(:finder) { described_class.new } + + it 'creates the shard by name' do + expect(finder).to receive(:create!).and_call_original + + expect(finder.find('default')).to be_present + end + + it 'does not try to create existing shards' do + shards.create(name: 'default') + + expect(finder).not_to receive(:create!) + + finder.find('default') + end + + it 'only queries the database once for shards' do + finder.find('default') + + expect do + finder.find('default') + end.not_to exceed_query_limit(0) + end + + it 'creates a new shard when it does not exist yet' do + expect do + finder.find('other') + end.to change(shards, :count).by(1) + end + + it 'only creates a new shard once' do + finder.find('other') + + expect do + finder.find('other') + end.not_to change(shards, :count) + end + + it 'is not vulnerable to race conditions' do + finder.find('default') + + other_shard = shards.create(name: 'other') + + expect(finder.find('other').id).to eq(other_shard.id) + end + end + end + + describe described_class::Project do + describe '.on_hashed_storage' do + it 'finds projects with repository on hashed storage' do + hashed_projects = [ + projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 1), + projects.create!(name: 'bar', path: 'bar', namespace_id: group.id, storage_version: 2) + ] + + projects.create!(name: 'baz', path: 'baz', namespace_id: group.id, storage_version: 0) + projects.create!(name: 'quz', path: 'quz', namespace_id: group.id, storage_version: nil) + + expect(described_class.on_hashed_storage.pluck(:id)).to match_array(hashed_projects.map(&:id)) + end + end + + describe '.without_project_repository' do + it 'finds projects which do not have a projects_repositories row' do + without_project = projects.create!(name: 'foo', path: 'foo', namespace_id: group.id) + with_project = projects.create!(name: 'bar', path: 'bar', namespace_id: group.id) + project_repositories.create!(project_id: with_project.id, disk_path: '@phony/foo/bar', shard_id: default_shard.id) + + expect(described_class.without_project_repository.pluck(:id)).to contain_exactly(without_project.id) + end + end + + describe '#project_repository_attributes' do + let(:shard_finder) { Gitlab::BackgroundMigration::BackfillHashedProjectRepositories::ShardFinder.new } + + it 'composes the correct attributes for project_repository' do + shiny_shard = shards.create!(name: 'shiny') + project = projects.create!(id: 5, name: 'foo', path: 'foo', namespace_id: group.id, repository_storage: shiny_shard.name, storage_version: 1) + + expected_attributes = { + project_id: project.id, + shard_id: shiny_shard.id, + disk_path: '@hashed/ef/2d/ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d' + } + + expect(described_class.find(project.id).project_repository_attributes(shard_finder)).to eq(expected_attributes) + end + + it 'returns nil for a project not on hashed storage' do + project = projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 0) + + expect(described_class.find(project.id).project_repository_attributes(shard_finder)).to be_nil + end + end + end + + describe '#perform' do + def perform! + described_class.new.perform(1, projects.last.id) + end + + it 'create project_repository row for hashed storage project' do + projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 1) + + expect do + perform! + end.to change(project_repositories, :count).by(1) + end + + it 'does nothing for projects that have already a project_repository' do + project = projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 1) + project_repositories.create!(project_id: project.id, disk_path: '@phony/foo/bar', shard_id: default_shard.id) + + expect do + perform! + end.not_to change(project_repositories, :count) + end + + it 'does nothing for projects on legacy storage' do + projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 0) + + expect do + perform! + end.not_to change(project_repositories, :count) + end + + it 'inserts rows in a single query' do + projects.create!(name: 'foo', path: 'foo', namespace_id: group.id, storage_version: 1, repository_storage: default_shard.name) + + control_count = ActiveRecord::QueryRecorder.new do + perform! + end + + projects.create!(name: 'bar', path: 'bar', namespace_id: group.id, storage_version: 1, repository_storage: default_shard.name) + projects.create!(name: 'quz', path: 'quz', namespace_id: group.id, storage_version: 1, repository_storage: default_shard.name) + + expect { perform! }.not_to exceed_query_limit(control_count) + end + end +end