Add support to migrate existing projects to Hashed Storage async

This commit is contained in:
Gabriel Mazetto 2017-09-06 07:16:26 +02:00 committed by Nick Thomas
parent 38607b48b6
commit f4de14d71f
No known key found for this signature in database
GPG key ID: 2A313A47AFADACE9
16 changed files with 686 additions and 4 deletions

View file

@ -245,6 +245,9 @@ class Project < ActiveRecord::Base
scope :pending_delete, -> { where(pending_delete: true) }
scope :without_deleted, -> { where(pending_delete: false) }
scope :with_hashed_storage, -> { where('storage_version >= 1') }
scope :with_legacy_storage, -> { where(storage_version: [nil, 0]) }
scope :sorted_by_activity, -> { reorder(last_activity_at: :desc) }
scope :sorted_by_stars, -> { reorder('projects.star_count DESC') }
@ -1550,18 +1553,44 @@ class Project < ActiveRecord::Base
end
def legacy_storage?
self.storage_version.nil?
[nil, 0].include?(self.storage_version)
end
def hashed_storage?
self.storage_version && self.storage_version >= 1
end
def renamed?
persisted? && path_changed?
end
def migrate_to_hashed_storage!
return if hashed_storage?
update!(repository_read_only: true)
if repo_reference_count > 0 || wiki_reference_count > 0
ProjectMigrateHashedStorageWorker.perform_in(Gitlab::ReferenceCounter::REFERENCE_EXPIRE_TIME, id)
else
ProjectMigrateHashedStorageWorker.perform_async(id)
end
end
def storage_version=(value)
super
@storage = nil if storage_version_changed?
end
def gl_repository(is_wiki:)
Gitlab::GlRepository.gl_repository(self, is_wiki)
end
private
def storage
@storage ||=
if self.storage_version && self.storage_version >= 1
if hashed_storage?
Storage::HashedProject.new(self)
else
Storage::LegacyProject.new(self)
@ -1574,6 +1603,14 @@ class Project < ActiveRecord::Base
end
end
def repo_reference_count
Gitlab::ReferenceCounter.new(gl_repository(is_wiki: false)).value
end
def wiki_reference_count
Gitlab::ReferenceCounter.new(gl_repository(is_wiki: true)).value
end
# set last_activity_at to the same as created_at
def set_last_activity_at
update_column(:last_activity_at, self.created_at)

View file

@ -4,6 +4,7 @@ module Storage
delegate :gitlab_shell, :repository_storage_path, to: :project
ROOT_PATH_PREFIX = '@hashed'.freeze
STORAGE_VERSION = 1
def initialize(project)
@project = project

View file

@ -0,0 +1,68 @@
module Projects
class HashedStorageMigrationService < BaseService
include Gitlab::ShellAdapter
attr_reader :old_disk_path, :new_disk_path
def initialize(project, logger = nil)
@project = project
@logger ||= Rails.logger
end
def execute
return if project.hashed_storage?
@old_disk_path = project.disk_path
has_wiki = project.wiki.repository_exists?
project.storage_version = Storage::HashedProject::STORAGE_VERSION
project.ensure_storage_path_exists
@new_disk_path = project.disk_path
result = move_repository(@old_disk_path, @new_disk_path)
if has_wiki
result &&= move_repository("#{@old_disk_path}.wiki", "#{@new_disk_path}.wiki")
end
unless result
rollback_folder_move
return
end
project.repository_read_only = false
project.save!
block_given? ? yield : result
end
private
def move_repository(from_name, to_name)
from_exists = gitlab_shell.exists?(project.repository_storage_path, "#{from_name}.git")
to_exists = gitlab_shell.exists?(project.repository_storage_path, "#{to_name}.git")
# If we don't find the repository on either original or target we should log that as it could be an issue if the
# project was not originally empty.
if !from_exists && !to_exists
logger.warn "Can't find a repository on either source or target paths for #{project.full_path} (ID=#{project.id}) ..."
return false
elsif !from_exists
# Repository have been moved already.
return true
end
gitlab_shell.mv_repository(project.repository_storage_path, from_name, to_name)
end
def rollback_folder_move
move_repository(@new_disk_path, @old_disk_path)
move_repository("#{@new_disk_path}.wiki", "#{@old_disk_path}.wiki")
end
def logger
@logger
end
end
end

View file

@ -0,0 +1,11 @@
class ProjectMigrateHashedStorageWorker
include Sidekiq::Worker
include DedicatedSidekiqQueue
def perform(project_id)
project = Project.find_by(id: project_id)
return if project.nil? || project.pending_delete?
::Projects::HashedStorageMigrationService.new(project, logger).execute
end
end

View file

@ -0,0 +1,30 @@
class StorageMigratorWorker
include Sidekiq::Worker
include DedicatedSidekiqQueue
BATCH_SIZE = 100
def perform(start, finish)
projects = build_relation(start, finish)
projects.with_route.find_each(batch_size: BATCH_SIZE) do |project|
Rails.logger.info "Starting storage migration of #{project.full_path} (ID=#{project.id})..."
begin
project.migrate_to_hashed_storage!
rescue => err
Rails.logger.error("#{err.message} migrating storage of #{project.full_path} (ID=#{project.id}), trace - #{err.backtrace}")
end
end
end
def build_relation(start, finish)
relation = Project
table = Project.arel_table
relation = relation.where(table[:id].gteq(start)) if start
relation = relation.where(table[:id].lteq(finish)) if finish
relation
end
end

View file

@ -0,0 +1,5 @@
---
title: Script to migrate project's repositories to new Hashed Storage
merge_request: 14067
author:
type: added

View file

@ -62,3 +62,5 @@
- [update_user_activity, 1]
- [propagate_service_template, 1]
- [background_migration, 1]
- [project_migrate_hashed_storage, 1]
- [storage_migrator, 1]

View file

@ -0,0 +1,107 @@
# Repository Storage Rake Tasks
This is a collection of rake tasks you can use to help you list and migrate
existing projects from Legacy storage to the new Hashed storage type.
You can read more about the storage types [here][storage-types].
## List projects on Legacy storage
To have a simple summary of projects using **Legacy** storage:
**Omnibus Installation**
```bash
gitlab-rake gitlab:storage:legacy_projects
```
**Source Installation**
```bash
rake gitlab:storage:legacy_projects
```
------
To list projects using **Legacy** storage:
**Omnibus Installation**
```bash
gitlab-rake gitlab:storage:list_legacy_projects
```
**Source Installation**
```bash
rake gitlab:storage:list_legacy_projects
```
## List projects on Hashed storage
To have a simple summary of projects using **Hashed** storage:
**Omnibus Installation**
```bash
gitlab-rake gitlab:storage:hashed_projects
```
**Source Installation**
```bash
rake gitlab:storage:hashed_projects
```
------
To list projects using **Hashed** storage:
**Omnibus Installation**
```bash
gitlab-rake gitlab:storage:list_hashed_projects
```
**Source Installation**
```bash
rake gitlab:storage:list_hashed_projects
```
## Migrate existing projects to Hashed storage
Before migrating your existing projects, you should
[enable hashed storage][storage-migration] for the new projects as well.
This task will schedule all your existing projects to be migrated to the
**Hashed** storage type:
**Omnibus Installation**
```bash
gitlab-rake gitlab:storage:migrate_to_hashed
```
**Source Installation**
```bash
rake gitlab:storage:migrate_to_hashed
```
You can monitor the progress in the _Admin > Monitoring > Background jobs_ screen.
There is a specific Queue you can watch to see how long it will take to finish: **project_migrate_hashed_storage**
After it reaches zero, you can confirm every project has been migrated by running the commands above.
If you find it necessary, you can run this migration script again to schedule missing projects.
Any error or warning will be logged in the sidekiq log file.
[storage-types]: ../repository_storage_types.md
[storage-migration]: ../repository_storage_types.md#how-to-migrate-to-hashed-storage

View file

@ -0,0 +1,69 @@
# Repository Storage Types
> [Introduced][ce-28283] in GitLab 10.0.
## Legacy Storage
Legacy Storage is the storage behavior prior to version 10.0. For historical reasons, GitLab replicated the same
mapping structure from the projects URLs:
* Project's repository: `#{namespace}/#{project_name}.git`
* Project's wiki: `#{namespace}/#{project_name}.wiki.git`
This structure made simple to migrate from existing solutions to GitLab and easy for Administrators to find where the
repository is stored.
On the other hand this has some drawbacks:
Storage location will concentrate huge amount of top-level namespaces. The impact can be reduced by the introduction of [multiple storage paths][storage-paths].
Because Backups are a snapshot of the same URL mapping, if you try to recover a very old backup, you need to verify
if any project has taken the place of an old removed project sharing the same URL. This means that `mygroup/myproject`
from your backup may not be the same original project that is today in the same URL.
Any change in the URL will need to be reflected on disk (when groups / users or projects are renamed). This can add a lot
of load in big installations, and can be even worst if they are using any type of network based filesystem.
Last, for GitLab Geo, this storage type means we have to synchronize the disk state, replicate renames in the correct
order or we may end-up with wrong repository or missing data temporarily.
## Hashed Storage
Hashed Storage is the new storage behavior we are rolling out with 10.0. It's not enabled by default yet, but we
encourage everyone to try-it and take the time to fix any script you may have that depends on the old behavior.
Instead of coupling project URL and the folder structure where the repository will be stored on disk, we are coupling
a hash, based on the project's ID.
This makes the folder structure immutable, and therefore eliminates any requirement to synchronize state from URLs to
disk structure. This means that renaming a group, user or project will cost only the database transaction, and will take
effect immediately.
The hash also helps to spread the repositories more evenly on the disk, so the top-level directory will contain less
folders than the total amount of top-level namespaces.
Hash format is based on hexadecimal representation of SHA256: `SHA256(project.id)`.
Top-level folder uses first 2 characters, followed by another folder with the next 2 characters. They are both stored in
a special folder `@hashed`, to co-exist with existing Legacy projects:
```ruby
# Project's repository:
"@hashed/#{hash[0..1]}/#{hash[2..3]}/#{hash}.git"
# Wiki's repository:
"@hashed/#{hash[0..1]}/#{hash[2..3]}/#{hash}.wiki.git"
```
This new format also makes possible to restore backups with confidence, as when restoring a repository from the backup,
you will never mistakenly restore a repository in the wrong project (considering the backup is made after the migration).
### How to migrate to Hashed Storage
In GitLab, go to **Admin > Settings**, find the **Repository Storage** section and select
"_Create new projects using hashed storage paths_".
To migrate your existing projects to the new storage type, check the specific [rake tasks].
[ce-28283]: https://gitlab.com/gitlab-org/gitlab-ce/issues/28283
[rake tasks]: raketasks/storage.md#migrate-existing-projects-to-hashed-storage
[storage-paths]: repository_storage_types.md

View file

@ -0,0 +1,85 @@
namespace :gitlab do
namespace :storage do
desc 'GitLab | Storage | Migrate existing projects to Hashed Storage'
task migrate_to_hashed: :environment do
legacy_projects_count = Project.with_legacy_storage.count
if legacy_projects_count == 0
puts 'There are no projects using legacy storage. Nothing to do!'
next
end
print "Enqueuing migration of #{legacy_projects_count} projects in batches of #{batch_size}"
project_id_batches do |start, finish|
StorageMigratorWorker.perform_async(start, finish)
print '.'
end
puts ' Done!'
end
desc 'Gitlab | Storage | Summary of existing projects using Legacy Storage'
task legacy_projects: :environment do
projects_summary(Project.with_legacy_storage)
end
desc 'Gitlab | Storage | List existing projects using Legacy Storage'
task list_legacy_projects: :environment do
projects_list(Project.with_legacy_storage)
end
desc 'Gitlab | Storage | Summary of existing projects using Hashed Storage'
task hashed_projects: :environment do
projects_summary(Project.with_hashed_storage)
end
desc 'Gitlab | Storage | List existing projects using Hashed Storage'
task list_hashed_projects: :environment do
projects_list(Project.with_hashed_storage)
end
def batch_size
ENV.fetch('BATCH', 200).to_i
end
def project_id_batches(&block)
Project.with_legacy_storage.in_batches(of: batch_size, start: ENV['ID_FROM'], finish: ENV['ID_TO']) do |relation| # rubocop: disable Cop/InBatches
ids = relation.pluck(:id)
yield ids.min, ids.max
end
end
def projects_summary(relation)
projects_count = relation.count
puts "* Found #{projects_count} projects".color(:green)
projects_count
end
def projects_list(relation)
projects_count = projects_summary(relation)
projects = relation.with_route
limit = ENV.fetch('LIMIT', 500).to_i
return unless projects_count > 0
puts " ! Displaying first #{limit} projects..." if projects_count > limit
counter = 0
projects.find_in_batches(batch_size: batch_size) do |batch|
batch.each do |project|
counter += 1
puts " - #{project.full_path} (id: #{project.id})".color(:red)
return if counter >= limit # rubocop:disable Lint/NonLocalExitFromIterator
end
end
end
end
end

View file

@ -143,7 +143,13 @@ FactoryGirl.define do
end
end
trait :read_only_repository do
trait :wiki_repo do
after(:create) do |project|
raise 'Failed to create wiki repository!' unless project.create_wiki
end
end
trait :readonly do
repository_read_only true
end

View file

@ -2363,10 +2363,22 @@ describe Project do
describe '#legacy_storage?' do
it 'returns true when storage_version is nil' do
project = build(:project)
project = build(:project, storage_version: nil)
expect(project.legacy_storage?).to be_truthy
end
it 'returns true when the storage_version is 0' do
project = build(:project, storage_version: 0)
expect(project.legacy_storage?).to be_truthy
end
end
describe '#hashed_storage?' do
it 'returns false' do
expect(project.hashed_storage?).to be_falsey
end
end
describe '#rename_repo' do
@ -2425,6 +2437,38 @@ describe Project do
expect(project.pages_path).to eq(File.join(Settings.pages.path, project.namespace.full_path, project.path))
end
end
describe '#migrate_to_hashed_storage!' do
it 'returns true' do
expect(project.migrate_to_hashed_storage!).to be_truthy
end
it 'flags as readonly' do
expect { project.migrate_to_hashed_storage! }.to change { project.repository_read_only }.to(true)
end
it 'schedules ProjectMigrateHashedStorageWorker with delayed start when the project repo is in use' do
Gitlab::ReferenceCounter.new(project.gl_repository(is_wiki: false)).increase
expect(ProjectMigrateHashedStorageWorker).to receive(:perform_in)
project.migrate_to_hashed_storage!
end
it 'schedules ProjectMigrateHashedStorageWorker with delayed start when the wiki repo is in use' do
Gitlab::ReferenceCounter.new(project.gl_repository(is_wiki: true)).increase
expect(ProjectMigrateHashedStorageWorker).to receive(:perform_in)
project.migrate_to_hashed_storage!
end
it 'schedules ProjectMigrateHashedStorageWorker' do
expect(ProjectMigrateHashedStorageWorker).to receive(:perform_async).with(project.id)
project.migrate_to_hashed_storage!
end
end
end
context 'hashed storage' do
@ -2438,6 +2482,18 @@ describe Project do
allow(project).to receive(:gitlab_shell).and_return(gitlab_shell)
end
describe '#legacy_storage?' do
it 'returns false' do
expect(project.legacy_storage?).to be_falsey
end
end
describe '#hashed_storage?' do
it 'returns true' do
expect(project.hashed_storage?).to be_truthy
end
end
describe '#base_dir' do
it 'returns base_dir based on hash of project id' do
expect(project.base_dir).to eq('@hashed/6b/86')
@ -2508,6 +2564,26 @@ describe Project do
expect(project.pages_path).to eq(File.join(Settings.pages.path, project.namespace.full_path, project.path))
end
end
describe '#migrate_to_hashed_storage!' do
it 'returns nil' do
expect(project.migrate_to_hashed_storage!).to be_nil
end
it 'does not flag as readonly' do
expect { project.migrate_to_hashed_storage! }.not_to change { project.repository_read_only }
end
end
end
describe '#gl_repository' do
let(:project) { create(:project) }
it 'delegates to Gitlab::GlRepository.gl_repository' do
expect(Gitlab::GlRepository).to receive(:gl_repository).with(project, true)
project.gl_repository(is_wiki: true)
end
end
describe '#has_ci?' do

View file

@ -0,0 +1,74 @@
require 'spec_helper'
describe Projects::HashedStorageMigrationService do
let(:gitlab_shell) { Gitlab::Shell.new }
let(:project) { create(:project, :empty_repo, :wiki_repo) }
let(:service) { described_class.new(project) }
let(:legacy_storage) { Storage::LegacyProject.new(project) }
let(:hashed_storage) { Storage::HashedProject.new(project) }
describe '#execute' do
before do
allow(service).to receive(:gitlab_shell) { gitlab_shell }
end
context 'when succeeds' do
it 'renames project and wiki repositories' do
service.execute
expect(gitlab_shell.exists?(project.repository_storage_path, "#{hashed_storage.disk_path}.git")).to be_truthy
expect(gitlab_shell.exists?(project.repository_storage_path, "#{hashed_storage.disk_path}.wiki.git")).to be_truthy
end
it 'updates project to be hashed and not readonly' do
service.execute
expect(project.hashed_storage?).to be_truthy
expect(project.repository_read_only).to be_falsey
end
it 'move operation is called for both repositories' do
expect_move_repository(project.disk_path, hashed_storage.disk_path)
expect_move_repository("#{project.disk_path}.wiki", "#{hashed_storage.disk_path}.wiki")
service.execute
end
end
context 'when one move fails' do
it 'rollsback repositories to original name' do
from_name = project.disk_path
to_name = hashed_storage.disk_path
allow(service).to receive(:move_repository).and_call_original
allow(service).to receive(:move_repository).with(from_name, to_name).once { false } # will disable first move only
expect(service).to receive(:rollback_folder_move).and_call_original
service.execute
expect(gitlab_shell.exists?(project.repository_storage_path, "#{hashed_storage.disk_path}.git")).to be_falsey
expect(gitlab_shell.exists?(project.repository_storage_path, "#{hashed_storage.disk_path}.wiki.git")).to be_falsey
end
context 'when rollback fails' do
before do
from_name = legacy_storage.disk_path
to_name = hashed_storage.disk_path
hashed_storage.ensure_storage_path_exists
gitlab_shell.mv_repository(project.repository_storage_path, from_name, to_name)
end
it 'does not try to move nil repository over hashed' do
expect_move_repository("#{project.disk_path}.wiki", "#{hashed_storage.disk_path}.wiki")
service.execute
end
end
end
def expect_move_repository(from_name, to_name)
expect(gitlab_shell).to receive(:mv_repository).with(project.repository_storage_path, from_name, to_name).and_call_original
end
end
end

View file

@ -0,0 +1,52 @@
require 'rake_helper'
describe 'gitlab:storage rake tasks' do
before do
Rake.application.rake_require 'tasks/gitlab/storage'
stub_warn_user_is_not_gitlab
end
describe 'migrate_to_hashed rake task' do
context '0 legacy projects' do
it 'does nothing' do
expect(StorageMigratorWorker).not_to receive(:perform_async)
run_rake_task('gitlab:storage:migrate_to_hashed')
end
end
context '5 legacy projects' do
let(:projects) { create_list(:project, 5, storage_version: 0) }
context 'in batches of 1' do
before do
stub_env('BATCH' => 1)
end
it 'enqueues one StorageMigratorWorker per project' do
projects.each do |project|
expect(StorageMigratorWorker).to receive(:perform_async).with(project.id, project.id)
end
run_rake_task('gitlab:storage:migrate_to_hashed')
end
end
context 'in batches of 2' do
before do
stub_env('BATCH' => 2)
end
it 'enqueues one StorageMigratorWorker per 2 projects' do
projects.map(&:id).sort.each_slice(2) do |first, last|
last ||= first
expect(StorageMigratorWorker).to receive(:perform_async).with(first, last)
end
run_rake_task('gitlab:storage:migrate_to_hashed')
end
end
end
end
end

View file

@ -0,0 +1,29 @@
require 'spec_helper'
describe ProjectMigrateHashedStorageWorker do
describe '#perform' do
let(:project) { create(:project, :empty_repo) }
let(:pending_delete_project) { create(:project, :empty_repo, pending_delete: true) }
it 'skips when project no longer exists' do
nonexistent_id = 999999999999
expect(::Projects::HashedStorageMigrationService).not_to receive(:new)
subject.perform(nonexistent_id)
end
it 'skips when project is pending delete' do
expect(::Projects::HashedStorageMigrationService).not_to receive(:new)
subject.perform(pending_delete_project.id)
end
it 'delegates removal to service class' do
service = double('service')
expect(::Projects::HashedStorageMigrationService).to receive(:new).with(project, subject.logger).and_return(service)
expect(service).to receive(:execute)
subject.perform(project.id)
end
end
end

View file

@ -0,0 +1,30 @@
require 'spec_helper'
describe StorageMigratorWorker do
subject(:worker) { described_class.new }
let(:projects) { create_list(:project, 2) }
describe '#perform' do
let(:ids) { projects.map(&:id) }
it 'enqueue jobs to ProjectMigrateHashedStorageWorker' do
expect(ProjectMigrateHashedStorageWorker).to receive(:perform_async).twice
worker.perform(ids.min, ids.max)
end
it 'sets projects as read only' do
allow(ProjectMigrateHashedStorageWorker).to receive(:perform_async).twice
worker.perform(ids.min, ids.max)
projects.each do |project|
expect(project.reload.repository_read_only?).to be_truthy
end
end
it 'rescues and log exceptions' do
allow_any_instance_of(Project).to receive(:migrate_to_hashed_storage!).and_raise(StandardError)
expect { worker.perform(ids.min, ids.max) }.not_to raise_error
end
end
end