From d83ee2bbd10d8fe2f2e9521bb3c266cf696aa98c Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Fri, 2 Jun 2017 17:12:36 +0200 Subject: [PATCH] Add the ability to perform background migrations Background migrations can be used to perform long running data migrations without these blocking a deployment procedure. See MR https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/11854 for more information. --- app/workers/background_migration_worker.rb | 23 ++ config/sidekiq_queues.yml | 1 + doc/development/README.md | 1 + doc/development/background_migrations.md | 205 ++++++++++++++++++ lib/gitlab/background_migration.rb | 31 +++ lib/gitlab/background_migration/.gitkeep | 0 spec/lib/gitlab/background_migration_spec.rb | 48 ++++ .../background_migration_worker_spec.rb | 13 ++ 8 files changed, 322 insertions(+) create mode 100644 app/workers/background_migration_worker.rb create mode 100644 doc/development/background_migrations.md create mode 100644 lib/gitlab/background_migration.rb create mode 100644 lib/gitlab/background_migration/.gitkeep create mode 100644 spec/lib/gitlab/background_migration_spec.rb create mode 100644 spec/workers/background_migration_worker_spec.rb diff --git a/app/workers/background_migration_worker.rb b/app/workers/background_migration_worker.rb new file mode 100644 index 00000000000..e85e221d353 --- /dev/null +++ b/app/workers/background_migration_worker.rb @@ -0,0 +1,23 @@ +class BackgroundMigrationWorker + include Sidekiq::Worker + include DedicatedSidekiqQueue + + # Schedules a number of jobs in bulk + # + # The `jobs` argument should be an Array of Arrays, each sub-array must be in + # the form: + # + # [migration-class, [arg1, arg2, ...]] + def self.perform_bulk(*jobs) + Sidekiq::Client.push_bulk('class' => self, + 'queue' => sidekiq_options['queue'], + 'args' => jobs) + end + + # Performs the background migration. + # + # See Gitlab::BackgroundMigration.perform for more information. + def perform(class_name, arguments = []) + Gitlab::BackgroundMigration.perform(class_name, arguments) + end +end diff --git a/config/sidekiq_queues.yml b/config/sidekiq_queues.yml index 93df2d6f5ff..1d9e69a2408 100644 --- a/config/sidekiq_queues.yml +++ b/config/sidekiq_queues.yml @@ -54,3 +54,4 @@ - [system_hook_push, 1] - [update_user_activity, 1] - [propagate_service_template, 1] + - [background_migration, 1] diff --git a/doc/development/README.md b/doc/development/README.md index 40addfd8a4c..9496a87d84d 100644 --- a/doc/development/README.md +++ b/doc/development/README.md @@ -53,6 +53,7 @@ - [Serializing Data](serializing_data.md) - [Polymorphic Associations](polymorphic_associations.md) - [Single Table Inheritance](single_table_inheritance.md) +- [Background Migrations](background_migrations.md) ## i18n diff --git a/doc/development/background_migrations.md b/doc/development/background_migrations.md new file mode 100644 index 00000000000..0239e6b3163 --- /dev/null +++ b/doc/development/background_migrations.md @@ -0,0 +1,205 @@ +# Background Migrations + +Background migrations can be used to perform data migrations that would +otherwise take a very long time (hours, days, years, etc) to complete. For +example, you can use background migrations to migrate data so that instead of +storing data in a single JSON column the data is stored in a separate table. + +## When To Use Background Migrations + +In the vast majority of cases you will want to use a regular Rails migration +instead. Background migrations should _only_ be used when migrating _data_ in +tables that have so many rows this process would take hours when performed in a +regular Rails migration. + +Background migrations _may not_ be used to perform schema migrations, they +should only be used for data migrations. + +Some examples where background migrations can be useful: + +* Migrating events from one table to multiple separate tables. +* Populating one column based on JSON stored in another column. +* Migrating data that depends on the output of exernal services (e.g. an API). + +## Isolation + +Background migrations must be isolated and can not use application code (e.g. +models defined in `app/models`). Since these migrations can take a long time to +run it's possible for new versions to be deployed while they are still running. + +It's also possible for different migrations to be executed at the same time. +This means that different background migrations should not migrate data in a +way that would cause conflicts. + +## How It Works + +Background migrations are simple classes that define a `perform` method. A +Sidekiq worker will then execute such a class, passing any arguments to it. All +migration classes must be defined in the namespace +`Gitlab::BackgroundMigration`, the files should be placed in the directory +`lib/gitlab/background_migration/`. + +## Scheduling + +Scheduling a migration can be done in either a regular migration or a +post-deployment migration. To do so, simply use the following code while +replacing the class name and arguments with whatever values are necessary for +your migration: + +```ruby +BackgroundMigrationWorker.perform_async('BackgroundMigrationClassName', [arg1, arg2, ...]) +``` + +Usually it's better to schedule jobs in bulk, for this you can use +`BackgroundMigrationWorker.perform_bulk`: + +```ruby +BackgroundMigrationWorker.perform_bulk( + ['BackgroundMigrationClassName', [1]], + ['BackgroundMigrationClassName', [2]], + ... +) +``` + +You'll also need to make sure that newly created data is either migrated, or +saved in both the old and new version upon creation. For complex and time +consuming migrations it's best to schedule a background job using an +`after_create` hook so this doesn't affect response timings. The same applies to +updates. Removals in turn can be handled by simply defining foreign keys with +cascading deletes. + +## Cleaning Up + +Because background migrations can take a long time you can't immediately clean +things up after scheduling them. For example, you can't drop a column that's +used in the migration process as this would cause jobs to fail. This means that +you'll need to add a separate _post deployment_ migration in a future release +that finishes any remaining jobs before cleaning things up (e.g. removing a +column). + +As an example, say you want to migrate the data from column `foo` (containing a +big JSON blob) to column `bar` (containing a string). The process for this would +roughly be as follows: + +1. Release A: + 1. Create a migration class that perform the migration for a row with a given ID. + 1. Deploy the code for this release, this should include some code that will + schedule jobs for newly created data (e.g. using an `after_create` hook). + 1. Schedule jobs for all existing rows in a post-deployment migration. It's + possible some newly created rows may be scheduled twice so your migration + should take care of this. +1. Release B: + 1. Deploy code so that the application starts using the new column and stops + scheduling jobs for newly created data. + 1. In a post-deployment migration you'll need to ensure no jobs remain. To do + so you can use `Gitlab::BackgroundMigration.steal` to process any remaining + jobs before continueing. + 1. Remove the old column. + +## Example + +To explain all this, let's use the following example: the table `services` has a +field called `properties` which is stored in JSON. For all rows you want to +extract the `url` key from this JSON object and store it in the `services.url` +column. There are millions of services and parsing JSON is slow, thus you can't +do this in a regular migration. + +To do this using a background migration we'll start with defining our migration +class: + +```ruby +class Gitlab::BackgroundMigration::ExtractServicesUrl + class Service < ActiveRecord::Base + self.table_name = 'services' + end + + def perform(service_id) + # A row may be removed between scheduling and starting of a job, thus we + # need to make sure the data is still present before doing any work. + service = Service.select(:properties).find_by(id: service_id) + + return unless service + + begin + json = JSON.load(service.properties) + rescue JSON::ParserError + # If the JSON is invalid we don't want to keep the job around forever, + # instead we'll just leave the "url" field to whatever the default value + # is. + return + end + + service.update(url: json['url']) if json['url'] + end +end +``` + +Next we'll need to adjust our code so we schedule the above migration for newly +created and updated services. We can do this using something along the lines of +the following: + +```ruby +class Service < ActiveRecord::Base + after_commit :schedule_service_migration, on: :update + after_commit :schedule_service_migration, on: :create + + def schedule_service_migration + BackgroundMigrationWorker.perform_async('ExtractServicesUrl', [id]) + end +end +``` + +We're using `after_commit` here to ensure the Sidekiq job is not scheduled +before the transaction completes as doing so can lead to race conditions where +the changes are not yet visible to the worker. + +Next we'll need a post-deployment migration that schedules the migration for +existing data. Since we're dealing with a lot of rows we'll schedule jobs in +batches instead of doing this one by one: + +```ruby +class ScheduleExtractServicesUrl < ActiveRecord::Migration + disable_ddl_transaction! + + class Service < ActiveRecord::Base + self.table_name = 'services' + end + + def up + Service.select(:id).in_batches do |relation| + jobs = relation.pluck(:id).map do |id| + ['ExtractServicesUrl', [id]] + end + + BackgroundMigrationWorker.perform_bulk(jobs) + end + end + + def down + end +end +``` + +Once deployed our application will continue using the data as before but at the +same time will ensure that both existing and new data is migrated. + +In the next release we can remove the `after_commit` hooks and related code. We +will also need to add a post-deployment migration that consumes any remaining +jobs. Such a migration would look like this: + +```ruby +class ConsumeRemainingExtractServicesUrlJobs < ActiveRecord::Migration + disable_ddl_transaction! + + def up + Gitlab::BackgroundMigration.steal('ExtractServicesUrl') + end + + def down + end +end +``` + +This migration will then process any jobs for the ExtractServicesUrl migration +and continue once all jobs have been processed. Once done you can safely remove +the `services.properties` column. diff --git a/lib/gitlab/background_migration.rb b/lib/gitlab/background_migration.rb new file mode 100644 index 00000000000..914a3b72abd --- /dev/null +++ b/lib/gitlab/background_migration.rb @@ -0,0 +1,31 @@ +module Gitlab + module BackgroundMigration + # Begins stealing jobs from the background migrations queue, blocking the + # caller until all jobs have been completed. + # + # steal_class - The name of the class for which to steal jobs. + def self.steal(steal_class) + queue = Sidekiq::Queue. + new(BackgroundMigrationWorker.sidekiq_options['queue']) + + queue.each do |job| + migration_class, migration_args = job.args + + next unless migration_class == steal_class + + perform(migration_class, migration_args) + + job.delete + end + end + + # class_name - The name of the background migration class as defined in the + # Gitlab::BackgroundMigration namespace. + # + # arguments - The arguments to pass to the background migration's "perform" + # method. + def self.perform(class_name, arguments) + const_get(class_name).new.perform(*arguments) + end + end +end diff --git a/lib/gitlab/background_migration/.gitkeep b/lib/gitlab/background_migration/.gitkeep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/spec/lib/gitlab/background_migration_spec.rb b/spec/lib/gitlab/background_migration_spec.rb new file mode 100644 index 00000000000..f2073b9bcb3 --- /dev/null +++ b/spec/lib/gitlab/background_migration_spec.rb @@ -0,0 +1,48 @@ +require 'spec_helper' + +describe Gitlab::BackgroundMigration do + describe '.steal' do + it 'steals jobs from a queue' do + queue = [double(:job, args: ['Foo', [10, 20]])] + + allow(Sidekiq::Queue).to receive(:new). + with(BackgroundMigrationWorker.sidekiq_options['queue']). + and_return(queue) + + expect(queue[0]).to receive(:delete) + + expect(described_class).to receive(:perform).with('Foo', [10, 20]) + + described_class.steal('Foo') + end + + it 'does not steal jobs for a different migration' do + queue = [double(:job, args: ['Foo', [10, 20]])] + + allow(Sidekiq::Queue).to receive(:new). + with(BackgroundMigrationWorker.sidekiq_options['queue']). + and_return(queue) + + expect(described_class).not_to receive(:perform) + + expect(queue[0]).not_to receive(:delete) + + described_class.steal('Bar') + end + end + + describe '.perform' do + it 'performs a background migration' do + instance = double(:instance) + klass = double(:klass, new: instance) + + expect(described_class).to receive(:const_get). + with('Foo'). + and_return(klass) + + expect(instance).to receive(:perform).with(10, 20) + + described_class.perform('Foo', [10, 20]) + end + end +end diff --git a/spec/workers/background_migration_worker_spec.rb b/spec/workers/background_migration_worker_spec.rb new file mode 100644 index 00000000000..0d742ae9dc7 --- /dev/null +++ b/spec/workers/background_migration_worker_spec.rb @@ -0,0 +1,13 @@ +require 'spec_helper' + +describe BackgroundMigrationWorker do + describe '.perform' do + it 'performs a background migration' do + expect(Gitlab::BackgroundMigration). + to receive(:perform). + with('Foo', [10, 20]) + + described_class.new.perform('Foo', [10, 20]) + end + end +end