Migrate MR commits and diffs to new tables
Previously, we stored these as serialised fields - `st_{commits,diffs}` - on the `merge_request_diffs` table. These now have their own tables - `merge_request_diff_{commits,diffs}` - with a column for each attribute of the serialised data. Add a background migration to go through the existing MR diffs and migrate them to the new format. Ignore any contents that cannot be displayed. Assuming that we have 5 million rows to migrate, and each batch of 2,500 rows can be completed in 5 minutes, this will take about 7 days to migrate everything.
This commit is contained in:
parent
1018ab0516
commit
f2d50af917
5 changed files with 388 additions and 5 deletions
|
@ -85,11 +85,7 @@ class MergeRequestDiff < ActiveRecord::Base
|
||||||
|
|
||||||
def raw_diffs(options = {})
|
def raw_diffs(options = {})
|
||||||
if options[:ignore_whitespace_change]
|
if options[:ignore_whitespace_change]
|
||||||
@diffs_no_whitespace ||=
|
@diffs_no_whitespace ||= compare.diffs(options)
|
||||||
Gitlab::Git::Compare.new(
|
|
||||||
repository.raw_repository,
|
|
||||||
safe_start_commit_sha,
|
|
||||||
head_commit_sha).diffs(options)
|
|
||||||
else
|
else
|
||||||
@raw_diffs ||= {}
|
@raw_diffs ||= {}
|
||||||
@raw_diffs[options] ||= load_diffs(options)
|
@raw_diffs[options] ||= load_diffs(options)
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
class ScheduleMergeRequestDiffMigrations < ActiveRecord::Migration
|
||||||
|
include Gitlab::Database::MigrationHelpers
|
||||||
|
|
||||||
|
DOWNTIME = false
|
||||||
|
BATCH_SIZE = 2500
|
||||||
|
MIGRATION = 'DeserializeMergeRequestDiffsAndCommits'
|
||||||
|
|
||||||
|
disable_ddl_transaction!
|
||||||
|
|
||||||
|
class MergeRequestDiff < ActiveRecord::Base
|
||||||
|
self.table_name = 'merge_request_diffs'
|
||||||
|
|
||||||
|
include ::EachBatch
|
||||||
|
end
|
||||||
|
|
||||||
|
# Assuming that there are 5 million rows affected (which is more than on
|
||||||
|
# GitLab.com), and that each batch of 2,500 rows takes up to 5 minutes, then
|
||||||
|
# we can migrate all the rows in 7 days.
|
||||||
|
#
|
||||||
|
# On staging, plucking the IDs themselves takes 5 seconds.
|
||||||
|
def up
|
||||||
|
non_empty = 'st_commits IS NOT NULL OR st_diffs IS NOT NULL'
|
||||||
|
|
||||||
|
MergeRequestDiff.where(non_empty).each_batch(of: BATCH_SIZE) do |relation, index|
|
||||||
|
range = relation.pluck('MIN(id)', 'MAX(id)').first
|
||||||
|
|
||||||
|
BackgroundMigrationWorker.perform_in(index * 5.minutes, MIGRATION, range)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def down
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,107 @@
|
||||||
|
module Gitlab
|
||||||
|
module BackgroundMigration
|
||||||
|
class DeserializeMergeRequestDiffsAndCommits
|
||||||
|
attr_reader :diff_ids, :commit_rows, :file_rows
|
||||||
|
|
||||||
|
class MergeRequestDiff < ActiveRecord::Base
|
||||||
|
self.table_name = 'merge_request_diffs'
|
||||||
|
end
|
||||||
|
|
||||||
|
BUFFER_ROWS = 1000
|
||||||
|
|
||||||
|
def perform(start_id, stop_id)
|
||||||
|
merge_request_diffs = MergeRequestDiff
|
||||||
|
.select(:id, :st_commits, :st_diffs)
|
||||||
|
.where('st_commits IS NOT NULL OR st_diffs IS NOT NULL')
|
||||||
|
.where(id: start_id..stop_id)
|
||||||
|
|
||||||
|
reset_buffers!
|
||||||
|
|
||||||
|
merge_request_diffs.each do |merge_request_diff|
|
||||||
|
commits, files = single_diff_rows(merge_request_diff)
|
||||||
|
|
||||||
|
diff_ids << merge_request_diff.id
|
||||||
|
commit_rows.concat(commits)
|
||||||
|
file_rows.concat(files)
|
||||||
|
|
||||||
|
if diff_ids.length > BUFFER_ROWS ||
|
||||||
|
commit_rows.length > BUFFER_ROWS ||
|
||||||
|
file_rows.length > BUFFER_ROWS
|
||||||
|
|
||||||
|
flush_buffers!
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
flush_buffers!
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def reset_buffers!
|
||||||
|
@diff_ids = []
|
||||||
|
@commit_rows = []
|
||||||
|
@file_rows = []
|
||||||
|
end
|
||||||
|
|
||||||
|
def flush_buffers!
|
||||||
|
if diff_ids.any?
|
||||||
|
MergeRequestDiff.transaction do
|
||||||
|
Gitlab::Database.bulk_insert('merge_request_diff_commits', commit_rows)
|
||||||
|
Gitlab::Database.bulk_insert('merge_request_diff_files', file_rows)
|
||||||
|
|
||||||
|
MergeRequestDiff.where(id: diff_ids).update_all(st_commits: nil, st_diffs: nil)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
reset_buffers!
|
||||||
|
end
|
||||||
|
|
||||||
|
def single_diff_rows(merge_request_diff)
|
||||||
|
sha_attribute = Gitlab::Database::ShaAttribute.new
|
||||||
|
commits = YAML.load(merge_request_diff.st_commits) rescue []
|
||||||
|
|
||||||
|
commit_rows = commits.map.with_index do |commit, index|
|
||||||
|
commit_hash = commit.to_hash.with_indifferent_access.except(:parent_ids)
|
||||||
|
sha = commit_hash.delete(:id)
|
||||||
|
|
||||||
|
commit_hash.merge(
|
||||||
|
merge_request_diff_id: merge_request_diff.id,
|
||||||
|
relative_order: index,
|
||||||
|
sha: sha_attribute.type_cast_for_database(sha)
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
diffs = YAML.load(merge_request_diff.st_diffs) rescue []
|
||||||
|
diffs = [] unless valid_raw_diffs?(diffs)
|
||||||
|
|
||||||
|
file_rows = diffs.map.with_index do |diff, index|
|
||||||
|
diff_hash = diff.to_hash.with_indifferent_access.merge(
|
||||||
|
binary: false,
|
||||||
|
merge_request_diff_id: merge_request_diff.id,
|
||||||
|
relative_order: index
|
||||||
|
)
|
||||||
|
|
||||||
|
# Compatibility with old diffs created with Psych.
|
||||||
|
diff_hash.tap do |hash|
|
||||||
|
diff_text = hash[:diff]
|
||||||
|
|
||||||
|
if diff_text.encoding == Encoding::BINARY && !diff_text.ascii_only?
|
||||||
|
hash[:binary] = true
|
||||||
|
hash[:diff] = [diff_text].pack('m0')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
[commit_rows, file_rows]
|
||||||
|
end
|
||||||
|
|
||||||
|
# Unlike MergeRequestDiff#valid_raw_diff?, don't count Rugged objects as
|
||||||
|
# valid, because we don't render them usefully anyway.
|
||||||
|
def valid_raw_diffs?(diffs)
|
||||||
|
return false unless diffs.respond_to?(:each)
|
||||||
|
|
||||||
|
diffs.all? { |diff| diff.is_a?(Hash) }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,188 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe Gitlab::BackgroundMigration::DeserializeMergeRequestDiffsAndCommits do
|
||||||
|
describe '#perform' do
|
||||||
|
set(:merge_request) { create(:merge_request) }
|
||||||
|
set(:merge_request_diff) { merge_request.merge_request_diff }
|
||||||
|
let(:updated_merge_request_diff) { MergeRequestDiff.find(merge_request_diff.id) }
|
||||||
|
|
||||||
|
def diffs_to_hashes(diffs)
|
||||||
|
diffs.as_json(only: Gitlab::Git::Diff::SERIALIZE_KEYS).map(&:with_indifferent_access)
|
||||||
|
end
|
||||||
|
|
||||||
|
def quote_yaml(value)
|
||||||
|
MergeRequestDiff.connection.quote(YAML.dump(value))
|
||||||
|
end
|
||||||
|
|
||||||
|
def convert_to_yaml(merge_request_diff_id, commits, diffs)
|
||||||
|
MergeRequestDiff.where(id: merge_request_diff_id).update_all(
|
||||||
|
"st_commits = #{quote_yaml(commits)}, st_diffs = #{quote_yaml(diffs)}"
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
shared_examples 'updated MR diff' do
|
||||||
|
before do
|
||||||
|
convert_to_yaml(merge_request_diff.id, commits, diffs)
|
||||||
|
|
||||||
|
MergeRequestDiffCommit.delete_all
|
||||||
|
MergeRequestDiffFile.delete_all
|
||||||
|
|
||||||
|
subject.perform(merge_request_diff.id, merge_request_diff.id)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'creates correct entries in the merge_request_diff_commits table' do
|
||||||
|
expect(updated_merge_request_diff.merge_request_diff_commits.count).to eq(commits.count)
|
||||||
|
expect(updated_merge_request_diff.commits.map(&:to_hash)).to eq(commits)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'creates correct entries in the merge_request_diff_files table' do
|
||||||
|
expect(updated_merge_request_diff.merge_request_diff_files.count).to eq(expected_diffs.count)
|
||||||
|
expect(diffs_to_hashes(updated_merge_request_diff.raw_diffs)).to eq(expected_diffs)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'sets the st_commits and st_diffs columns to nil' do
|
||||||
|
expect(updated_merge_request_diff.st_commits_before_type_cast).to be_nil
|
||||||
|
expect(updated_merge_request_diff.st_diffs_before_type_cast).to be_nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when the diff IDs passed do not exist' do
|
||||||
|
it 'does not raise' do
|
||||||
|
expect { subject.perform(0, 0) }.not_to raise_exception
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when the merge request diff has no serialised commits or diffs' do
|
||||||
|
before do
|
||||||
|
merge_request_diff.update(st_commits: nil, st_diffs: nil)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'does not raise' do
|
||||||
|
expect { subject.perform(merge_request_diff.id, merge_request_diff.id) }
|
||||||
|
.not_to raise_exception
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'processing multiple merge request diffs' do
|
||||||
|
let(:start_id) { described_class::MergeRequestDiff.minimum(:id) }
|
||||||
|
let(:stop_id) { described_class::MergeRequestDiff.maximum(:id) }
|
||||||
|
|
||||||
|
before do
|
||||||
|
merge_request.reload_diff(true)
|
||||||
|
|
||||||
|
convert_to_yaml(start_id, merge_request_diff.commits, merge_request_diff.diffs)
|
||||||
|
convert_to_yaml(stop_id, updated_merge_request_diff.commits, updated_merge_request_diff.diffs)
|
||||||
|
|
||||||
|
MergeRequestDiffCommit.delete_all
|
||||||
|
MergeRequestDiffFile.delete_all
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when BUFFER_ROWS is exceeded' do
|
||||||
|
before do
|
||||||
|
stub_const("#{described_class}::BUFFER_ROWS", 1)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'updates and continues' do
|
||||||
|
expect(described_class::MergeRequestDiff).to receive(:transaction).twice
|
||||||
|
|
||||||
|
subject.perform(start_id, stop_id)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when BUFFER_ROWS is not exceeded' do
|
||||||
|
it 'only updates once' do
|
||||||
|
expect(described_class::MergeRequestDiff).to receive(:transaction).once
|
||||||
|
|
||||||
|
subject.perform(start_id, stop_id)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when the merge request diff update fails' do
|
||||||
|
before do
|
||||||
|
allow(described_class::MergeRequestDiff)
|
||||||
|
.to receive(:update_all).and_raise(ActiveRecord::Rollback)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'does not add any diff commits' do
|
||||||
|
expect { subject.perform(merge_request_diff.id, merge_request_diff.id) }
|
||||||
|
.not_to change { MergeRequestDiffCommit.count }
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'does not add any diff files' do
|
||||||
|
expect { subject.perform(merge_request_diff.id, merge_request_diff.id) }
|
||||||
|
.not_to change { MergeRequestDiffFile.count }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when the merge request diff has valid commits and diffs' do
|
||||||
|
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
|
||||||
|
let(:diffs) { diffs_to_hashes(merge_request_diff.merge_request_diff_files) }
|
||||||
|
let(:expected_diffs) { diffs }
|
||||||
|
|
||||||
|
include_examples 'updated MR diff'
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when the merge request diffs have binary content' do
|
||||||
|
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
|
||||||
|
let(:expected_diffs) { diffs }
|
||||||
|
|
||||||
|
# The start of a PDF created by Illustrator
|
||||||
|
let(:binary_string) do
|
||||||
|
"\x25\x50\x44\x46\x2d\x31\x2e\x35\x0d\x25\xe2\xe3\xcf\xd3\x0d\x0a".force_encoding(Encoding::BINARY)
|
||||||
|
end
|
||||||
|
|
||||||
|
let(:diffs) do
|
||||||
|
[
|
||||||
|
{
|
||||||
|
'diff' => binary_string,
|
||||||
|
'new_path' => 'path',
|
||||||
|
'old_path' => 'path',
|
||||||
|
'a_mode' => '100644',
|
||||||
|
'b_mode' => '100644',
|
||||||
|
'new_file' => false,
|
||||||
|
'renamed_file' => false,
|
||||||
|
'deleted_file' => false,
|
||||||
|
'too_large' => false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
end
|
||||||
|
|
||||||
|
include_examples 'updated MR diff'
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when the merge request diff has commits, but no diffs' do
|
||||||
|
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
|
||||||
|
let(:diffs) { [] }
|
||||||
|
let(:expected_diffs) { diffs }
|
||||||
|
|
||||||
|
include_examples 'updated MR diff'
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when the merge request diffs have invalid content' do
|
||||||
|
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
|
||||||
|
let(:diffs) { ['--broken-diff'] }
|
||||||
|
let(:expected_diffs) { [] }
|
||||||
|
|
||||||
|
include_examples 'updated MR diff'
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when the merge request diffs are Rugged::Patch instances' do
|
||||||
|
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
|
||||||
|
let(:first_commit) { merge_request.project.repository.commit(merge_request_diff.head_commit_sha) }
|
||||||
|
let(:diffs) { first_commit.diff_from_parent.patches }
|
||||||
|
let(:expected_diffs) { [] }
|
||||||
|
|
||||||
|
include_examples 'updated MR diff'
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when the merge request diffs are Rugged::Diff::Delta instances' do
|
||||||
|
let(:commits) { merge_request_diff.commits.map(&:to_hash) }
|
||||||
|
let(:first_commit) { merge_request.project.repository.commit(merge_request_diff.head_commit_sha) }
|
||||||
|
let(:diffs) { first_commit.diff_from_parent.deltas }
|
||||||
|
let(:expected_diffs) { [] }
|
||||||
|
|
||||||
|
include_examples 'updated MR diff'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,59 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
require Rails.root.join('db', 'post_migrate', '20170703130158_schedule_merge_request_diff_migrations')
|
||||||
|
|
||||||
|
describe ScheduleMergeRequestDiffMigrations, :migration, :sidekiq do
|
||||||
|
matcher :be_scheduled_migration do |time, *expected|
|
||||||
|
match do |migration|
|
||||||
|
BackgroundMigrationWorker.jobs.any? do |job|
|
||||||
|
job['args'] == [migration, expected] &&
|
||||||
|
job['at'].to_i == time.to_i
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
failure_message do |migration|
|
||||||
|
"Migration `#{migration}` with args `#{expected.inspect}` not scheduled!"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
let(:merge_request_diffs) { table(:merge_request_diffs) }
|
||||||
|
let(:merge_requests) { table(:merge_requests) }
|
||||||
|
let(:projects) { table(:projects) }
|
||||||
|
|
||||||
|
before do
|
||||||
|
stub_const("#{described_class.name}::BATCH_SIZE", 1)
|
||||||
|
|
||||||
|
projects.create!(id: 1, name: 'gitlab', path: 'gitlab')
|
||||||
|
|
||||||
|
merge_requests.create!(id: 1, target_project_id: 1, source_project_id: 1, target_branch: 'feature', source_branch: 'master')
|
||||||
|
|
||||||
|
merge_request_diffs.create!(id: 1, merge_request_id: 1, st_commits: YAML.dump([]), st_diffs: nil)
|
||||||
|
merge_request_diffs.create!(id: 2, merge_request_id: 1, st_commits: nil, st_diffs: YAML.dump([]))
|
||||||
|
merge_request_diffs.create!(id: 3, merge_request_id: 1, st_commits: nil, st_diffs: nil)
|
||||||
|
merge_request_diffs.create!(id: 4, merge_request_id: 1, st_commits: YAML.dump([]), st_diffs: YAML.dump([]))
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'correctly schedules background migrations' do
|
||||||
|
Sidekiq::Testing.fake! do
|
||||||
|
Timecop.freeze do
|
||||||
|
migrate!
|
||||||
|
|
||||||
|
expect(described_class::MIGRATION).to be_scheduled_migration(5.minutes.from_now, 1, 1)
|
||||||
|
expect(described_class::MIGRATION).to be_scheduled_migration(10.minutes.from_now, 2, 2)
|
||||||
|
expect(described_class::MIGRATION).to be_scheduled_migration(15.minutes.from_now, 4, 4)
|
||||||
|
expect(BackgroundMigrationWorker.jobs.size).to eq 3
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'schedules background migrations' do
|
||||||
|
Sidekiq::Testing.inline! do
|
||||||
|
non_empty = 'st_commits IS NOT NULL OR st_diffs IS NOT NULL'
|
||||||
|
|
||||||
|
expect(merge_request_diffs.where(non_empty).count).to eq 3
|
||||||
|
|
||||||
|
migrate!
|
||||||
|
|
||||||
|
expect(merge_request_diffs.where(non_empty).count).to eq 0
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in a new issue