From f2d50af917b878a98e06b994ac32c0718f3d0b78 Mon Sep 17 00:00:00 2001 From: Sean McGivern Date: Mon, 3 Jul 2017 15:48:59 +0100 Subject: [PATCH] Migrate MR commits and diffs to new tables Previously, we stored these as serialised fields - `st_{commits,diffs}` - on the `merge_request_diffs` table. These now have their own tables - `merge_request_diff_{commits,diffs}` - with a column for each attribute of the serialised data. Add a background migration to go through the existing MR diffs and migrate them to the new format. Ignore any contents that cannot be displayed. Assuming that we have 5 million rows to migrate, and each batch of 2,500 rows can be completed in 5 minutes, this will take about 7 days to migrate everything. --- app/models/merge_request_diff.rb | 6 +- ..._schedule_merge_request_diff_migrations.rb | 33 +++ ...rialize_merge_request_diffs_and_commits.rb | 107 ++++++++++ ...ze_merge_request_diffs_and_commits_spec.rb | 188 ++++++++++++++++++ ...dule_merge_request_diff_migrations_spec.rb | 59 ++++++ 5 files changed, 388 insertions(+), 5 deletions(-) create mode 100644 db/post_migrate/20170703130158_schedule_merge_request_diff_migrations.rb create mode 100644 lib/gitlab/background_migration/deserialize_merge_request_diffs_and_commits.rb create mode 100644 spec/lib/gitlab/background_migration/deserialize_merge_request_diffs_and_commits_spec.rb create mode 100644 spec/migrations/schedule_merge_request_diff_migrations_spec.rb diff --git a/app/models/merge_request_diff.rb b/app/models/merge_request_diff.rb index ec87aee9310..d9d746ccf41 100644 --- a/app/models/merge_request_diff.rb +++ b/app/models/merge_request_diff.rb @@ -85,11 +85,7 @@ class MergeRequestDiff < ActiveRecord::Base def raw_diffs(options = {}) if options[:ignore_whitespace_change] - @diffs_no_whitespace ||= - Gitlab::Git::Compare.new( - repository.raw_repository, - safe_start_commit_sha, - head_commit_sha).diffs(options) + @diffs_no_whitespace ||= compare.diffs(options) else @raw_diffs ||= {} @raw_diffs[options] ||= load_diffs(options) diff --git a/db/post_migrate/20170703130158_schedule_merge_request_diff_migrations.rb b/db/post_migrate/20170703130158_schedule_merge_request_diff_migrations.rb new file mode 100644 index 00000000000..17a9dc293f1 --- /dev/null +++ b/db/post_migrate/20170703130158_schedule_merge_request_diff_migrations.rb @@ -0,0 +1,33 @@ +class ScheduleMergeRequestDiffMigrations < ActiveRecord::Migration + include Gitlab::Database::MigrationHelpers + + DOWNTIME = false + BATCH_SIZE = 2500 + MIGRATION = 'DeserializeMergeRequestDiffsAndCommits' + + disable_ddl_transaction! + + class MergeRequestDiff < ActiveRecord::Base + self.table_name = 'merge_request_diffs' + + include ::EachBatch + end + + # Assuming that there are 5 million rows affected (which is more than on + # GitLab.com), and that each batch of 2,500 rows takes up to 5 minutes, then + # we can migrate all the rows in 7 days. + # + # On staging, plucking the IDs themselves takes 5 seconds. + def up + non_empty = 'st_commits IS NOT NULL OR st_diffs IS NOT NULL' + + MergeRequestDiff.where(non_empty).each_batch(of: BATCH_SIZE) do |relation, index| + range = relation.pluck('MIN(id)', 'MAX(id)').first + + BackgroundMigrationWorker.perform_in(index * 5.minutes, MIGRATION, range) + end + end + + def down + end +end diff --git a/lib/gitlab/background_migration/deserialize_merge_request_diffs_and_commits.rb b/lib/gitlab/background_migration/deserialize_merge_request_diffs_and_commits.rb new file mode 100644 index 00000000000..0fbc6b70989 --- /dev/null +++ b/lib/gitlab/background_migration/deserialize_merge_request_diffs_and_commits.rb @@ -0,0 +1,107 @@ +module Gitlab + module BackgroundMigration + class DeserializeMergeRequestDiffsAndCommits + attr_reader :diff_ids, :commit_rows, :file_rows + + class MergeRequestDiff < ActiveRecord::Base + self.table_name = 'merge_request_diffs' + end + + BUFFER_ROWS = 1000 + + def perform(start_id, stop_id) + merge_request_diffs = MergeRequestDiff + .select(:id, :st_commits, :st_diffs) + .where('st_commits IS NOT NULL OR st_diffs IS NOT NULL') + .where(id: start_id..stop_id) + + reset_buffers! + + merge_request_diffs.each do |merge_request_diff| + commits, files = single_diff_rows(merge_request_diff) + + diff_ids << merge_request_diff.id + commit_rows.concat(commits) + file_rows.concat(files) + + if diff_ids.length > BUFFER_ROWS || + commit_rows.length > BUFFER_ROWS || + file_rows.length > BUFFER_ROWS + + flush_buffers! + end + end + + flush_buffers! + end + + private + + def reset_buffers! + @diff_ids = [] + @commit_rows = [] + @file_rows = [] + end + + def flush_buffers! + if diff_ids.any? + MergeRequestDiff.transaction do + Gitlab::Database.bulk_insert('merge_request_diff_commits', commit_rows) + Gitlab::Database.bulk_insert('merge_request_diff_files', file_rows) + + MergeRequestDiff.where(id: diff_ids).update_all(st_commits: nil, st_diffs: nil) + end + end + + reset_buffers! + end + + def single_diff_rows(merge_request_diff) + sha_attribute = Gitlab::Database::ShaAttribute.new + commits = YAML.load(merge_request_diff.st_commits) rescue [] + + commit_rows = commits.map.with_index do |commit, index| + commit_hash = commit.to_hash.with_indifferent_access.except(:parent_ids) + sha = commit_hash.delete(:id) + + commit_hash.merge( + merge_request_diff_id: merge_request_diff.id, + relative_order: index, + sha: sha_attribute.type_cast_for_database(sha) + ) + end + + diffs = YAML.load(merge_request_diff.st_diffs) rescue [] + diffs = [] unless valid_raw_diffs?(diffs) + + file_rows = diffs.map.with_index do |diff, index| + diff_hash = diff.to_hash.with_indifferent_access.merge( + binary: false, + merge_request_diff_id: merge_request_diff.id, + relative_order: index + ) + + # Compatibility with old diffs created with Psych. + diff_hash.tap do |hash| + diff_text = hash[:diff] + + if diff_text.encoding == Encoding::BINARY && !diff_text.ascii_only? + hash[:binary] = true + hash[:diff] = [diff_text].pack('m0') + end + end + end + + [commit_rows, file_rows] + end + + # Unlike MergeRequestDiff#valid_raw_diff?, don't count Rugged objects as + # valid, because we don't render them usefully anyway. + def valid_raw_diffs?(diffs) + return false unless diffs.respond_to?(:each) + + diffs.all? { |diff| diff.is_a?(Hash) } + end + end + end +end diff --git a/spec/lib/gitlab/background_migration/deserialize_merge_request_diffs_and_commits_spec.rb b/spec/lib/gitlab/background_migration/deserialize_merge_request_diffs_and_commits_spec.rb new file mode 100644 index 00000000000..18843cbe992 --- /dev/null +++ b/spec/lib/gitlab/background_migration/deserialize_merge_request_diffs_and_commits_spec.rb @@ -0,0 +1,188 @@ +require 'spec_helper' + +describe Gitlab::BackgroundMigration::DeserializeMergeRequestDiffsAndCommits do + describe '#perform' do + set(:merge_request) { create(:merge_request) } + set(:merge_request_diff) { merge_request.merge_request_diff } + let(:updated_merge_request_diff) { MergeRequestDiff.find(merge_request_diff.id) } + + def diffs_to_hashes(diffs) + diffs.as_json(only: Gitlab::Git::Diff::SERIALIZE_KEYS).map(&:with_indifferent_access) + end + + def quote_yaml(value) + MergeRequestDiff.connection.quote(YAML.dump(value)) + end + + def convert_to_yaml(merge_request_diff_id, commits, diffs) + MergeRequestDiff.where(id: merge_request_diff_id).update_all( + "st_commits = #{quote_yaml(commits)}, st_diffs = #{quote_yaml(diffs)}" + ) + end + + shared_examples 'updated MR diff' do + before do + convert_to_yaml(merge_request_diff.id, commits, diffs) + + MergeRequestDiffCommit.delete_all + MergeRequestDiffFile.delete_all + + subject.perform(merge_request_diff.id, merge_request_diff.id) + end + + it 'creates correct entries in the merge_request_diff_commits table' do + expect(updated_merge_request_diff.merge_request_diff_commits.count).to eq(commits.count) + expect(updated_merge_request_diff.commits.map(&:to_hash)).to eq(commits) + end + + it 'creates correct entries in the merge_request_diff_files table' do + expect(updated_merge_request_diff.merge_request_diff_files.count).to eq(expected_diffs.count) + expect(diffs_to_hashes(updated_merge_request_diff.raw_diffs)).to eq(expected_diffs) + end + + it 'sets the st_commits and st_diffs columns to nil' do + expect(updated_merge_request_diff.st_commits_before_type_cast).to be_nil + expect(updated_merge_request_diff.st_diffs_before_type_cast).to be_nil + end + end + + context 'when the diff IDs passed do not exist' do + it 'does not raise' do + expect { subject.perform(0, 0) }.not_to raise_exception + end + end + + context 'when the merge request diff has no serialised commits or diffs' do + before do + merge_request_diff.update(st_commits: nil, st_diffs: nil) + end + + it 'does not raise' do + expect { subject.perform(merge_request_diff.id, merge_request_diff.id) } + .not_to raise_exception + end + end + + context 'processing multiple merge request diffs' do + let(:start_id) { described_class::MergeRequestDiff.minimum(:id) } + let(:stop_id) { described_class::MergeRequestDiff.maximum(:id) } + + before do + merge_request.reload_diff(true) + + convert_to_yaml(start_id, merge_request_diff.commits, merge_request_diff.diffs) + convert_to_yaml(stop_id, updated_merge_request_diff.commits, updated_merge_request_diff.diffs) + + MergeRequestDiffCommit.delete_all + MergeRequestDiffFile.delete_all + end + + context 'when BUFFER_ROWS is exceeded' do + before do + stub_const("#{described_class}::BUFFER_ROWS", 1) + end + + it 'updates and continues' do + expect(described_class::MergeRequestDiff).to receive(:transaction).twice + + subject.perform(start_id, stop_id) + end + end + + context 'when BUFFER_ROWS is not exceeded' do + it 'only updates once' do + expect(described_class::MergeRequestDiff).to receive(:transaction).once + + subject.perform(start_id, stop_id) + end + end + end + + context 'when the merge request diff update fails' do + before do + allow(described_class::MergeRequestDiff) + .to receive(:update_all).and_raise(ActiveRecord::Rollback) + end + + it 'does not add any diff commits' do + expect { subject.perform(merge_request_diff.id, merge_request_diff.id) } + .not_to change { MergeRequestDiffCommit.count } + end + + it 'does not add any diff files' do + expect { subject.perform(merge_request_diff.id, merge_request_diff.id) } + .not_to change { MergeRequestDiffFile.count } + end + end + + context 'when the merge request diff has valid commits and diffs' do + let(:commits) { merge_request_diff.commits.map(&:to_hash) } + let(:diffs) { diffs_to_hashes(merge_request_diff.merge_request_diff_files) } + let(:expected_diffs) { diffs } + + include_examples 'updated MR diff' + end + + context 'when the merge request diffs have binary content' do + let(:commits) { merge_request_diff.commits.map(&:to_hash) } + let(:expected_diffs) { diffs } + + # The start of a PDF created by Illustrator + let(:binary_string) do + "\x25\x50\x44\x46\x2d\x31\x2e\x35\x0d\x25\xe2\xe3\xcf\xd3\x0d\x0a".force_encoding(Encoding::BINARY) + end + + let(:diffs) do + [ + { + 'diff' => binary_string, + 'new_path' => 'path', + 'old_path' => 'path', + 'a_mode' => '100644', + 'b_mode' => '100644', + 'new_file' => false, + 'renamed_file' => false, + 'deleted_file' => false, + 'too_large' => false + } + ] + end + + include_examples 'updated MR diff' + end + + context 'when the merge request diff has commits, but no diffs' do + let(:commits) { merge_request_diff.commits.map(&:to_hash) } + let(:diffs) { [] } + let(:expected_diffs) { diffs } + + include_examples 'updated MR diff' + end + + context 'when the merge request diffs have invalid content' do + let(:commits) { merge_request_diff.commits.map(&:to_hash) } + let(:diffs) { ['--broken-diff'] } + let(:expected_diffs) { [] } + + include_examples 'updated MR diff' + end + + context 'when the merge request diffs are Rugged::Patch instances' do + let(:commits) { merge_request_diff.commits.map(&:to_hash) } + let(:first_commit) { merge_request.project.repository.commit(merge_request_diff.head_commit_sha) } + let(:diffs) { first_commit.diff_from_parent.patches } + let(:expected_diffs) { [] } + + include_examples 'updated MR diff' + end + + context 'when the merge request diffs are Rugged::Diff::Delta instances' do + let(:commits) { merge_request_diff.commits.map(&:to_hash) } + let(:first_commit) { merge_request.project.repository.commit(merge_request_diff.head_commit_sha) } + let(:diffs) { first_commit.diff_from_parent.deltas } + let(:expected_diffs) { [] } + + include_examples 'updated MR diff' + end + end +end diff --git a/spec/migrations/schedule_merge_request_diff_migrations_spec.rb b/spec/migrations/schedule_merge_request_diff_migrations_spec.rb new file mode 100644 index 00000000000..f95bd6e3511 --- /dev/null +++ b/spec/migrations/schedule_merge_request_diff_migrations_spec.rb @@ -0,0 +1,59 @@ +require 'spec_helper' +require Rails.root.join('db', 'post_migrate', '20170703130158_schedule_merge_request_diff_migrations') + +describe ScheduleMergeRequestDiffMigrations, :migration, :sidekiq do + matcher :be_scheduled_migration do |time, *expected| + match do |migration| + BackgroundMigrationWorker.jobs.any? do |job| + job['args'] == [migration, expected] && + job['at'].to_i == time.to_i + end + end + + failure_message do |migration| + "Migration `#{migration}` with args `#{expected.inspect}` not scheduled!" + end + end + + let(:merge_request_diffs) { table(:merge_request_diffs) } + let(:merge_requests) { table(:merge_requests) } + let(:projects) { table(:projects) } + + before do + stub_const("#{described_class.name}::BATCH_SIZE", 1) + + projects.create!(id: 1, name: 'gitlab', path: 'gitlab') + + merge_requests.create!(id: 1, target_project_id: 1, source_project_id: 1, target_branch: 'feature', source_branch: 'master') + + merge_request_diffs.create!(id: 1, merge_request_id: 1, st_commits: YAML.dump([]), st_diffs: nil) + merge_request_diffs.create!(id: 2, merge_request_id: 1, st_commits: nil, st_diffs: YAML.dump([])) + merge_request_diffs.create!(id: 3, merge_request_id: 1, st_commits: nil, st_diffs: nil) + merge_request_diffs.create!(id: 4, merge_request_id: 1, st_commits: YAML.dump([]), st_diffs: YAML.dump([])) + end + + it 'correctly schedules background migrations' do + Sidekiq::Testing.fake! do + Timecop.freeze do + migrate! + + expect(described_class::MIGRATION).to be_scheduled_migration(5.minutes.from_now, 1, 1) + expect(described_class::MIGRATION).to be_scheduled_migration(10.minutes.from_now, 2, 2) + expect(described_class::MIGRATION).to be_scheduled_migration(15.minutes.from_now, 4, 4) + expect(BackgroundMigrationWorker.jobs.size).to eq 3 + end + end + end + + it 'schedules background migrations' do + Sidekiq::Testing.inline! do + non_empty = 'st_commits IS NOT NULL OR st_diffs IS NOT NULL' + + expect(merge_request_diffs.where(non_empty).count).to eq 3 + + migrate! + + expect(merge_request_diffs.where(non_empty).count).to eq 0 + end + end +end