Remove cleaned up OIDs from database and cache

This commit is contained in:
Nick Thomas 2019-03-25 14:29:51 +00:00
parent d7eb886b9f
commit 8973f32d42
No known key found for this signature in database
GPG key ID: 2A313A47AFADACE9
14 changed files with 307 additions and 128 deletions

View file

@ -1 +1 @@
1.40.0
1.41.0

View file

@ -51,6 +51,10 @@ class MergeRequestDiff < ApplicationRecord
joins(:merge_request_diff_commits).where(merge_request_diff_commits: { sha: sha }).reorder(nil)
end
scope :by_project_id, -> (project_id) do
joins(:merge_request).where(merge_requests: { target_project_id: project_id })
end
scope :recent, -> { order(id: :desc).limit(100) }
scope :files_in_database, -> { where(stored_externally: [false, nil]) }

View file

@ -7,6 +7,10 @@ class NoteDiffFile < ApplicationRecord
joins(:diff_note).where("resolved_at IS NULL OR noteable_type = 'Commit'")
end
scope :referencing_sha, -> (oids, project_id:) do
joins(:diff_note).where(notes: { project_id: project_id, commit_id: oids })
end
delegate :original_position, :project, to: :diff_note
belongs_to :diff_note, inverse_of: :note_diff_file

View file

@ -18,9 +18,6 @@ module Projects
# per rewritten object, with the old and new SHAs space-separated. It can be
# used to update or remove content that references the objects that BFG has
# altered
#
# Currently, only the project repository is modified by this service, but we
# may wish to modify other data sources in the future.
def execute
apply_bfg_object_map!
@ -41,10 +38,52 @@ module Projects
raise NoUploadError unless project.bfg_object_map.exists?
project.bfg_object_map.open do |io|
repository_cleaner.apply_bfg_object_map(io)
repository_cleaner.apply_bfg_object_map_stream(io) do |response|
cleanup_diffs(response)
end
end
end
def cleanup_diffs(response)
old_commit_shas = extract_old_commit_shas(response.entries)
ActiveRecord::Base.transaction do
cleanup_merge_request_diffs(old_commit_shas)
cleanup_note_diff_files(old_commit_shas)
end
end
def extract_old_commit_shas(batch)
batch.lazy.select { |entry| entry.type == :COMMIT }.map(&:old_oid).force
end
def cleanup_merge_request_diffs(old_commit_shas)
merge_request_diffs = MergeRequestDiff
.by_project_id(project.id)
.by_commit_sha(old_commit_shas)
# It's important to run the ActiveRecord callbacks here
merge_request_diffs.destroy_all # rubocop:disable Cop/DestroyAll
# TODO: ensure the highlight cache is removed immediately. It's too hard
# to calculate the Redis keys at present.
#
# https://gitlab.com/gitlab-org/gitlab-ce/issues/61115
end
def cleanup_note_diff_files(old_commit_shas)
# Pluck the IDs instead of running the query twice to ensure we clear the
# cache for exactly the note diffs we remove
ids = NoteDiffFile
.referencing_sha(old_commit_shas, project_id: project.id)
.pluck_primary_key
NoteDiffFile.id_in(ids).delete_all
# A highlighted version of the diff is stored in redis. Remove it now.
Gitlab::DiscussionsDiff::HighlightCache.clear_multiple(ids)
end
def repository_cleaner
@repository_cleaner ||= Gitlab::Git::RepositoryCleaner.new(repository.raw)
end

View file

@ -0,0 +1,5 @@
---
title: Remove cleaned up OIDs from database and cache
merge_request: 26555
author:
type: added

View file

@ -98,6 +98,12 @@ up its own internal state, maximizing the space saved.
`git gc` against the repository. You will receive an email once it has
completed.
This process will remove some copies of the rewritten commits from GitLab's
cache and database, but there are still numerous gaps in coverage - at present,
some of the copies may persist indefinitely. [Clearing the instance cache]
(../../../administration/raketasks/maintenance.md#clear-redis-cache) may help to
remove some of them, but it should not be depended on for security purposes!
## Using `git filter-branch`
1. Navigate to your repository:

View file

@ -52,6 +52,19 @@ module Gitlab
end
end
# Clears multiple cache keys at once.
#
# raw_keys - An Array of unique cache keys, without namespaces.
#
# It returns the number of cache keys cleared. Ex.: 42
def clear_multiple(raw_keys)
return [] if raw_keys.empty?
keys = raw_keys.map { |id| cache_key_for(id) }
Redis::Cache.with { |redis| redis.del(keys) }
end
def cache_key_for(raw_key)
"#{cache_key_prefix}:#{raw_key}"
end

View file

@ -12,9 +12,9 @@ module Gitlab
@repository = repository
end
def apply_bfg_object_map(io)
def apply_bfg_object_map_stream(io, &blk)
wrapped_gitaly_errors do
gitaly_cleanup_client.apply_bfg_object_map(io)
gitaly_cleanup_client.apply_bfg_object_map_stream(io, &blk)
end
end

View file

@ -12,25 +12,32 @@ module Gitlab
@storage = repository.storage
end
def apply_bfg_object_map(io)
first_request = Gitaly::ApplyBfgObjectMapRequest.new(repository: gitaly_repo)
def apply_bfg_object_map_stream(io, &blk)
responses = GitalyClient.call(
storage,
:cleanup_service,
:apply_bfg_object_map_stream,
build_object_map_enum(io),
timeout: GitalyClient.no_timeout
)
enum = Enumerator.new do |y|
y.yield first_request
responses.each(&blk)
end
private
def build_object_map_enum(io)
Enumerator.new do |y|
# First request. For simplicity, doesn't include any object map data
y << Gitaly::ApplyBfgObjectMapStreamRequest.new(repository: gitaly_repo)
# Now stream the BFG object map file to gitaly in chunks
while data = io.read(RepositoryService::MAX_MSG_SIZE)
y.yield Gitaly::ApplyBfgObjectMapRequest.new(object_map: data)
y << Gitaly::ApplyBfgObjectMapStreamRequest.new(object_map: data)
break if io&.eof?
end
end
GitalyClient.call(
storage,
:cleanup_service,
:apply_bfg_object_map,
enum,
timeout: GitalyClient.no_timeout
)
end
end
end

View file

@ -3,31 +3,32 @@
require 'spec_helper'
describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
def fake_file(offset)
{
text: 'foo',
type: 'new',
index: 2 + offset,
old_pos: 10 + offset,
new_pos: 11 + offset,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
}
end
let(:mapping) do
{
3 => [
fake_file(0),
fake_file(1)
],
4 => [
fake_file(2)
]
}
end
describe '#write_multiple' do
it 'sets multiple keys serializing content as JSON' do
mapping = {
3 => [
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 10,
new_pos: 11,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
},
{
text: 'foo',
type: 'new',
index: 3,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blops>blips</blops>'
}
]
}
described_class.write_multiple(mapping)
mapping.each do |key, value|
@ -41,53 +42,16 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
describe '#read_multiple' do
it 'reads multiple keys and serializes content into Gitlab::Diff::Line objects' do
mapping = {
3 => [
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
},
{
text: 'foo',
type: 'new',
index: 3,
old_pos: 10,
new_pos: 11,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
}
]
}
described_class.write_multiple(mapping)
found = described_class.read_multiple(mapping.keys)
expect(found.size).to eq(1)
expect(found.size).to eq(2)
expect(found.first.size).to eq(2)
expect(found.first).to all(be_a(Gitlab::Diff::Line))
end
it 'returns nil when cached key is not found' do
mapping = {
3 => [
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
}
]
}
described_class.write_multiple(mapping)
found = described_class.read_multiple([2, 3])
@ -95,8 +59,30 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
expect(found.size).to eq(2)
expect(found.first).to eq(nil)
expect(found.second.size).to eq(1)
expect(found.second.size).to eq(2)
expect(found.second).to all(be_a(Gitlab::Diff::Line))
end
end
describe '#clear_multiple' do
it 'removes all named keys' do
described_class.write_multiple(mapping)
described_class.clear_multiple(mapping.keys)
expect(described_class.read_multiple(mapping.keys)).to all(be_nil)
end
it 'only removed named keys' do
to_clear, to_leave = mapping.keys
described_class.write_multiple(mapping)
described_class.clear_multiple([to_clear])
cleared, left = described_class.read_multiple([to_clear, to_leave])
expect(cleared).to be_nil
expect(left).to all(be_a(Gitlab::Diff::Line))
end
end
end

View file

@ -6,55 +6,62 @@ describe Gitlab::Git::RepositoryCleaner do
let(:project) { create(:project, :repository) }
let(:repository) { project.repository }
let(:head_sha) { repository.head_commit.id }
let(:object_map_data) { "#{head_sha} #{'0' * 40}" }
let(:object_map_data) { "#{head_sha} #{Gitlab::Git::BLANK_SHA}" }
let(:clean_refs) { %W[refs/environments/1 refs/merge-requests/1 refs/keep-around/#{head_sha}] }
let(:keep_refs) { %w[refs/heads/_keep refs/tags/_keep] }
subject(:cleaner) { described_class.new(repository.raw) }
describe '#apply_bfg_object_map' do
let(:clean_refs) { %W[refs/environments/1 refs/merge-requests/1 refs/keep-around/#{head_sha}] }
let(:keep_refs) { %w[refs/heads/_keep refs/tags/_keep] }
shared_examples_for '#apply_bfg_object_map_stream' do
before do
(clean_refs + keep_refs).each { |ref| repository.create_ref(head_sha, ref) }
end
context 'from StringIO' do
let(:object_map) { StringIO.new(object_map_data) }
it 'removes internal references' do
entries = []
it 'removes internal references' do
cleaner.apply_bfg_object_map(object_map)
aggregate_failures do
clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_falsy }
keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_truthy }
end
end
end
context 'from Gitlab::HttpIO' do
let(:url) { 'http://example.com/bfg_object_map.txt' }
let(:tempfile) { Tempfile.new }
let(:object_map) { Gitlab::HttpIO.new(url, object_map_data.size) }
around do |example|
tempfile.write(object_map_data)
tempfile.close
example.run
ensure
tempfile.unlink
cleaner.apply_bfg_object_map_stream(object_map) do |rsp|
entries.concat(rsp.entries)
end
it 'removes internal references' do
stub_remote_url_200(url, tempfile.path)
aggregate_failures do
clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be(false) }
keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be(true) }
cleaner.apply_bfg_object_map(object_map)
aggregate_failures do
clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_falsy }
keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_truthy }
end
expect(entries).to contain_exactly(
Gitaly::ApplyBfgObjectMapStreamResponse::Entry.new(
type: :COMMIT,
old_oid: head_sha,
new_oid: Gitlab::Git::BLANK_SHA
)
)
end
end
end
describe '#apply_bfg_object_map_stream (from StringIO)' do
let(:object_map) { StringIO.new(object_map_data) }
include_examples '#apply_bfg_object_map_stream'
end
describe '#apply_bfg_object_map_stream (from Gitlab::HttpIO)' do
let(:url) { 'http://example.com/bfg_object_map.txt' }
let(:tempfile) { Tempfile.new }
let(:object_map) { Gitlab::HttpIO.new(url, object_map_data.size) }
around do |example|
tempfile.write(object_map_data)
tempfile.close
stub_remote_url_200(url, tempfile.path)
example.run
ensure
tempfile.unlink
end
include_examples '#apply_bfg_object_map_stream'
end
end

View file

@ -6,14 +6,14 @@ describe Gitlab::GitalyClient::CleanupService do
let(:relative_path) { project.disk_path + '.git' }
let(:client) { described_class.new(project.repository) }
describe '#apply_bfg_object_map' do
it 'sends an apply_bfg_object_map message' do
describe '#apply_bfg_object_map_stream' do
it 'sends an apply_bfg_object_map_stream message' do
expect_any_instance_of(Gitaly::CleanupService::Stub)
.to receive(:apply_bfg_object_map)
.to receive(:apply_bfg_object_map_stream)
.with(kind_of(Enumerator), kind_of(Hash))
.and_return(double)
.and_return([])
client.apply_bfg_object_map(StringIO.new)
client.apply_bfg_object_map_stream(StringIO.new)
end
end
end

View file

@ -10,4 +10,31 @@ describe NoteDiffFile do
describe 'validations' do
it { is_expected.to validate_presence_of(:diff_note) }
end
describe '.referencing_sha' do
let!(:diff_note) { create(:diff_note_on_commit) }
let(:note_diff_file) { diff_note.note_diff_file }
let(:project) { diff_note.project }
it 'finds note diff files by project and sha' do
found = described_class.referencing_sha(diff_note.commit_id, project_id: project.id)
expect(found).to contain_exactly(note_diff_file)
end
it 'excludes note diff files with the wrong project' do
other_project = create(:project)
found = described_class.referencing_sha(diff_note.commit_id, project_id: other_project.id)
expect(found).to be_empty
end
it 'excludes note diff files with the wrong sha' do
found = described_class.referencing_sha(Gitlab::Git::BLANK_SHA, project_id: project.id)
expect(found).to be_empty
end
end
end

View file

@ -6,13 +6,13 @@ describe Projects::CleanupService do
let(:project) { create(:project, :repository, bfg_object_map: fixture_file_upload('spec/fixtures/bfg_object_map.txt')) }
let(:object_map) { project.bfg_object_map }
let(:cleaner) { service.__send__(:repository_cleaner) }
subject(:service) { described_class.new(project) }
describe '#execute' do
it 'runs the apply_bfg_object_map gitaly RPC' do
expect_next_instance_of(Gitlab::Git::RepositoryCleaner) do |cleaner|
expect(cleaner).to receive(:apply_bfg_object_map).with(kind_of(IO))
end
it 'runs the apply_bfg_object_map_stream gitaly RPC' do
expect(cleaner).to receive(:apply_bfg_object_map_stream).with(kind_of(IO))
service.execute
end
@ -37,10 +37,91 @@ describe Projects::CleanupService do
expect(object_map.exists?).to be_falsy
end
context 'with a tainted merge request diff' do
let(:merge_request) { create(:merge_request, source_project: project, target_project: project) }
let(:diff) { merge_request.merge_request_diff }
let(:entry) { build_entry(diff.commits.first.id) }
before do
allow(cleaner)
.to receive(:apply_bfg_object_map_stream)
.and_yield(Gitaly::ApplyBfgObjectMapStreamResponse.new(entries: [entry]))
end
it 'removes the tainted commit from the database' do
service.execute
expect(MergeRequestDiff.exists?(diff.id)).to be_falsy
end
it 'ignores non-commit responses from Gitaly' do
entry.type = :UNKNOWN
service.execute
expect(MergeRequestDiff.exists?(diff.id)).to be_truthy
end
end
context 'with a tainted diff note' do
let(:diff_note) { create(:diff_note_on_commit, project: project) }
let(:note_diff_file) { diff_note.note_diff_file }
let(:entry) { build_entry(diff_note.commit_id) }
let(:highlight_cache) { Gitlab::DiscussionsDiff::HighlightCache }
let(:cache_id) { note_diff_file.id }
before do
allow(cleaner)
.to receive(:apply_bfg_object_map_stream)
.and_yield(Gitaly::ApplyBfgObjectMapStreamResponse.new(entries: [entry]))
end
it 'removes the tainted commit from the database' do
service.execute
expect(NoteDiffFile.exists?(note_diff_file.id)).to be_falsy
end
it 'removes the highlight cache from redis' do
write_cache(highlight_cache, cache_id, [{}])
expect(read_cache(highlight_cache, cache_id)).not_to be_nil
service.execute
expect(read_cache(highlight_cache, cache_id)).to be_nil
end
it 'ignores non-commit responses from Gitaly' do
entry.type = :UNKNOWN
service.execute
expect(NoteDiffFile.exists?(note_diff_file.id)).to be_truthy
end
end
it 'raises an error if no object map can be found' do
object_map.remove!
expect { service.execute }.to raise_error(described_class::NoUploadError)
end
end
def build_entry(old_oid)
Gitaly::ApplyBfgObjectMapStreamResponse::Entry.new(
type: :COMMIT,
old_oid: old_oid,
new_oid: Gitlab::Git::BLANK_SHA
)
end
def read_cache(cache, key)
cache.read_multiple([key]).first
end
def write_cache(cache, key, value)
cache.write_multiple(key => value)
end
end