Refine Git garbage collection

This commit is contained in:
Jacob Vosmaer 2016-10-27 14:59:52 +02:00
parent e4c05de75c
commit 6bcc52a536
16 changed files with 410 additions and 83 deletions

View file

@ -117,6 +117,11 @@ class Admin::ApplicationSettingsController < Admin::ApplicationController
:send_user_confirmation_email,
:container_registry_token_expire_delay,
:enabled_git_access_protocol,
:housekeeping_enabled,
:housekeeping_bitmaps_enabled,
:housekeeping_incremental_repack_period,
:housekeeping_full_repack_period,
:housekeeping_gc_period,
repository_storages: [],
restricted_visibility_levels: [],
import_sources: [],

View file

@ -85,6 +85,18 @@ class ApplicationSetting < ActiveRecord::Base
presence: { message: 'Domain blacklist cannot be empty if Blacklist is enabled.' },
if: :domain_blacklist_enabled?
validates :housekeeping_incremental_repack_period,
presence: true,
numericality: { only_integer: true, greater_than: 0 }
validates :housekeeping_full_repack_period,
presence: true,
numericality: { only_integer: true, greater_than: :housekeeping_incremental_repack_period }
validates :housekeeping_gc_period,
presence: true,
numericality: { only_integer: true, greater_than: :housekeeping_full_repack_period }
validates_each :restricted_visibility_levels do |record, attr, value|
unless value.nil?
value.each do |level|
@ -168,6 +180,11 @@ class ApplicationSetting < ActiveRecord::Base
container_registry_token_expire_delay: 5,
repository_storages: ['default'],
user_default_external: false,
housekeeping_enabled: true,
housekeeping_bitmaps_enabled: true,
housekeeping_incremental_repack_period: 10,
housekeeping_full_repack_period: 50,
housekeeping_gc_period: 200,
)
end

View file

@ -7,6 +7,8 @@
#
module Projects
class HousekeepingService < BaseService
include Gitlab::CurrentSettings
LEASE_TIMEOUT = 3600
class LeaseTaken < StandardError
@ -20,13 +22,14 @@ module Projects
end
def execute
raise LeaseTaken unless try_obtain_lease
lease_uuid = try_obtain_lease
raise LeaseTaken unless lease_uuid.present?
execute_gitlab_shell_gc
execute_gitlab_shell_gc(lease_uuid)
end
def needed?
@project.pushes_since_gc >= 10
pushes_since_gc > 0 && period_match? && housekeeping_enabled?
end
def increment!
@ -37,19 +40,59 @@ module Projects
private
def execute_gitlab_shell_gc
GitGarbageCollectWorker.perform_async(@project.id)
def execute_gitlab_shell_gc(lease_uuid)
GitGarbageCollectWorker.perform_async(@project.id, task, lease_key, lease_uuid)
ensure
Gitlab::Metrics.measure(:reset_pushes_since_gc) do
@project.reset_pushes_since_gc
if pushes_since_gc >= gc_period
Gitlab::Metrics.measure(:reset_pushes_since_gc) do
@project.reset_pushes_since_gc
end
end
end
def try_obtain_lease
Gitlab::Metrics.measure(:obtain_housekeeping_lease) do
lease = ::Gitlab::ExclusiveLease.new("project_housekeeping:#{@project.id}", timeout: LEASE_TIMEOUT)
lease = ::Gitlab::ExclusiveLease.new(lease_key, timeout: LEASE_TIMEOUT)
lease.try_obtain
end
end
def lease_key
"project_housekeeping:#{@project.id}"
end
def pushes_since_gc
@project.pushes_since_gc
end
def task
if pushes_since_gc % gc_period == 0
:gc
elsif pushes_since_gc % full_repack_period == 0
:full_repack
else
:incremental_repack
end
end
def period_match?
[gc_period, full_repack_period, repack_period].any? { |period| pushes_since_gc % period == 0 }
end
def housekeeping_enabled?
current_application_settings.housekeeping_enabled
end
def gc_period
current_application_settings.housekeeping_gc_period
end
def full_repack_period
current_application_settings.housekeeping_full_repack_period
end
def repack_period
current_application_settings.housekeeping_incremental_repack_period
end
end
end

View file

@ -422,5 +422,44 @@
Enable this option to include the name of the author of the issue,
merge request or comment in the email body instead.
%fieldset
%legend Automatic Git repository housekeeping
.form-group
.col-sm-offset-2.col-sm-10
.checkbox
= f.label :housekeeping_enabled do
= f.check_box :housekeeping_enabled
Enable automatic repository housekeeping (git repack, git gc)
.help-block
If you keep automatic housekeeping disabled for a long time Git
repository access on your GitLab server will become slower and your
repositories will use more disk space. We recommend to always leave
this enabled.
.checkbox
= f.label :housekeeping_bitmaps_enabled do
= f.check_box :housekeeping_bitmaps_enabled
Enable Git pack file bitmap creation
.help-block
Creating pack file bitmaps makes housekeeping take a little longer but
bitmaps should accelerate 'git clone' performance.
.form-group
= f.label :housekeeping_incremental_repack_period, 'Incremental repack period', class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :housekeeping_incremental_repack_period, class: 'form-control'
.help-block
Number of Git pushes after which an incremental 'git repack' is run.
.form-group
= f.label :housekeeping_full_repack_period, 'Full repack period', class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :housekeeping_full_repack_period, class: 'form-control'
.help-block
Number of Git pushes after which a full 'git repack' is run.
.form-group
= f.label :housekeeping_gc_period, 'Git GC period', class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :housekeeping_gc_period, class: 'form-control'
.help-block
Number of Git pushes after which 'git gc' is run.
.form-actions
= f.submit 'Save', class: 'btn btn-save'

View file

@ -1,17 +1,58 @@
class GitGarbageCollectWorker
include Sidekiq::Worker
include Gitlab::ShellAdapter
include DedicatedSidekiqQueue
include Gitlab::CurrentSettings
sidekiq_options retry: false
def perform(project_id)
def perform(project_id, task = :gc, lease_key = nil, lease_uuid = nil)
project = Project.find(project_id)
task = task.to_sym
cmd = command(task)
repo_path = project.repository.path_to_repo
description = "'#{cmd.join(' ')}' in #{repo_path}"
Gitlab::GitLogger.info(description)
output, status = Gitlab::Popen.popen(cmd, repo_path)
Gitlab::GitLogger.error("#{description} failed:\n#{output}") unless status.zero?
gitlab_shell.gc(project.repository_storage_path, project.path_with_namespace)
# Refresh the branch cache in case garbage collection caused a ref lookup to fail
flush_ref_caches(project) if task == :gc
ensure
Gitlab::ExclusiveLease.cancel(lease_key, lease_uuid) if lease_key.present? && lease_uuid.present?
end
private
def command(task)
case task
when :gc
git(write_bitmaps: bitmaps_enabled?) + %w[gc]
when :full_repack
git(write_bitmaps: bitmaps_enabled?) + %w[repack -A -d --pack-kept-objects]
when :incremental_repack
# Normal git repack fails when bitmaps are enabled. It is impossible to
# create a bitmap here anyway.
git(write_bitmaps: false) + %w[repack -d]
else
raise "Invalid gc task: #{task.inspect}"
end
end
def flush_ref_caches(project)
project.repository.after_create_branch
project.repository.branch_names
project.repository.has_visible_content?
end
def bitmaps_enabled?
current_application_settings.housekeeping_bitmaps_enabled
end
def git(write_bitmaps:)
config_value = write_bitmaps ? 'true' : 'false'
%W[git -c repack.writeBitmaps=#{config_value}]
end
end

View file

@ -0,0 +1,4 @@
---
title: Finer-grained Git gargage collection
merge_request: 6588
author:

View file

@ -0,0 +1,32 @@
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class AddHousekeepingToApplicationSettings < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
# When a migration requires downtime you **must** uncomment the following
# constant and define a short and easy to understand explanation as to why the
# migration requires downtime.
# DOWNTIME_REASON = ''
disable_ddl_transaction!
def up
add_column_with_default(:application_settings, :housekeeping_enabled, :boolean, default: true, allow_null: false)
add_column_with_default(:application_settings, :housekeeping_bitmaps_enabled, :boolean, default: true, allow_null: false)
add_column_with_default(:application_settings, :housekeeping_incremental_repack_period, :integer, default: 10, allow_null: false)
add_column_with_default(:application_settings, :housekeeping_full_repack_period, :integer, default: 50, allow_null: false)
add_column_with_default(:application_settings, :housekeeping_gc_period, :integer, default: 200, allow_null: false)
end
def down
remove_column(:application_settings, :housekeeping_enabled, :boolean, default: true, allow_null: false)
remove_column(:application_settings, :housekeeping_bitmaps_enabled, :boolean, default: true, allow_null: false)
remove_column(:application_settings, :housekeeping_incremental_repack_period, :integer, default: 10, allow_null: false)
remove_column(:application_settings, :housekeeping_full_repack_period, :integer, default: 50, allow_null: false)
remove_column(:application_settings, :housekeeping_gc_period, :integer, default: 200, allow_null: false)
end
end

View file

@ -98,6 +98,11 @@ ActiveRecord::Schema.define(version: 20161103171205) do
t.text "help_page_text_html"
t.text "shared_runners_text_html"
t.text "after_sign_up_text_html"
t.boolean "housekeeping_enabled", default: true, null: false
t.boolean "housekeeping_bitmaps_enabled", default: true, null: false
t.integer "housekeeping_incremental_repack_period", default: 10, null: false
t.integer "housekeeping_full_repack_period", default: 50, null: false
t.integer "housekeeping_gc_period", default: 200, null: false
end
create_table "audit_events", force: :cascade do |t|

View file

@ -3,6 +3,14 @@
> [Introduced][ce-2371] in GitLab 8.4.
---
## Automatic housekeeping
GitLab automatically runs `git gc` and `git repack` on repositories
after Git pushes. If needed you can change how often this happens, or
to turn it off, go to **Admin area > Settings**
(`/admin/application_settings`).
## Manual housekeeping
The housekeeping function runs `git gc` ([man page][man]) on the current
project Git repository.

View file

@ -127,19 +127,6 @@ module Gitlab
'rm-project', storage, "#{name}.git"])
end
# Gc repository
#
# storage - project storage path
# path - project path with namespace
#
# Ex.
# gc("/path/to/storage", "gitlab/gitlab-ci")
#
def gc(storage, path)
Gitlab::Utils.system_silent([gitlab_shell_projects_path, 'gc',
storage, "#{path}.git"])
end
# Add new key to gitlab-shell
#
# Ex.

View file

@ -1,66 +1,52 @@
require 'securerandom'
module Gitlab
# This class implements an 'exclusive lease'. We call it a 'lease'
# because it has a set expiry time. We call it 'exclusive' because only
# one caller may obtain a lease for a given key at a time. The
# implementation is intended to work across GitLab processes and across
# servers. It is a 'cheap' alternative to using SQL queries and updates:
# servers. It is a cheap alternative to using SQL queries and updates:
# you do not need to change the SQL schema to start using
# ExclusiveLease.
#
# It is important to choose the timeout wisely. If the timeout is very
# high (1 hour) then the throughput of your operation gets very low (at
# most once an hour). If the timeout is lower than how long your
# operation may take then you cannot count on exclusivity. For example,
# if the timeout is 10 seconds and you do an operation which may take 20
# seconds then two overlapping operations may hold a lease for the same
# key at the same time.
#
# This class has no 'cancel' method. I originally decided against adding
# it because it would add complexity and a false sense of security. The
# complexity: instead of setting '1' we would have to set a UUID, and to
# delete it we would have to execute Lua on the Redis server to only
# delete the key if the value was our own UUID. Otherwise there is a
# chance that when you intend to cancel your lease you actually delete
# someone else's. The false sense of security: you cannot design your
# system to rely too much on the lease being cancelled after use because
# the calling (Ruby) process may crash or be killed. You _cannot_ count
# on begin/ensure blocks to cancel a lease, because the 'ensure' does
# not always run. Think of 'kill -9' from the Unicorn master for
# instance.
#
# If you find that leases are getting in your way, ask yourself: would
# it be enough to lower the lease timeout? Another thing that might be
# appropriate is to only use a lease for bulk/automated operations, and
# to ignore the lease when you get a single 'manual' user request (a
# button click).
#
class ExclusiveLease
def initialize(key, timeout:)
@key, @timeout = key, timeout
LUA_CANCEL_SCRIPT = <<-EOS
local key, uuid = KEYS[1], ARGV[1]
if redis.call("get", key) == uuid then
redis.call("del", key)
end
EOS
def self.cancel(key, uuid)
Gitlab::Redis.with do |redis|
redis.eval(LUA_CANCEL_SCRIPT, keys: [redis_key(key)], argv: [uuid])
end
end
# Try to obtain the lease. Return true on success,
def self.redis_key(key)
"gitlab:exclusive_lease:#{key}"
end
def initialize(key, timeout:)
@redis_key = self.class.redis_key(key)
@timeout = timeout
@uuid = SecureRandom.uuid
end
# Try to obtain the lease. Return lease UUID on success,
# false if the lease is already taken.
def try_obtain
# Performing a single SET is atomic
Gitlab::Redis.with do |redis|
!!redis.set(redis_key, '1', nx: true, ex: @timeout)
redis.set(@redis_key, @uuid, nx: true, ex: @timeout) && @uuid
end
end
# Returns true if the key for this lease is set.
def exists?
Gitlab::Redis.with do |redis|
redis.exists(redis_key)
redis.exists(@redis_key)
end
end
# No #cancel method. See comments above!
private
def redis_key
"gitlab:exclusive_lease:#{@key}"
end
end
end

View file

@ -14,7 +14,6 @@ describe Gitlab::Shell, lib: true do
it { is_expected.to respond_to :add_repository }
it { is_expected.to respond_to :remove_repository }
it { is_expected.to respond_to :fork_repository }
it { is_expected.to respond_to :gc }
it { is_expected.to respond_to :add_namespace }
it { is_expected.to respond_to :rm_namespace }
it { is_expected.to respond_to :mv_namespace }

View file

@ -5,32 +5,47 @@ describe Gitlab::ExclusiveLease, type: :redis do
describe '#try_obtain' do
it 'cannot obtain twice before the lease has expired' do
lease = Gitlab::ExclusiveLease.new(unique_key, timeout: 3600)
expect(lease.try_obtain).to eq(true)
lease = described_class.new(unique_key, timeout: 3600)
expect(lease.try_obtain).to be_present
expect(lease.try_obtain).to eq(false)
end
it 'can obtain after the lease has expired' do
timeout = 1
lease = Gitlab::ExclusiveLease.new(unique_key, timeout: timeout)
lease = described_class.new(unique_key, timeout: timeout)
lease.try_obtain # start the lease
sleep(2 * timeout) # lease should have expired now
expect(lease.try_obtain).to eq(true)
expect(lease.try_obtain).to be_present
end
end
describe '#exists?' do
it 'returns true for an existing lease' do
lease = Gitlab::ExclusiveLease.new(unique_key, timeout: 3600)
lease = described_class.new(unique_key, timeout: 3600)
lease.try_obtain
expect(lease.exists?).to eq(true)
end
it 'returns false for a lease that does not exist' do
lease = Gitlab::ExclusiveLease.new(unique_key, timeout: 3600)
lease = described_class.new(unique_key, timeout: 3600)
expect(lease.exists?).to eq(false)
end
end
describe '.cancel' do
it 'can cancel a lease' do
uuid = new_lease(unique_key)
expect(uuid).to be_present
expect(new_lease(unique_key)).to eq(false)
described_class.cancel(unique_key, uuid)
expect(new_lease(unique_key)).to be_present
end
def new_lease(key)
described_class.new(key, timeout: 3600).try_obtain
end
end
end

View file

@ -98,6 +98,24 @@ describe ApplicationSetting, models: true do
end
end
end
context 'housekeeping settings' do
it { is_expected.not_to allow_value(0).for(:housekeeping_incremental_repack_period) }
it 'wants the full repack period to be longer than the incremental repack period' do
subject.housekeeping_incremental_repack_period = 2
subject.housekeeping_full_repack_period = 1
expect(subject).not_to be_valid
end
it 'wants the gc period to be longer than the full repack period' do
subject.housekeeping_full_repack_period = 2
subject.housekeeping_gc_period = 1
expect(subject).not_to be_valid
end
end
end
context 'restricted signup domains' do

View file

@ -14,8 +14,10 @@ describe Projects::HousekeepingService do
describe '#execute' do
it 'enqueues a sidekiq job' do
expect(subject).to receive(:try_obtain_lease).and_return(true)
expect(GitGarbageCollectWorker).to receive(:perform_async).with(project.id)
expect(subject).to receive(:try_obtain_lease).and_return(:the_uuid)
expect(subject).to receive(:lease_key).and_return(:the_lease_key)
expect(subject).to receive(:task).and_return(:the_task)
expect(GitGarbageCollectWorker).to receive(:perform_async).with(project.id, :the_task, :the_lease_key, :the_uuid)
subject.execute
expect(project.reload.pushes_since_gc).to eq(0)
@ -58,4 +60,26 @@ describe Projects::HousekeepingService do
end.to change { project.pushes_since_gc }.from(0).to(1)
end
end
it 'uses all three kinds of housekeeping we offer' do
allow(subject).to receive(:try_obtain_lease).and_return(:the_uuid)
allow(subject).to receive(:lease_key).and_return(:the_lease_key)
# At push 200
expect(GitGarbageCollectWorker).to receive(:perform_async).with(project.id, :gc, :the_lease_key, :the_uuid).
exactly(1).times
# At push 50, 100, 150
expect(GitGarbageCollectWorker).to receive(:perform_async).with(project.id, :full_repack, :the_lease_key, :the_uuid).
exactly(3).times
# At push 10, 20, ... (except those above)
expect(GitGarbageCollectWorker).to receive(:perform_async).with(project.id, :incremental_repack, :the_lease_key, :the_uuid).
exactly(16).times
201.times do
subject.increment!
subject.execute if subject.needed?
end
expect(project.pushes_since_gc).to eq(1)
end
end

View file

@ -1,3 +1,6 @@
require 'digest'
require 'fileutils'
require 'spec_helper'
describe GitGarbageCollectWorker do
@ -6,16 +9,12 @@ describe GitGarbageCollectWorker do
subject { GitGarbageCollectWorker.new }
before do
allow(subject).to receive(:gitlab_shell).and_return(shell)
end
describe "#perform" do
it "runs `git gc`" do
expect(shell).to receive(:gc).with(
project.repository_storage_path,
project.path_with_namespace).
and_return(true)
it "flushes ref caches when the task is 'gc'" do
expect(subject).to receive(:command).with(:gc).and_return([:the, :command])
expect(Gitlab::Popen).to receive(:popen).
with([:the, :command], project.repository.path_to_repo).and_return(["", 0])
expect_any_instance_of(Repository).to receive(:after_create_branch).and_call_original
expect_any_instance_of(Repository).to receive(:branch_names).and_call_original
expect_any_instance_of(Repository).to receive(:branch_count).and_call_original
@ -23,5 +22,110 @@ describe GitGarbageCollectWorker do
subject.perform(project.id)
end
shared_examples 'gc tasks' do
before { allow(subject).to receive(:bitmaps_enabled?).and_return(bitmaps_enabled) }
it 'incremental repack adds a new packfile' do
create_objects(project)
before_packs = packs(project)
expect(before_packs.count).to be >= 1
subject.perform(project.id, 'incremental_repack')
after_packs = packs(project)
# Exactly one new pack should have been created
expect(after_packs.count).to eq(before_packs.count + 1)
# Previously existing packs are still around
expect(before_packs & after_packs).to eq(before_packs)
end
it 'full repack consolidates into 1 packfile' do
create_objects(project)
subject.perform(project.id, 'incremental_repack')
before_packs = packs(project)
expect(before_packs.count).to be >= 2
subject.perform(project.id, 'full_repack')
after_packs = packs(project)
expect(after_packs.count).to eq(1)
# Previously existing packs should be gone now
expect(after_packs - before_packs).to eq(after_packs)
expect(File.exist?(bitmap_path(after_packs.first))).to eq(bitmaps_enabled)
end
it 'gc consolidates into 1 packfile and updates packed-refs' do
create_objects(project)
before_packs = packs(project)
before_packed_refs = packed_refs(project)
expect(before_packs.count).to be >= 1
subject.perform(project.id, 'gc')
after_packed_refs = packed_refs(project)
after_packs = packs(project)
expect(after_packs.count).to eq(1)
# Previously existing packs should be gone now
expect(after_packs - before_packs).to eq(after_packs)
# The packed-refs file should have been updated during 'git gc'
expect(before_packed_refs).not_to eq(after_packed_refs)
expect(File.exist?(bitmap_path(after_packs.first))).to eq(bitmaps_enabled)
end
end
context 'with bitmaps enabled' do
let(:bitmaps_enabled) { true }
include_examples 'gc tasks'
end
context 'with bitmaps disabled' do
let(:bitmaps_enabled) { false }
include_examples 'gc tasks'
end
end
# Create a new commit on a random new branch
def create_objects(project)
rugged = project.repository.rugged
old_commit = rugged.branches.first.target
new_commit_sha = Rugged::Commit.create(
rugged,
message: "hello world #{SecureRandom.hex(6)}",
author: Gitlab::Git::committer_hash(email: 'foo@bar', name: 'baz'),
committer: Gitlab::Git::committer_hash(email: 'foo@bar', name: 'baz'),
tree: old_commit.tree,
parents: [old_commit],
)
project.repository.update_ref!(
"refs/heads/#{SecureRandom.hex(6)}",
new_commit_sha,
Gitlab::Git::BLANK_SHA
)
end
def packs(project)
Dir["#{project.repository.path_to_repo}/objects/pack/*.pack"]
end
def packed_refs(project)
path = "#{project.repository.path_to_repo}/packed-refs"
FileUtils.touch(path)
File.read(path)
end
def bitmap_path(pack)
pack.sub(/\.pack\z/, '.bitmap')
end
end