Detect changes to LFS pointers for pruning and integrity check
Gitlab::Git::Blob.batch_lfs_metadata can be used to check for LFS pointers. It uses a lazy enumorator and filters by blob size
This commit is contained in:
parent
b5e7035c1a
commit
fb3f9c6e50
8 changed files with 313 additions and 41 deletions
|
@ -12,6 +12,12 @@ module Gitlab
|
|||
# blob data should use load_all_data!.
|
||||
MAX_DATA_DISPLAY_SIZE = 10.megabytes
|
||||
|
||||
# These limits are used as a heuristic to ignore files which can't be LFS
|
||||
# pointers. The format of these is described in
|
||||
# https://github.com/git-lfs/git-lfs/blob/master/docs/spec.md#the-pointer
|
||||
LFS_POINTER_MIN_SIZE = 120.bytes
|
||||
LFS_POINTER_MAX_SIZE = 200.bytes
|
||||
|
||||
attr_accessor :name, :path, :size, :data, :mode, :id, :commit_id, :loaded_size, :binary
|
||||
|
||||
class << self
|
||||
|
@ -30,16 +36,7 @@ module Gitlab
|
|||
if is_enabled
|
||||
Gitlab::GitalyClient::BlobService.new(repository).get_blob(oid: sha, limit: MAX_DATA_DISPLAY_SIZE)
|
||||
else
|
||||
blob = repository.lookup(sha)
|
||||
|
||||
next unless blob.is_a?(Rugged::Blob)
|
||||
|
||||
new(
|
||||
id: blob.oid,
|
||||
size: blob.size,
|
||||
data: blob.content(MAX_DATA_DISPLAY_SIZE),
|
||||
binary: blob.binary?
|
||||
)
|
||||
rugged_raw(repository, sha, limit: MAX_DATA_DISPLAY_SIZE)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -59,10 +56,25 @@ module Gitlab
|
|||
end
|
||||
end
|
||||
|
||||
# Find LFS blobs given an array of sha ids
|
||||
# Returns array of Gitlab::Git::Blob
|
||||
# Does not guarantee blob data will be set
|
||||
def batch_lfs_pointers(repository, blob_ids)
|
||||
blob_ids.lazy
|
||||
.select { |sha| possible_lfs_blob?(repository, sha) }
|
||||
.map { |sha| rugged_raw(repository, sha, limit: LFS_POINTER_MAX_SIZE) }
|
||||
.select(&:lfs_pointer?)
|
||||
.force
|
||||
end
|
||||
|
||||
def binary?(data)
|
||||
EncodingHelper.detect_libgit2_binary?(data)
|
||||
end
|
||||
|
||||
def size_could_be_lfs?(size)
|
||||
size.between?(LFS_POINTER_MIN_SIZE, LFS_POINTER_MAX_SIZE)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
# Recursive search of blob id by path
|
||||
|
@ -167,6 +179,29 @@ module Gitlab
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
def rugged_raw(repository, sha, limit:)
|
||||
blob = repository.lookup(sha)
|
||||
|
||||
return unless blob.is_a?(Rugged::Blob)
|
||||
|
||||
new(
|
||||
id: blob.oid,
|
||||
size: blob.size,
|
||||
data: blob.content(limit),
|
||||
binary: blob.binary?
|
||||
)
|
||||
end
|
||||
|
||||
# Efficient lookup to determine if object size
|
||||
# and type make it a possible LFS blob without loading
|
||||
# blob content into memory with repository.lookup(sha)
|
||||
def possible_lfs_blob?(repository, sha)
|
||||
object_header = repository.rugged.read_header(sha)
|
||||
|
||||
object_header[:type] == :blob &&
|
||||
size_could_be_lfs?(object_header[:len])
|
||||
end
|
||||
end
|
||||
|
||||
def initialize(options)
|
||||
|
@ -226,7 +261,7 @@ module Gitlab
|
|||
# size
|
||||
# see https://github.com/github/git-lfs/blob/v1.1.0/docs/spec.md#the-pointer
|
||||
def lfs_pointer?
|
||||
has_lfs_version_key? && lfs_oid.present? && lfs_size.present?
|
||||
self.class.size_could_be_lfs?(size) && has_lfs_version_key? && lfs_oid.present? && lfs_size.present?
|
||||
end
|
||||
|
||||
def lfs_oid
|
||||
|
|
36
lib/gitlab/git/lfs_changes.rb
Normal file
36
lib/gitlab/git/lfs_changes.rb
Normal file
|
@ -0,0 +1,36 @@
|
|||
module Gitlab
|
||||
module Git
|
||||
class LfsChanges
|
||||
def initialize(repository, newrev)
|
||||
@repository = repository
|
||||
@newrev = newrev
|
||||
end
|
||||
|
||||
def new_pointers(object_limit: nil, not_in: nil)
|
||||
@new_pointers ||= begin
|
||||
object_ids = new_objects(not_in: not_in)
|
||||
object_ids = object_ids.take(object_limit) if object_limit
|
||||
|
||||
Gitlab::Git::Blob.batch_lfs_pointers(@repository, object_ids)
|
||||
end
|
||||
end
|
||||
|
||||
def all_pointers
|
||||
object_ids = rev_list.all_objects(require_path: true)
|
||||
|
||||
Gitlab::Git::Blob.batch_lfs_pointers(@repository, object_ids)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def new_objects(not_in:)
|
||||
rev_list.new_objects(require_path: true, lazy: true, not_in: not_in)
|
||||
end
|
||||
|
||||
def rev_list
|
||||
::Gitlab::Git::RevList.new(path_to_repo: @repository.path_to_repo,
|
||||
newrev: @newrev)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -290,6 +290,14 @@ module Gitlab
|
|||
end
|
||||
end
|
||||
|
||||
def batch_existence(object_ids, existing: true)
|
||||
filter_method = existing ? :select : :reject
|
||||
|
||||
object_ids.public_send(filter_method) do |oid| # rubocop:disable GitlabSecurity/PublicSend
|
||||
rugged.exists?(oid)
|
||||
end
|
||||
end
|
||||
|
||||
# Returns an Array of branch and tag names
|
||||
def ref_names
|
||||
branch_names + tag_names
|
||||
|
|
|
@ -13,11 +13,31 @@ module Gitlab
|
|||
@path_to_repo = path_to_repo
|
||||
end
|
||||
|
||||
# This method returns an array of new references
|
||||
# This method returns an array of new commit references
|
||||
def new_refs
|
||||
execute([*base_args, newrev, '--not', '--all'])
|
||||
end
|
||||
|
||||
# Finds newly added objects
|
||||
# Returns an array of shas
|
||||
#
|
||||
# Can skip objects which do not have a path using required_path: true
|
||||
# This skips commit objects and root trees, which might not be needed when
|
||||
# looking for blobs
|
||||
#
|
||||
# Can return a lazy enumerator to limit work done on megabytes of data
|
||||
def new_objects(require_path: nil, lazy: false, not_in: nil)
|
||||
object_output = execute([*base_args, newrev, *not_in_refs(not_in), '--objects'])
|
||||
|
||||
objects_from_output(object_output, require_path: require_path, lazy: lazy)
|
||||
end
|
||||
|
||||
def all_objects(require_path: nil)
|
||||
object_output = execute([*base_args, '--all', '--objects'])
|
||||
|
||||
objects_from_output(object_output, require_path: require_path, lazy: true)
|
||||
end
|
||||
|
||||
# This methods returns an array of missed references
|
||||
#
|
||||
# Should become obsolete after https://gitlab.com/gitlab-org/gitaly/issues/348.
|
||||
|
@ -27,6 +47,13 @@ module Gitlab
|
|||
|
||||
private
|
||||
|
||||
def not_in_refs(references)
|
||||
return ['--not', '--all'] unless references
|
||||
return [] if references.empty?
|
||||
|
||||
references.prepend('--not')
|
||||
end
|
||||
|
||||
def execute(args)
|
||||
output, status = popen(args, nil, Gitlab::Git::Env.to_env_hash)
|
||||
|
||||
|
@ -44,6 +71,22 @@ module Gitlab
|
|||
'rev-list'
|
||||
]
|
||||
end
|
||||
|
||||
def objects_from_output(object_output, require_path: nil, lazy: nil)
|
||||
objects = object_output.lazy.map do |output_line|
|
||||
sha, path = output_line.split(' ', 2)
|
||||
|
||||
next if require_path && path.blank?
|
||||
|
||||
sha
|
||||
end.reject(&:nil?)
|
||||
|
||||
if lazy
|
||||
objects
|
||||
else
|
||||
objects.force
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -143,6 +143,16 @@ describe Gitlab::Git::Blob, seed_helper: true do
|
|||
expect(blob.loaded_size).to eq(blob_size)
|
||||
end
|
||||
end
|
||||
|
||||
context 'when sha references a tree' do
|
||||
it 'returns nil' do
|
||||
tree = Gitlab::Git::Commit.find(repository, 'master').tree
|
||||
|
||||
blob = Gitlab::Git::Blob.raw(repository, tree.oid)
|
||||
|
||||
expect(blob).to be_nil
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '.raw' do
|
||||
|
@ -226,6 +236,51 @@ describe Gitlab::Git::Blob, seed_helper: true do
|
|||
end
|
||||
end
|
||||
|
||||
describe '.batch_lfs_pointers' do
|
||||
let(:tree_object) { Gitlab::Git::Commit.find(repository, 'master').tree }
|
||||
|
||||
let(:non_lfs_blob) do
|
||||
Gitlab::Git::Blob.find(
|
||||
repository,
|
||||
'master',
|
||||
'README.md'
|
||||
)
|
||||
end
|
||||
|
||||
let(:lfs_blob) do
|
||||
Gitlab::Git::Blob.find(
|
||||
repository,
|
||||
'33bcff41c232a11727ac6d660bd4b0c2ba86d63d',
|
||||
'files/lfs/image.jpg'
|
||||
)
|
||||
end
|
||||
|
||||
it 'returns a list of Gitlab::Git::Blob' do
|
||||
blobs = described_class.batch_lfs_pointers(repository, [lfs_blob.id])
|
||||
|
||||
expect(blobs.count).to eq(1)
|
||||
expect(blobs).to all( be_a(Gitlab::Git::Blob) )
|
||||
end
|
||||
|
||||
it 'silently ignores tree objects' do
|
||||
blobs = described_class.batch_lfs_pointers(repository, [tree_object.oid])
|
||||
|
||||
expect(blobs).to eq([])
|
||||
end
|
||||
|
||||
it 'silently ignores non lfs objects' do
|
||||
blobs = described_class.batch_lfs_pointers(repository, [non_lfs_blob.id])
|
||||
|
||||
expect(blobs).to eq([])
|
||||
end
|
||||
|
||||
it 'avoids loading large blobs into memory' do
|
||||
expect(repository).not_to receive(:lookup)
|
||||
|
||||
described_class.batch_lfs_pointers(repository, [non_lfs_blob.id])
|
||||
end
|
||||
end
|
||||
|
||||
describe 'encoding' do
|
||||
context 'file with russian text' do
|
||||
let(:blob) { Gitlab::Git::Blob.find(repository, SeedRepo::Commit::ID, "encoding/russian.rb") }
|
||||
|
|
48
spec/lib/gitlab/git/lfs_changes_spec.rb
Normal file
48
spec/lib/gitlab/git/lfs_changes_spec.rb
Normal file
|
@ -0,0 +1,48 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Gitlab::Git::LfsChanges do
|
||||
let(:project) { create(:project, :repository) }
|
||||
let(:newrev) { '54fcc214b94e78d7a41a9a8fe6d87a5e59500e51' }
|
||||
let(:blob_object_id) { '0c304a93cb8430108629bbbcaa27db3343299bc0' }
|
||||
|
||||
subject { described_class.new(project.repository, newrev) }
|
||||
|
||||
describe 'new_pointers' do
|
||||
before do
|
||||
allow_any_instance_of(Gitlab::Git::RevList).to receive(:new_objects).and_return([blob_object_id])
|
||||
end
|
||||
|
||||
it 'uses rev-list to find new objects' do
|
||||
rev_list = double
|
||||
allow(Gitlab::Git::RevList).to receive(:new).and_return(rev_list)
|
||||
|
||||
expect(rev_list).to receive(:new_objects).and_return([])
|
||||
|
||||
subject.new_pointers
|
||||
end
|
||||
|
||||
it 'filters new objects to find lfs pointers' do
|
||||
expect(Gitlab::Git::Blob).to receive(:batch_lfs_pointers).with(project.repository, [blob_object_id])
|
||||
|
||||
subject.new_pointers(object_limit: 1)
|
||||
end
|
||||
|
||||
it 'limits new_objects using object_limit' do
|
||||
expect(Gitlab::Git::Blob).to receive(:batch_lfs_pointers).with(project.repository, [])
|
||||
|
||||
subject.new_pointers(object_limit: 0)
|
||||
end
|
||||
end
|
||||
|
||||
describe 'all_pointers' do
|
||||
it 'uses rev-list to find all objects' do
|
||||
rev_list = double
|
||||
allow(Gitlab::Git::RevList).to receive(:new).and_return(rev_list)
|
||||
allow(rev_list).to receive(:all_objects).and_return([blob_object_id])
|
||||
|
||||
expect(Gitlab::Git::Blob).to receive(:batch_lfs_pointers).with(project.repository, [blob_object_id])
|
||||
|
||||
subject.all_pointers
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1310,6 +1310,24 @@ describe Gitlab::Git::Repository, seed_helper: true do
|
|||
end
|
||||
end
|
||||
|
||||
describe '#batch_existence' do
|
||||
let(:refs) { ['deadbeef', SeedRepo::RubyBlob::ID, '909e6157199'] }
|
||||
|
||||
it 'returns existing refs back' do
|
||||
result = repository.batch_existence(refs)
|
||||
|
||||
expect(result).to eq([SeedRepo::RubyBlob::ID])
|
||||
end
|
||||
|
||||
context 'existing: true' do
|
||||
it 'inverts meaning and returns non-existing refs' do
|
||||
result = repository.batch_existence(refs, existing: false)
|
||||
|
||||
expect(result).to eq(%w(deadbeef 909e6157199))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '#local_branches' do
|
||||
before(:all) do
|
||||
@repo = Gitlab::Git::Repository.new('default', TEST_MUTABLE_REPO_PATH, '')
|
||||
|
|
|
@ -2,53 +2,82 @@ require 'spec_helper'
|
|||
|
||||
describe Gitlab::Git::RevList do
|
||||
let(:project) { create(:project, :repository) }
|
||||
let(:rev_list) { described_class.new(newrev: 'newrev', path_to_repo: project.repository.path_to_repo) }
|
||||
|
||||
before do
|
||||
expect(Gitlab::Git::Env).to receive(:all).and_return({
|
||||
allow(Gitlab::Git::Env).to receive(:all).and_return({
|
||||
GIT_OBJECT_DIRECTORY: 'foo',
|
||||
GIT_ALTERNATE_OBJECT_DIRECTORIES: 'bar'
|
||||
})
|
||||
end
|
||||
|
||||
context "#new_refs" do
|
||||
let(:rev_list) { described_class.new(newrev: 'newrev', path_to_repo: project.repository.path_to_repo) }
|
||||
def stub_popen_rev_list(*additional_args, output:)
|
||||
expect(rev_list).to receive(:popen).with([
|
||||
Gitlab.config.git.bin_path,
|
||||
"--git-dir=#{project.repository.path_to_repo}",
|
||||
'rev-list',
|
||||
*additional_args
|
||||
],
|
||||
nil,
|
||||
{
|
||||
'GIT_OBJECT_DIRECTORY' => 'foo',
|
||||
'GIT_ALTERNATE_OBJECT_DIRECTORIES' => 'bar'
|
||||
}).and_return([output, 0])
|
||||
end
|
||||
|
||||
context "#new_refs" do
|
||||
it 'calls out to `popen`' do
|
||||
expect(rev_list).to receive(:popen).with([
|
||||
Gitlab.config.git.bin_path,
|
||||
"--git-dir=#{project.repository.path_to_repo}",
|
||||
'rev-list',
|
||||
'newrev',
|
||||
'--not',
|
||||
'--all'
|
||||
],
|
||||
nil,
|
||||
{
|
||||
'GIT_OBJECT_DIRECTORY' => 'foo',
|
||||
'GIT_ALTERNATE_OBJECT_DIRECTORIES' => 'bar'
|
||||
}).and_return(["sha1\nsha2", 0])
|
||||
stub_popen_rev_list('newrev', '--not', '--all', output: "sha1\nsha2")
|
||||
|
||||
expect(rev_list.new_refs).to eq(%w[sha1 sha2])
|
||||
end
|
||||
end
|
||||
|
||||
context '#new_objects' do
|
||||
it 'fetches list of newly pushed objects using rev-list' do
|
||||
stub_popen_rev_list('newrev', '--not', '--all', '--objects', output: "sha1\nsha2")
|
||||
|
||||
expect(rev_list.new_objects).to eq(%w[sha1 sha2])
|
||||
end
|
||||
|
||||
it 'can skip pathless objects' do
|
||||
stub_popen_rev_list('newrev', '--not', '--all', '--objects', output: "sha1\nsha2 path/to/file")
|
||||
|
||||
expect(rev_list.new_objects(require_path: true)).to eq(%w[sha2])
|
||||
end
|
||||
|
||||
it 'can return a lazy enumerator' do
|
||||
stub_popen_rev_list('newrev', '--not', '--all', '--objects', output: "sha1\nsha2")
|
||||
|
||||
expect(rev_list.new_objects(lazy: true)).to be_a Enumerator::Lazy
|
||||
end
|
||||
|
||||
it 'can accept list of references to exclude' do
|
||||
stub_popen_rev_list('newrev', '--not', 'master', '--objects', output: "sha1\nsha2")
|
||||
|
||||
expect(rev_list.new_objects(not_in: ['master'])).to eq(%w[sha1 sha2])
|
||||
end
|
||||
|
||||
it 'handles empty list of references to exclude as listing all known objects' do
|
||||
stub_popen_rev_list('newrev', '--objects', output: "sha1\nsha2")
|
||||
|
||||
expect(rev_list.new_objects(not_in: [])).to eq(%w[sha1 sha2])
|
||||
end
|
||||
end
|
||||
|
||||
context '#all_objects' do
|
||||
it 'fetches list of all pushed objects using rev-list' do
|
||||
stub_popen_rev_list('--all', '--objects', output: "sha1\nsha2")
|
||||
|
||||
expect(rev_list.all_objects.force).to eq(%w[sha1 sha2])
|
||||
end
|
||||
end
|
||||
|
||||
context "#missed_ref" do
|
||||
let(:rev_list) { described_class.new(oldrev: 'oldrev', newrev: 'newrev', path_to_repo: project.repository.path_to_repo) }
|
||||
|
||||
it 'calls out to `popen`' do
|
||||
expect(rev_list).to receive(:popen).with([
|
||||
Gitlab.config.git.bin_path,
|
||||
"--git-dir=#{project.repository.path_to_repo}",
|
||||
'rev-list',
|
||||
'--max-count=1',
|
||||
'oldrev',
|
||||
'^newrev'
|
||||
],
|
||||
nil,
|
||||
{
|
||||
'GIT_OBJECT_DIRECTORY' => 'foo',
|
||||
'GIT_ALTERNATE_OBJECT_DIRECTORIES' => 'bar'
|
||||
}).and_return(["sha1\nsha2", 0])
|
||||
stub_popen_rev_list('--max-count=1', 'oldrev', '^newrev', output: "sha1\nsha2")
|
||||
|
||||
expect(rev_list.missed_ref).to eq(%w[sha1 sha2])
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue