diff --git a/lib/gitlab/git/blob.rb b/lib/gitlab/git/blob.rb index a4336facee5..cc6c7609ec7 100644 --- a/lib/gitlab/git/blob.rb +++ b/lib/gitlab/git/blob.rb @@ -12,6 +12,12 @@ module Gitlab # blob data should use load_all_data!. MAX_DATA_DISPLAY_SIZE = 10.megabytes + # These limits are used as a heuristic to ignore files which can't be LFS + # pointers. The format of these is described in + # https://github.com/git-lfs/git-lfs/blob/master/docs/spec.md#the-pointer + LFS_POINTER_MIN_SIZE = 120.bytes + LFS_POINTER_MAX_SIZE = 200.bytes + attr_accessor :name, :path, :size, :data, :mode, :id, :commit_id, :loaded_size, :binary class << self @@ -30,16 +36,7 @@ module Gitlab if is_enabled Gitlab::GitalyClient::BlobService.new(repository).get_blob(oid: sha, limit: MAX_DATA_DISPLAY_SIZE) else - blob = repository.lookup(sha) - - next unless blob.is_a?(Rugged::Blob) - - new( - id: blob.oid, - size: blob.size, - data: blob.content(MAX_DATA_DISPLAY_SIZE), - binary: blob.binary? - ) + rugged_raw(repository, sha, limit: MAX_DATA_DISPLAY_SIZE) end end end @@ -59,10 +56,25 @@ module Gitlab end end + # Find LFS blobs given an array of sha ids + # Returns array of Gitlab::Git::Blob + # Does not guarantee blob data will be set + def batch_lfs_pointers(repository, blob_ids) + blob_ids.lazy + .select { |sha| possible_lfs_blob?(repository, sha) } + .map { |sha| rugged_raw(repository, sha, limit: LFS_POINTER_MAX_SIZE) } + .select(&:lfs_pointer?) + .force + end + def binary?(data) EncodingHelper.detect_libgit2_binary?(data) end + def size_could_be_lfs?(size) + size.between?(LFS_POINTER_MIN_SIZE, LFS_POINTER_MAX_SIZE) + end + private # Recursive search of blob id by path @@ -167,6 +179,29 @@ module Gitlab end end end + + def rugged_raw(repository, sha, limit:) + blob = repository.lookup(sha) + + return unless blob.is_a?(Rugged::Blob) + + new( + id: blob.oid, + size: blob.size, + data: blob.content(limit), + binary: blob.binary? + ) + end + + # Efficient lookup to determine if object size + # and type make it a possible LFS blob without loading + # blob content into memory with repository.lookup(sha) + def possible_lfs_blob?(repository, sha) + object_header = repository.rugged.read_header(sha) + + object_header[:type] == :blob && + size_could_be_lfs?(object_header[:len]) + end end def initialize(options) @@ -226,7 +261,7 @@ module Gitlab # size # see https://github.com/github/git-lfs/blob/v1.1.0/docs/spec.md#the-pointer def lfs_pointer? - has_lfs_version_key? && lfs_oid.present? && lfs_size.present? + self.class.size_could_be_lfs?(size) && has_lfs_version_key? && lfs_oid.present? && lfs_size.present? end def lfs_oid diff --git a/lib/gitlab/git/lfs_changes.rb b/lib/gitlab/git/lfs_changes.rb new file mode 100644 index 00000000000..2749e2e69e2 --- /dev/null +++ b/lib/gitlab/git/lfs_changes.rb @@ -0,0 +1,36 @@ +module Gitlab + module Git + class LfsChanges + def initialize(repository, newrev) + @repository = repository + @newrev = newrev + end + + def new_pointers(object_limit: nil, not_in: nil) + @new_pointers ||= begin + object_ids = new_objects(not_in: not_in) + object_ids = object_ids.take(object_limit) if object_limit + + Gitlab::Git::Blob.batch_lfs_pointers(@repository, object_ids) + end + end + + def all_pointers + object_ids = rev_list.all_objects(require_path: true) + + Gitlab::Git::Blob.batch_lfs_pointers(@repository, object_ids) + end + + private + + def new_objects(not_in:) + rev_list.new_objects(require_path: true, lazy: true, not_in: not_in) + end + + def rev_list + ::Gitlab::Git::RevList.new(path_to_repo: @repository.path_to_repo, + newrev: @newrev) + end + end + end +end diff --git a/lib/gitlab/git/repository.rb b/lib/gitlab/git/repository.rb index fc8af38d4d9..a9e4e1130c3 100644 --- a/lib/gitlab/git/repository.rb +++ b/lib/gitlab/git/repository.rb @@ -290,6 +290,14 @@ module Gitlab end end + def batch_existence(object_ids, existing: true) + filter_method = existing ? :select : :reject + + object_ids.public_send(filter_method) do |oid| # rubocop:disable GitlabSecurity/PublicSend + rugged.exists?(oid) + end + end + # Returns an Array of branch and tag names def ref_names branch_names + tag_names diff --git a/lib/gitlab/git/rev_list.rb b/lib/gitlab/git/rev_list.rb index 60b2a4ec411..e0c884aceaa 100644 --- a/lib/gitlab/git/rev_list.rb +++ b/lib/gitlab/git/rev_list.rb @@ -13,11 +13,31 @@ module Gitlab @path_to_repo = path_to_repo end - # This method returns an array of new references + # This method returns an array of new commit references def new_refs execute([*base_args, newrev, '--not', '--all']) end + # Finds newly added objects + # Returns an array of shas + # + # Can skip objects which do not have a path using required_path: true + # This skips commit objects and root trees, which might not be needed when + # looking for blobs + # + # Can return a lazy enumerator to limit work done on megabytes of data + def new_objects(require_path: nil, lazy: false, not_in: nil) + object_output = execute([*base_args, newrev, *not_in_refs(not_in), '--objects']) + + objects_from_output(object_output, require_path: require_path, lazy: lazy) + end + + def all_objects(require_path: nil) + object_output = execute([*base_args, '--all', '--objects']) + + objects_from_output(object_output, require_path: require_path, lazy: true) + end + # This methods returns an array of missed references # # Should become obsolete after https://gitlab.com/gitlab-org/gitaly/issues/348. @@ -27,6 +47,13 @@ module Gitlab private + def not_in_refs(references) + return ['--not', '--all'] unless references + return [] if references.empty? + + references.prepend('--not') + end + def execute(args) output, status = popen(args, nil, Gitlab::Git::Env.to_env_hash) @@ -44,6 +71,22 @@ module Gitlab 'rev-list' ] end + + def objects_from_output(object_output, require_path: nil, lazy: nil) + objects = object_output.lazy.map do |output_line| + sha, path = output_line.split(' ', 2) + + next if require_path && path.blank? + + sha + end.reject(&:nil?) + + if lazy + objects + else + objects.force + end + end end end end diff --git a/spec/lib/gitlab/git/blob_spec.rb b/spec/lib/gitlab/git/blob_spec.rb index 412a0093d97..c04a9688503 100644 --- a/spec/lib/gitlab/git/blob_spec.rb +++ b/spec/lib/gitlab/git/blob_spec.rb @@ -143,6 +143,16 @@ describe Gitlab::Git::Blob, seed_helper: true do expect(blob.loaded_size).to eq(blob_size) end end + + context 'when sha references a tree' do + it 'returns nil' do + tree = Gitlab::Git::Commit.find(repository, 'master').tree + + blob = Gitlab::Git::Blob.raw(repository, tree.oid) + + expect(blob).to be_nil + end + end end describe '.raw' do @@ -226,6 +236,51 @@ describe Gitlab::Git::Blob, seed_helper: true do end end + describe '.batch_lfs_pointers' do + let(:tree_object) { Gitlab::Git::Commit.find(repository, 'master').tree } + + let(:non_lfs_blob) do + Gitlab::Git::Blob.find( + repository, + 'master', + 'README.md' + ) + end + + let(:lfs_blob) do + Gitlab::Git::Blob.find( + repository, + '33bcff41c232a11727ac6d660bd4b0c2ba86d63d', + 'files/lfs/image.jpg' + ) + end + + it 'returns a list of Gitlab::Git::Blob' do + blobs = described_class.batch_lfs_pointers(repository, [lfs_blob.id]) + + expect(blobs.count).to eq(1) + expect(blobs).to all( be_a(Gitlab::Git::Blob) ) + end + + it 'silently ignores tree objects' do + blobs = described_class.batch_lfs_pointers(repository, [tree_object.oid]) + + expect(blobs).to eq([]) + end + + it 'silently ignores non lfs objects' do + blobs = described_class.batch_lfs_pointers(repository, [non_lfs_blob.id]) + + expect(blobs).to eq([]) + end + + it 'avoids loading large blobs into memory' do + expect(repository).not_to receive(:lookup) + + described_class.batch_lfs_pointers(repository, [non_lfs_blob.id]) + end + end + describe 'encoding' do context 'file with russian text' do let(:blob) { Gitlab::Git::Blob.find(repository, SeedRepo::Commit::ID, "encoding/russian.rb") } diff --git a/spec/lib/gitlab/git/lfs_changes_spec.rb b/spec/lib/gitlab/git/lfs_changes_spec.rb new file mode 100644 index 00000000000..c2d2c6e1bc8 --- /dev/null +++ b/spec/lib/gitlab/git/lfs_changes_spec.rb @@ -0,0 +1,48 @@ +require 'spec_helper' + +describe Gitlab::Git::LfsChanges do + let(:project) { create(:project, :repository) } + let(:newrev) { '54fcc214b94e78d7a41a9a8fe6d87a5e59500e51' } + let(:blob_object_id) { '0c304a93cb8430108629bbbcaa27db3343299bc0' } + + subject { described_class.new(project.repository, newrev) } + + describe 'new_pointers' do + before do + allow_any_instance_of(Gitlab::Git::RevList).to receive(:new_objects).and_return([blob_object_id]) + end + + it 'uses rev-list to find new objects' do + rev_list = double + allow(Gitlab::Git::RevList).to receive(:new).and_return(rev_list) + + expect(rev_list).to receive(:new_objects).and_return([]) + + subject.new_pointers + end + + it 'filters new objects to find lfs pointers' do + expect(Gitlab::Git::Blob).to receive(:batch_lfs_pointers).with(project.repository, [blob_object_id]) + + subject.new_pointers(object_limit: 1) + end + + it 'limits new_objects using object_limit' do + expect(Gitlab::Git::Blob).to receive(:batch_lfs_pointers).with(project.repository, []) + + subject.new_pointers(object_limit: 0) + end + end + + describe 'all_pointers' do + it 'uses rev-list to find all objects' do + rev_list = double + allow(Gitlab::Git::RevList).to receive(:new).and_return(rev_list) + allow(rev_list).to receive(:all_objects).and_return([blob_object_id]) + + expect(Gitlab::Git::Blob).to receive(:batch_lfs_pointers).with(project.repository, [blob_object_id]) + + subject.all_pointers + end + end +end diff --git a/spec/lib/gitlab/git/repository_spec.rb b/spec/lib/gitlab/git/repository_spec.rb index b161d25b96c..0fa88d17f5d 100644 --- a/spec/lib/gitlab/git/repository_spec.rb +++ b/spec/lib/gitlab/git/repository_spec.rb @@ -1336,6 +1336,24 @@ describe Gitlab::Git::Repository, seed_helper: true do end end + describe '#batch_existence' do + let(:refs) { ['deadbeef', SeedRepo::RubyBlob::ID, '909e6157199'] } + + it 'returns existing refs back' do + result = repository.batch_existence(refs) + + expect(result).to eq([SeedRepo::RubyBlob::ID]) + end + + context 'existing: true' do + it 'inverts meaning and returns non-existing refs' do + result = repository.batch_existence(refs, existing: false) + + expect(result).to eq(%w(deadbeef 909e6157199)) + end + end + end + describe '#local_branches' do before(:all) do @repo = Gitlab::Git::Repository.new('default', TEST_MUTABLE_REPO_PATH, '') diff --git a/spec/lib/gitlab/git/rev_list_spec.rb b/spec/lib/gitlab/git/rev_list_spec.rb index c0eac98d718..643a4b2d03e 100644 --- a/spec/lib/gitlab/git/rev_list_spec.rb +++ b/spec/lib/gitlab/git/rev_list_spec.rb @@ -2,53 +2,82 @@ require 'spec_helper' describe Gitlab::Git::RevList do let(:project) { create(:project, :repository) } + let(:rev_list) { described_class.new(newrev: 'newrev', path_to_repo: project.repository.path_to_repo) } before do - expect(Gitlab::Git::Env).to receive(:all).and_return({ + allow(Gitlab::Git::Env).to receive(:all).and_return({ GIT_OBJECT_DIRECTORY: 'foo', GIT_ALTERNATE_OBJECT_DIRECTORIES: 'bar' }) end - context "#new_refs" do - let(:rev_list) { described_class.new(newrev: 'newrev', path_to_repo: project.repository.path_to_repo) } + def stub_popen_rev_list(*additional_args, output:) + expect(rev_list).to receive(:popen).with([ + Gitlab.config.git.bin_path, + "--git-dir=#{project.repository.path_to_repo}", + 'rev-list', + *additional_args + ], + nil, + { + 'GIT_OBJECT_DIRECTORY' => 'foo', + 'GIT_ALTERNATE_OBJECT_DIRECTORIES' => 'bar' + }).and_return([output, 0]) + end + context "#new_refs" do it 'calls out to `popen`' do - expect(rev_list).to receive(:popen).with([ - Gitlab.config.git.bin_path, - "--git-dir=#{project.repository.path_to_repo}", - 'rev-list', - 'newrev', - '--not', - '--all' - ], - nil, - { - 'GIT_OBJECT_DIRECTORY' => 'foo', - 'GIT_ALTERNATE_OBJECT_DIRECTORIES' => 'bar' - }).and_return(["sha1\nsha2", 0]) + stub_popen_rev_list('newrev', '--not', '--all', output: "sha1\nsha2") expect(rev_list.new_refs).to eq(%w[sha1 sha2]) end end + context '#new_objects' do + it 'fetches list of newly pushed objects using rev-list' do + stub_popen_rev_list('newrev', '--not', '--all', '--objects', output: "sha1\nsha2") + + expect(rev_list.new_objects).to eq(%w[sha1 sha2]) + end + + it 'can skip pathless objects' do + stub_popen_rev_list('newrev', '--not', '--all', '--objects', output: "sha1\nsha2 path/to/file") + + expect(rev_list.new_objects(require_path: true)).to eq(%w[sha2]) + end + + it 'can return a lazy enumerator' do + stub_popen_rev_list('newrev', '--not', '--all', '--objects', output: "sha1\nsha2") + + expect(rev_list.new_objects(lazy: true)).to be_a Enumerator::Lazy + end + + it 'can accept list of references to exclude' do + stub_popen_rev_list('newrev', '--not', 'master', '--objects', output: "sha1\nsha2") + + expect(rev_list.new_objects(not_in: ['master'])).to eq(%w[sha1 sha2]) + end + + it 'handles empty list of references to exclude as listing all known objects' do + stub_popen_rev_list('newrev', '--objects', output: "sha1\nsha2") + + expect(rev_list.new_objects(not_in: [])).to eq(%w[sha1 sha2]) + end + end + + context '#all_objects' do + it 'fetches list of all pushed objects using rev-list' do + stub_popen_rev_list('--all', '--objects', output: "sha1\nsha2") + + expect(rev_list.all_objects.force).to eq(%w[sha1 sha2]) + end + end + context "#missed_ref" do let(:rev_list) { described_class.new(oldrev: 'oldrev', newrev: 'newrev', path_to_repo: project.repository.path_to_repo) } it 'calls out to `popen`' do - expect(rev_list).to receive(:popen).with([ - Gitlab.config.git.bin_path, - "--git-dir=#{project.repository.path_to_repo}", - 'rev-list', - '--max-count=1', - 'oldrev', - '^newrev' - ], - nil, - { - 'GIT_OBJECT_DIRECTORY' => 'foo', - 'GIT_ALTERNATE_OBJECT_DIRECTORIES' => 'bar' - }).and_return(["sha1\nsha2", 0]) + stub_popen_rev_list('--max-count=1', 'oldrev', '^newrev', output: "sha1\nsha2") expect(rev_list.missed_ref).to eq(%w[sha1 sha2]) end