Merge branch 'improve-metadata-access-performance' into 'master'

Improve metadata access performance

See merge request gitlab-org/gitlab-ce!20493
This commit is contained in:
Grzegorz Bizon 2018-07-10 09:21:11 +00:00
commit c7b8afcd90
15 changed files with 431 additions and 317 deletions

View File

@ -437,9 +437,9 @@ module Ci
end
def artifacts_metadata_entry(path, **options)
artifacts_metadata.use_file do |metadata_path|
artifacts_metadata.open do |metadata_stream|
metadata = Gitlab::Ci::Build::Artifacts::Metadata.new(
metadata_path,
metadata_stream,
path,
**options)

View File

@ -71,6 +71,28 @@ class GitlabUploader < CarrierWave::Uploader::Base
File.join('/', self.class.base_dir, dynamic_segment, filename)
end
def cached_size
size
end
def open
stream =
if file_storage?
File.open(path, "rb") if path
else
::Gitlab::HttpIO.new(url, cached_size) if url
end
return unless stream
return stream unless block_given?
begin
yield(stream)
ensure
stream.close
end
end
private
# Designed to be overridden by child uploaders that have a dynamic path

View File

@ -18,14 +18,6 @@ class JobArtifactUploader < GitlabUploader
dynamic_segment
end
def open
if file_storage?
File.open(path, "rb") if path
else
::Gitlab::Ci::Trace::HttpIO.new(url, cached_size) if url
end
end
private
def dynamic_segment

View File

@ -0,0 +1,5 @@
---
title: Access metadata directly from Object Storage
merge_request:
author:
type: performance

View File

@ -7,14 +7,15 @@ module Gitlab
module Artifacts
class Metadata
ParserError = Class.new(StandardError)
InvalidStreamError = Class.new(StandardError)
VERSION_PATTERN = /^[\w\s]+(\d+\.\d+\.\d+)/
INVALID_PATH_PATTERN = %r{(^\.?\.?/)|(/\.?\.?/)}
attr_reader :file, :path, :full_version
attr_reader :stream, :path, :full_version
def initialize(file, path, **opts)
@file, @path, @opts = file, path, opts
def initialize(stream, path, **opts)
@stream, @path, @opts = stream, path, opts
@full_version = read_version
end
@ -103,7 +104,17 @@ module Gitlab
end
def gzip(&block)
Zlib::GzipReader.open(@file, &block)
raise InvalidStreamError, "Invalid stream" unless @stream
# restart gzip reading
@stream.seek(0)
gz = Zlib::GzipReader.new(@stream)
yield(gz)
rescue Zlib::Error => e
raise InvalidStreamError, e.message
ensure
gz&.finish
end
end
end

View File

@ -1,197 +0,0 @@
##
# This class is compatible with IO class (https://ruby-doc.org/core-2.3.1/IO.html)
# source: https://gitlab.com/snippets/1685610
module Gitlab
module Ci
class Trace
class HttpIO
BUFFER_SIZE = 128.kilobytes
InvalidURLError = Class.new(StandardError)
FailedToGetChunkError = Class.new(StandardError)
attr_reader :uri, :size
attr_reader :tell
attr_reader :chunk, :chunk_range
alias_method :pos, :tell
def initialize(url, size)
raise InvalidURLError unless ::Gitlab::UrlSanitizer.valid?(url)
@uri = URI(url)
@size = size
@tell = 0
end
def close
# no-op
end
def binmode
# no-op
end
def binmode?
true
end
def path
nil
end
def url
@uri.to_s
end
def seek(pos, where = IO::SEEK_SET)
new_pos =
case where
when IO::SEEK_END
size + pos
when IO::SEEK_SET
pos
when IO::SEEK_CUR
tell + pos
else
-1
end
raise 'new position is outside of file' if new_pos < 0 || new_pos > size
@tell = new_pos
end
def eof?
tell == size
end
def each_line
until eof?
line = readline
break if line.nil?
yield(line)
end
end
def read(length = nil, outbuf = "")
out = ""
length ||= size - tell
until length <= 0 || eof?
data = get_chunk
break if data.empty?
chunk_bytes = [BUFFER_SIZE - chunk_offset, length].min
chunk_data = data.byteslice(0, chunk_bytes)
out << chunk_data
@tell += chunk_data.bytesize
length -= chunk_data.bytesize
end
# If outbuf is passed, we put the output into the buffer. This supports IO.copy_stream functionality
if outbuf
outbuf.slice!(0, outbuf.bytesize)
outbuf << out
end
out
end
def readline
out = ""
until eof?
data = get_chunk
new_line = data.index("\n")
if !new_line.nil?
out << data[0..new_line]
@tell += new_line + 1
break
else
out << data
@tell += data.bytesize
end
end
out
end
def write(data)
raise NotImplementedError
end
def truncate(offset)
raise NotImplementedError
end
def flush
raise NotImplementedError
end
def present?
true
end
private
##
# The below methods are not implemented in IO class
#
def in_range?
@chunk_range&.include?(tell)
end
def get_chunk
unless in_range?
response = Net::HTTP.start(uri.hostname, uri.port, proxy_from_env: true, use_ssl: uri.scheme == 'https') do |http|
http.request(request)
end
raise FailedToGetChunkError unless response.code == '200' || response.code == '206'
@chunk = response.body.force_encoding(Encoding::BINARY)
@chunk_range = response.content_range
##
# Note: If provider does not return content_range, then we set it as we requested
# Provider: minio
# - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206
# - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206
# Provider: AWS
# - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206
# - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206
# Provider: GCS
# - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206
# - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPOK 200
@chunk_range ||= (chunk_start...(chunk_start + @chunk.bytesize))
end
@chunk[chunk_offset..BUFFER_SIZE]
end
def request
Net::HTTP::Get.new(uri).tap do |request|
request.set_range(chunk_start, BUFFER_SIZE)
end
end
def chunk_offset
tell % BUFFER_SIZE
end
def chunk_start
(tell / BUFFER_SIZE) * BUFFER_SIZE
end
def chunk_end
[chunk_start + BUFFER_SIZE, size].min
end
end
end
end
end

193
lib/gitlab/http_io.rb Normal file
View File

@ -0,0 +1,193 @@
##
# This class is compatible with IO class (https://ruby-doc.org/core-2.3.1/IO.html)
# source: https://gitlab.com/snippets/1685610
module Gitlab
class HttpIO
BUFFER_SIZE = 128.kilobytes
InvalidURLError = Class.new(StandardError)
FailedToGetChunkError = Class.new(StandardError)
attr_reader :uri, :size
attr_reader :tell
attr_reader :chunk, :chunk_range
alias_method :pos, :tell
def initialize(url, size)
raise InvalidURLError unless ::Gitlab::UrlSanitizer.valid?(url)
@uri = URI(url)
@size = size
@tell = 0
end
def close
# no-op
end
def binmode
# no-op
end
def binmode?
true
end
def path
nil
end
def url
@uri.to_s
end
def seek(pos, where = IO::SEEK_SET)
new_pos =
case where
when IO::SEEK_END
size + pos
when IO::SEEK_SET
pos
when IO::SEEK_CUR
tell + pos
else
-1
end
raise 'new position is outside of file' if new_pos < 0 || new_pos > size
@tell = new_pos
end
def eof?
tell == size
end
def each_line
until eof?
line = readline
break if line.nil?
yield(line)
end
end
def read(length = nil, outbuf = "")
out = ""
length ||= size - tell
until length <= 0 || eof?
data = get_chunk
break if data.empty?
chunk_bytes = [BUFFER_SIZE - chunk_offset, length].min
chunk_data = data.byteslice(0, chunk_bytes)
out << chunk_data
@tell += chunk_data.bytesize
length -= chunk_data.bytesize
end
# If outbuf is passed, we put the output into the buffer. This supports IO.copy_stream functionality
if outbuf
outbuf.slice!(0, outbuf.bytesize)
outbuf << out
end
out
end
def readline
out = ""
until eof?
data = get_chunk
new_line = data.index("\n")
if !new_line.nil?
out << data[0..new_line]
@tell += new_line + 1
break
else
out << data
@tell += data.bytesize
end
end
out
end
def write(data)
raise NotImplementedError
end
def truncate(offset)
raise NotImplementedError
end
def flush
raise NotImplementedError
end
def present?
true
end
private
##
# The below methods are not implemented in IO class
#
def in_range?
@chunk_range&.include?(tell)
end
def get_chunk
unless in_range?
response = Net::HTTP.start(uri.hostname, uri.port, proxy_from_env: true, use_ssl: uri.scheme == 'https') do |http|
http.request(request)
end
raise FailedToGetChunkError unless response.code == '200' || response.code == '206'
@chunk = response.body.force_encoding(Encoding::BINARY)
@chunk_range = response.content_range
##
# Note: If provider does not return content_range, then we set it as we requested
# Provider: minio
# - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206
# - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206
# Provider: AWS
# - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206
# - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206
# Provider: GCS
# - When the file size is larger than requested Content-range, the Content-range is included in responces with Net::HTTPPartialContent 206
# - When the file size is smaller than requested Content-range, the Content-range is included in responces with Net::HTTPOK 200
@chunk_range ||= (chunk_start...(chunk_start + @chunk.bytesize))
end
@chunk[chunk_offset..BUFFER_SIZE]
end
def request
Net::HTTP::Get.new(uri).tap do |request|
request.set_range(chunk_start, BUFFER_SIZE)
end
end
def chunk_offset
tell % BUFFER_SIZE
end
def chunk_start
(tell / BUFFER_SIZE) * BUFFER_SIZE
end
def chunk_end
[chunk_start + BUFFER_SIZE, size].min
end
end
end

View File

@ -216,17 +216,19 @@ describe Projects::JobsController, :clean_gitlab_redis_shared_state do
end
context 'when trace artifact is in ObjectStorage' do
let(:url) { 'http://object-storage/trace' }
let(:file_path) { expand_fixture_path('trace/sample_trace') }
let!(:job) { create(:ci_build, :success, :trace_artifact, pipeline: pipeline) }
before do
allow_any_instance_of(JobArtifactUploader).to receive(:file_storage?) { false }
allow_any_instance_of(JobArtifactUploader).to receive(:url) { remote_trace_url }
allow_any_instance_of(JobArtifactUploader).to receive(:size) { remote_trace_size }
allow_any_instance_of(JobArtifactUploader).to receive(:url) { url }
allow_any_instance_of(JobArtifactUploader).to receive(:size) { File.size(file_path) }
end
context 'when there are no network issues' do
before do
stub_remote_trace_206
stub_remote_url_206(url, file_path)
get_trace
end
@ -241,11 +243,11 @@ describe Projects::JobsController, :clean_gitlab_redis_shared_state do
context 'when there is a network issue' do
before do
stub_remote_trace_500
stub_remote_url_500(url)
end
it 'returns a trace' do
expect { get_trace }.to raise_error(Gitlab::Ci::Trace::HttpIO::FailedToGetChunkError)
expect { get_trace }.to raise_error(Gitlab::HttpIO::FailedToGetChunkError)
end
end
end

View File

@ -2,13 +2,21 @@ require 'spec_helper'
describe Gitlab::Ci::Build::Artifacts::Metadata do
def metadata(path = '', **opts)
described_class.new(metadata_file_path, path, **opts)
described_class.new(metadata_file_stream, path, **opts)
end
let(:metadata_file_path) do
Rails.root + 'spec/fixtures/ci_build_artifacts_metadata.gz'
end
let(:metadata_file_stream) do
File.open(metadata_file_path) if metadata_file_path
end
after do
metadata_file_stream&.close
end
context 'metadata file exists' do
describe '#find_entries! empty string' do
subject { metadata('').find_entries! }
@ -86,11 +94,21 @@ describe Gitlab::Ci::Build::Artifacts::Metadata do
end
context 'metadata file does not exist' do
let(:metadata_file_path) { '' }
let(:metadata_file_path) { nil }
describe '#find_entries!' do
it 'raises error' do
expect { metadata.find_entries! }.to raise_error(Errno::ENOENT)
expect { metadata.find_entries! }.to raise_error(described_class::InvalidStreamError, /Invalid stream/)
end
end
end
context 'metadata file is invalid' do
let(:metadata_file_path) { Rails.root + 'spec/fixtures/ci_build_artifacts.zip' }
describe '#find_entries!' do
it 'raises error' do
expect { metadata.find_entries! }.to raise_error(described_class::InvalidStreamError, /not in gzip format/)
end
end
end

View File

@ -1,11 +1,14 @@
require 'spec_helper'
describe Gitlab::Ci::Trace::HttpIO do
describe Gitlab::HttpIO do
include HttpIOHelpers
let(:http_io) { described_class.new(url, size) }
let(:url) { remote_trace_url }
let(:size) { remote_trace_size }
let(:url) { 'http://object-storage/trace' }
let(:file_path) { expand_fixture_path('trace/sample_trace') }
let(:file_body) { File.read(file_path).force_encoding(Encoding::BINARY) }
let(:size) { File.size(file_path) }
describe '#close' do
subject { http_io.close }
@ -86,10 +89,10 @@ describe Gitlab::Ci::Trace::HttpIO do
describe '#each_line' do
subject { http_io.each_line }
let(:string_io) { StringIO.new(remote_trace_body) }
let(:string_io) { StringIO.new(file_body) }
before do
stub_remote_trace_206
stub_remote_url_206(url, file_path)
end
it 'yields lines' do
@ -99,7 +102,7 @@ describe Gitlab::Ci::Trace::HttpIO do
context 'when buckets on GCS' do
context 'when BUFFER_SIZE is larger than file size' do
before do
stub_remote_trace_200
stub_remote_url_200(url, file_path)
set_larger_buffer_size_than(size)
end
@ -117,7 +120,7 @@ describe Gitlab::Ci::Trace::HttpIO do
context 'when there are no network issue' do
before do
stub_remote_trace_206
stub_remote_url_206(url, file_path)
end
context 'when read whole size' do
@ -129,7 +132,7 @@ describe Gitlab::Ci::Trace::HttpIO do
end
it 'reads a trace' do
is_expected.to eq(remote_trace_body)
is_expected.to eq(file_body)
end
end
@ -139,7 +142,7 @@ describe Gitlab::Ci::Trace::HttpIO do
end
it 'reads a trace' do
is_expected.to eq(remote_trace_body)
is_expected.to eq(file_body)
end
end
end
@ -153,7 +156,7 @@ describe Gitlab::Ci::Trace::HttpIO do
end
it 'reads a trace' do
is_expected.to eq(remote_trace_body[0, length])
is_expected.to eq(file_body[0, length])
end
end
@ -163,7 +166,7 @@ describe Gitlab::Ci::Trace::HttpIO do
end
it 'reads a trace' do
is_expected.to eq(remote_trace_body[0, length])
is_expected.to eq(file_body[0, length])
end
end
end
@ -177,7 +180,7 @@ describe Gitlab::Ci::Trace::HttpIO do
end
it 'reads a trace' do
is_expected.to eq(remote_trace_body)
is_expected.to eq(file_body)
end
end
@ -187,7 +190,7 @@ describe Gitlab::Ci::Trace::HttpIO do
end
it 'reads a trace' do
is_expected.to eq(remote_trace_body)
is_expected.to eq(file_body)
end
end
end
@ -221,11 +224,11 @@ describe Gitlab::Ci::Trace::HttpIO do
let(:length) { nil }
before do
stub_remote_trace_500
stub_remote_url_500(url)
end
it 'reads a trace' do
expect { subject }.to raise_error(Gitlab::Ci::Trace::HttpIO::FailedToGetChunkError)
expect { subject }.to raise_error(Gitlab::HttpIO::FailedToGetChunkError)
end
end
end
@ -233,15 +236,15 @@ describe Gitlab::Ci::Trace::HttpIO do
describe '#readline' do
subject { http_io.readline }
let(:string_io) { StringIO.new(remote_trace_body) }
let(:string_io) { StringIO.new(file_body) }
before do
stub_remote_trace_206
stub_remote_url_206(url, file_path)
end
shared_examples 'all line matching' do
it 'reads a line' do
(0...remote_trace_body.lines.count).each do
(0...file_body.lines.count).each do
expect(http_io.readline).to eq(string_io.readline)
end
end
@ -251,11 +254,11 @@ describe Gitlab::Ci::Trace::HttpIO do
let(:length) { nil }
before do
stub_remote_trace_500
stub_remote_url_500(url)
end
it 'reads a trace' do
expect { subject }.to raise_error(Gitlab::Ci::Trace::HttpIO::FailedToGetChunkError)
expect { subject }.to raise_error(Gitlab::HttpIO::FailedToGetChunkError)
end
end

View File

@ -2687,4 +2687,58 @@ describe Ci::Build do
end
end
end
describe '#artifacts_metadata_entry' do
set(:build) { create(:ci_build, project: project) }
let(:path) { 'other_artifacts_0.1.2/another-subdirectory/banana_sample.gif' }
before do
stub_artifacts_object_storage
end
subject { build.artifacts_metadata_entry(path) }
context 'when using local storage' do
let!(:metadata) { create(:ci_job_artifact, :metadata, job: build) }
context 'for existing file' do
it 'does exist' do
is_expected.to be_exists
end
end
context 'for non-existing file' do
let(:path) { 'invalid-file' }
it 'does not exist' do
is_expected.not_to be_exists
end
end
end
context 'when using remote storage' do
include HttpIOHelpers
let!(:metadata) { create(:ci_job_artifact, :remote_store, :metadata, job: build) }
let(:file_path) { expand_fixture_path('ci_build_artifacts_metadata.gz') }
before do
stub_remote_url_206(metadata.file.url, file_path)
end
context 'for existing file' do
it 'does exist' do
is_expected.to be_exists
end
end
context 'for non-existing file' do
let(:path) { 'invalid-file' }
it 'does not exist' do
is_expected.not_to be_exists
end
end
end
end
end

View File

@ -535,12 +535,14 @@ describe API::Jobs do
context 'authorized user' do
context 'when trace is in ObjectStorage' do
let!(:job) { create(:ci_build, :trace_artifact, pipeline: pipeline) }
let(:url) { 'http://object-storage/trace' }
let(:file_path) { expand_fixture_path('trace/sample_trace') }
before do
stub_remote_trace_206
stub_remote_url_206(url, file_path)
allow_any_instance_of(JobArtifactUploader).to receive(:file_storage?) { false }
allow_any_instance_of(JobArtifactUploader).to receive(:url) { remote_trace_url }
allow_any_instance_of(JobArtifactUploader).to receive(:size) { remote_trace_size }
allow_any_instance_of(JobArtifactUploader).to receive(:url) { url }
allow_any_instance_of(JobArtifactUploader).to receive(:size) { File.size(file_path) }
end
it 'returns specific job trace' do

View File

@ -1,65 +1,49 @@
module HttpIOHelpers
def stub_remote_trace_206
WebMock.stub_request(:get, remote_trace_url)
.to_return { |request| remote_trace_response(request, 206) }
def stub_remote_url_206(url, file_path)
WebMock.stub_request(:get, url)
.to_return { |request| remote_url_response(file_path, request, 206) }
end
def stub_remote_trace_200
WebMock.stub_request(:get, remote_trace_url)
.to_return { |request| remote_trace_response(request, 200) }
def stub_remote_url_200(url, file_path)
WebMock.stub_request(:get, url)
.to_return { |request| remote_url_response(file_path, request, 200) }
end
def stub_remote_trace_500
WebMock.stub_request(:get, remote_trace_url)
def stub_remote_url_500(url)
WebMock.stub_request(:get, url)
.to_return(status: [500, "Internal Server Error"])
end
def remote_trace_url
"http://trace.com/trace"
end
def remote_trace_response(request, responce_status)
def remote_url_response(file_path, request, response_status)
range = request.headers['Range'].match(/bytes=(\d+)-(\d+)/)
body = File.read(file_path).force_encoding(Encoding::BINARY)
size = body.bytesize
{
status: responce_status,
headers: remote_trace_response_headers(responce_status, range[1].to_i, range[2].to_i),
body: range_trace_body(range[1].to_i, range[2].to_i)
status: response_status,
headers: remote_url_response_headers(response_status, range[1].to_i, range[2].to_i, size),
body: body[range[1].to_i..range[2].to_i]
}
end
def remote_trace_response_headers(responce_status, from, to)
headers = { 'Content-Type' => 'text/plain' }
if responce_status == 206
headers.merge('Content-Range' => "bytes #{from}-#{to}/#{remote_trace_size}")
def remote_url_response_headers(response_status, from, to, size)
{ 'Content-Type' => 'text/plain' }.tap do |headers|
if response_status == 206
headers.merge('Content-Range' => "bytes #{from}-#{to}/#{size}")
end
end
headers
end
def range_trace_body(from, to)
remote_trace_body[from..to]
end
def remote_trace_body
@remote_trace_body ||= File.read(expand_fixture_path('trace/sample_trace'))
.force_encoding(Encoding::BINARY)
end
def remote_trace_size
remote_trace_body.bytesize
end
def set_smaller_buffer_size_than(file_size)
blocks = (file_size / 128)
new_size = (blocks / 2) * 128
stub_const("Gitlab::Ci::Trace::HttpIO::BUFFER_SIZE", new_size)
stub_const("Gitlab::HttpIO::BUFFER_SIZE", new_size)
end
def set_larger_buffer_size_than(file_size)
blocks = (file_size / 128)
new_size = (blocks * 2) * 128
stub_const("Gitlab::Ci::Trace::HttpIO::BUFFER_SIZE", new_size)
stub_const("Gitlab::HttpIO::BUFFER_SIZE", new_size)
end
end

View File

@ -68,4 +68,66 @@ describe GitlabUploader do
expect(subject.file.path).to match(/#{subject.cache_dir}/)
end
end
describe '#open' do
context 'when trace is stored in File storage' do
context 'when file exists' do
let(:file) do
fixture_file_upload('spec/fixtures/trace/sample_trace', 'text/plain')
end
before do
subject.store!(file)
end
it 'returns io stream' do
expect(subject.open).to be_a(IO)
end
it 'when passing block it yields' do
expect { |b| subject.open(&b) }.to yield_control
end
end
context 'when file does not exist' do
it 'returns nil' do
expect(subject.open).to be_nil
end
it 'when passing block it does not yield' do
expect { |b| subject.open(&b) }.not_to yield_control
end
end
end
context 'when trace is stored in Object storage' do
before do
allow(subject).to receive(:file_storage?) { false }
end
context 'when file exists' do
before do
allow(subject).to receive(:url) { 'http://object_storage.com/trace' }
end
it 'returns http io stream' do
expect(subject.open).to be_a(Gitlab::HttpIO)
end
it 'when passing block it yields' do
expect { |b| subject.open(&b) }.to yield_control.once
end
end
context 'when file does not exist' do
it 'returns nil' do
expect(subject.open).to be_nil
end
it 'when passing block it does not yield' do
expect { |b| subject.open(&b) }.not_to yield_control
end
end
end
end
end

View File

@ -23,43 +23,6 @@ describe JobArtifactUploader do
store_dir: %r[\h{2}/\h{2}/\h{64}/\d{4}_\d{1,2}_\d{1,2}/\d+/\d+\z]
end
describe '#open' do
subject { uploader.open }
context 'when trace is stored in File storage' do
context 'when file exists' do
let(:file) do
fixture_file_upload('spec/fixtures/trace/sample_trace', 'text/plain')
end
before do
uploader.store!(file)
end
it 'returns io stream' do
is_expected.to be_a(IO)
end
end
context 'when file does not exist' do
it 'returns nil' do
is_expected.to be_nil
end
end
end
context 'when trace is stored in Object storage' do
before do
allow(uploader).to receive(:file_storage?) { false }
allow(uploader).to receive(:url) { 'http://object_storage.com/trace' }
end
it 'returns http io stream' do
is_expected.to be_a(Gitlab::Ci::Trace::HttpIO)
end
end
end
context 'file is stored in valid local_path' do
let(:file) do
fixture_file_upload('spec/fixtures/ci_build_artifacts.zip', 'application/zip')