Improve performance and memory usage of project export

ActiveModel::Serialization is simple in that it recursively calls
`as_json` on each object to serialize everything. However, for a model
like a Project, this can generate a query for every single association,
which can add up to tens of thousands of queries and lead to memory
bloat.

To improve this, we can do several things:

1. We use `tree:` and `preload:` to automatically generate
   a list of all preloads that could be used to serialize
   objects in bulk.

2. We observe that a single project has many issues, merge requests,
   etc. Instead of serializing everything at once, which could lead to
   database timeouts and high memory usage, we take each top-level
   association and serialize the data in batches.

For example, we serialize the first 100 issues and preload all of
their associated events, notes, etc. before moving onto the next
batch. When we're done, we serialize merge requests in the same way.
We repeat this pattern for the remaining associations specified in
import_export.yml.
This commit is contained in:
Kamil Trzciński 2019-09-09 15:40:49 +00:00 committed by Stan Hu
parent 383f363589
commit 0e56c1e7cb
10 changed files with 547 additions and 19 deletions

View File

@ -0,0 +1,5 @@
---
title: Reduce N+1 when doing project export
merge_request: 32423
author:
type: performance

View File

@ -8,6 +8,7 @@ module Gitlab
@included_attributes = config[:included_attributes] || {}
@excluded_attributes = config[:excluded_attributes] || {}
@methods = config[:methods] || {}
@preloads = config[:preloads] || {}
end
def find_root(model_key)
@ -29,10 +30,26 @@ module Gitlab
only: @included_attributes[model_key],
except: @excluded_attributes[model_key],
methods: @methods[model_key],
include: resolve_model_tree(model_tree)
include: resolve_model_tree(model_tree),
preload: resolve_preloads(model_key, model_tree)
}.compact
end
def resolve_preloads(model_key, model_tree)
model_tree
.map { |submodel_key, submodel_tree| resolve_preload(model_key, submodel_key, submodel_tree) }
.compact
.to_h
.deep_merge(@preloads[model_key].to_h)
.presence
end
def resolve_preload(parent_model_key, model_key, model_tree)
return if @methods[parent_model_key]&.include?(model_key)
[model_key, resolve_preloads(model_key, model_tree)]
end
def resolve_model_tree(model_tree)
return unless model_tree

View File

@ -0,0 +1,108 @@
# frozen_string_literal: true
# ActiveModel::Serialization (https://github.com/rails/rails/blob/v5.0.7/activemodel/lib/active_model/serialization.rb#L184)
# is simple in that it recursively calls `as_json` on each object to
# serialize everything. However, for a model like a Project, this can
# generate a query for every single association, which can add up to tens
# of thousands of queries and lead to memory bloat.
#
# To improve this, we can do several things:
# 1. Use the option tree in http://api.rubyonrails.org/classes/ActiveModel/Serializers/JSON.html
# to generate the necessary preload clauses.
#
# 2. We observe that a single project has many issues, merge requests,
# etc. Instead of serializing everything at once, which could lead to
# database timeouts and high memory usage, we take each top-level
# association and serialize the data in batches.
#
# For example, we serialize the first 100 issues and preload all of
# their associated events, notes, etc. before moving onto the next
# batch. When we're done, we serialize merge requests in the same way.
# We repeat this pattern for the remaining associations specified in
# import_export.yml.
module Gitlab
module ImportExport
class FastHashSerializer
attr_reader :subject, :tree
BATCH_SIZE = 100
def initialize(subject, tree, batch_size: BATCH_SIZE)
@subject = subject
@batch_size = batch_size
@tree = tree
end
# Serializes the subject into a Hash for the given option tree
# (e.g. Project#as_json)
def execute
simple_serialize.merge(serialize_includes)
end
private
def simple_serialize
subject.as_json(
tree.merge(include: nil, preloads: nil))
end
def serialize_includes
return {} unless includes
includes
.map(&method(:serialize_include_definition))
.compact
.to_h
end
# definition:
# { labels: { includes: ... } }
def serialize_include_definition(definition)
raise ArgumentError, 'definition needs to be Hash' unless definition.is_a?(Hash)
raise ArgumentError, 'definition needs to have exactly one Hash element' unless definition.one?
key = definition.first.first
options = definition.first.second
record = subject.public_send(key) # rubocop: disable GitlabSecurity/PublicSend
return unless record
serialized_record = serialize_record(key, record, options)
return unless serialized_record
# `#as_json` always returns keys as `strings`
[key.to_s, serialized_record]
end
def serialize_record(key, record, options)
unless record.respond_to?(:as_json)
raise "Invalid type of #{key} is #{record.class}"
end
# no has-many relation
unless record.is_a?(ActiveRecord::Relation)
return record.as_json(options)
end
# has-many relation
data = []
record.in_batches(of: @batch_size) do |batch| # rubocop:disable Cop/InBatches
batch = batch.preload(preloads[key]) if preloads&.key?(key)
data += batch.as_json(options)
end
data
end
def includes
tree[:include]
end
def preloads
tree[:preload]
end
end
end
end

View File

@ -231,6 +231,16 @@ methods:
ci_pipelines:
- :notes
preloads:
statuses:
# TODO: We cannot preload tags, as they are not part of `GenericCommitStatus`
# tags: # needed by tag_list
project: # deprecated: needed by coverage_regex of Ci::Build
merge_requests:
source_project: # needed by source_branch_sha and diff_head_sha
target_project: # needed by target_branch_sha
assignees: # needed by assigne_id that is implemented by DeprecatedAssignee
# EE specific relationships and settings to include. All of this will be merged
# into the previous structures if EE is used.
ee:

View File

@ -41,7 +41,13 @@ module Gitlab
end
def serialize_project_tree
@project.as_json(reader.project_tree)
if Feature.enabled?(:export_fast_serialize, default_enabled: true)
Gitlab::ImportExport::FastHashSerializer
.new(@project, reader.project_tree)
.execute
else
@project.as_json(reader.project_tree)
end
end
def reader

View File

@ -20,20 +20,41 @@ describe Gitlab::ImportExport::AttributesFinder do
except: [:iid],
include: [
{ merge_request_diff: {
include: []
include: [],
preload: { source_project: nil }
} },
{ merge_request_test: { include: [] } }
],
only: [:id]
only: [:id],
preload: {
merge_request_diff: { source_project: nil },
merge_request_test: nil
}
} },
{ commit_statuses: {
include: [{ commit: { include: [] } }]
include: [{ commit: { include: [] } }],
preload: { commit: nil }
} },
{ project_members: {
include: [{ user: { include: [],
only: [:email] } }]
only: [:email] } }],
preload: { user: nil }
} }
]
],
preload: {
commit_statuses: {
commit: nil
},
issues: nil,
labels: nil,
merge_requests: {
merge_request_diff: { source_project: nil },
merge_request_test: nil
},
project_members: {
user: nil
}
}
}
end
@ -50,7 +71,8 @@ describe Gitlab::ImportExport::AttributesFinder do
setup_yaml(tree: { project: [:issues] })
is_expected.to match(
include: [{ issues: { include: [] } }]
include: [{ issues: { include: [] } }],
preload: { issues: nil }
)
end
@ -58,7 +80,8 @@ describe Gitlab::ImportExport::AttributesFinder do
setup_yaml(tree: { project: [:project_feature] })
is_expected.to match(
include: [{ project_feature: { include: [] } }]
include: [{ project_feature: { include: [] } }],
preload: { project_feature: nil }
)
end
@ -67,7 +90,8 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ issues: { include: [] } },
{ snippets: { include: [] } }]
{ snippets: { include: [] } }],
preload: { issues: nil, snippets: nil }
)
end
@ -75,7 +99,9 @@ describe Gitlab::ImportExport::AttributesFinder do
setup_yaml(tree: { project: [issues: [:notes]] })
is_expected.to match(
include: [{ issues: { include: [{ notes: { include: [] } }] } }]
include: [{ issues: { include: [{ notes: { include: [] } }],
preload: { notes: nil } } }],
preload: { issues: { notes: nil } }
)
end
@ -85,7 +111,9 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ merge_requests:
{ include: [{ notes: { include: [] } },
{ merge_request_diff: { include: [] } }] } }]
{ merge_request_diff: { include: [] } }],
preload: { merge_request_diff: nil, notes: nil } } }],
preload: { merge_requests: { merge_request_diff: nil, notes: nil } }
)
end
@ -94,8 +122,11 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ merge_requests: {
include: [{ notes: { include: [{ author: { include: [] } }] } }]
} }]
include: [{ notes: { include: [{ author: { include: [] } }],
preload: { author: nil } } }],
preload: { notes: { author: nil } }
} }],
preload: { merge_requests: { notes: { author: nil } } }
)
end
@ -105,7 +136,8 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ issues: { include: [],
only: [:name, :description] } }]
only: [:name, :description] } }],
preload: { issues: nil }
)
end
@ -115,7 +147,8 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ issues: { except: [:name],
include: [] } }]
include: [] } }],
preload: { issues: nil }
)
end
@ -127,7 +160,8 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ issues: { except: [:name],
include: [],
only: [:description] } }]
only: [:description] } }],
preload: { issues: nil }
)
end
@ -137,7 +171,8 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ issues: { include: [],
methods: [:name] } }]
methods: [:name] } }],
preload: { issues: nil }
)
end

View File

@ -25,7 +25,7 @@ describe Gitlab::ImportExport::Config do
expect { subject }.not_to raise_error
expect(subject).to be_a(Hash)
expect(subject.keys).to contain_exactly(
:tree, :excluded_attributes, :included_attributes, :methods)
:tree, :excluded_attributes, :included_attributes, :methods, :preloads)
end
end
end
@ -55,6 +55,10 @@ describe Gitlab::ImportExport::Config do
events:
- :action
preloads:
statuses:
project:
ee:
tree:
project:
@ -71,6 +75,9 @@ describe Gitlab::ImportExport::Config do
- :type_ee
events_ee:
- :action_ee
preloads:
statuses:
bridge_ee:
EOF
end
@ -111,6 +118,11 @@ describe Gitlab::ImportExport::Config do
methods: {
labels: [:type],
events: [:action]
},
preloads: {
statuses: {
project: nil
}
}
}
)
@ -150,6 +162,12 @@ describe Gitlab::ImportExport::Config do
labels: [:type, :type_ee],
events: [:action],
events_ee: [:action_ee]
},
preloads: {
statuses: {
project: nil,
bridge_ee: nil
}
}
}
)

View File

@ -0,0 +1,272 @@
require 'spec_helper'
describe Gitlab::ImportExport::FastHashSerializer do
subject { described_class.new(project, tree).execute }
let!(:project) { setup_project }
let(:user) { create(:user) }
let(:shared) { project.import_export_shared }
let(:reader) { Gitlab::ImportExport::Reader.new(shared: shared) }
let(:tree) { reader.project_tree }
before do
project.add_maintainer(user)
allow_any_instance_of(MergeRequest).to receive(:source_branch_sha).and_return('ABCD')
allow_any_instance_of(MergeRequest).to receive(:target_branch_sha).and_return('DCBA')
end
it 'saves the correct hash' do
is_expected.to include({ 'description' => 'description', 'visibility_level' => 20 })
end
it 'has approvals_before_merge set' do
expect(subject['approvals_before_merge']).to eq(1)
end
it 'has milestones' do
expect(subject['milestones']).not_to be_empty
end
it 'has merge requests' do
expect(subject['merge_requests']).not_to be_empty
end
it 'has merge request\'s milestones' do
expect(subject['merge_requests'].first['milestone']).not_to be_empty
end
it 'has merge request\'s source branch SHA' do
expect(subject['merge_requests'].first['source_branch_sha']).to eq('ABCD')
end
it 'has merge request\'s target branch SHA' do
expect(subject['merge_requests'].first['target_branch_sha']).to eq('DCBA')
end
it 'has events' do
expect(subject['merge_requests'].first['milestone']['events']).not_to be_empty
end
it 'has snippets' do
expect(subject['snippets']).not_to be_empty
end
it 'has snippet notes' do
expect(subject['snippets'].first['notes']).not_to be_empty
end
it 'has releases' do
expect(subject['releases']).not_to be_empty
end
it 'has no author on releases' do
expect(subject['releases'].first['author']).to be_nil
end
it 'has the author ID on releases' do
expect(subject['releases'].first['author_id']).not_to be_nil
end
it 'has issues' do
expect(subject['issues']).not_to be_empty
end
it 'has issue comments' do
notes = subject['issues'].first['notes']
expect(notes).not_to be_empty
expect(notes.first['type']).to eq('DiscussionNote')
end
it 'has issue assignees' do
expect(subject['issues'].first['issue_assignees']).not_to be_empty
end
it 'has author on issue comments' do
expect(subject['issues'].first['notes'].first['author']).not_to be_empty
end
it 'has project members' do
expect(subject['project_members']).not_to be_empty
end
it 'has merge requests diffs' do
expect(subject['merge_requests'].first['merge_request_diff']).not_to be_empty
end
it 'has merge request diff files' do
expect(subject['merge_requests'].first['merge_request_diff']['merge_request_diff_files']).not_to be_empty
end
it 'has merge request diff commits' do
expect(subject['merge_requests'].first['merge_request_diff']['merge_request_diff_commits']).not_to be_empty
end
it 'has merge requests comments' do
expect(subject['merge_requests'].first['notes']).not_to be_empty
end
it 'has author on merge requests comments' do
expect(subject['merge_requests'].first['notes'].first['author']).not_to be_empty
end
it 'has pipeline stages' do
expect(subject.dig('ci_pipelines', 0, 'stages')).not_to be_empty
end
it 'has pipeline statuses' do
expect(subject.dig('ci_pipelines', 0, 'stages', 0, 'statuses')).not_to be_empty
end
it 'has pipeline builds' do
builds_count = subject
.dig('ci_pipelines', 0, 'stages', 0, 'statuses')
.count { |hash| hash['type'] == 'Ci::Build' }
expect(builds_count).to eq(1)
end
it 'has no when YML attributes but only the DB column' do
allow_any_instance_of(Ci::Pipeline)
.to receive(:ci_yaml_file)
.and_return(File.read(Rails.root.join('spec/support/gitlab_stubs/gitlab_ci.yml')))
expect_any_instance_of(Gitlab::Ci::YamlProcessor).not_to receive(:build_attributes)
subject
end
it 'has pipeline commits' do
expect(subject['ci_pipelines']).not_to be_empty
end
it 'has ci pipeline notes' do
expect(subject['ci_pipelines'].first['notes']).not_to be_empty
end
it 'has labels with no associations' do
expect(subject['labels']).not_to be_empty
end
it 'has labels associated to records' do
expect(subject['issues'].first['label_links'].first['label']).not_to be_empty
end
it 'has project and group labels' do
label_types = subject['issues'].first['label_links'].map { |link| link['label']['type'] }
expect(label_types).to match_array(%w(ProjectLabel GroupLabel))
end
it 'has priorities associated to labels' do
priorities = subject['issues'].first['label_links'].flat_map { |link| link['label']['priorities'] }
expect(priorities).not_to be_empty
end
it 'has issue resource label events' do
expect(subject['issues'].first['resource_label_events']).not_to be_empty
end
it 'has merge request resource label events' do
expect(subject['merge_requests'].first['resource_label_events']).not_to be_empty
end
it 'saves the correct service type' do
expect(subject['services'].first['type']).to eq('CustomIssueTrackerService')
end
it 'saves the properties for a service' do
expect(subject['services'].first['properties']).to eq('one' => 'value')
end
it 'has project feature' do
project_feature = subject['project_feature']
expect(project_feature).not_to be_empty
expect(project_feature["issues_access_level"]).to eq(ProjectFeature::DISABLED)
expect(project_feature["wiki_access_level"]).to eq(ProjectFeature::ENABLED)
expect(project_feature["builds_access_level"]).to eq(ProjectFeature::PRIVATE)
end
it 'has custom attributes' do
expect(subject['custom_attributes'].count).to eq(2)
end
it 'has badges' do
expect(subject['project_badges'].count).to eq(2)
end
it 'does not complain about non UTF-8 characters in MR diff files' do
ActiveRecord::Base.connection.execute("UPDATE merge_request_diff_files SET diff = '---\n- :diff: !binary |-\n LS0tIC9kZXYvbnVsbAorKysgYi9pbWFnZXMvbnVjb3IucGRmCkBAIC0wLDAg\n KzEsMTY3OSBAQAorJVBERi0xLjUNJeLjz9MNCisxIDAgb2JqDTw8L01ldGFk\n YXR'")
expect(subject['merge_requests'].first['merge_request_diff']).not_to be_empty
end
context 'project attributes' do
it 'does not contain the runners token' do
expect(subject).not_to include("runners_token" => 'token')
end
end
it 'has a board and a list' do
expect(subject['boards'].first['lists']).not_to be_empty
end
def setup_project
issue = create(:issue, assignees: [user])
snippet = create(:project_snippet)
release = create(:release)
group = create(:group)
project = create(:project,
:public,
:repository,
:issues_disabled,
:wiki_enabled,
:builds_private,
description: 'description',
issues: [issue],
snippets: [snippet],
releases: [release],
group: group,
approvals_before_merge: 1
)
project_label = create(:label, project: project)
group_label = create(:group_label, group: group)
create(:label_link, label: project_label, target: issue)
create(:label_link, label: group_label, target: issue)
create(:label_priority, label: group_label, priority: 1)
milestone = create(:milestone, project: project)
merge_request = create(:merge_request, source_project: project, milestone: milestone)
ci_build = create(:ci_build, project: project, when: nil)
ci_build.pipeline.update(project: project)
create(:commit_status, project: project, pipeline: ci_build.pipeline)
create(:milestone, project: project)
create(:discussion_note, noteable: issue, project: project)
create(:note, noteable: merge_request, project: project)
create(:note, noteable: snippet, project: project)
create(:note_on_commit,
author: user,
project: project,
commit_id: ci_build.pipeline.sha)
create(:resource_label_event, label: project_label, issue: issue)
create(:resource_label_event, label: group_label, merge_request: merge_request)
create(:event, :created, target: milestone, project: project, author: user)
create(:service, project: project, type: 'CustomIssueTrackerService', category: 'issue_tracker', properties: { one: 'value' })
create(:project_custom_attribute, project: project)
create(:project_custom_attribute, project: project)
create(:project_badge, project: project)
create(:project_badge, project: project)
board = create(:board, project: project, name: 'TestBoard')
create(:list, board: board, position: 0, label: project_label)
project
end
end

View File

@ -23,12 +23,65 @@ describe Gitlab::ImportExport::ProjectTreeSaver do
expect(project_tree_saver.save).to be true
end
context ':export_fast_serialize feature flag checks' do
before do
expect(Gitlab::ImportExport::Reader).to receive(:new).with(shared: shared).and_return(reader)
expect(reader).to receive(:project_tree).and_return(project_tree)
end
let(:serializer) { instance_double('Gitlab::ImportExport::FastHashSerializer') }
let(:reader) { instance_double('Gitlab::ImportExport::Reader') }
let(:project_tree) do
{
include: [{ issues: { include: [] } }],
preload: { issues: nil }
}
end
context 'when :export_fast_serialize feature is enabled' do
before do
stub_feature_flags(export_fast_serialize: true)
end
it 'uses FastHashSerializer' do
expect(Gitlab::ImportExport::FastHashSerializer)
.to receive(:new)
.with(project, project_tree)
.and_return(serializer)
expect(serializer).to receive(:execute)
project_tree_saver.save
end
end
context 'when :export_fast_serialize feature is disabled' do
before do
stub_feature_flags(export_fast_serialize: false)
end
it 'is serialized via built-in `as_json`' do
expect(project).to receive(:as_json).with(project_tree)
project_tree_saver.save
end
end
end
# It is mostly duplicated in
# `spec/lib/gitlab/import_export/fast_hash_serializer_spec.rb`
# except:
# context 'with description override' do
# context 'group members' do
# ^ These are specific for the ProjectTreeSaver
context 'JSON' do
let(:saved_project_json) do
project_tree_saver.save
project_json(project_tree_saver.full_path)
end
# It is not duplicated in
# `spec/lib/gitlab/import_export/fast_hash_serializer_spec.rb`
context 'with description override' do
let(:params) { { description: 'Foo Bar' } }
let(:project_tree_saver) { described_class.new(project: project, current_user: user, shared: shared, params: params) }

View File

@ -13,6 +13,10 @@ tree:
group_members:
- :user
preloads:
merge_request_diff:
source_project:
included_attributes:
merge_requests:
- :id