Add latest changes from gitlab-org/gitlab@master
This commit is contained in:
parent
2857ba3ce5
commit
1793989ccc
|
@ -17,7 +17,7 @@ module Types
|
||||||
@requires_argument = !!kwargs.delete(:requires_argument)
|
@requires_argument = !!kwargs.delete(:requires_argument)
|
||||||
@authorize = Array.wrap(kwargs.delete(:authorize))
|
@authorize = Array.wrap(kwargs.delete(:authorize))
|
||||||
kwargs[:complexity] = field_complexity(kwargs[:resolver_class], kwargs[:complexity])
|
kwargs[:complexity] = field_complexity(kwargs[:resolver_class], kwargs[:complexity])
|
||||||
@feature_flag = kwargs[:feature_flag]
|
@feature_flag = kwargs[:_deprecated_feature_flag]
|
||||||
kwargs = check_feature_flag(kwargs)
|
kwargs = check_feature_flag(kwargs)
|
||||||
@deprecation = gitlab_deprecation(kwargs)
|
@deprecation = gitlab_deprecation(kwargs)
|
||||||
after_connection_extensions = kwargs.delete(:late_extensions) || []
|
after_connection_extensions = kwargs.delete(:late_extensions) || []
|
||||||
|
@ -136,7 +136,7 @@ module Types
|
||||||
end
|
end
|
||||||
|
|
||||||
def check_feature_flag(args)
|
def check_feature_flag(args)
|
||||||
ff = args.delete(:feature_flag)
|
ff = args.delete(:_deprecated_feature_flag)
|
||||||
return args unless ff.present?
|
return args unless ff.present?
|
||||||
|
|
||||||
args[:description] = feature_documentation_message(ff, args[:description])
|
args[:description] = feature_documentation_message(ff, args[:description])
|
||||||
|
|
|
@ -22,7 +22,7 @@ module Types
|
||||||
type: Types::CustomEmojiType.connection_type,
|
type: Types::CustomEmojiType.connection_type,
|
||||||
null: true,
|
null: true,
|
||||||
description: 'Custom emoji within this namespace.',
|
description: 'Custom emoji within this namespace.',
|
||||||
feature_flag: :custom_emoji
|
_deprecated_feature_flag: :custom_emoji
|
||||||
|
|
||||||
field :share_with_group_lock,
|
field :share_with_group_lock,
|
||||||
type: GraphQL::Types::Boolean,
|
type: GraphQL::Types::Boolean,
|
||||||
|
|
|
@ -37,8 +37,8 @@ module Types
|
||||||
mount_mutation Mutations::Clusters::AgentTokens::Create
|
mount_mutation Mutations::Clusters::AgentTokens::Create
|
||||||
mount_mutation Mutations::Clusters::AgentTokens::Revoke
|
mount_mutation Mutations::Clusters::AgentTokens::Revoke
|
||||||
mount_mutation Mutations::Commits::Create, calls_gitaly: true
|
mount_mutation Mutations::Commits::Create, calls_gitaly: true
|
||||||
mount_mutation Mutations::CustomEmoji::Create, feature_flag: :custom_emoji
|
mount_mutation Mutations::CustomEmoji::Create, _deprecated_feature_flag: :custom_emoji
|
||||||
mount_mutation Mutations::CustomEmoji::Destroy, feature_flag: :custom_emoji
|
mount_mutation Mutations::CustomEmoji::Destroy, _deprecated_feature_flag: :custom_emoji
|
||||||
mount_mutation Mutations::CustomerRelations::Contacts::Create
|
mount_mutation Mutations::CustomerRelations::Contacts::Create
|
||||||
mount_mutation Mutations::CustomerRelations::Contacts::Update
|
mount_mutation Mutations::CustomerRelations::Contacts::Update
|
||||||
mount_mutation Mutations::CustomerRelations::Organizations::Create
|
mount_mutation Mutations::CustomerRelations::Organizations::Create
|
||||||
|
|
|
@ -61,7 +61,7 @@ module Types
|
||||||
Types::TimeTracking::TimelogCategoryType.connection_type,
|
Types::TimeTracking::TimelogCategoryType.connection_type,
|
||||||
null: true,
|
null: true,
|
||||||
description: "Timelog categories for the namespace.",
|
description: "Timelog categories for the namespace.",
|
||||||
feature_flag: :timelog_categories
|
_deprecated_feature_flag: :timelog_categories
|
||||||
|
|
||||||
markdown_field :description_html, null: true
|
markdown_field :description_html, null: true
|
||||||
|
|
||||||
|
|
|
@ -442,7 +442,7 @@ module Types
|
||||||
Types::TimeTracking::TimelogCategoryType.connection_type,
|
Types::TimeTracking::TimelogCategoryType.connection_type,
|
||||||
null: true,
|
null: true,
|
||||||
description: "Timelog categories for the project.",
|
description: "Timelog categories for the project.",
|
||||||
feature_flag: :timelog_categories
|
_deprecated_feature_flag: :timelog_categories
|
||||||
|
|
||||||
def timelog_categories
|
def timelog_categories
|
||||||
object.project_namespace.timelog_categories
|
object.project_namespace.timelog_categories
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
---
|
||||||
|
name: only_positive_pagination_values
|
||||||
|
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/93571
|
||||||
|
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/369225
|
||||||
|
milestone: '15.3'
|
||||||
|
type: development
|
||||||
|
group: group::source code
|
||||||
|
default_enabled: false
|
|
@ -98,11 +98,8 @@ See the [deprecating schema items](#deprecating-schema-items) section for how to
|
||||||
|
|
||||||
### Breaking change exemptions
|
### Breaking change exemptions
|
||||||
|
|
||||||
Two scenarios exist where schema items are exempt from the deprecation process,
|
Schema items [marked as alpha](#mark-schema-items-as-alpha) are exempt from the deprecation process,
|
||||||
and can be removed or changed at any time without notice. These are schema items that either:
|
and can be removed or changed at any time without notice.
|
||||||
|
|
||||||
- Use the [`feature_flag` property](#feature_flag-property) _and_ the flag is disabled by default.
|
|
||||||
- Are [marked as alpha](#mark-schema-items-as-alpha).
|
|
||||||
|
|
||||||
## Global IDs
|
## Global IDs
|
||||||
|
|
||||||
|
@ -472,92 +469,37 @@ end
|
||||||
|
|
||||||
## Feature flags
|
## Feature flags
|
||||||
|
|
||||||
Developers can add [feature flags](../development/feature_flags/index.md) to GraphQL
|
You can implement [feature flags](../development/feature_flags/index.md) in GraphQL to toggle:
|
||||||
fields in the following ways:
|
|
||||||
|
|
||||||
- Add the [`feature_flag` property](#feature_flag-property) to a field. This allows the field to be _hidden_
|
- The return value of a field.
|
||||||
from the GraphQL schema when the flag is disabled.
|
- The behavior of an argument or mutation.
|
||||||
- [Toggle the return value](#toggle-the-value-of-a-field) when resolving the field.
|
|
||||||
|
|
||||||
You can refer to these guidelines to decide which approach to use:
|
This can be done in a resolver, in the
|
||||||
|
|
||||||
- If your field is experimental, and its name or type is subject to
|
|
||||||
change, use the [`feature_flag` property](#feature_flag-property).
|
|
||||||
- If your field is stable and its definition doesn't change, even after the flag is
|
|
||||||
removed, [toggle the return value](#toggle-the-value-of-a-field) of the field instead. Note that
|
|
||||||
[all fields should be nullable](#nullable-fields) anyway.
|
|
||||||
- If your field will be accessed from frontend using the `@include` or `@skip` directive, [do not use the `feature_flag` property](#frontend-and-backend-feature-flag-strategies).
|
|
||||||
|
|
||||||
### `feature_flag` property
|
|
||||||
|
|
||||||
The `feature_flag` property allows you to toggle the field's
|
|
||||||
[visibility](https://graphql-ruby.org/authorization/visibility.html)
|
|
||||||
in the GraphQL schema. This removes the field from the schema
|
|
||||||
when the flag is disabled.
|
|
||||||
|
|
||||||
A description is [appended](https://gitlab.com/gitlab-org/gitlab/-/blob/497b556/app/graphql/types/base_field.rb#L44-53)
|
|
||||||
to the field indicating that it is behind a feature flag.
|
|
||||||
|
|
||||||
WARNING:
|
|
||||||
If a client queries for the field when the feature flag is disabled, the query
|
|
||||||
fails. Consider this when toggling the visibility of the feature on or off on
|
|
||||||
production.
|
|
||||||
|
|
||||||
The `feature_flag` property does not allow the use of
|
|
||||||
[feature gates based on actors](../development/feature_flags/index.md).
|
|
||||||
This means that the feature flag cannot be toggled only for particular
|
|
||||||
projects, groups, or users, but instead can only be toggled globally for
|
|
||||||
everyone.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
field :test_field, type: GraphQL::Types::String,
|
|
||||||
null: true,
|
|
||||||
description: 'Some test field.',
|
|
||||||
feature_flag: :my_feature_flag
|
|
||||||
```
|
|
||||||
|
|
||||||
### Frontend and Backend feature flag strategies
|
|
||||||
|
|
||||||
#### Directives
|
|
||||||
|
|
||||||
When feature flags are used in the frontend to control the `@include` and `@skip` directives, do not use the `feature_flag`
|
|
||||||
property on the server-side. For the accepted backend workaround, see [Toggle the value of a field](#toggle-the-value-of-a-field). It is recommended that the feature flag used in this approach be the same for frontend and backend.
|
|
||||||
|
|
||||||
Even if the frontend directives evaluate to `@include:false` / `@skip:true`, the guarded entity is sent to the backend and matched
|
|
||||||
against the GraphQL schema. We would then get an exception due to a schema mismatch. See the [frontend GraphQL guide](../development/fe_guide/graphql.md#the-include-directive) for more guidance.
|
|
||||||
|
|
||||||
#### Different versions of a query
|
|
||||||
|
|
||||||
See the guide frontend GraphQL guide for [different versions of a query](../development/fe_guide/graphql.md#different-versions-of-a-query), and [why it is not the preferred approach](../development/fe_guide/graphql.md#avoiding-multiple-query-versions)
|
|
||||||
|
|
||||||
### Toggle the value of a field
|
|
||||||
|
|
||||||
This method of using feature flags for fields is to toggle the
|
|
||||||
return value of the field. This can be done in the resolver, in the
|
|
||||||
type, or even in a model method, depending on your preference and
|
type, or even in a model method, depending on your preference and
|
||||||
situation.
|
situation.
|
||||||
|
|
||||||
Consider also [marking the field as Alpha](#mark-schema-items-as-alpha)
|
NOTE:
|
||||||
while the value of the field can be toggled. You can
|
It's recommended that you also [mark the item as Alpha](#mark-schema-items-as-alpha) while it is behind a feature flag.
|
||||||
[change or remove Alpha fields at any time](#breaking-change-exemptions) without needing to deprecate them.
|
This signals to consumers of the public GraphQL API that the field is not
|
||||||
This also signals to consumers of the public GraphQL API that the field is not
|
|
||||||
meant to be used yet.
|
meant to be used yet.
|
||||||
|
You can also
|
||||||
|
[change or remove Alpha items at any time](#breaking-change-exemptions) without needing to deprecate them. When the flag is removed, "release"
|
||||||
|
the schema item by removing its Alpha property to make it public.
|
||||||
|
|
||||||
When applying a feature flag to toggle the value of a field, the
|
### Descriptions for feature flagged items
|
||||||
`description` of the field must:
|
|
||||||
|
|
||||||
- State that the value of the field can be toggled by a feature flag.
|
When using a feature flag to toggle the value or behavior of a schema item, the
|
||||||
|
`description` of the item must:
|
||||||
|
|
||||||
|
- State that the value or behavior can be toggled by a feature flag.
|
||||||
- Name the feature flag.
|
- Name the feature flag.
|
||||||
- State what the field returns when the feature flag is disabled (or
|
- State what the field returns, or behavior is, when the feature flag is disabled (or
|
||||||
enabled, if more appropriate).
|
enabled, if more appropriate).
|
||||||
|
|
||||||
Example:
|
Example of a feature-flagged field:
|
||||||
|
|
||||||
```ruby
|
```ruby
|
||||||
field :foo, GraphQL::Types::String,
|
field :foo, GraphQL::Types::String, null: true,
|
||||||
null: true,
|
|
||||||
alpha: { milestone: '10.0' },
|
alpha: { milestone: '10.0' },
|
||||||
description: 'Some test field. Returns `null`' \
|
description: 'Some test field. Returns `null`' \
|
||||||
'if `my_feature_flag` feature flag is disabled.'
|
'if `my_feature_flag` feature flag is disabled.'
|
||||||
|
@ -567,6 +509,26 @@ def foo
|
||||||
end
|
end
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Example of a feature-flagged argument:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
argument :foo, type: GraphQL::Types::String, required: false,
|
||||||
|
alpha: { milestone: '10.0' },
|
||||||
|
description: 'Some test argument. Is ignored if ' \
|
||||||
|
'`my_feature_flag` feature flag is disabled.'
|
||||||
|
|
||||||
|
def resolve(args)
|
||||||
|
args.delete(:foo) unless Feature.enabled?(:my_feature_flag, object)
|
||||||
|
# ...
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
### `feature_flag` property (deprecated)
|
||||||
|
|
||||||
|
NOTE:
|
||||||
|
This property is deprecated and should no longer be used. The property
|
||||||
|
has been temporarily renamed to `_deprecated_feature_flag` and support for it will be removed in [#369202](https://gitlab.com/gitlab-org/gitlab/-/issues/369202).
|
||||||
|
|
||||||
## Deprecating schema items
|
## Deprecating schema items
|
||||||
|
|
||||||
The GitLab GraphQL API is versionless, which means we maintain backwards
|
The GitLab GraphQL API is versionless, which means we maintain backwards
|
||||||
|
|
|
@ -6,7 +6,7 @@ info: To determine the technical writer assigned to the Stage/Group associated w
|
||||||
|
|
||||||
# Add a foreign key constraint to an existing column
|
# Add a foreign key constraint to an existing column
|
||||||
|
|
||||||
Foreign keys ensure consistency between related database tables. The current database review process **always** encourages you to add [foreign keys](../foreign_keys.md) when creating tables that reference records from other tables.
|
Foreign keys ensure consistency between related database tables. The current database review process **always** encourages you to add [foreign keys](foreign_keys.md) when creating tables that reference records from other tables.
|
||||||
|
|
||||||
Starting with Rails version 4, Rails includes migration helpers to add foreign key constraints
|
Starting with Rails version 4, Rails includes migration helpers to add foreign key constraints
|
||||||
to database tables. Before Rails 4, the only way for ensuring some level of consistency was the
|
to database tables. Before Rails 4, the only way for ensuring some level of consistency was the
|
||||||
|
|
|
@ -0,0 +1,179 @@
|
||||||
|
---
|
||||||
|
stage: Plan
|
||||||
|
group: Project Management
|
||||||
|
info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
|
||||||
|
---
|
||||||
|
# Filtering by label
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
GitLab has [labels](../../user/project/labels.md) that can be assigned to issues,
|
||||||
|
merge requests, and epics. Labels on those objects are a many-to-many relation
|
||||||
|
through the polymorphic `label_links` table.
|
||||||
|
|
||||||
|
To filter these objects by multiple labels - for instance, 'all open
|
||||||
|
issues with the label ~Plan and the label ~backend' - we generate a
|
||||||
|
query containing a `GROUP BY` clause. In a simple form, this looks like:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT
|
||||||
|
issues.*
|
||||||
|
FROM
|
||||||
|
issues
|
||||||
|
INNER JOIN label_links ON label_links.target_id = issues.id
|
||||||
|
AND label_links.target_type = 'Issue'
|
||||||
|
INNER JOIN labels ON labels.id = label_links.label_id
|
||||||
|
WHERE
|
||||||
|
issues.project_id = 13083
|
||||||
|
AND (issues.state IN ('opened'))
|
||||||
|
AND labels.title IN ('Plan',
|
||||||
|
'backend')
|
||||||
|
GROUP BY
|
||||||
|
issues.id
|
||||||
|
HAVING (COUNT(DISTINCT labels.title) = 2)
|
||||||
|
ORDER BY
|
||||||
|
issues.updated_at DESC,
|
||||||
|
issues.id DESC
|
||||||
|
LIMIT 20 OFFSET 0
|
||||||
|
```
|
||||||
|
|
||||||
|
In particular, note that:
|
||||||
|
|
||||||
|
1. We `GROUP BY issues.id` so that we can ...
|
||||||
|
1. Use the `HAVING (COUNT(DISTINCT labels.title) = 2)` condition to ensure that
|
||||||
|
all matched issues have both labels.
|
||||||
|
|
||||||
|
This is more complicated than is ideal. It makes the query construction more
|
||||||
|
prone to errors (such as
|
||||||
|
[issue #15557](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/15557)).
|
||||||
|
|
||||||
|
## Attempt A: `WHERE EXISTS`
|
||||||
|
|
||||||
|
### Attempt A1: use multiple subqueries with `WHERE EXISTS`
|
||||||
|
|
||||||
|
In [issue #37137](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/37137)
|
||||||
|
and its associated [merge request](https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/14022),
|
||||||
|
we tried to replace the `GROUP BY` with multiple uses of `WHERE EXISTS`. For the
|
||||||
|
example above, this would give:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
WHERE (EXISTS (
|
||||||
|
SELECT
|
||||||
|
TRUE
|
||||||
|
FROM
|
||||||
|
label_links
|
||||||
|
INNER JOIN labels ON labels.id = label_links.label_id
|
||||||
|
WHERE
|
||||||
|
labels.title = 'Plan'
|
||||||
|
AND target_type = 'Issue'
|
||||||
|
AND target_id = issues.id))
|
||||||
|
AND (EXISTS (
|
||||||
|
SELECT
|
||||||
|
TRUE
|
||||||
|
FROM
|
||||||
|
label_links
|
||||||
|
INNER JOIN labels ON labels.id = label_links.label_id
|
||||||
|
WHERE
|
||||||
|
labels.title = 'backend'
|
||||||
|
AND target_type = 'Issue'
|
||||||
|
AND target_id = issues.id))
|
||||||
|
```
|
||||||
|
|
||||||
|
While this worked without schema changes, and did improve readability somewhat,
|
||||||
|
it did not improve query performance.
|
||||||
|
|
||||||
|
### Attempt A2: use label IDs in the `WHERE EXISTS` clause
|
||||||
|
|
||||||
|
In [merge request #34503](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/34503), we followed a similar approach to A1. But this time, we
|
||||||
|
did a separate query to fetch the IDs of the labels used in the filter so that we avoid the `JOIN` in the `EXISTS` clause and filter directly by
|
||||||
|
`label_links.label_id`. We also added a new index on `label_links` for the `target_id`, `label_id`, and `target_type` columns to speed up this query.
|
||||||
|
|
||||||
|
Finding the label IDs wasn't straightforward because there could be multiple labels with the same title within a single root namespace. We solved
|
||||||
|
this by grouping the label IDs by title and then using the array of IDs in the `EXISTS` clauses.
|
||||||
|
|
||||||
|
This resulted in a significant performance improvement. However, this optimization could not be applied to the dashboard pages
|
||||||
|
where we do not have a project or group context. We could not easily search for the label IDs here because that would mean searching across all
|
||||||
|
projects and groups that the user has access to.
|
||||||
|
|
||||||
|
## Attempt B: Denormalize using an array column
|
||||||
|
|
||||||
|
Having [removed MySQL support in GitLab 12.1](https://about.gitlab.com/blog/2019/06/27/removing-mysql-support/),
|
||||||
|
using [PostgreSQL's arrays](https://www.postgresql.org/docs/11/arrays.html) became more
|
||||||
|
tractable as we didn't have to support two databases. We discussed denormalizing
|
||||||
|
the `label_links` table for querying in
|
||||||
|
[issue #49651](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/49651),
|
||||||
|
with two options: label IDs and titles.
|
||||||
|
|
||||||
|
We can think of both of those as array columns on `issues`, `merge_requests`,
|
||||||
|
and `epics`: `issues.label_ids` would be an array column of label IDs, and
|
||||||
|
`issues.label_titles` would be an array of label titles.
|
||||||
|
|
||||||
|
These array columns can be complemented with
|
||||||
|
[GIN indexes](https://www.postgresql.org/docs/11/gin-intro.html) to improve
|
||||||
|
matching.
|
||||||
|
|
||||||
|
### Attempt B1: store label IDs for each object
|
||||||
|
|
||||||
|
This has some strong advantages over titles:
|
||||||
|
|
||||||
|
1. Unless a label is deleted, or a project is moved, we never need to
|
||||||
|
bulk-update the denormalized column.
|
||||||
|
1. It uses less storage than the titles.
|
||||||
|
|
||||||
|
Unfortunately, our application design makes this hard. If we were able to query
|
||||||
|
just by label ID easily, we wouldn't need the `INNER JOIN labels` in the initial
|
||||||
|
query at the start of this document. GitLab allows users to filter by label
|
||||||
|
title across projects and even across groups, so a filter by the label ~Plan may
|
||||||
|
include labels with multiple distinct IDs.
|
||||||
|
|
||||||
|
We do not want users to have to know about the different IDs, which means that
|
||||||
|
given this data set:
|
||||||
|
|
||||||
|
| Project | ~Plan label ID | ~backend label ID |
|
||||||
|
| ------- | -------------- | ----------------- |
|
||||||
|
| A | 11 | 12 |
|
||||||
|
| B | 21 | 22 |
|
||||||
|
| C | 31 | 32 |
|
||||||
|
|
||||||
|
We would need something like:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
WHERE
|
||||||
|
label_ids @> ARRAY[11, 12]
|
||||||
|
OR label_ids @> ARRAY[21, 22]
|
||||||
|
OR label_ids @> ARRAY[31, 32]
|
||||||
|
```
|
||||||
|
|
||||||
|
This can get even more complicated when we consider that in some cases, there
|
||||||
|
might be two ~backend labels - with different IDs - that could apply to the same
|
||||||
|
object, so the number of combinations would balloon further.
|
||||||
|
|
||||||
|
### Attempt B2: store label titles for each object
|
||||||
|
|
||||||
|
From the perspective of updating the object, this is the worst
|
||||||
|
option. We have to bulk update the objects when:
|
||||||
|
|
||||||
|
1. The objects are moved from one project to another.
|
||||||
|
1. The project is moved from one group to another.
|
||||||
|
1. The label is renamed.
|
||||||
|
1. The label is deleted.
|
||||||
|
|
||||||
|
It also uses much more storage. Querying is simple, though:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
WHERE
|
||||||
|
label_titles @> ARRAY['Plan', 'backend']
|
||||||
|
```
|
||||||
|
|
||||||
|
And our [tests in issue #49651](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/49651#note_188777346)
|
||||||
|
showed that this could be fast.
|
||||||
|
|
||||||
|
However, at present, the disadvantages outweigh the advantages.
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
We found a method A2 that does not need denormalization and improves the query performance significantly. This
|
||||||
|
did not apply to all cases, but we were able to apply method A1 to the rest of the cases so that we remove the
|
||||||
|
`GROUP BY` and `HAVING` clauses in all scenarios.
|
||||||
|
|
||||||
|
This simplified the query and improved the performance in the most common cases.
|
|
@ -0,0 +1,199 @@
|
||||||
|
---
|
||||||
|
stage: Data Stores
|
||||||
|
group: Database
|
||||||
|
info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
|
||||||
|
---
|
||||||
|
|
||||||
|
# Foreign Keys & Associations
|
||||||
|
|
||||||
|
When adding an association to a model you must also add a foreign key. For
|
||||||
|
example, say you have the following model:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
class User < ActiveRecord::Base
|
||||||
|
has_many :posts
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
Add a foreign key here on column `posts.user_id`. This ensures
|
||||||
|
that data consistency is enforced on database level. Foreign keys also mean that
|
||||||
|
the database can very quickly remove associated data (for example, when removing a
|
||||||
|
user), instead of Rails having to do this.
|
||||||
|
|
||||||
|
## Adding Foreign Keys In Migrations
|
||||||
|
|
||||||
|
Foreign keys can be added concurrently using `add_concurrent_foreign_key` as
|
||||||
|
defined in `Gitlab::Database::MigrationHelpers`. See the
|
||||||
|
[Migration Style Guide](../migration_style_guide.md) for more information.
|
||||||
|
|
||||||
|
Keep in mind that you can only safely add foreign keys to existing tables after
|
||||||
|
you have removed any orphaned rows. The method `add_concurrent_foreign_key`
|
||||||
|
does not take care of this so you must do so manually. See
|
||||||
|
[adding foreign key constraint to an existing column](add_foreign_key_to_existing_column.md).
|
||||||
|
|
||||||
|
## Updating Foreign Keys In Migrations
|
||||||
|
|
||||||
|
Sometimes a foreign key constraint must be changed, preserving the column
|
||||||
|
but updating the constraint condition. For example, moving from
|
||||||
|
`ON DELETE CASCADE` to `ON DELETE SET NULL` or vice-versa.
|
||||||
|
|
||||||
|
PostgreSQL does not prevent you from adding overlapping foreign keys. It
|
||||||
|
honors the most recently added constraint. This allows us to replace foreign keys without
|
||||||
|
ever losing foreign key protection on a column.
|
||||||
|
|
||||||
|
To replace a foreign key:
|
||||||
|
|
||||||
|
1. [Add the new foreign key without validation](add_foreign_key_to_existing_column.md#prevent-invalid-records)
|
||||||
|
|
||||||
|
The name of the foreign key constraint must be changed to add a new
|
||||||
|
foreign key before removing the old one.
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
class ReplaceFkOnPackagesPackagesProjectId < Gitlab::Database::Migration[2.0]
|
||||||
|
disable_ddl_transaction!
|
||||||
|
|
||||||
|
NEW_CONSTRAINT_NAME = 'fk_new'
|
||||||
|
|
||||||
|
def up
|
||||||
|
add_concurrent_foreign_key(:packages_packages, :projects, column: :project_id, on_delete: :nullify, validate: false, name: NEW_CONSTRAINT_NAME)
|
||||||
|
end
|
||||||
|
|
||||||
|
def down
|
||||||
|
with_lock_retries do
|
||||||
|
remove_foreign_key_if_exists(:packages_packages, column: :project_id, on_delete: :nullify, name: NEW_CONSTRAINT_NAME)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
1. [Validate the new foreign key](add_foreign_key_to_existing_column.md#validate-the-foreign-key)
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
class ValidateFkNew < Gitlab::Database::Migration[2.0]
|
||||||
|
NEW_CONSTRAINT_NAME = 'fk_new'
|
||||||
|
|
||||||
|
# foreign key added in <link to MR or path to migration adding new FK>
|
||||||
|
def up
|
||||||
|
validate_foreign_key(:packages_packages, name: NEW_CONSTRAINT_NAME)
|
||||||
|
end
|
||||||
|
|
||||||
|
def down
|
||||||
|
# no-op
|
||||||
|
end
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Remove the old foreign key:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
class RemoveFkOld < Gitlab::Database::Migration[2.0]
|
||||||
|
OLD_CONSTRAINT_NAME = 'fk_old'
|
||||||
|
|
||||||
|
# new foreign key added in <link to MR or path to migration adding new FK>
|
||||||
|
# and validated in <link to MR or path to migration validating new FK>
|
||||||
|
def up
|
||||||
|
remove_foreign_key_if_exists(:packages_packages, column: :project_id, on_delete: :cascade, name: OLD_CONSTRAINT_NAME)
|
||||||
|
end
|
||||||
|
|
||||||
|
def down
|
||||||
|
# Validation is skipped here, so if rolled back, this will need to be revalidated in a separate migration
|
||||||
|
add_concurrent_foreign_key(:packages_packages, :projects, column: :project_id, on_delete: :cascade, validate: false, name: OLD_CONSTRAINT_NAME)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cascading Deletes
|
||||||
|
|
||||||
|
Every foreign key must define an `ON DELETE` clause, and in 99% of the cases
|
||||||
|
this should be set to `CASCADE`.
|
||||||
|
|
||||||
|
## Indexes
|
||||||
|
|
||||||
|
When adding a foreign key in PostgreSQL the column is not indexed automatically,
|
||||||
|
thus you must also add a concurrent index. Not doing so results in cascading
|
||||||
|
deletes being very slow.
|
||||||
|
|
||||||
|
## Naming foreign keys
|
||||||
|
|
||||||
|
By default Ruby on Rails uses the `_id` suffix for foreign keys. So we should
|
||||||
|
only use this suffix for associations between two tables. If you want to
|
||||||
|
reference an ID on a third party platform the `_xid` suffix is recommended.
|
||||||
|
|
||||||
|
The spec `spec/db/schema_spec.rb` tests if all columns with the `_id` suffix
|
||||||
|
have a foreign key constraint. So if that spec fails, don't add the column to
|
||||||
|
`IGNORED_FK_COLUMNS`, but instead add the FK constraint, or consider naming it
|
||||||
|
differently.
|
||||||
|
|
||||||
|
## Dependent Removals
|
||||||
|
|
||||||
|
Don't define options such as `dependent: :destroy` or `dependent: :delete` when
|
||||||
|
defining an association. Defining these options means Rails handles the
|
||||||
|
removal of data, instead of letting the database handle this in the most
|
||||||
|
efficient way possible.
|
||||||
|
|
||||||
|
In other words, this is bad and should be avoided at all costs:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
class User < ActiveRecord::Base
|
||||||
|
has_many :posts, dependent: :destroy
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
Should you truly have a need for this it should be approved by a database
|
||||||
|
specialist first.
|
||||||
|
|
||||||
|
You should also not define any `before_destroy` or `after_destroy` callbacks on
|
||||||
|
your models _unless_ absolutely required and only when approved by database
|
||||||
|
specialists. For example, if each row in a table has a corresponding file on a
|
||||||
|
file system it may be tempting to add a `after_destroy` hook. This however
|
||||||
|
introduces non database logic to a model, and means we can no longer rely on
|
||||||
|
foreign keys to remove the data as this would result in the file system data
|
||||||
|
being left behind. In such a case you should use a service class instead that
|
||||||
|
takes care of removing non database data.
|
||||||
|
|
||||||
|
In cases where the relation spans multiple databases you have even
|
||||||
|
further problems using `dependent: :destroy` or the above hooks. You can
|
||||||
|
read more about alternatives at
|
||||||
|
[Avoid `dependent: :nullify` and `dependent: :destroy` across databases](multiple_databases.md#avoid-dependent-nullify-and-dependent-destroy-across-databases).
|
||||||
|
|
||||||
|
## Alternative primary keys with `has_one` associations
|
||||||
|
|
||||||
|
Sometimes a `has_one` association is used to create a one-to-one relationship:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
class User < ActiveRecord::Base
|
||||||
|
has_one :user_config
|
||||||
|
end
|
||||||
|
|
||||||
|
class UserConfig < ActiveRecord::Base
|
||||||
|
belongs_to :user
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
In these cases, there may be an opportunity to remove the unnecessary `id`
|
||||||
|
column on the associated table, `user_config.id` in this example. Instead,
|
||||||
|
the originating table ID can be used as the primary key for the associated
|
||||||
|
table:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
create_table :user_configs, id: false do |t|
|
||||||
|
t.references :users, primary_key: true, default: nil, index: false, foreign_key: { on_delete: :cascade }
|
||||||
|
...
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
Setting `default: nil` ensures a primary key sequence is not created, and since the primary key
|
||||||
|
automatically gets an index, we set `index: false` to avoid creating a duplicate.
|
||||||
|
You also need to add the new primary key to the model:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
class UserConfig < ActiveRecord::Base
|
||||||
|
self.primary_key = :user_id
|
||||||
|
|
||||||
|
belongs_to :user
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
Using a foreign key as primary key saves space but can make
|
||||||
|
[batch counting](../service_ping/implement.md#batch-counters) in [Service Ping](../service_ping/index.md) less efficient.
|
||||||
|
Consider using a regular `id` column if the table is relevant for Service Ping.
|
|
@ -42,7 +42,7 @@ info: To determine the technical writer assigned to the Stage/Group associated w
|
||||||
## Best practices
|
## Best practices
|
||||||
|
|
||||||
- [Adding database indexes](adding_database_indexes.md)
|
- [Adding database indexes](adding_database_indexes.md)
|
||||||
- [Foreign keys & associations](../foreign_keys.md)
|
- [Foreign keys & associations](foreign_keys.md)
|
||||||
- [Adding a foreign key constraint to an existing column](add_foreign_key_to_existing_column.md)
|
- [Adding a foreign key constraint to an existing column](add_foreign_key_to_existing_column.md)
|
||||||
- [`NOT NULL` constraints](not_null_constraints.md)
|
- [`NOT NULL` constraints](not_null_constraints.md)
|
||||||
- [Strings and the Text data type](strings_and_the_text_data_type.md)
|
- [Strings and the Text data type](strings_and_the_text_data_type.md)
|
||||||
|
@ -52,7 +52,7 @@ info: To determine the technical writer assigned to the Stage/Group associated w
|
||||||
- [Hash indexes](../hash_indexes.md)
|
- [Hash indexes](../hash_indexes.md)
|
||||||
- [Storing SHA1 hashes as binary](../sha1_as_binary.md)
|
- [Storing SHA1 hashes as binary](../sha1_as_binary.md)
|
||||||
- [Iterating tables in batches](../iterating_tables_in_batches.md)
|
- [Iterating tables in batches](../iterating_tables_in_batches.md)
|
||||||
- [Insert into tables in batches](../insert_into_tables_in_batches.md)
|
- [Insert into tables in batches](insert_into_tables_in_batches.md)
|
||||||
- [Ordering table columns](ordering_table_columns.md)
|
- [Ordering table columns](ordering_table_columns.md)
|
||||||
- [Verifying database capabilities](../verifying_database_capabilities.md)
|
- [Verifying database capabilities](../verifying_database_capabilities.md)
|
||||||
- [Database Debugging and Troubleshooting](../database_debugging.md)
|
- [Database Debugging and Troubleshooting](../database_debugging.md)
|
||||||
|
@ -69,8 +69,8 @@ info: To determine the technical writer assigned to the Stage/Group associated w
|
||||||
|
|
||||||
## Case studies
|
## Case studies
|
||||||
|
|
||||||
- [Database case study: Filtering by label](../filtering_by_label.md)
|
- [Database case study: Filtering by label](filtering_by_label.md)
|
||||||
- [Database case study: Namespaces storage statistics](../namespaces_storage_statistics.md)
|
- [Database case study: Namespaces storage statistics](namespaces_storage_statistics.md)
|
||||||
|
|
||||||
## Miscellaneous
|
## Miscellaneous
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,196 @@
|
||||||
|
---
|
||||||
|
stage: Data Stores
|
||||||
|
group: Database
|
||||||
|
info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
|
||||||
|
description: "Sometimes it is necessary to store large amounts of records at once, which can be inefficient
|
||||||
|
when iterating collections and performing individual `save`s. With the arrival of `insert_all`
|
||||||
|
in Rails 6, which operates at the row level (that is, using `Hash`es), GitLab has added a set
|
||||||
|
of APIs that make it safe and simple to insert ActiveRecord objects in bulk."
|
||||||
|
---
|
||||||
|
|
||||||
|
# Insert into tables in batches
|
||||||
|
|
||||||
|
Sometimes it is necessary to store large amounts of records at once, which can be inefficient
|
||||||
|
when iterating collections and saving each record individually. With the arrival of
|
||||||
|
[`insert_all`](https://apidock.com/rails/ActiveRecord/Persistence/ClassMethods/insert_all)
|
||||||
|
in Rails 6, which operates at the row level (that is, using `Hash` objects), GitLab has added a set
|
||||||
|
of APIs that make it safe and simple to insert `ActiveRecord` objects in bulk.
|
||||||
|
|
||||||
|
## Prepare `ApplicationRecord`s for bulk insertion
|
||||||
|
|
||||||
|
In order for a model class to take advantage of the bulk insertion API, it has to include the
|
||||||
|
`BulkInsertSafe` concern first:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
class MyModel < ApplicationRecord
|
||||||
|
# other includes here
|
||||||
|
# ...
|
||||||
|
include BulkInsertSafe # include this last
|
||||||
|
|
||||||
|
# ...
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
The `BulkInsertSafe` concern has two functions:
|
||||||
|
|
||||||
|
- It performs checks against your model class to ensure that it does not use ActiveRecord
|
||||||
|
APIs that are not safe to use with respect to bulk insertions (more on that below).
|
||||||
|
- It adds new class methods `bulk_insert!` and `bulk_upsert!`, which you can use to insert many records at once.
|
||||||
|
|
||||||
|
## Insert records with `bulk_insert!` and `bulk_upsert!`
|
||||||
|
|
||||||
|
If the target class passes the checks performed by `BulkInsertSafe`, you can insert an array of
|
||||||
|
ActiveRecord model objects as follows:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
records = [MyModel.new, ...]
|
||||||
|
|
||||||
|
MyModel.bulk_insert!(records)
|
||||||
|
```
|
||||||
|
|
||||||
|
Calls to `bulk_insert!` always attempt to insert _new records_. If instead
|
||||||
|
you would like to replace existing records with new values, while still inserting those
|
||||||
|
that do not already exist, then you can use `bulk_upsert!`:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
records = [MyModel.new, existing_model, ...]
|
||||||
|
|
||||||
|
MyModel.bulk_upsert!(records, unique_by: [:name])
|
||||||
|
```
|
||||||
|
|
||||||
|
In this example, `unique_by` specifies the columns by which records are considered to be
|
||||||
|
unique and as such are updated if they existed prior to insertion. For example, if
|
||||||
|
`existing_model` has a `name` attribute, and if a record with the same `name` value already
|
||||||
|
exists, its fields are updated with those of `existing_model`.
|
||||||
|
|
||||||
|
The `unique_by` parameter can also be passed as a `Symbol`, in which case it specifies
|
||||||
|
a database index by which a column is considered unique:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
MyModel.bulk_insert!(records, unique_by: :index_on_name)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Record validation
|
||||||
|
|
||||||
|
The `bulk_insert!` method guarantees that `records` are inserted transactionally, and
|
||||||
|
runs validations on each record prior to insertion. If any record fails to validate,
|
||||||
|
an error is raised and the transaction is rolled back. You can turn off validations via
|
||||||
|
the `:validate` option:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
MyModel.bulk_insert!(records, validate: false)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Batch size configuration
|
||||||
|
|
||||||
|
In those cases where the number of `records` is above a given threshold, insertions
|
||||||
|
occur in multiple batches. The default batch size is defined in
|
||||||
|
[`BulkInsertSafe::DEFAULT_BATCH_SIZE`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/models/concerns/bulk_insert_safe.rb).
|
||||||
|
Assuming a default threshold of 500, inserting 950 records
|
||||||
|
would result in two batches being written sequentially (of size 500 and 450 respectively.)
|
||||||
|
You can override the default batch size via the `:batch_size` option:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
MyModel.bulk_insert!(records, batch_size: 100)
|
||||||
|
```
|
||||||
|
|
||||||
|
Assuming the same number of 950 records, this would result in 10 batches being written instead.
|
||||||
|
Since this also affects the number of `INSERT` statements that occur, make sure you measure the
|
||||||
|
performance impact this might have on your code. There is a trade-off between the number of
|
||||||
|
`INSERT` statements the database has to process and the size and cost of each `INSERT`.
|
||||||
|
|
||||||
|
### Handling duplicate records
|
||||||
|
|
||||||
|
NOTE:
|
||||||
|
This parameter applies only to `bulk_insert!`. If you intend to update existing
|
||||||
|
records, use `bulk_upsert!` instead.
|
||||||
|
|
||||||
|
It may happen that some records you are trying to insert already exist, which would result in
|
||||||
|
primary key conflicts. There are two ways to address this problem: failing fast by raising an
|
||||||
|
error or skipping duplicate records. The default behavior of `bulk_insert!` is to fail fast
|
||||||
|
and raise an `ActiveRecord::RecordNotUnique` error.
|
||||||
|
|
||||||
|
If this is undesirable, you can instead skip duplicate records with the `skip_duplicates` flag:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
MyModel.bulk_insert!(records, skip_duplicates: true)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Requirements for safe bulk insertions
|
||||||
|
|
||||||
|
Large parts of ActiveRecord's persistence API are built around the notion of callbacks. Many
|
||||||
|
of these callbacks fire in response to model life cycle events such as `save` or `create`.
|
||||||
|
These callbacks cannot be used with bulk insertions, since they are meant to be called for
|
||||||
|
every instance that is saved or created. Since these events do not fire when
|
||||||
|
records are inserted in bulk, we currently prevent their use.
|
||||||
|
|
||||||
|
The specifics around which callbacks are explicitly allowed are defined in
|
||||||
|
[`BulkInsertSafe`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/models/concerns/bulk_insert_safe.rb).
|
||||||
|
Consult the module source code for details. If your class uses callbacks that are not explicitly designated
|
||||||
|
safe and you `include BulkInsertSafe` the application fails with an error.
|
||||||
|
|
||||||
|
### `BulkInsertSafe` versus `InsertAll`
|
||||||
|
|
||||||
|
Internally, `BulkInsertSafe` is based on `InsertAll`, and you may wonder when to choose
|
||||||
|
the former over the latter. To help you make the decision,
|
||||||
|
the key differences between these classes are listed in the table below.
|
||||||
|
|
||||||
|
| | Input type | Validates input | Specify batch size | Can bypass callbacks | Transactional |
|
||||||
|
|--------------- | -------------------- | --------------- | ------------------ | --------------------------------- | ------------- |
|
||||||
|
| `bulk_insert!` | ActiveRecord objects | Yes (optional) | Yes (optional) | No (prevents unsafe callback use) | Yes |
|
||||||
|
| `insert_all!` | Attribute hashes | No | No | Yes | Yes |
|
||||||
|
|
||||||
|
To summarize, `BulkInsertSafe` moves bulk inserts closer to how ActiveRecord objects
|
||||||
|
and inserts would normally behave. However, if all you need is to insert raw data in bulk, then
|
||||||
|
`insert_all` is more efficient.
|
||||||
|
|
||||||
|
## Insert `has_many` associations in bulk
|
||||||
|
|
||||||
|
A common use case is to save collections of associated relations through the owner side of the relation,
|
||||||
|
where the owned relation is associated to the owner through the `has_many` class method:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
owner = OwnerModel.new(owned_relations: array_of_owned_relations)
|
||||||
|
# saves all `owned_relations` one-by-one
|
||||||
|
owner.save!
|
||||||
|
```
|
||||||
|
|
||||||
|
This issues a single `INSERT`, and transaction, for every record in `owned_relations`, which is inefficient if
|
||||||
|
`array_of_owned_relations` is large. To remedy this, the `BulkInsertableAssociations` concern can be
|
||||||
|
used to declare that the owner defines associations that are safe for bulk insertion:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
class OwnerModel < ApplicationRecord
|
||||||
|
# other includes here
|
||||||
|
# ...
|
||||||
|
include BulkInsertableAssociations # include this last
|
||||||
|
|
||||||
|
has_many :my_models
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
Here `my_models` must be declared `BulkInsertSafe` (as described previously) for bulk insertions
|
||||||
|
to happen. You can now insert any yet unsaved records as follows:
|
||||||
|
|
||||||
|
```ruby
|
||||||
|
BulkInsertableAssociations.with_bulk_insert do
|
||||||
|
owner = OwnerModel.new(my_models: array_of_my_model_instances)
|
||||||
|
# saves `my_models` using a single bulk insert (possibly via multiple batches)
|
||||||
|
owner.save!
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
You can still save relations that are not `BulkInsertSafe` in this block; they
|
||||||
|
simply are treated as if you had invoked `save` from outside the block.
|
||||||
|
|
||||||
|
## Known limitations
|
||||||
|
|
||||||
|
There are a few restrictions to how these APIs can be used:
|
||||||
|
|
||||||
|
- `BulkInsertableAssociations`:
|
||||||
|
- It is currently only compatible with `has_many` relations.
|
||||||
|
- It does not yet support `has_many through: ...` relations.
|
||||||
|
|
||||||
|
Moreover, input data should either be limited to around 1000 records at most,
|
||||||
|
or already batched prior to calling bulk insert. The `INSERT` statement runs in a single
|
||||||
|
transaction, so for large amounts of records it may negatively affect database stability.
|
|
@ -10,7 +10,7 @@ info: To determine the technical writer assigned to the Stage/Group associated w
|
||||||
|
|
||||||
In relational databases (including PostgreSQL), foreign keys provide a way to link
|
In relational databases (including PostgreSQL), foreign keys provide a way to link
|
||||||
two database tables together, and ensure data-consistency between them. In GitLab,
|
two database tables together, and ensure data-consistency between them. In GitLab,
|
||||||
[foreign keys](../foreign_keys.md) are vital part of the database design process.
|
[foreign keys](foreign_keys.md) are vital part of the database design process.
|
||||||
Most of our database tables have foreign keys.
|
Most of our database tables have foreign keys.
|
||||||
|
|
||||||
With the ongoing database [decomposition work](https://gitlab.com/groups/gitlab-org/-/epics/6168),
|
With the ongoing database [decomposition work](https://gitlab.com/groups/gitlab-org/-/epics/6168),
|
||||||
|
|
|
@ -0,0 +1,193 @@
|
||||||
|
---
|
||||||
|
stage: none
|
||||||
|
group: unassigned
|
||||||
|
info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
|
||||||
|
---
|
||||||
|
|
||||||
|
# Database case study: Namespaces storage statistics
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
On [Storage and limits management for groups](https://gitlab.com/groups/gitlab-org/-/epics/886),
|
||||||
|
we want to facilitate a method for easily viewing the amount of
|
||||||
|
storage consumed by a group, and allow easy management.
|
||||||
|
|
||||||
|
## Proposal
|
||||||
|
|
||||||
|
1. Create a new ActiveRecord model to hold the namespaces' statistics in an aggregated form (only for root [namespaces](../../user/namespace/index.md)).
|
||||||
|
1. Refresh the statistics in this model every time a project belonging to this namespace is changed.
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
In GitLab, we update the project storage statistics through a
|
||||||
|
[callback](https://gitlab.com/gitlab-org/gitlab/-/blob/4ab54c2233e91f60a80e5b6fa2181e6899fdcc3e/app/models/project.rb#L97)
|
||||||
|
every time the project is saved.
|
||||||
|
|
||||||
|
The summary of those statistics per namespace is then retrieved
|
||||||
|
by [`Namespaces#with_statistics`](https://gitlab.com/gitlab-org/gitlab/-/blob/4ab54c2233e91f60a80e5b6fa2181e6899fdcc3e/app/models/namespace.rb#L70) scope. Analyzing this query we noticed that:
|
||||||
|
|
||||||
|
- It takes up to `1.2` seconds for namespaces with over `15k` projects.
|
||||||
|
- It can't be analyzed with [ChatOps](../chatops_on_gitlabcom.md), as it times out.
|
||||||
|
|
||||||
|
Additionally, the pattern that is currently used to update the project statistics
|
||||||
|
(the callback) doesn't scale adequately. It is currently one of the largest
|
||||||
|
[database queries transactions on production](https://gitlab.com/gitlab-org/gitlab/-/issues/29070)
|
||||||
|
that takes the most time overall. We can't add one more query to it as
|
||||||
|
it increases the transaction's length.
|
||||||
|
|
||||||
|
Because of all of the above, we can't apply the same pattern to store
|
||||||
|
and update the namespaces statistics, as the `namespaces` table is one
|
||||||
|
of the largest tables on GitLab.com. Therefore we needed to find a performant and
|
||||||
|
alternative method.
|
||||||
|
|
||||||
|
## Attempts
|
||||||
|
|
||||||
|
### Attempt A: PostgreSQL materialized view
|
||||||
|
|
||||||
|
Model can be updated through a refresh strategy based on a project routes SQL and a [materialized view](https://www.postgresql.org/docs/11/rules-materializedviews.html):
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT split_part("rs".path, '/', 1) as root_path,
|
||||||
|
COALESCE(SUM(ps.storage_size), 0) AS storage_size,
|
||||||
|
COALESCE(SUM(ps.repository_size), 0) AS repository_size,
|
||||||
|
COALESCE(SUM(ps.wiki_size), 0) AS wiki_size,
|
||||||
|
COALESCE(SUM(ps.lfs_objects_size), 0) AS lfs_objects_size,
|
||||||
|
COALESCE(SUM(ps.build_artifacts_size), 0) AS build_artifacts_size,
|
||||||
|
COALESCE(SUM(ps.pipeline_artifacts_size), 0) AS pipeline_artifacts_size,
|
||||||
|
COALESCE(SUM(ps.packages_size), 0) AS packages_size,
|
||||||
|
COALESCE(SUM(ps.snippets_size), 0) AS snippets_size,
|
||||||
|
COALESCE(SUM(ps.uploads_size), 0) AS uploads_size
|
||||||
|
FROM "projects"
|
||||||
|
INNER JOIN routes rs ON rs.source_id = projects.id AND rs.source_type = 'Project'
|
||||||
|
INNER JOIN project_statistics ps ON ps.project_id = projects.id
|
||||||
|
GROUP BY root_path
|
||||||
|
```
|
||||||
|
|
||||||
|
We could then execute the query with:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
REFRESH MATERIALIZED VIEW root_namespace_storage_statistics;
|
||||||
|
```
|
||||||
|
|
||||||
|
While this implied a single query update (and probably a fast one), it has some downsides:
|
||||||
|
|
||||||
|
- Materialized views syntax varies from PostgreSQL and MySQL. While this feature was worked on, MySQL was still supported by GitLab.
|
||||||
|
- Rails does not have native support for materialized views. We'd need to use a specialized gem to take care of the management of the database views, which implies additional work.
|
||||||
|
|
||||||
|
### Attempt B: An update through a CTE
|
||||||
|
|
||||||
|
Similar to Attempt A: Model update done through a refresh strategy with a [Common Table Expression](https://www.postgresql.org/docs/9.1/queries-with.html)
|
||||||
|
|
||||||
|
```sql
|
||||||
|
WITH refresh AS (
|
||||||
|
SELECT split_part("rs".path, '/', 1) as root_path,
|
||||||
|
COALESCE(SUM(ps.storage_size), 0) AS storage_size,
|
||||||
|
COALESCE(SUM(ps.repository_size), 0) AS repository_size,
|
||||||
|
COALESCE(SUM(ps.wiki_size), 0) AS wiki_size,
|
||||||
|
COALESCE(SUM(ps.lfs_objects_size), 0) AS lfs_objects_size,
|
||||||
|
COALESCE(SUM(ps.build_artifacts_size), 0) AS build_artifacts_size,
|
||||||
|
COALESCE(SUM(ps.pipeline_artifacts_size), 0) AS pipeline_artifacts_size,
|
||||||
|
COALESCE(SUM(ps.packages_size), 0) AS packages_size,
|
||||||
|
COALESCE(SUM(ps.snippets_size), 0) AS snippets_size,
|
||||||
|
COALESCE(SUM(ps.uploads_size), 0) AS uploads_size
|
||||||
|
FROM "projects"
|
||||||
|
INNER JOIN routes rs ON rs.source_id = projects.id AND rs.source_type = 'Project'
|
||||||
|
INNER JOIN project_statistics ps ON ps.project_id = projects.id
|
||||||
|
GROUP BY root_path)
|
||||||
|
UPDATE namespace_storage_statistics
|
||||||
|
SET storage_size = refresh.storage_size,
|
||||||
|
repository_size = refresh.repository_size,
|
||||||
|
wiki_size = refresh.wiki_size,
|
||||||
|
lfs_objects_size = refresh.lfs_objects_size,
|
||||||
|
build_artifacts_size = refresh.build_artifacts_size,
|
||||||
|
pipeline_artifacts_size = refresh.pipeline_artifacts_size,
|
||||||
|
packages_size = refresh.packages_size,
|
||||||
|
snippets_size = refresh.snippets_size,
|
||||||
|
uploads_size = refresh.uploads_size
|
||||||
|
FROM refresh
|
||||||
|
INNER JOIN routes rs ON rs.path = refresh.root_path AND rs.source_type = 'Namespace'
|
||||||
|
WHERE namespace_storage_statistics.namespace_id = rs.source_id
|
||||||
|
```
|
||||||
|
|
||||||
|
Same benefits and downsides as attempt A.
|
||||||
|
|
||||||
|
### Attempt C: Get rid of the model and store the statistics on Redis
|
||||||
|
|
||||||
|
We could get rid of the model that stores the statistics in aggregated form and instead use a Redis Set.
|
||||||
|
This would be the [boring solution](https://about.gitlab.com/handbook/values/#boring-solutions) and the fastest one
|
||||||
|
to implement, as GitLab already includes Redis as part of its [Architecture](../architecture.md#redis).
|
||||||
|
|
||||||
|
The downside of this approach is that Redis does not provide the same persistence/consistency guarantees as PostgreSQL,
|
||||||
|
and this is information we can't afford to lose in a Redis failure.
|
||||||
|
|
||||||
|
### Attempt D: Tag the root namespace and its child namespaces
|
||||||
|
|
||||||
|
Directly relate the root namespace to its child namespaces, so
|
||||||
|
whenever a namespace is created without a parent, this one is tagged
|
||||||
|
with the root namespace ID:
|
||||||
|
|
||||||
|
| ID | root ID | parent ID |
|
||||||
|
|:---|:--------|:----------|
|
||||||
|
| 1 | 1 | NULL |
|
||||||
|
| 2 | 1 | 1 |
|
||||||
|
| 3 | 1 | 2 |
|
||||||
|
|
||||||
|
To aggregate the statistics inside a namespace we'd execute something like:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT COUNT(...)
|
||||||
|
FROM projects
|
||||||
|
WHERE namespace_id IN (
|
||||||
|
SELECT id
|
||||||
|
FROM namespaces
|
||||||
|
WHERE root_id = X
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Even though this approach would make aggregating much easier, it has some major downsides:
|
||||||
|
|
||||||
|
- We'd have to migrate **all namespaces** by adding and filling a new column. Because of the size of the table, dealing with time/cost would be significant. The background migration would take approximately `153h`, see <https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/29772>.
|
||||||
|
- Background migration has to be shipped one release before, delaying the functionality by another milestone.
|
||||||
|
|
||||||
|
### Attempt E (final): Update the namespace storage statistics asynchronously
|
||||||
|
|
||||||
|
This approach consists of continuing to use the incremental statistics updates we already have,
|
||||||
|
but we refresh them through Sidekiq jobs and in different transactions:
|
||||||
|
|
||||||
|
1. Create a second table (`namespace_aggregation_schedules`) with two columns `id` and `namespace_id`.
|
||||||
|
1. Whenever the statistics of a project changes, insert a row into `namespace_aggregation_schedules`
|
||||||
|
- We don't insert a new row if there's already one related to the root namespace.
|
||||||
|
- Keeping in mind the length of the transaction that involves updating `project_statistics`(<https://gitlab.com/gitlab-org/gitlab/-/issues/29070>), the insertion should be done in a different transaction and through a Sidekiq Job.
|
||||||
|
1. After inserting the row, we schedule another worker to be executed asynchronously at two different moments:
|
||||||
|
- One enqueued for immediate execution and another one scheduled in `1.5h` hours.
|
||||||
|
- We only schedule the jobs, if we can obtain a `1.5h` lease on Redis on a key based on the root namespace ID.
|
||||||
|
- If we can't obtain the lease, it indicates there's another aggregation already in progress, or scheduled in no more than `1.5h`.
|
||||||
|
1. This worker will:
|
||||||
|
- Update the root namespace storage statistics by querying all the namespaces through a service.
|
||||||
|
- Delete the related `namespace_aggregation_schedules` after the update.
|
||||||
|
1. Another Sidekiq job is also included to traverse any remaining rows on the `namespace_aggregation_schedules` table and schedule jobs for every pending row.
|
||||||
|
- This job is scheduled with cron to run every night (UTC).
|
||||||
|
|
||||||
|
This implementation has the following benefits:
|
||||||
|
|
||||||
|
- All the updates are done asynchronously, so we're not increasing the length of the transactions for `project_statistics`.
|
||||||
|
- We're doing the update in a single SQL query.
|
||||||
|
- It is compatible with PostgreSQL and MySQL.
|
||||||
|
- No background migration required.
|
||||||
|
|
||||||
|
The only downside of this approach is that namespaces' statistics are updated up to `1.5` hours after the change is done,
|
||||||
|
which means there's a time window in which the statistics are inaccurate. Because we're still not
|
||||||
|
[enforcing storage limits](https://gitlab.com/gitlab-org/gitlab/-/issues/17664), this is not a major problem.
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
Updating the storage statistics asynchronously, was the less problematic and
|
||||||
|
performant approach of aggregating the root namespaces.
|
||||||
|
|
||||||
|
All the details regarding this use case can be found on:
|
||||||
|
|
||||||
|
- <https://gitlab.com/gitlab-org/gitlab-foss/-/issues/62214>
|
||||||
|
- Merge Request with the implementation: <https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/28996>
|
||||||
|
|
||||||
|
Performance of the namespace storage statistics were measured in staging and production (GitLab.com). All results were posted
|
||||||
|
on <https://gitlab.com/gitlab-org/gitlab-foss/-/issues/64092>: No problem has been reported so far.
|
|
@ -597,7 +597,7 @@ export default {
|
||||||
Note that, even if the directive evaluates to `false`, the guarded entity is sent to the backend and
|
Note that, even if the directive evaluates to `false`, the guarded entity is sent to the backend and
|
||||||
matched against the GraphQL schema. So this approach requires that the feature-flagged entity
|
matched against the GraphQL schema. So this approach requires that the feature-flagged entity
|
||||||
exists in the schema, even if the feature flag is disabled. When the feature flag is turned off, it
|
exists in the schema, even if the feature flag is disabled. When the feature flag is turned off, it
|
||||||
is recommended that the resolver returns `null` at the very least using the same feature flag as the frontend. See the [API GraphQL guide](../api_graphql_styleguide.md#frontend-and-backend-feature-flag-strategies).
|
is recommended that the resolver returns `null` at the very least using the same feature flag as the frontend. See the [API GraphQL guide](../api_graphql_styleguide.md#feature-flags).
|
||||||
|
|
||||||
##### Different versions of a query
|
##### Different versions of a query
|
||||||
|
|
||||||
|
|
|
@ -1,179 +1,11 @@
|
||||||
---
|
---
|
||||||
stage: Plan
|
redirect_to: 'database/filtering_by_label.md'
|
||||||
group: Project Management
|
remove_date: '2022-11-05'
|
||||||
info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
|
|
||||||
---
|
---
|
||||||
# Filtering by label
|
|
||||||
|
|
||||||
## Introduction
|
This document was moved to [another location](database/filtering_by_label.md).
|
||||||
|
|
||||||
GitLab has [labels](../user/project/labels.md) that can be assigned to issues,
|
<!-- This redirect file can be deleted after <2022-11-05>. -->
|
||||||
merge requests, and epics. Labels on those objects are a many-to-many relation
|
<!-- Redirects that point to other docs in the same project expire in three months. -->
|
||||||
through the polymorphic `label_links` table.
|
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
|
||||||
|
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->
|
||||||
To filter these objects by multiple labels - for instance, 'all open
|
|
||||||
issues with the label ~Plan and the label ~backend' - we generate a
|
|
||||||
query containing a `GROUP BY` clause. In a simple form, this looks like:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
SELECT
|
|
||||||
issues.*
|
|
||||||
FROM
|
|
||||||
issues
|
|
||||||
INNER JOIN label_links ON label_links.target_id = issues.id
|
|
||||||
AND label_links.target_type = 'Issue'
|
|
||||||
INNER JOIN labels ON labels.id = label_links.label_id
|
|
||||||
WHERE
|
|
||||||
issues.project_id = 13083
|
|
||||||
AND (issues.state IN ('opened'))
|
|
||||||
AND labels.title IN ('Plan',
|
|
||||||
'backend')
|
|
||||||
GROUP BY
|
|
||||||
issues.id
|
|
||||||
HAVING (COUNT(DISTINCT labels.title) = 2)
|
|
||||||
ORDER BY
|
|
||||||
issues.updated_at DESC,
|
|
||||||
issues.id DESC
|
|
||||||
LIMIT 20 OFFSET 0
|
|
||||||
```
|
|
||||||
|
|
||||||
In particular, note that:
|
|
||||||
|
|
||||||
1. We `GROUP BY issues.id` so that we can ...
|
|
||||||
1. Use the `HAVING (COUNT(DISTINCT labels.title) = 2)` condition to ensure that
|
|
||||||
all matched issues have both labels.
|
|
||||||
|
|
||||||
This is more complicated than is ideal. It makes the query construction more
|
|
||||||
prone to errors (such as
|
|
||||||
[issue #15557](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/15557)).
|
|
||||||
|
|
||||||
## Attempt A: `WHERE EXISTS`
|
|
||||||
|
|
||||||
### Attempt A1: use multiple subqueries with `WHERE EXISTS`
|
|
||||||
|
|
||||||
In [issue #37137](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/37137)
|
|
||||||
and its associated [merge request](https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/14022),
|
|
||||||
we tried to replace the `GROUP BY` with multiple uses of `WHERE EXISTS`. For the
|
|
||||||
example above, this would give:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
WHERE (EXISTS (
|
|
||||||
SELECT
|
|
||||||
TRUE
|
|
||||||
FROM
|
|
||||||
label_links
|
|
||||||
INNER JOIN labels ON labels.id = label_links.label_id
|
|
||||||
WHERE
|
|
||||||
labels.title = 'Plan'
|
|
||||||
AND target_type = 'Issue'
|
|
||||||
AND target_id = issues.id))
|
|
||||||
AND (EXISTS (
|
|
||||||
SELECT
|
|
||||||
TRUE
|
|
||||||
FROM
|
|
||||||
label_links
|
|
||||||
INNER JOIN labels ON labels.id = label_links.label_id
|
|
||||||
WHERE
|
|
||||||
labels.title = 'backend'
|
|
||||||
AND target_type = 'Issue'
|
|
||||||
AND target_id = issues.id))
|
|
||||||
```
|
|
||||||
|
|
||||||
While this worked without schema changes, and did improve readability somewhat,
|
|
||||||
it did not improve query performance.
|
|
||||||
|
|
||||||
### Attempt A2: use label IDs in the `WHERE EXISTS` clause
|
|
||||||
|
|
||||||
In [merge request #34503](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/34503), we followed a similar approach to A1. But this time, we
|
|
||||||
did a separate query to fetch the IDs of the labels used in the filter so that we avoid the `JOIN` in the `EXISTS` clause and filter directly by
|
|
||||||
`label_links.label_id`. We also added a new index on `label_links` for the `target_id`, `label_id`, and `target_type` columns to speed up this query.
|
|
||||||
|
|
||||||
Finding the label IDs wasn't straightforward because there could be multiple labels with the same title within a single root namespace. We solved
|
|
||||||
this by grouping the label IDs by title and then using the array of IDs in the `EXISTS` clauses.
|
|
||||||
|
|
||||||
This resulted in a significant performance improvement. However, this optimization could not be applied to the dashboard pages
|
|
||||||
where we do not have a project or group context. We could not easily search for the label IDs here because that would mean searching across all
|
|
||||||
projects and groups that the user has access to.
|
|
||||||
|
|
||||||
## Attempt B: Denormalize using an array column
|
|
||||||
|
|
||||||
Having [removed MySQL support in GitLab 12.1](https://about.gitlab.com/blog/2019/06/27/removing-mysql-support/),
|
|
||||||
using [PostgreSQL's arrays](https://www.postgresql.org/docs/11/arrays.html) became more
|
|
||||||
tractable as we didn't have to support two databases. We discussed denormalizing
|
|
||||||
the `label_links` table for querying in
|
|
||||||
[issue #49651](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/49651),
|
|
||||||
with two options: label IDs and titles.
|
|
||||||
|
|
||||||
We can think of both of those as array columns on `issues`, `merge_requests`,
|
|
||||||
and `epics`: `issues.label_ids` would be an array column of label IDs, and
|
|
||||||
`issues.label_titles` would be an array of label titles.
|
|
||||||
|
|
||||||
These array columns can be complemented with [GIN
|
|
||||||
indexes](https://www.postgresql.org/docs/11/gin-intro.html) to improve
|
|
||||||
matching.
|
|
||||||
|
|
||||||
### Attempt B1: store label IDs for each object
|
|
||||||
|
|
||||||
This has some strong advantages over titles:
|
|
||||||
|
|
||||||
1. Unless a label is deleted, or a project is moved, we never need to
|
|
||||||
bulk-update the denormalized column.
|
|
||||||
1. It uses less storage than the titles.
|
|
||||||
|
|
||||||
Unfortunately, our application design makes this hard. If we were able to query
|
|
||||||
just by label ID easily, we wouldn't need the `INNER JOIN labels` in the initial
|
|
||||||
query at the start of this document. GitLab allows users to filter by label
|
|
||||||
title across projects and even across groups, so a filter by the label ~Plan may
|
|
||||||
include labels with multiple distinct IDs.
|
|
||||||
|
|
||||||
We do not want users to have to know about the different IDs, which means that
|
|
||||||
given this data set:
|
|
||||||
|
|
||||||
| Project | ~Plan label ID | ~backend label ID |
|
|
||||||
| ------- | -------------- | ----------------- |
|
|
||||||
| A | 11 | 12 |
|
|
||||||
| B | 21 | 22 |
|
|
||||||
| C | 31 | 32 |
|
|
||||||
|
|
||||||
We would need something like:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
WHERE
|
|
||||||
label_ids @> ARRAY[11, 12]
|
|
||||||
OR label_ids @> ARRAY[21, 22]
|
|
||||||
OR label_ids @> ARRAY[31, 32]
|
|
||||||
```
|
|
||||||
|
|
||||||
This can get even more complicated when we consider that in some cases, there
|
|
||||||
might be two ~backend labels - with different IDs - that could apply to the same
|
|
||||||
object, so the number of combinations would balloon further.
|
|
||||||
|
|
||||||
### Attempt B2: store label titles for each object
|
|
||||||
|
|
||||||
From the perspective of updating the object, this is the worst
|
|
||||||
option. We have to bulk update the objects when:
|
|
||||||
|
|
||||||
1. The objects are moved from one project to another.
|
|
||||||
1. The project is moved from one group to another.
|
|
||||||
1. The label is renamed.
|
|
||||||
1. The label is deleted.
|
|
||||||
|
|
||||||
It also uses much more storage. Querying is simple, though:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
WHERE
|
|
||||||
label_titles @> ARRAY['Plan', 'backend']
|
|
||||||
```
|
|
||||||
|
|
||||||
And our [tests in issue #49651](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/49651#note_188777346)
|
|
||||||
showed that this could be fast.
|
|
||||||
|
|
||||||
However, at present, the disadvantages outweigh the advantages.
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
We found a method A2 that does not need denormalization and improves the query performance significantly. This
|
|
||||||
did not apply to all cases, but we were able to apply method A1 to the rest of the cases so that we remove the
|
|
||||||
`GROUP BY` and `HAVING` clauses in all scenarios.
|
|
||||||
|
|
||||||
This simplified the query and improved the performance in the most common cases.
|
|
||||||
|
|
|
@ -1,200 +1,11 @@
|
||||||
---
|
---
|
||||||
stage: Data Stores
|
redirect_to: 'database/foreign_keys.md'
|
||||||
group: Database
|
remove_date: '2022-11-05'
|
||||||
info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# Foreign Keys & Associations
|
This document was moved to [another location](database/foreign_keys.md).
|
||||||
|
|
||||||
When adding an association to a model you must also add a foreign key. For
|
<!-- This redirect file can be deleted after <2022-11-05>. -->
|
||||||
example, say you have the following model:
|
<!-- Redirects that point to other docs in the same project expire in three months. -->
|
||||||
|
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
|
||||||
```ruby
|
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->
|
||||||
class User < ActiveRecord::Base
|
|
||||||
has_many :posts
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
Add a foreign key here on column `posts.user_id`. This ensures
|
|
||||||
that data consistency is enforced on database level. Foreign keys also mean that
|
|
||||||
the database can very quickly remove associated data (for example, when removing a
|
|
||||||
user), instead of Rails having to do this.
|
|
||||||
|
|
||||||
## Adding Foreign Keys In Migrations
|
|
||||||
|
|
||||||
Foreign keys can be added concurrently using `add_concurrent_foreign_key` as
|
|
||||||
defined in `Gitlab::Database::MigrationHelpers`. See the [Migration Style
|
|
||||||
Guide](migration_style_guide.md) for more information.
|
|
||||||
|
|
||||||
Keep in mind that you can only safely add foreign keys to existing tables after
|
|
||||||
you have removed any orphaned rows. The method `add_concurrent_foreign_key`
|
|
||||||
does not take care of this so you must do so manually. See
|
|
||||||
[adding foreign key constraint to an existing column](database/add_foreign_key_to_existing_column.md).
|
|
||||||
|
|
||||||
## Updating Foreign Keys In Migrations
|
|
||||||
|
|
||||||
Sometimes a foreign key constraint must be changed, preserving the column
|
|
||||||
but updating the constraint condition. For example, moving from
|
|
||||||
`ON DELETE CASCADE` to `ON DELETE SET NULL` or vice-versa.
|
|
||||||
|
|
||||||
PostgreSQL does not prevent you from adding overlapping foreign keys. It
|
|
||||||
honors the most recently added constraint. This allows us to replace foreign keys without
|
|
||||||
ever losing foreign key protection on a column.
|
|
||||||
|
|
||||||
To replace a foreign key:
|
|
||||||
|
|
||||||
1. [Add the new foreign key without validation](database/add_foreign_key_to_existing_column.md#prevent-invalid-records)
|
|
||||||
|
|
||||||
The name of the foreign key constraint must be changed to add a new
|
|
||||||
foreign key before removing the old one.
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
class ReplaceFkOnPackagesPackagesProjectId < Gitlab::Database::Migration[2.0]
|
|
||||||
disable_ddl_transaction!
|
|
||||||
|
|
||||||
NEW_CONSTRAINT_NAME = 'fk_new'
|
|
||||||
|
|
||||||
def up
|
|
||||||
add_concurrent_foreign_key(:packages_packages, :projects, column: :project_id, on_delete: :nullify, validate: false, name: NEW_CONSTRAINT_NAME)
|
|
||||||
end
|
|
||||||
|
|
||||||
def down
|
|
||||||
with_lock_retries do
|
|
||||||
remove_foreign_key_if_exists(:packages_packages, column: :project_id, on_delete: :nullify, name: NEW_CONSTRAINT_NAME)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
1. [Validate the new foreign key](database/add_foreign_key_to_existing_column.md#validate-the-foreign-key)
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
class ValidateFkNew < Gitlab::Database::Migration[2.0]
|
|
||||||
NEW_CONSTRAINT_NAME = 'fk_new'
|
|
||||||
|
|
||||||
# foreign key added in <link to MR or path to migration adding new FK>
|
|
||||||
def up
|
|
||||||
validate_foreign_key(:packages_packages, name: NEW_CONSTRAINT_NAME)
|
|
||||||
end
|
|
||||||
|
|
||||||
def down
|
|
||||||
# no-op
|
|
||||||
end
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
1. Remove the old foreign key:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
class RemoveFkOld < Gitlab::Database::Migration[2.0]
|
|
||||||
OLD_CONSTRAINT_NAME = 'fk_old'
|
|
||||||
|
|
||||||
# new foreign key added in <link to MR or path to migration adding new FK>
|
|
||||||
# and validated in <link to MR or path to migration validating new FK>
|
|
||||||
def up
|
|
||||||
remove_foreign_key_if_exists(:packages_packages, column: :project_id, on_delete: :cascade, name: OLD_CONSTRAINT_NAME)
|
|
||||||
end
|
|
||||||
|
|
||||||
def down
|
|
||||||
# Validation is skipped here, so if rolled back, this will need to be revalidated in a separate migration
|
|
||||||
add_concurrent_foreign_key(:packages_packages, :projects, column: :project_id, on_delete: :cascade, validate: false, name: OLD_CONSTRAINT_NAME)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
## Cascading Deletes
|
|
||||||
|
|
||||||
Every foreign key must define an `ON DELETE` clause, and in 99% of the cases
|
|
||||||
this should be set to `CASCADE`.
|
|
||||||
|
|
||||||
## Indexes
|
|
||||||
|
|
||||||
When adding a foreign key in PostgreSQL the column is not indexed automatically,
|
|
||||||
thus you must also add a concurrent index. Not doing so results in cascading
|
|
||||||
deletes being very slow.
|
|
||||||
|
|
||||||
## Naming foreign keys
|
|
||||||
|
|
||||||
By default Ruby on Rails uses the `_id` suffix for foreign keys. So we should
|
|
||||||
only use this suffix for associations between two tables. If you want to
|
|
||||||
reference an ID on a third party platform the `_xid` suffix is recommended.
|
|
||||||
|
|
||||||
The spec `spec/db/schema_spec.rb` tests if all columns with the `_id` suffix
|
|
||||||
have a foreign key constraint. So if that spec fails, don't add the column to
|
|
||||||
`IGNORED_FK_COLUMNS`, but instead add the FK constraint, or consider naming it
|
|
||||||
differently.
|
|
||||||
|
|
||||||
## Dependent Removals
|
|
||||||
|
|
||||||
Don't define options such as `dependent: :destroy` or `dependent: :delete` when
|
|
||||||
defining an association. Defining these options means Rails handles the
|
|
||||||
removal of data, instead of letting the database handle this in the most
|
|
||||||
efficient way possible.
|
|
||||||
|
|
||||||
In other words, this is bad and should be avoided at all costs:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
class User < ActiveRecord::Base
|
|
||||||
has_many :posts, dependent: :destroy
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
Should you truly have a need for this it should be approved by a database
|
|
||||||
specialist first.
|
|
||||||
|
|
||||||
You should also not define any `before_destroy` or `after_destroy` callbacks on
|
|
||||||
your models _unless_ absolutely required and only when approved by database
|
|
||||||
specialists. For example, if each row in a table has a corresponding file on a
|
|
||||||
file system it may be tempting to add a `after_destroy` hook. This however
|
|
||||||
introduces non database logic to a model, and means we can no longer rely on
|
|
||||||
foreign keys to remove the data as this would result in the file system data
|
|
||||||
being left behind. In such a case you should use a service class instead that
|
|
||||||
takes care of removing non database data.
|
|
||||||
|
|
||||||
In cases where the relation spans multiple databases you have even
|
|
||||||
further problems using `dependent: :destroy` or the above hooks. You can
|
|
||||||
read more about alternatives at [Avoid `dependent: :nullify` and
|
|
||||||
`dependent: :destroy` across
|
|
||||||
databases](database/multiple_databases.md#avoid-dependent-nullify-and-dependent-destroy-across-databases).
|
|
||||||
|
|
||||||
## Alternative primary keys with `has_one` associations
|
|
||||||
|
|
||||||
Sometimes a `has_one` association is used to create a one-to-one relationship:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
class User < ActiveRecord::Base
|
|
||||||
has_one :user_config
|
|
||||||
end
|
|
||||||
|
|
||||||
class UserConfig < ActiveRecord::Base
|
|
||||||
belongs_to :user
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
In these cases, there may be an opportunity to remove the unnecessary `id`
|
|
||||||
column on the associated table, `user_config.id` in this example. Instead,
|
|
||||||
the originating table ID can be used as the primary key for the associated
|
|
||||||
table:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
create_table :user_configs, id: false do |t|
|
|
||||||
t.references :users, primary_key: true, default: nil, index: false, foreign_key: { on_delete: :cascade }
|
|
||||||
...
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
Setting `default: nil` ensures a primary key sequence is not created, and since the primary key
|
|
||||||
automatically gets an index, we set `index: false` to avoid creating a duplicate.
|
|
||||||
You also need to add the new primary key to the model:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
class UserConfig < ActiveRecord::Base
|
|
||||||
self.primary_key = :user_id
|
|
||||||
|
|
||||||
belongs_to :user
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
Using a foreign key as primary key saves space but can make
|
|
||||||
[batch counting](service_ping/implement.md#batch-counters) in [Service Ping](service_ping/index.md) less efficient.
|
|
||||||
Consider using a regular `id` column if the table is relevant for Service Ping.
|
|
||||||
|
|
|
@ -1,196 +1,11 @@
|
||||||
---
|
---
|
||||||
stage: Data Stores
|
redirect_to: 'database/insert_into_tables_in_batches.md'
|
||||||
group: Database
|
remove_date: '2022-11-05'
|
||||||
info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
|
|
||||||
description: "Sometimes it is necessary to store large amounts of records at once, which can be inefficient
|
|
||||||
when iterating collections and performing individual `save`s. With the arrival of `insert_all`
|
|
||||||
in Rails 6, which operates at the row level (that is, using `Hash`es), GitLab has added a set
|
|
||||||
of APIs that make it safe and simple to insert ActiveRecord objects in bulk."
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# Insert into tables in batches
|
This document was moved to [another location](database/insert_into_tables_in_batches.md).
|
||||||
|
|
||||||
Sometimes it is necessary to store large amounts of records at once, which can be inefficient
|
<!-- This redirect file can be deleted after <2022-11-05>. -->
|
||||||
when iterating collections and saving each record individually. With the arrival of
|
<!-- Redirects that point to other docs in the same project expire in three months. -->
|
||||||
[`insert_all`](https://apidock.com/rails/ActiveRecord/Persistence/ClassMethods/insert_all)
|
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
|
||||||
in Rails 6, which operates at the row level (that is, using `Hash` objects), GitLab has added a set
|
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->
|
||||||
of APIs that make it safe and simple to insert `ActiveRecord` objects in bulk.
|
|
||||||
|
|
||||||
## Prepare `ApplicationRecord`s for bulk insertion
|
|
||||||
|
|
||||||
In order for a model class to take advantage of the bulk insertion API, it has to include the
|
|
||||||
`BulkInsertSafe` concern first:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
class MyModel < ApplicationRecord
|
|
||||||
# other includes here
|
|
||||||
# ...
|
|
||||||
include BulkInsertSafe # include this last
|
|
||||||
|
|
||||||
# ...
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
The `BulkInsertSafe` concern has two functions:
|
|
||||||
|
|
||||||
- It performs checks against your model class to ensure that it does not use ActiveRecord
|
|
||||||
APIs that are not safe to use with respect to bulk insertions (more on that below).
|
|
||||||
- It adds new class methods `bulk_insert!` and `bulk_upsert!`, which you can use to insert many records at once.
|
|
||||||
|
|
||||||
## Insert records with `bulk_insert!` and `bulk_upsert!`
|
|
||||||
|
|
||||||
If the target class passes the checks performed by `BulkInsertSafe`, you can insert an array of
|
|
||||||
ActiveRecord model objects as follows:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
records = [MyModel.new, ...]
|
|
||||||
|
|
||||||
MyModel.bulk_insert!(records)
|
|
||||||
```
|
|
||||||
|
|
||||||
Calls to `bulk_insert!` always attempt to insert _new records_. If instead
|
|
||||||
you would like to replace existing records with new values, while still inserting those
|
|
||||||
that do not already exist, then you can use `bulk_upsert!`:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
records = [MyModel.new, existing_model, ...]
|
|
||||||
|
|
||||||
MyModel.bulk_upsert!(records, unique_by: [:name])
|
|
||||||
```
|
|
||||||
|
|
||||||
In this example, `unique_by` specifies the columns by which records are considered to be
|
|
||||||
unique and as such are updated if they existed prior to insertion. For example, if
|
|
||||||
`existing_model` has a `name` attribute, and if a record with the same `name` value already
|
|
||||||
exists, its fields are updated with those of `existing_model`.
|
|
||||||
|
|
||||||
The `unique_by` parameter can also be passed as a `Symbol`, in which case it specifies
|
|
||||||
a database index by which a column is considered unique:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
MyModel.bulk_insert!(records, unique_by: :index_on_name)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Record validation
|
|
||||||
|
|
||||||
The `bulk_insert!` method guarantees that `records` are inserted transactionally, and
|
|
||||||
runs validations on each record prior to insertion. If any record fails to validate,
|
|
||||||
an error is raised and the transaction is rolled back. You can turn off validations via
|
|
||||||
the `:validate` option:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
MyModel.bulk_insert!(records, validate: false)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Batch size configuration
|
|
||||||
|
|
||||||
In those cases where the number of `records` is above a given threshold, insertions
|
|
||||||
occur in multiple batches. The default batch size is defined in
|
|
||||||
[`BulkInsertSafe::DEFAULT_BATCH_SIZE`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/models/concerns/bulk_insert_safe.rb).
|
|
||||||
Assuming a default threshold of 500, inserting 950 records
|
|
||||||
would result in two batches being written sequentially (of size 500 and 450 respectively.)
|
|
||||||
You can override the default batch size via the `:batch_size` option:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
MyModel.bulk_insert!(records, batch_size: 100)
|
|
||||||
```
|
|
||||||
|
|
||||||
Assuming the same number of 950 records, this would result in 10 batches being written instead.
|
|
||||||
Since this also affects the number of `INSERT` statements that occur, make sure you measure the
|
|
||||||
performance impact this might have on your code. There is a trade-off between the number of
|
|
||||||
`INSERT` statements the database has to process and the size and cost of each `INSERT`.
|
|
||||||
|
|
||||||
### Handling duplicate records
|
|
||||||
|
|
||||||
NOTE:
|
|
||||||
This parameter applies only to `bulk_insert!`. If you intend to update existing
|
|
||||||
records, use `bulk_upsert!` instead.
|
|
||||||
|
|
||||||
It may happen that some records you are trying to insert already exist, which would result in
|
|
||||||
primary key conflicts. There are two ways to address this problem: failing fast by raising an
|
|
||||||
error or skipping duplicate records. The default behavior of `bulk_insert!` is to fail fast
|
|
||||||
and raise an `ActiveRecord::RecordNotUnique` error.
|
|
||||||
|
|
||||||
If this is undesirable, you can instead skip duplicate records with the `skip_duplicates` flag:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
MyModel.bulk_insert!(records, skip_duplicates: true)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Requirements for safe bulk insertions
|
|
||||||
|
|
||||||
Large parts of ActiveRecord's persistence API are built around the notion of callbacks. Many
|
|
||||||
of these callbacks fire in response to model life cycle events such as `save` or `create`.
|
|
||||||
These callbacks cannot be used with bulk insertions, since they are meant to be called for
|
|
||||||
every instance that is saved or created. Since these events do not fire when
|
|
||||||
records are inserted in bulk, we currently prevent their use.
|
|
||||||
|
|
||||||
The specifics around which callbacks are explicitly allowed are defined in
|
|
||||||
[`BulkInsertSafe`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/models/concerns/bulk_insert_safe.rb).
|
|
||||||
Consult the module source code for details. If your class uses callbacks that are not explicitly designated
|
|
||||||
safe and you `include BulkInsertSafe` the application fails with an error.
|
|
||||||
|
|
||||||
### `BulkInsertSafe` versus `InsertAll`
|
|
||||||
|
|
||||||
Internally, `BulkInsertSafe` is based on `InsertAll`, and you may wonder when to choose
|
|
||||||
the former over the latter. To help you make the decision,
|
|
||||||
the key differences between these classes are listed in the table below.
|
|
||||||
|
|
||||||
| | Input type | Validates input | Specify batch size | Can bypass callbacks | Transactional |
|
|
||||||
|--------------- | -------------------- | --------------- | ------------------ | --------------------------------- | ------------- |
|
|
||||||
| `bulk_insert!` | ActiveRecord objects | Yes (optional) | Yes (optional) | No (prevents unsafe callback use) | Yes |
|
|
||||||
| `insert_all!` | Attribute hashes | No | No | Yes | Yes |
|
|
||||||
|
|
||||||
To summarize, `BulkInsertSafe` moves bulk inserts closer to how ActiveRecord objects
|
|
||||||
and inserts would normally behave. However, if all you need is to insert raw data in bulk, then
|
|
||||||
`insert_all` is more efficient.
|
|
||||||
|
|
||||||
## Insert `has_many` associations in bulk
|
|
||||||
|
|
||||||
A common use case is to save collections of associated relations through the owner side of the relation,
|
|
||||||
where the owned relation is associated to the owner through the `has_many` class method:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
owner = OwnerModel.new(owned_relations: array_of_owned_relations)
|
|
||||||
# saves all `owned_relations` one-by-one
|
|
||||||
owner.save!
|
|
||||||
```
|
|
||||||
|
|
||||||
This issues a single `INSERT`, and transaction, for every record in `owned_relations`, which is inefficient if
|
|
||||||
`array_of_owned_relations` is large. To remedy this, the `BulkInsertableAssociations` concern can be
|
|
||||||
used to declare that the owner defines associations that are safe for bulk insertion:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
class OwnerModel < ApplicationRecord
|
|
||||||
# other includes here
|
|
||||||
# ...
|
|
||||||
include BulkInsertableAssociations # include this last
|
|
||||||
|
|
||||||
has_many :my_models
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
Here `my_models` must be declared `BulkInsertSafe` (as described previously) for bulk insertions
|
|
||||||
to happen. You can now insert any yet unsaved records as follows:
|
|
||||||
|
|
||||||
```ruby
|
|
||||||
BulkInsertableAssociations.with_bulk_insert do
|
|
||||||
owner = OwnerModel.new(my_models: array_of_my_model_instances)
|
|
||||||
# saves `my_models` using a single bulk insert (possibly via multiple batches)
|
|
||||||
owner.save!
|
|
||||||
end
|
|
||||||
```
|
|
||||||
|
|
||||||
You can still save relations that are not `BulkInsertSafe` in this block; they
|
|
||||||
simply are treated as if you had invoked `save` from outside the block.
|
|
||||||
|
|
||||||
## Known limitations
|
|
||||||
|
|
||||||
There are a few restrictions to how these APIs can be used:
|
|
||||||
|
|
||||||
- `BulkInsertableAssociations`:
|
|
||||||
- It is currently only compatible with `has_many` relations.
|
|
||||||
- It does not yet support `has_many through: ...` relations.
|
|
||||||
|
|
||||||
Moreover, input data should either be limited to around 1000 records at most,
|
|
||||||
or already batched prior to calling bulk insert. The `INSERT` statement runs in a single
|
|
||||||
transaction, so for large amounts of records it may negatively affect database stability.
|
|
||||||
|
|
|
@ -1,193 +1,11 @@
|
||||||
---
|
---
|
||||||
stage: none
|
redirect_to: 'database/namespaces_storage_statistics.md'
|
||||||
group: unassigned
|
remove_date: '2022-11-05'
|
||||||
info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# Database case study: Namespaces storage statistics
|
This document was moved to [another location](database/namespaces_storage_statistics.md).
|
||||||
|
|
||||||
## Introduction
|
<!-- This redirect file can be deleted after <2022-11-05>. -->
|
||||||
|
<!-- Redirects that point to other docs in the same project expire in three months. -->
|
||||||
On [Storage and limits management for groups](https://gitlab.com/groups/gitlab-org/-/epics/886),
|
<!-- Redirects that point to docs in a different project or site (for example, link is not relative and starts with `https:`) expire in one year. -->
|
||||||
we want to facilitate a method for easily viewing the amount of
|
<!-- Before deletion, see: https://docs.gitlab.com/ee/development/documentation/redirects.html -->
|
||||||
storage consumed by a group, and allow easy management.
|
|
||||||
|
|
||||||
## Proposal
|
|
||||||
|
|
||||||
1. Create a new ActiveRecord model to hold the namespaces' statistics in an aggregated form (only for root [namespaces](../user/namespace/index.md)).
|
|
||||||
1. Refresh the statistics in this model every time a project belonging to this namespace is changed.
|
|
||||||
|
|
||||||
## Problem
|
|
||||||
|
|
||||||
In GitLab, we update the project storage statistics through a
|
|
||||||
[callback](https://gitlab.com/gitlab-org/gitlab/-/blob/4ab54c2233e91f60a80e5b6fa2181e6899fdcc3e/app/models/project.rb#L97)
|
|
||||||
every time the project is saved.
|
|
||||||
|
|
||||||
The summary of those statistics per namespace is then retrieved
|
|
||||||
by [`Namespaces#with_statistics`](https://gitlab.com/gitlab-org/gitlab/-/blob/4ab54c2233e91f60a80e5b6fa2181e6899fdcc3e/app/models/namespace.rb#L70) scope. Analyzing this query we noticed that:
|
|
||||||
|
|
||||||
- It takes up to `1.2` seconds for namespaces with over `15k` projects.
|
|
||||||
- It can't be analyzed with [ChatOps](chatops_on_gitlabcom.md), as it times out.
|
|
||||||
|
|
||||||
Additionally, the pattern that is currently used to update the project statistics
|
|
||||||
(the callback) doesn't scale adequately. It is currently one of the largest
|
|
||||||
[database queries transactions on production](https://gitlab.com/gitlab-org/gitlab/-/issues/29070)
|
|
||||||
that takes the most time overall. We can't add one more query to it as
|
|
||||||
it increases the transaction's length.
|
|
||||||
|
|
||||||
Because of all of the above, we can't apply the same pattern to store
|
|
||||||
and update the namespaces statistics, as the `namespaces` table is one
|
|
||||||
of the largest tables on GitLab.com. Therefore we needed to find a performant and
|
|
||||||
alternative method.
|
|
||||||
|
|
||||||
## Attempts
|
|
||||||
|
|
||||||
### Attempt A: PostgreSQL materialized view
|
|
||||||
|
|
||||||
Model can be updated through a refresh strategy based on a project routes SQL and a [materialized view](https://www.postgresql.org/docs/11/rules-materializedviews.html):
|
|
||||||
|
|
||||||
```sql
|
|
||||||
SELECT split_part("rs".path, '/', 1) as root_path,
|
|
||||||
COALESCE(SUM(ps.storage_size), 0) AS storage_size,
|
|
||||||
COALESCE(SUM(ps.repository_size), 0) AS repository_size,
|
|
||||||
COALESCE(SUM(ps.wiki_size), 0) AS wiki_size,
|
|
||||||
COALESCE(SUM(ps.lfs_objects_size), 0) AS lfs_objects_size,
|
|
||||||
COALESCE(SUM(ps.build_artifacts_size), 0) AS build_artifacts_size,
|
|
||||||
COALESCE(SUM(ps.pipeline_artifacts_size), 0) AS pipeline_artifacts_size,
|
|
||||||
COALESCE(SUM(ps.packages_size), 0) AS packages_size,
|
|
||||||
COALESCE(SUM(ps.snippets_size), 0) AS snippets_size,
|
|
||||||
COALESCE(SUM(ps.uploads_size), 0) AS uploads_size
|
|
||||||
FROM "projects"
|
|
||||||
INNER JOIN routes rs ON rs.source_id = projects.id AND rs.source_type = 'Project'
|
|
||||||
INNER JOIN project_statistics ps ON ps.project_id = projects.id
|
|
||||||
GROUP BY root_path
|
|
||||||
```
|
|
||||||
|
|
||||||
We could then execute the query with:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
REFRESH MATERIALIZED VIEW root_namespace_storage_statistics;
|
|
||||||
```
|
|
||||||
|
|
||||||
While this implied a single query update (and probably a fast one), it has some downsides:
|
|
||||||
|
|
||||||
- Materialized views syntax varies from PostgreSQL and MySQL. While this feature was worked on, MySQL was still supported by GitLab.
|
|
||||||
- Rails does not have native support for materialized views. We'd need to use a specialized gem to take care of the management of the database views, which implies additional work.
|
|
||||||
|
|
||||||
### Attempt B: An update through a CTE
|
|
||||||
|
|
||||||
Similar to Attempt A: Model update done through a refresh strategy with a [Common Table Expression](https://www.postgresql.org/docs/9.1/queries-with.html)
|
|
||||||
|
|
||||||
```sql
|
|
||||||
WITH refresh AS (
|
|
||||||
SELECT split_part("rs".path, '/', 1) as root_path,
|
|
||||||
COALESCE(SUM(ps.storage_size), 0) AS storage_size,
|
|
||||||
COALESCE(SUM(ps.repository_size), 0) AS repository_size,
|
|
||||||
COALESCE(SUM(ps.wiki_size), 0) AS wiki_size,
|
|
||||||
COALESCE(SUM(ps.lfs_objects_size), 0) AS lfs_objects_size,
|
|
||||||
COALESCE(SUM(ps.build_artifacts_size), 0) AS build_artifacts_size,
|
|
||||||
COALESCE(SUM(ps.pipeline_artifacts_size), 0) AS pipeline_artifacts_size,
|
|
||||||
COALESCE(SUM(ps.packages_size), 0) AS packages_size,
|
|
||||||
COALESCE(SUM(ps.snippets_size), 0) AS snippets_size,
|
|
||||||
COALESCE(SUM(ps.uploads_size), 0) AS uploads_size
|
|
||||||
FROM "projects"
|
|
||||||
INNER JOIN routes rs ON rs.source_id = projects.id AND rs.source_type = 'Project'
|
|
||||||
INNER JOIN project_statistics ps ON ps.project_id = projects.id
|
|
||||||
GROUP BY root_path)
|
|
||||||
UPDATE namespace_storage_statistics
|
|
||||||
SET storage_size = refresh.storage_size,
|
|
||||||
repository_size = refresh.repository_size,
|
|
||||||
wiki_size = refresh.wiki_size,
|
|
||||||
lfs_objects_size = refresh.lfs_objects_size,
|
|
||||||
build_artifacts_size = refresh.build_artifacts_size,
|
|
||||||
pipeline_artifacts_size = refresh.pipeline_artifacts_size,
|
|
||||||
packages_size = refresh.packages_size,
|
|
||||||
snippets_size = refresh.snippets_size,
|
|
||||||
uploads_size = refresh.uploads_size
|
|
||||||
FROM refresh
|
|
||||||
INNER JOIN routes rs ON rs.path = refresh.root_path AND rs.source_type = 'Namespace'
|
|
||||||
WHERE namespace_storage_statistics.namespace_id = rs.source_id
|
|
||||||
```
|
|
||||||
|
|
||||||
Same benefits and downsides as attempt A.
|
|
||||||
|
|
||||||
### Attempt C: Get rid of the model and store the statistics on Redis
|
|
||||||
|
|
||||||
We could get rid of the model that stores the statistics in aggregated form and instead use a Redis Set.
|
|
||||||
This would be the [boring solution](https://about.gitlab.com/handbook/values/#boring-solutions) and the fastest one
|
|
||||||
to implement, as GitLab already includes Redis as part of its [Architecture](architecture.md#redis).
|
|
||||||
|
|
||||||
The downside of this approach is that Redis does not provide the same persistence/consistency guarantees as PostgreSQL,
|
|
||||||
and this is information we can't afford to lose in a Redis failure.
|
|
||||||
|
|
||||||
### Attempt D: Tag the root namespace and its child namespaces
|
|
||||||
|
|
||||||
Directly relate the root namespace to its child namespaces, so
|
|
||||||
whenever a namespace is created without a parent, this one is tagged
|
|
||||||
with the root namespace ID:
|
|
||||||
|
|
||||||
| ID | root ID | parent ID |
|
|
||||||
|:---|:--------|:----------|
|
|
||||||
| 1 | 1 | NULL |
|
|
||||||
| 2 | 1 | 1 |
|
|
||||||
| 3 | 1 | 2 |
|
|
||||||
|
|
||||||
To aggregate the statistics inside a namespace we'd execute something like:
|
|
||||||
|
|
||||||
```sql
|
|
||||||
SELECT COUNT(...)
|
|
||||||
FROM projects
|
|
||||||
WHERE namespace_id IN (
|
|
||||||
SELECT id
|
|
||||||
FROM namespaces
|
|
||||||
WHERE root_id = X
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
Even though this approach would make aggregating much easier, it has some major downsides:
|
|
||||||
|
|
||||||
- We'd have to migrate **all namespaces** by adding and filling a new column. Because of the size of the table, dealing with time/cost would be significant. The background migration would take approximately `153h`, see <https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/29772>.
|
|
||||||
- Background migration has to be shipped one release before, delaying the functionality by another milestone.
|
|
||||||
|
|
||||||
### Attempt E (final): Update the namespace storage statistics asynchronously
|
|
||||||
|
|
||||||
This approach consists of continuing to use the incremental statistics updates we already have,
|
|
||||||
but we refresh them through Sidekiq jobs and in different transactions:
|
|
||||||
|
|
||||||
1. Create a second table (`namespace_aggregation_schedules`) with two columns `id` and `namespace_id`.
|
|
||||||
1. Whenever the statistics of a project changes, insert a row into `namespace_aggregation_schedules`
|
|
||||||
- We don't insert a new row if there's already one related to the root namespace.
|
|
||||||
- Keeping in mind the length of the transaction that involves updating `project_statistics`(<https://gitlab.com/gitlab-org/gitlab/-/issues/29070>), the insertion should be done in a different transaction and through a Sidekiq Job.
|
|
||||||
1. After inserting the row, we schedule another worker to be executed asynchronously at two different moments:
|
|
||||||
- One enqueued for immediate execution and another one scheduled in `1.5h` hours.
|
|
||||||
- We only schedule the jobs, if we can obtain a `1.5h` lease on Redis on a key based on the root namespace ID.
|
|
||||||
- If we can't obtain the lease, it indicates there's another aggregation already in progress, or scheduled in no more than `1.5h`.
|
|
||||||
1. This worker will:
|
|
||||||
- Update the root namespace storage statistics by querying all the namespaces through a service.
|
|
||||||
- Delete the related `namespace_aggregation_schedules` after the update.
|
|
||||||
1. Another Sidekiq job is also included to traverse any remaining rows on the `namespace_aggregation_schedules` table and schedule jobs for every pending row.
|
|
||||||
- This job is scheduled with cron to run every night (UTC).
|
|
||||||
|
|
||||||
This implementation has the following benefits:
|
|
||||||
|
|
||||||
- All the updates are done asynchronously, so we're not increasing the length of the transactions for `project_statistics`.
|
|
||||||
- We're doing the update in a single SQL query.
|
|
||||||
- It is compatible with PostgreSQL and MySQL.
|
|
||||||
- No background migration required.
|
|
||||||
|
|
||||||
The only downside of this approach is that namespaces' statistics are updated up to `1.5` hours after the change is done,
|
|
||||||
which means there's a time window in which the statistics are inaccurate. Because we're still not
|
|
||||||
[enforcing storage limits](https://gitlab.com/gitlab-org/gitlab/-/issues/17664), this is not a major problem.
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
Updating the storage statistics asynchronously, was the less problematic and
|
|
||||||
performant approach of aggregating the root namespaces.
|
|
||||||
|
|
||||||
All the details regarding this use case can be found on:
|
|
||||||
|
|
||||||
- <https://gitlab.com/gitlab-org/gitlab-foss/-/issues/62214>
|
|
||||||
- Merge Request with the implementation: <https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/28996>
|
|
||||||
|
|
||||||
Performance of the namespace storage statistics were measured in staging and production (GitLab.com). All results were posted
|
|
||||||
on <https://gitlab.com/gitlab-org/gitlab-foss/-/issues/64092>: No problem has been reported so far.
|
|
||||||
|
|
|
@ -932,7 +932,7 @@ Models](mass_insert.md) functionality.
|
||||||
|
|
||||||
Assuming you are working with ActiveRecord models, you might also find these links helpful:
|
Assuming you are working with ActiveRecord models, you might also find these links helpful:
|
||||||
|
|
||||||
- [Insert records in batches](insert_into_tables_in_batches.md)
|
- [Insert records in batches](database/insert_into_tables_in_batches.md)
|
||||||
- [BulkInsert gem](https://github.com/jamis/bulk_insert)
|
- [BulkInsert gem](https://github.com/jamis/bulk_insert)
|
||||||
- [ActiveRecord::PgGenerateSeries gem](https://github.com/ryu39/active_record-pg_generate_series)
|
- [ActiveRecord::PgGenerateSeries gem](https://github.com/ryu39/active_record-pg_generate_series)
|
||||||
|
|
||||||
|
|
|
@ -316,6 +316,28 @@ services:
|
||||||
alias: docker
|
alias: docker
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Issues when authenticating to the Dependency Proxy from CI/CD jobs
|
||||||
|
|
||||||
|
GitLab Runner will automatically authenticate to the Dependency Proxy. However, the underlying Docker engine is still subject to its [authorization resolving process](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/main/docs/configuration/advanced-configuration.md#precedence-of-docker-authorization-resolving).
|
||||||
|
|
||||||
|
Misconfigurations in the authentication mechanism may cause `HTTP Basic: Access denied` and `403: Access forbidden` errors.
|
||||||
|
|
||||||
|
You can use the job logs to view the authentication mechanism used to authenticate against the Dependency Proxy:
|
||||||
|
|
||||||
|
```plaintext
|
||||||
|
Authenticating with credentials from $DOCKER_AUTH_CONFIG
|
||||||
|
```
|
||||||
|
|
||||||
|
```plaintext
|
||||||
|
Authenticating with credentials from /root/.docker/config.json
|
||||||
|
```
|
||||||
|
|
||||||
|
```plaintext
|
||||||
|
Authenticating with credentials from job payload (GitLab Registry)
|
||||||
|
```
|
||||||
|
|
||||||
|
Make sure you are using the expected authentication mechanism.
|
||||||
|
|
||||||
### "Not Found" error when pulling image
|
### "Not Found" error when pulling image
|
||||||
|
|
||||||
Docker errors similar to the following may indicate that the user running the build job doesn't have
|
Docker errors similar to the following may indicate that the user running the build job doesn't have
|
||||||
|
|
|
@ -10,6 +10,8 @@ module API
|
||||||
before do
|
before do
|
||||||
require_repository_enabled!
|
require_repository_enabled!
|
||||||
authorize! :download_code, user_project
|
authorize! :download_code, user_project
|
||||||
|
|
||||||
|
verify_pagination_params!
|
||||||
end
|
end
|
||||||
|
|
||||||
helpers do
|
helpers do
|
||||||
|
|
|
@ -20,6 +20,26 @@ module API
|
||||||
optional :page, type: Integer, default: 1, desc: 'Current page number'
|
optional :page, type: Integer, default: 1, desc: 'Current page number'
|
||||||
optional :per_page, type: Integer, default: 20, desc: 'Number of items per page', except_values: [0]
|
optional :per_page, type: Integer, default: 20, desc: 'Number of items per page', except_values: [0]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def verify_pagination_params!
|
||||||
|
return if Feature.disabled?(:only_positive_pagination_values)
|
||||||
|
|
||||||
|
page = begin
|
||||||
|
Integer(params[:page])
|
||||||
|
rescue ArgumentError, TypeError
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
return render_structured_api_error!({ error: 'page does not have a valid value' }, 400) if page&.< 1
|
||||||
|
|
||||||
|
per_page = begin
|
||||||
|
Integer(params[:per_page])
|
||||||
|
rescue ArgumentError, TypeError
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
return render_structured_api_error!({ error: 'per_page does not have a valid value' }, 400) if per_page&.< 1
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -145,7 +145,7 @@ module RuboCop
|
||||||
return unless node.children[opts_index]
|
return unless node.children[opts_index]
|
||||||
|
|
||||||
node.children[opts_index].each_pair.find do |pair|
|
node.children[opts_index].each_pair.find do |pair|
|
||||||
pair.key.value == :feature_flag
|
pair.key.value == :_deprecated_feature_flag
|
||||||
end&.value
|
end&.value
|
||||||
else
|
else
|
||||||
arg_index = rugged_method?(node) ? 3 : 2
|
arg_index = rugged_method?(node) ? 3 : 2
|
||||||
|
|
|
@ -24,7 +24,7 @@ RSpec.describe 'Graphql Field feature flags' do
|
||||||
|
|
||||||
let(:query_type) do
|
let(:query_type) do
|
||||||
query_factory do |query|
|
query_factory do |query|
|
||||||
query.field :item, type, null: true, feature_flag: feature_flag, resolver: new_resolver(test_object)
|
query.field :item, type, null: true, _deprecated_feature_flag: feature_flag, resolver: new_resolver(test_object)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -209,7 +209,7 @@ RSpec.describe Types::BaseField do
|
||||||
describe '#visible?' do
|
describe '#visible?' do
|
||||||
context 'and has a feature_flag' do
|
context 'and has a feature_flag' do
|
||||||
let(:flag) { :test_feature }
|
let(:flag) { :test_feature }
|
||||||
let(:field) { described_class.new(name: 'test', type: GraphQL::Types::String, feature_flag: flag, null: false) }
|
let(:field) { described_class.new(name: 'test', type: GraphQL::Types::String, _deprecated_feature_flag: flag, null: false) }
|
||||||
let(:context) { {} }
|
let(:context) { {} }
|
||||||
|
|
||||||
before do
|
before do
|
||||||
|
@ -253,7 +253,7 @@ RSpec.describe Types::BaseField do
|
||||||
|
|
||||||
describe '#description' do
|
describe '#description' do
|
||||||
context 'feature flag given' do
|
context 'feature flag given' do
|
||||||
let(:field) { described_class.new(name: 'test', type: GraphQL::Types::String, feature_flag: flag, null: false, description: 'Test description.') }
|
let(:field) { described_class.new(name: 'test', type: GraphQL::Types::String, _deprecated_feature_flag: flag, null: false, description: 'Test description.') }
|
||||||
let(:flag) { :test_flag }
|
let(:flag) { :test_flag }
|
||||||
|
|
||||||
it 'prepends the description' do
|
it 'prepends the description' do
|
||||||
|
@ -313,11 +313,11 @@ RSpec.describe Types::BaseField do
|
||||||
described_class.new(**base_args.merge(args))
|
described_class.new(**base_args.merge(args))
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'interacts well with the `feature_flag` property' do
|
it 'interacts well with the `_deprecated_feature_flag` property' do
|
||||||
field = subject(
|
field = subject(
|
||||||
deprecated: { milestone: '1.10', reason: 'Deprecation reason' },
|
deprecated: { milestone: '1.10', reason: 'Deprecation reason' },
|
||||||
description: 'Field description.',
|
description: 'Field description.',
|
||||||
feature_flag: 'foo_flag'
|
_deprecated_feature_flag: 'foo_flag'
|
||||||
)
|
)
|
||||||
|
|
||||||
expect(field.description).to start_with('Field description. Available only when feature flag `foo_flag` is enabled.')
|
expect(field.description).to start_with('Field description. Available only when feature flag `foo_flag` is enabled.')
|
||||||
|
|
|
@ -206,11 +206,13 @@ RSpec.describe API::Commits do
|
||||||
let(:page) { 1 }
|
let(:page) { 1 }
|
||||||
let(:per_page) { 5 }
|
let(:per_page) { 5 }
|
||||||
let(:ref_name) { 'master' }
|
let(:ref_name) { 'master' }
|
||||||
let!(:request) do
|
let(:request) do
|
||||||
get api("/projects/#{project_id}/repository/commits?page=#{page}&per_page=#{per_page}&ref_name=#{ref_name}", user)
|
get api("/projects/#{project_id}/repository/commits?page=#{page}&per_page=#{per_page}&ref_name=#{ref_name}", user)
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'returns correct headers' do
|
it 'returns correct headers' do
|
||||||
|
request
|
||||||
|
|
||||||
expect(response).to include_limited_pagination_headers
|
expect(response).to include_limited_pagination_headers
|
||||||
expect(response.headers['Link']).to match(/page=1&per_page=5/)
|
expect(response.headers['Link']).to match(/page=1&per_page=5/)
|
||||||
expect(response.headers['Link']).to match(/page=2&per_page=5/)
|
expect(response.headers['Link']).to match(/page=2&per_page=5/)
|
||||||
|
@ -218,6 +220,8 @@ RSpec.describe API::Commits do
|
||||||
|
|
||||||
context 'viewing the first page' do
|
context 'viewing the first page' do
|
||||||
it 'returns the first 5 commits' do
|
it 'returns the first 5 commits' do
|
||||||
|
request
|
||||||
|
|
||||||
commit = project.repository.commit
|
commit = project.repository.commit
|
||||||
|
|
||||||
expect(json_response.size).to eq(per_page)
|
expect(json_response.size).to eq(per_page)
|
||||||
|
@ -230,6 +234,8 @@ RSpec.describe API::Commits do
|
||||||
let(:page) { 3 }
|
let(:page) { 3 }
|
||||||
|
|
||||||
it 'returns the third 5 commits' do
|
it 'returns the third 5 commits' do
|
||||||
|
request
|
||||||
|
|
||||||
commit = project.repository.commits('HEAD', limit: per_page, offset: (page - 1) * per_page).first
|
commit = project.repository.commits('HEAD', limit: per_page, offset: (page - 1) * per_page).first
|
||||||
|
|
||||||
expect(json_response.size).to eq(per_page)
|
expect(json_response.size).to eq(per_page)
|
||||||
|
@ -238,10 +244,55 @@ RSpec.describe API::Commits do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
context 'when per_page is 0' do
|
context 'when pagination params are invalid' do
|
||||||
let(:per_page) { 0 }
|
let_it_be(:project) { create(:project, :repository) }
|
||||||
|
|
||||||
it_behaves_like '400 response'
|
using RSpec::Parameterized::TableSyntax
|
||||||
|
|
||||||
|
where(:page, :per_page, :error_message) do
|
||||||
|
0 | nil | 'page does not have a valid value'
|
||||||
|
-1 | nil | 'page does not have a valid value'
|
||||||
|
'a' | nil | 'page is invalid'
|
||||||
|
nil | 0 | 'per_page does not have a valid value'
|
||||||
|
nil | -1 | 'per_page does not have a valid value'
|
||||||
|
nil | 'a' | 'per_page is invalid'
|
||||||
|
end
|
||||||
|
|
||||||
|
with_them do
|
||||||
|
it 'returns 400 response' do
|
||||||
|
request
|
||||||
|
|
||||||
|
expect(response).to have_gitlab_http_status(:bad_request)
|
||||||
|
expect(json_response['error']).to eq(error_message)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'when FF is off' do
|
||||||
|
before do
|
||||||
|
stub_feature_flags(only_positive_pagination_values: false)
|
||||||
|
end
|
||||||
|
|
||||||
|
where(:page, :per_page, :error_message, :status) do
|
||||||
|
0 | nil | nil | :success
|
||||||
|
-10 | nil | nil | :internal_server_error
|
||||||
|
'a' | nil | 'page is invalid' | :bad_request
|
||||||
|
nil | 0 | 'per_page has a value not allowed' | :bad_request
|
||||||
|
nil | -1 | nil | :success
|
||||||
|
nil | 'a' | 'per_page is invalid' | :bad_request
|
||||||
|
end
|
||||||
|
|
||||||
|
with_them do
|
||||||
|
it 'returns a response' do
|
||||||
|
request
|
||||||
|
|
||||||
|
expect(response).to have_gitlab_http_status(status)
|
||||||
|
|
||||||
|
if error_message
|
||||||
|
expect(json_response['error']).to eq(error_message)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -217,8 +217,8 @@ RSpec.describe RuboCop::Cop::Gitlab::MarkUsedFeatureFlags do
|
||||||
allow(cop).to receive(:in_graphql_types?).and_return(true)
|
allow(cop).to receive(:in_graphql_types?).and_return(true)
|
||||||
end
|
end
|
||||||
|
|
||||||
include_examples 'sets flag as used', 'field :runners, Types::Ci::RunnerType.connection_type, null: true, feature_flag: :foo', 'foo'
|
include_examples 'sets flag as used', 'field :runners, Types::Ci::RunnerType.connection_type, null: true, _deprecated_feature_flag: :foo', 'foo'
|
||||||
include_examples 'sets flag as used', 'field :runners, null: true, feature_flag: :foo', 'foo'
|
include_examples 'sets flag as used', 'field :runners, null: true, _deprecated_feature_flag: :foo', 'foo'
|
||||||
include_examples 'does not set any flags as used', 'field :solution'
|
include_examples 'does not set any flags as used', 'field :solution'
|
||||||
include_examples 'does not set any flags as used', 'field :runners, Types::Ci::RunnerType.connection_type'
|
include_examples 'does not set any flags as used', 'field :runners, Types::Ci::RunnerType.connection_type'
|
||||||
include_examples 'does not set any flags as used', 'field :runners, Types::Ci::RunnerType.connection_type, null: true, description: "hello world"'
|
include_examples 'does not set any flags as used', 'field :runners, Types::Ci::RunnerType.connection_type, null: true, description: "hello world"'
|
||||||
|
|
Loading…
Reference in New Issue