diff --git a/app/graphql/types/base_field.rb b/app/graphql/types/base_field.rb index 6aee9a5c052..1c43432594a 100644 --- a/app/graphql/types/base_field.rb +++ b/app/graphql/types/base_field.rb @@ -17,7 +17,7 @@ module Types @requires_argument = !!kwargs.delete(:requires_argument) @authorize = Array.wrap(kwargs.delete(:authorize)) kwargs[:complexity] = field_complexity(kwargs[:resolver_class], kwargs[:complexity]) - @feature_flag = kwargs[:feature_flag] + @feature_flag = kwargs[:_deprecated_feature_flag] kwargs = check_feature_flag(kwargs) @deprecation = gitlab_deprecation(kwargs) after_connection_extensions = kwargs.delete(:late_extensions) || [] @@ -136,7 +136,7 @@ module Types end def check_feature_flag(args) - ff = args.delete(:feature_flag) + ff = args.delete(:_deprecated_feature_flag) return args unless ff.present? args[:description] = feature_documentation_message(ff, args[:description]) diff --git a/app/graphql/types/group_type.rb b/app/graphql/types/group_type.rb index 9cd6dddca09..d41b7c34302 100644 --- a/app/graphql/types/group_type.rb +++ b/app/graphql/types/group_type.rb @@ -22,7 +22,7 @@ module Types type: Types::CustomEmojiType.connection_type, null: true, description: 'Custom emoji within this namespace.', - feature_flag: :custom_emoji + _deprecated_feature_flag: :custom_emoji field :share_with_group_lock, type: GraphQL::Types::Boolean, diff --git a/app/graphql/types/mutation_type.rb b/app/graphql/types/mutation_type.rb index ba595a11911..db4fb8fb14a 100644 --- a/app/graphql/types/mutation_type.rb +++ b/app/graphql/types/mutation_type.rb @@ -37,8 +37,8 @@ module Types mount_mutation Mutations::Clusters::AgentTokens::Create mount_mutation Mutations::Clusters::AgentTokens::Revoke mount_mutation Mutations::Commits::Create, calls_gitaly: true - mount_mutation Mutations::CustomEmoji::Create, feature_flag: :custom_emoji - mount_mutation Mutations::CustomEmoji::Destroy, feature_flag: :custom_emoji + mount_mutation Mutations::CustomEmoji::Create, _deprecated_feature_flag: :custom_emoji + mount_mutation Mutations::CustomEmoji::Destroy, _deprecated_feature_flag: :custom_emoji mount_mutation Mutations::CustomerRelations::Contacts::Create mount_mutation Mutations::CustomerRelations::Contacts::Update mount_mutation Mutations::CustomerRelations::Organizations::Create diff --git a/app/graphql/types/namespace_type.rb b/app/graphql/types/namespace_type.rb index b90226bd733..8a50dae4a3e 100644 --- a/app/graphql/types/namespace_type.rb +++ b/app/graphql/types/namespace_type.rb @@ -61,7 +61,7 @@ module Types Types::TimeTracking::TimelogCategoryType.connection_type, null: true, description: "Timelog categories for the namespace.", - feature_flag: :timelog_categories + _deprecated_feature_flag: :timelog_categories markdown_field :description_html, null: true diff --git a/app/graphql/types/project_type.rb b/app/graphql/types/project_type.rb index 5bb8e613844..c9223577b07 100644 --- a/app/graphql/types/project_type.rb +++ b/app/graphql/types/project_type.rb @@ -442,7 +442,7 @@ module Types Types::TimeTracking::TimelogCategoryType.connection_type, null: true, description: "Timelog categories for the project.", - feature_flag: :timelog_categories + _deprecated_feature_flag: :timelog_categories def timelog_categories object.project_namespace.timelog_categories diff --git a/config/feature_flags/development/only_positive_pagination_values.yml b/config/feature_flags/development/only_positive_pagination_values.yml new file mode 100644 index 00000000000..9347c628e65 --- /dev/null +++ b/config/feature_flags/development/only_positive_pagination_values.yml @@ -0,0 +1,8 @@ +--- +name: only_positive_pagination_values +introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/93571 +rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/369225 +milestone: '15.3' +type: development +group: group::source code +default_enabled: false diff --git a/doc/development/api_graphql_styleguide.md b/doc/development/api_graphql_styleguide.md index 7fe49e0c8f0..3baf5c92da8 100644 --- a/doc/development/api_graphql_styleguide.md +++ b/doc/development/api_graphql_styleguide.md @@ -98,11 +98,8 @@ See the [deprecating schema items](#deprecating-schema-items) section for how to ### Breaking change exemptions -Two scenarios exist where schema items are exempt from the deprecation process, -and can be removed or changed at any time without notice. These are schema items that either: - -- Use the [`feature_flag` property](#feature_flag-property) _and_ the flag is disabled by default. -- Are [marked as alpha](#mark-schema-items-as-alpha). +Schema items [marked as alpha](#mark-schema-items-as-alpha) are exempt from the deprecation process, +and can be removed or changed at any time without notice. ## Global IDs @@ -472,92 +469,37 @@ end ## Feature flags -Developers can add [feature flags](../development/feature_flags/index.md) to GraphQL -fields in the following ways: +You can implement [feature flags](../development/feature_flags/index.md) in GraphQL to toggle: -- Add the [`feature_flag` property](#feature_flag-property) to a field. This allows the field to be _hidden_ - from the GraphQL schema when the flag is disabled. -- [Toggle the return value](#toggle-the-value-of-a-field) when resolving the field. +- The return value of a field. +- The behavior of an argument or mutation. -You can refer to these guidelines to decide which approach to use: - -- If your field is experimental, and its name or type is subject to - change, use the [`feature_flag` property](#feature_flag-property). -- If your field is stable and its definition doesn't change, even after the flag is - removed, [toggle the return value](#toggle-the-value-of-a-field) of the field instead. Note that - [all fields should be nullable](#nullable-fields) anyway. -- If your field will be accessed from frontend using the `@include` or `@skip` directive, [do not use the `feature_flag` property](#frontend-and-backend-feature-flag-strategies). - -### `feature_flag` property - -The `feature_flag` property allows you to toggle the field's -[visibility](https://graphql-ruby.org/authorization/visibility.html) -in the GraphQL schema. This removes the field from the schema -when the flag is disabled. - -A description is [appended](https://gitlab.com/gitlab-org/gitlab/-/blob/497b556/app/graphql/types/base_field.rb#L44-53) -to the field indicating that it is behind a feature flag. - -WARNING: -If a client queries for the field when the feature flag is disabled, the query -fails. Consider this when toggling the visibility of the feature on or off on -production. - -The `feature_flag` property does not allow the use of -[feature gates based on actors](../development/feature_flags/index.md). -This means that the feature flag cannot be toggled only for particular -projects, groups, or users, but instead can only be toggled globally for -everyone. - -Example: - -```ruby -field :test_field, type: GraphQL::Types::String, - null: true, - description: 'Some test field.', - feature_flag: :my_feature_flag -``` - -### Frontend and Backend feature flag strategies - -#### Directives - -When feature flags are used in the frontend to control the `@include` and `@skip` directives, do not use the `feature_flag` -property on the server-side. For the accepted backend workaround, see [Toggle the value of a field](#toggle-the-value-of-a-field). It is recommended that the feature flag used in this approach be the same for frontend and backend. - -Even if the frontend directives evaluate to `@include:false` / `@skip:true`, the guarded entity is sent to the backend and matched -against the GraphQL schema. We would then get an exception due to a schema mismatch. See the [frontend GraphQL guide](../development/fe_guide/graphql.md#the-include-directive) for more guidance. - -#### Different versions of a query - -See the guide frontend GraphQL guide for [different versions of a query](../development/fe_guide/graphql.md#different-versions-of-a-query), and [why it is not the preferred approach](../development/fe_guide/graphql.md#avoiding-multiple-query-versions) - -### Toggle the value of a field - -This method of using feature flags for fields is to toggle the -return value of the field. This can be done in the resolver, in the +This can be done in a resolver, in the type, or even in a model method, depending on your preference and situation. -Consider also [marking the field as Alpha](#mark-schema-items-as-alpha) -while the value of the field can be toggled. You can -[change or remove Alpha fields at any time](#breaking-change-exemptions) without needing to deprecate them. -This also signals to consumers of the public GraphQL API that the field is not +NOTE: +It's recommended that you also [mark the item as Alpha](#mark-schema-items-as-alpha) while it is behind a feature flag. +This signals to consumers of the public GraphQL API that the field is not meant to be used yet. +You can also +[change or remove Alpha items at any time](#breaking-change-exemptions) without needing to deprecate them. When the flag is removed, "release" +the schema item by removing its Alpha property to make it public. -When applying a feature flag to toggle the value of a field, the -`description` of the field must: +### Descriptions for feature flagged items -- State that the value of the field can be toggled by a feature flag. +When using a feature flag to toggle the value or behavior of a schema item, the +`description` of the item must: + +- State that the value or behavior can be toggled by a feature flag. - Name the feature flag. -- State what the field returns when the feature flag is disabled (or +- State what the field returns, or behavior is, when the feature flag is disabled (or enabled, if more appropriate). -Example: +Example of a feature-flagged field: ```ruby -field :foo, GraphQL::Types::String, - null: true, +field :foo, GraphQL::Types::String, null: true, alpha: { milestone: '10.0' }, description: 'Some test field. Returns `null`' \ 'if `my_feature_flag` feature flag is disabled.' @@ -567,6 +509,26 @@ def foo end ``` +Example of a feature-flagged argument: + +```ruby +argument :foo, type: GraphQL::Types::String, required: false, + alpha: { milestone: '10.0' }, + description: 'Some test argument. Is ignored if ' \ + '`my_feature_flag` feature flag is disabled.' + +def resolve(args) + args.delete(:foo) unless Feature.enabled?(:my_feature_flag, object) + # ... +end +``` + +### `feature_flag` property (deprecated) + +NOTE: +This property is deprecated and should no longer be used. The property +has been temporarily renamed to `_deprecated_feature_flag` and support for it will be removed in [#369202](https://gitlab.com/gitlab-org/gitlab/-/issues/369202). + ## Deprecating schema items The GitLab GraphQL API is versionless, which means we maintain backwards diff --git a/doc/development/database/add_foreign_key_to_existing_column.md b/doc/development/database/add_foreign_key_to_existing_column.md index 7a18da2223f..1827996b731 100644 --- a/doc/development/database/add_foreign_key_to_existing_column.md +++ b/doc/development/database/add_foreign_key_to_existing_column.md @@ -6,7 +6,7 @@ info: To determine the technical writer assigned to the Stage/Group associated w # Add a foreign key constraint to an existing column -Foreign keys ensure consistency between related database tables. The current database review process **always** encourages you to add [foreign keys](../foreign_keys.md) when creating tables that reference records from other tables. +Foreign keys ensure consistency between related database tables. The current database review process **always** encourages you to add [foreign keys](foreign_keys.md) when creating tables that reference records from other tables. Starting with Rails version 4, Rails includes migration helpers to add foreign key constraints to database tables. Before Rails 4, the only way for ensuring some level of consistency was the diff --git a/doc/development/database/filtering_by_label.md b/doc/development/database/filtering_by_label.md new file mode 100644 index 00000000000..29b0c75298e --- /dev/null +++ b/doc/development/database/filtering_by_label.md @@ -0,0 +1,179 @@ +--- +stage: Plan +group: Project Management +info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments +--- +# Filtering by label + +## Introduction + +GitLab has [labels](../../user/project/labels.md) that can be assigned to issues, +merge requests, and epics. Labels on those objects are a many-to-many relation +through the polymorphic `label_links` table. + +To filter these objects by multiple labels - for instance, 'all open +issues with the label ~Plan and the label ~backend' - we generate a +query containing a `GROUP BY` clause. In a simple form, this looks like: + +```sql +SELECT + issues.* +FROM + issues + INNER JOIN label_links ON label_links.target_id = issues.id + AND label_links.target_type = 'Issue' + INNER JOIN labels ON labels.id = label_links.label_id +WHERE + issues.project_id = 13083 + AND (issues.state IN ('opened')) + AND labels.title IN ('Plan', + 'backend') +GROUP BY + issues.id +HAVING (COUNT(DISTINCT labels.title) = 2) +ORDER BY + issues.updated_at DESC, + issues.id DESC +LIMIT 20 OFFSET 0 +``` + +In particular, note that: + +1. We `GROUP BY issues.id` so that we can ... +1. Use the `HAVING (COUNT(DISTINCT labels.title) = 2)` condition to ensure that + all matched issues have both labels. + +This is more complicated than is ideal. It makes the query construction more +prone to errors (such as +[issue #15557](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/15557)). + +## Attempt A: `WHERE EXISTS` + +### Attempt A1: use multiple subqueries with `WHERE EXISTS` + +In [issue #37137](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/37137) +and its associated [merge request](https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/14022), +we tried to replace the `GROUP BY` with multiple uses of `WHERE EXISTS`. For the +example above, this would give: + +```sql +WHERE (EXISTS ( + SELECT + TRUE + FROM + label_links + INNER JOIN labels ON labels.id = label_links.label_id + WHERE + labels.title = 'Plan' + AND target_type = 'Issue' + AND target_id = issues.id)) +AND (EXISTS ( + SELECT + TRUE + FROM + label_links + INNER JOIN labels ON labels.id = label_links.label_id + WHERE + labels.title = 'backend' + AND target_type = 'Issue' + AND target_id = issues.id)) +``` + +While this worked without schema changes, and did improve readability somewhat, +it did not improve query performance. + +### Attempt A2: use label IDs in the `WHERE EXISTS` clause + +In [merge request #34503](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/34503), we followed a similar approach to A1. But this time, we +did a separate query to fetch the IDs of the labels used in the filter so that we avoid the `JOIN` in the `EXISTS` clause and filter directly by +`label_links.label_id`. We also added a new index on `label_links` for the `target_id`, `label_id`, and `target_type` columns to speed up this query. + +Finding the label IDs wasn't straightforward because there could be multiple labels with the same title within a single root namespace. We solved +this by grouping the label IDs by title and then using the array of IDs in the `EXISTS` clauses. + +This resulted in a significant performance improvement. However, this optimization could not be applied to the dashboard pages +where we do not have a project or group context. We could not easily search for the label IDs here because that would mean searching across all +projects and groups that the user has access to. + +## Attempt B: Denormalize using an array column + +Having [removed MySQL support in GitLab 12.1](https://about.gitlab.com/blog/2019/06/27/removing-mysql-support/), +using [PostgreSQL's arrays](https://www.postgresql.org/docs/11/arrays.html) became more +tractable as we didn't have to support two databases. We discussed denormalizing +the `label_links` table for querying in +[issue #49651](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/49651), +with two options: label IDs and titles. + +We can think of both of those as array columns on `issues`, `merge_requests`, +and `epics`: `issues.label_ids` would be an array column of label IDs, and +`issues.label_titles` would be an array of label titles. + +These array columns can be complemented with +[GIN indexes](https://www.postgresql.org/docs/11/gin-intro.html) to improve +matching. + +### Attempt B1: store label IDs for each object + +This has some strong advantages over titles: + +1. Unless a label is deleted, or a project is moved, we never need to + bulk-update the denormalized column. +1. It uses less storage than the titles. + +Unfortunately, our application design makes this hard. If we were able to query +just by label ID easily, we wouldn't need the `INNER JOIN labels` in the initial +query at the start of this document. GitLab allows users to filter by label +title across projects and even across groups, so a filter by the label ~Plan may +include labels with multiple distinct IDs. + +We do not want users to have to know about the different IDs, which means that +given this data set: + +| Project | ~Plan label ID | ~backend label ID | +| ------- | -------------- | ----------------- | +| A | 11 | 12 | +| B | 21 | 22 | +| C | 31 | 32 | + +We would need something like: + +```sql +WHERE + label_ids @> ARRAY[11, 12] + OR label_ids @> ARRAY[21, 22] + OR label_ids @> ARRAY[31, 32] +``` + +This can get even more complicated when we consider that in some cases, there +might be two ~backend labels - with different IDs - that could apply to the same +object, so the number of combinations would balloon further. + +### Attempt B2: store label titles for each object + +From the perspective of updating the object, this is the worst +option. We have to bulk update the objects when: + +1. The objects are moved from one project to another. +1. The project is moved from one group to another. +1. The label is renamed. +1. The label is deleted. + +It also uses much more storage. Querying is simple, though: + +```sql +WHERE + label_titles @> ARRAY['Plan', 'backend'] +``` + +And our [tests in issue #49651](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/49651#note_188777346) +showed that this could be fast. + +However, at present, the disadvantages outweigh the advantages. + +## Conclusion + +We found a method A2 that does not need denormalization and improves the query performance significantly. This +did not apply to all cases, but we were able to apply method A1 to the rest of the cases so that we remove the +`GROUP BY` and `HAVING` clauses in all scenarios. + +This simplified the query and improved the performance in the most common cases. diff --git a/doc/development/database/foreign_keys.md b/doc/development/database/foreign_keys.md new file mode 100644 index 00000000000..7834e7d53c3 --- /dev/null +++ b/doc/development/database/foreign_keys.md @@ -0,0 +1,199 @@ +--- +stage: Data Stores +group: Database +info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments +--- + +# Foreign Keys & Associations + +When adding an association to a model you must also add a foreign key. For +example, say you have the following model: + +```ruby +class User < ActiveRecord::Base + has_many :posts +end +``` + +Add a foreign key here on column `posts.user_id`. This ensures +that data consistency is enforced on database level. Foreign keys also mean that +the database can very quickly remove associated data (for example, when removing a +user), instead of Rails having to do this. + +## Adding Foreign Keys In Migrations + +Foreign keys can be added concurrently using `add_concurrent_foreign_key` as +defined in `Gitlab::Database::MigrationHelpers`. See the +[Migration Style Guide](../migration_style_guide.md) for more information. + +Keep in mind that you can only safely add foreign keys to existing tables after +you have removed any orphaned rows. The method `add_concurrent_foreign_key` +does not take care of this so you must do so manually. See +[adding foreign key constraint to an existing column](add_foreign_key_to_existing_column.md). + +## Updating Foreign Keys In Migrations + +Sometimes a foreign key constraint must be changed, preserving the column +but updating the constraint condition. For example, moving from +`ON DELETE CASCADE` to `ON DELETE SET NULL` or vice-versa. + +PostgreSQL does not prevent you from adding overlapping foreign keys. It +honors the most recently added constraint. This allows us to replace foreign keys without +ever losing foreign key protection on a column. + +To replace a foreign key: + +1. [Add the new foreign key without validation](add_foreign_key_to_existing_column.md#prevent-invalid-records) + + The name of the foreign key constraint must be changed to add a new + foreign key before removing the old one. + + ```ruby + class ReplaceFkOnPackagesPackagesProjectId < Gitlab::Database::Migration[2.0] + disable_ddl_transaction! + + NEW_CONSTRAINT_NAME = 'fk_new' + + def up + add_concurrent_foreign_key(:packages_packages, :projects, column: :project_id, on_delete: :nullify, validate: false, name: NEW_CONSTRAINT_NAME) + end + + def down + with_lock_retries do + remove_foreign_key_if_exists(:packages_packages, column: :project_id, on_delete: :nullify, name: NEW_CONSTRAINT_NAME) + end + end + end + ``` + +1. [Validate the new foreign key](add_foreign_key_to_existing_column.md#validate-the-foreign-key) + + ```ruby + class ValidateFkNew < Gitlab::Database::Migration[2.0] + NEW_CONSTRAINT_NAME = 'fk_new' + + # foreign key added in + def up + validate_foreign_key(:packages_packages, name: NEW_CONSTRAINT_NAME) + end + + def down + # no-op + end + end + ``` + +1. Remove the old foreign key: + + ```ruby + class RemoveFkOld < Gitlab::Database::Migration[2.0] + OLD_CONSTRAINT_NAME = 'fk_old' + + # new foreign key added in + # and validated in + def up + remove_foreign_key_if_exists(:packages_packages, column: :project_id, on_delete: :cascade, name: OLD_CONSTRAINT_NAME) + end + + def down + # Validation is skipped here, so if rolled back, this will need to be revalidated in a separate migration + add_concurrent_foreign_key(:packages_packages, :projects, column: :project_id, on_delete: :cascade, validate: false, name: OLD_CONSTRAINT_NAME) + end + end + ``` + +## Cascading Deletes + +Every foreign key must define an `ON DELETE` clause, and in 99% of the cases +this should be set to `CASCADE`. + +## Indexes + +When adding a foreign key in PostgreSQL the column is not indexed automatically, +thus you must also add a concurrent index. Not doing so results in cascading +deletes being very slow. + +## Naming foreign keys + +By default Ruby on Rails uses the `_id` suffix for foreign keys. So we should +only use this suffix for associations between two tables. If you want to +reference an ID on a third party platform the `_xid` suffix is recommended. + +The spec `spec/db/schema_spec.rb` tests if all columns with the `_id` suffix +have a foreign key constraint. So if that spec fails, don't add the column to +`IGNORED_FK_COLUMNS`, but instead add the FK constraint, or consider naming it +differently. + +## Dependent Removals + +Don't define options such as `dependent: :destroy` or `dependent: :delete` when +defining an association. Defining these options means Rails handles the +removal of data, instead of letting the database handle this in the most +efficient way possible. + +In other words, this is bad and should be avoided at all costs: + +```ruby +class User < ActiveRecord::Base + has_many :posts, dependent: :destroy +end +``` + +Should you truly have a need for this it should be approved by a database +specialist first. + +You should also not define any `before_destroy` or `after_destroy` callbacks on +your models _unless_ absolutely required and only when approved by database +specialists. For example, if each row in a table has a corresponding file on a +file system it may be tempting to add a `after_destroy` hook. This however +introduces non database logic to a model, and means we can no longer rely on +foreign keys to remove the data as this would result in the file system data +being left behind. In such a case you should use a service class instead that +takes care of removing non database data. + +In cases where the relation spans multiple databases you have even +further problems using `dependent: :destroy` or the above hooks. You can +read more about alternatives at +[Avoid `dependent: :nullify` and `dependent: :destroy` across databases](multiple_databases.md#avoid-dependent-nullify-and-dependent-destroy-across-databases). + +## Alternative primary keys with `has_one` associations + +Sometimes a `has_one` association is used to create a one-to-one relationship: + +```ruby +class User < ActiveRecord::Base + has_one :user_config +end + +class UserConfig < ActiveRecord::Base + belongs_to :user +end +``` + +In these cases, there may be an opportunity to remove the unnecessary `id` +column on the associated table, `user_config.id` in this example. Instead, +the originating table ID can be used as the primary key for the associated +table: + +```ruby +create_table :user_configs, id: false do |t| + t.references :users, primary_key: true, default: nil, index: false, foreign_key: { on_delete: :cascade } + ... +end +``` + +Setting `default: nil` ensures a primary key sequence is not created, and since the primary key +automatically gets an index, we set `index: false` to avoid creating a duplicate. +You also need to add the new primary key to the model: + +```ruby +class UserConfig < ActiveRecord::Base + self.primary_key = :user_id + + belongs_to :user +end +``` + +Using a foreign key as primary key saves space but can make +[batch counting](../service_ping/implement.md#batch-counters) in [Service Ping](../service_ping/index.md) less efficient. +Consider using a regular `id` column if the table is relevant for Service Ping. diff --git a/doc/development/database/index.md b/doc/development/database/index.md index 553c9811ee9..ac6b75046fb 100644 --- a/doc/development/database/index.md +++ b/doc/development/database/index.md @@ -42,7 +42,7 @@ info: To determine the technical writer assigned to the Stage/Group associated w ## Best practices - [Adding database indexes](adding_database_indexes.md) -- [Foreign keys & associations](../foreign_keys.md) +- [Foreign keys & associations](foreign_keys.md) - [Adding a foreign key constraint to an existing column](add_foreign_key_to_existing_column.md) - [`NOT NULL` constraints](not_null_constraints.md) - [Strings and the Text data type](strings_and_the_text_data_type.md) @@ -52,7 +52,7 @@ info: To determine the technical writer assigned to the Stage/Group associated w - [Hash indexes](../hash_indexes.md) - [Storing SHA1 hashes as binary](../sha1_as_binary.md) - [Iterating tables in batches](../iterating_tables_in_batches.md) -- [Insert into tables in batches](../insert_into_tables_in_batches.md) +- [Insert into tables in batches](insert_into_tables_in_batches.md) - [Ordering table columns](ordering_table_columns.md) - [Verifying database capabilities](../verifying_database_capabilities.md) - [Database Debugging and Troubleshooting](../database_debugging.md) @@ -69,8 +69,8 @@ info: To determine the technical writer assigned to the Stage/Group associated w ## Case studies -- [Database case study: Filtering by label](../filtering_by_label.md) -- [Database case study: Namespaces storage statistics](../namespaces_storage_statistics.md) +- [Database case study: Filtering by label](filtering_by_label.md) +- [Database case study: Namespaces storage statistics](namespaces_storage_statistics.md) ## Miscellaneous diff --git a/doc/development/database/insert_into_tables_in_batches.md b/doc/development/database/insert_into_tables_in_batches.md new file mode 100644 index 00000000000..ebed3d16319 --- /dev/null +++ b/doc/development/database/insert_into_tables_in_batches.md @@ -0,0 +1,196 @@ +--- +stage: Data Stores +group: Database +info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments +description: "Sometimes it is necessary to store large amounts of records at once, which can be inefficient +when iterating collections and performing individual `save`s. With the arrival of `insert_all` +in Rails 6, which operates at the row level (that is, using `Hash`es), GitLab has added a set +of APIs that make it safe and simple to insert ActiveRecord objects in bulk." +--- + +# Insert into tables in batches + +Sometimes it is necessary to store large amounts of records at once, which can be inefficient +when iterating collections and saving each record individually. With the arrival of +[`insert_all`](https://apidock.com/rails/ActiveRecord/Persistence/ClassMethods/insert_all) +in Rails 6, which operates at the row level (that is, using `Hash` objects), GitLab has added a set +of APIs that make it safe and simple to insert `ActiveRecord` objects in bulk. + +## Prepare `ApplicationRecord`s for bulk insertion + +In order for a model class to take advantage of the bulk insertion API, it has to include the +`BulkInsertSafe` concern first: + +```ruby +class MyModel < ApplicationRecord + # other includes here + # ... + include BulkInsertSafe # include this last + + # ... +end +``` + +The `BulkInsertSafe` concern has two functions: + +- It performs checks against your model class to ensure that it does not use ActiveRecord + APIs that are not safe to use with respect to bulk insertions (more on that below). +- It adds new class methods `bulk_insert!` and `bulk_upsert!`, which you can use to insert many records at once. + +## Insert records with `bulk_insert!` and `bulk_upsert!` + +If the target class passes the checks performed by `BulkInsertSafe`, you can insert an array of +ActiveRecord model objects as follows: + +```ruby +records = [MyModel.new, ...] + +MyModel.bulk_insert!(records) +``` + +Calls to `bulk_insert!` always attempt to insert _new records_. If instead +you would like to replace existing records with new values, while still inserting those +that do not already exist, then you can use `bulk_upsert!`: + +```ruby +records = [MyModel.new, existing_model, ...] + +MyModel.bulk_upsert!(records, unique_by: [:name]) +``` + +In this example, `unique_by` specifies the columns by which records are considered to be +unique and as such are updated if they existed prior to insertion. For example, if +`existing_model` has a `name` attribute, and if a record with the same `name` value already +exists, its fields are updated with those of `existing_model`. + +The `unique_by` parameter can also be passed as a `Symbol`, in which case it specifies +a database index by which a column is considered unique: + +```ruby +MyModel.bulk_insert!(records, unique_by: :index_on_name) +``` + +### Record validation + +The `bulk_insert!` method guarantees that `records` are inserted transactionally, and +runs validations on each record prior to insertion. If any record fails to validate, +an error is raised and the transaction is rolled back. You can turn off validations via +the `:validate` option: + +```ruby +MyModel.bulk_insert!(records, validate: false) +``` + +### Batch size configuration + +In those cases where the number of `records` is above a given threshold, insertions +occur in multiple batches. The default batch size is defined in +[`BulkInsertSafe::DEFAULT_BATCH_SIZE`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/models/concerns/bulk_insert_safe.rb). +Assuming a default threshold of 500, inserting 950 records +would result in two batches being written sequentially (of size 500 and 450 respectively.) +You can override the default batch size via the `:batch_size` option: + +```ruby +MyModel.bulk_insert!(records, batch_size: 100) +``` + +Assuming the same number of 950 records, this would result in 10 batches being written instead. +Since this also affects the number of `INSERT` statements that occur, make sure you measure the +performance impact this might have on your code. There is a trade-off between the number of +`INSERT` statements the database has to process and the size and cost of each `INSERT`. + +### Handling duplicate records + +NOTE: +This parameter applies only to `bulk_insert!`. If you intend to update existing +records, use `bulk_upsert!` instead. + +It may happen that some records you are trying to insert already exist, which would result in +primary key conflicts. There are two ways to address this problem: failing fast by raising an +error or skipping duplicate records. The default behavior of `bulk_insert!` is to fail fast +and raise an `ActiveRecord::RecordNotUnique` error. + +If this is undesirable, you can instead skip duplicate records with the `skip_duplicates` flag: + +```ruby +MyModel.bulk_insert!(records, skip_duplicates: true) +``` + +### Requirements for safe bulk insertions + +Large parts of ActiveRecord's persistence API are built around the notion of callbacks. Many +of these callbacks fire in response to model life cycle events such as `save` or `create`. +These callbacks cannot be used with bulk insertions, since they are meant to be called for +every instance that is saved or created. Since these events do not fire when +records are inserted in bulk, we currently prevent their use. + +The specifics around which callbacks are explicitly allowed are defined in +[`BulkInsertSafe`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/models/concerns/bulk_insert_safe.rb). +Consult the module source code for details. If your class uses callbacks that are not explicitly designated +safe and you `include BulkInsertSafe` the application fails with an error. + +### `BulkInsertSafe` versus `InsertAll` + +Internally, `BulkInsertSafe` is based on `InsertAll`, and you may wonder when to choose +the former over the latter. To help you make the decision, +the key differences between these classes are listed in the table below. + +| | Input type | Validates input | Specify batch size | Can bypass callbacks | Transactional | +|--------------- | -------------------- | --------------- | ------------------ | --------------------------------- | ------------- | +| `bulk_insert!` | ActiveRecord objects | Yes (optional) | Yes (optional) | No (prevents unsafe callback use) | Yes | +| `insert_all!` | Attribute hashes | No | No | Yes | Yes | + +To summarize, `BulkInsertSafe` moves bulk inserts closer to how ActiveRecord objects +and inserts would normally behave. However, if all you need is to insert raw data in bulk, then +`insert_all` is more efficient. + +## Insert `has_many` associations in bulk + +A common use case is to save collections of associated relations through the owner side of the relation, +where the owned relation is associated to the owner through the `has_many` class method: + +```ruby +owner = OwnerModel.new(owned_relations: array_of_owned_relations) +# saves all `owned_relations` one-by-one +owner.save! +``` + +This issues a single `INSERT`, and transaction, for every record in `owned_relations`, which is inefficient if +`array_of_owned_relations` is large. To remedy this, the `BulkInsertableAssociations` concern can be +used to declare that the owner defines associations that are safe for bulk insertion: + +```ruby +class OwnerModel < ApplicationRecord + # other includes here + # ... + include BulkInsertableAssociations # include this last + + has_many :my_models +end +``` + +Here `my_models` must be declared `BulkInsertSafe` (as described previously) for bulk insertions +to happen. You can now insert any yet unsaved records as follows: + +```ruby +BulkInsertableAssociations.with_bulk_insert do + owner = OwnerModel.new(my_models: array_of_my_model_instances) + # saves `my_models` using a single bulk insert (possibly via multiple batches) + owner.save! +end +``` + +You can still save relations that are not `BulkInsertSafe` in this block; they +simply are treated as if you had invoked `save` from outside the block. + +## Known limitations + +There are a few restrictions to how these APIs can be used: + +- `BulkInsertableAssociations`: + - It is currently only compatible with `has_many` relations. + - It does not yet support `has_many through: ...` relations. + +Moreover, input data should either be limited to around 1000 records at most, +or already batched prior to calling bulk insert. The `INSERT` statement runs in a single +transaction, so for large amounts of records it may negatively affect database stability. diff --git a/doc/development/database/loose_foreign_keys.md b/doc/development/database/loose_foreign_keys.md index a390576c4e5..e28237ca5fc 100644 --- a/doc/development/database/loose_foreign_keys.md +++ b/doc/development/database/loose_foreign_keys.md @@ -10,7 +10,7 @@ info: To determine the technical writer assigned to the Stage/Group associated w In relational databases (including PostgreSQL), foreign keys provide a way to link two database tables together, and ensure data-consistency between them. In GitLab, -[foreign keys](../foreign_keys.md) are vital part of the database design process. +[foreign keys](foreign_keys.md) are vital part of the database design process. Most of our database tables have foreign keys. With the ongoing database [decomposition work](https://gitlab.com/groups/gitlab-org/-/epics/6168), diff --git a/doc/development/database/namespaces_storage_statistics.md b/doc/development/database/namespaces_storage_statistics.md new file mode 100644 index 00000000000..702129b9ddb --- /dev/null +++ b/doc/development/database/namespaces_storage_statistics.md @@ -0,0 +1,193 @@ +--- +stage: none +group: unassigned +info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments +--- + +# Database case study: Namespaces storage statistics + +## Introduction + +On [Storage and limits management for groups](https://gitlab.com/groups/gitlab-org/-/epics/886), +we want to facilitate a method for easily viewing the amount of +storage consumed by a group, and allow easy management. + +## Proposal + +1. Create a new ActiveRecord model to hold the namespaces' statistics in an aggregated form (only for root [namespaces](../../user/namespace/index.md)). +1. Refresh the statistics in this model every time a project belonging to this namespace is changed. + +## Problem + +In GitLab, we update the project storage statistics through a +[callback](https://gitlab.com/gitlab-org/gitlab/-/blob/4ab54c2233e91f60a80e5b6fa2181e6899fdcc3e/app/models/project.rb#L97) +every time the project is saved. + +The summary of those statistics per namespace is then retrieved +by [`Namespaces#with_statistics`](https://gitlab.com/gitlab-org/gitlab/-/blob/4ab54c2233e91f60a80e5b6fa2181e6899fdcc3e/app/models/namespace.rb#L70) scope. Analyzing this query we noticed that: + +- It takes up to `1.2` seconds for namespaces with over `15k` projects. +- It can't be analyzed with [ChatOps](../chatops_on_gitlabcom.md), as it times out. + +Additionally, the pattern that is currently used to update the project statistics +(the callback) doesn't scale adequately. It is currently one of the largest +[database queries transactions on production](https://gitlab.com/gitlab-org/gitlab/-/issues/29070) +that takes the most time overall. We can't add one more query to it as +it increases the transaction's length. + +Because of all of the above, we can't apply the same pattern to store +and update the namespaces statistics, as the `namespaces` table is one +of the largest tables on GitLab.com. Therefore we needed to find a performant and +alternative method. + +## Attempts + +### Attempt A: PostgreSQL materialized view + +Model can be updated through a refresh strategy based on a project routes SQL and a [materialized view](https://www.postgresql.org/docs/11/rules-materializedviews.html): + +```sql +SELECT split_part("rs".path, '/', 1) as root_path, + COALESCE(SUM(ps.storage_size), 0) AS storage_size, + COALESCE(SUM(ps.repository_size), 0) AS repository_size, + COALESCE(SUM(ps.wiki_size), 0) AS wiki_size, + COALESCE(SUM(ps.lfs_objects_size), 0) AS lfs_objects_size, + COALESCE(SUM(ps.build_artifacts_size), 0) AS build_artifacts_size, + COALESCE(SUM(ps.pipeline_artifacts_size), 0) AS pipeline_artifacts_size, + COALESCE(SUM(ps.packages_size), 0) AS packages_size, + COALESCE(SUM(ps.snippets_size), 0) AS snippets_size, + COALESCE(SUM(ps.uploads_size), 0) AS uploads_size +FROM "projects" + INNER JOIN routes rs ON rs.source_id = projects.id AND rs.source_type = 'Project' + INNER JOIN project_statistics ps ON ps.project_id = projects.id +GROUP BY root_path +``` + +We could then execute the query with: + +```sql +REFRESH MATERIALIZED VIEW root_namespace_storage_statistics; +``` + +While this implied a single query update (and probably a fast one), it has some downsides: + +- Materialized views syntax varies from PostgreSQL and MySQL. While this feature was worked on, MySQL was still supported by GitLab. +- Rails does not have native support for materialized views. We'd need to use a specialized gem to take care of the management of the database views, which implies additional work. + +### Attempt B: An update through a CTE + +Similar to Attempt A: Model update done through a refresh strategy with a [Common Table Expression](https://www.postgresql.org/docs/9.1/queries-with.html) + +```sql +WITH refresh AS ( + SELECT split_part("rs".path, '/', 1) as root_path, + COALESCE(SUM(ps.storage_size), 0) AS storage_size, + COALESCE(SUM(ps.repository_size), 0) AS repository_size, + COALESCE(SUM(ps.wiki_size), 0) AS wiki_size, + COALESCE(SUM(ps.lfs_objects_size), 0) AS lfs_objects_size, + COALESCE(SUM(ps.build_artifacts_size), 0) AS build_artifacts_size, + COALESCE(SUM(ps.pipeline_artifacts_size), 0) AS pipeline_artifacts_size, + COALESCE(SUM(ps.packages_size), 0) AS packages_size, + COALESCE(SUM(ps.snippets_size), 0) AS snippets_size, + COALESCE(SUM(ps.uploads_size), 0) AS uploads_size + FROM "projects" + INNER JOIN routes rs ON rs.source_id = projects.id AND rs.source_type = 'Project' + INNER JOIN project_statistics ps ON ps.project_id = projects.id + GROUP BY root_path) +UPDATE namespace_storage_statistics +SET storage_size = refresh.storage_size, + repository_size = refresh.repository_size, + wiki_size = refresh.wiki_size, + lfs_objects_size = refresh.lfs_objects_size, + build_artifacts_size = refresh.build_artifacts_size, + pipeline_artifacts_size = refresh.pipeline_artifacts_size, + packages_size = refresh.packages_size, + snippets_size = refresh.snippets_size, + uploads_size = refresh.uploads_size +FROM refresh + INNER JOIN routes rs ON rs.path = refresh.root_path AND rs.source_type = 'Namespace' +WHERE namespace_storage_statistics.namespace_id = rs.source_id +``` + +Same benefits and downsides as attempt A. + +### Attempt C: Get rid of the model and store the statistics on Redis + +We could get rid of the model that stores the statistics in aggregated form and instead use a Redis Set. +This would be the [boring solution](https://about.gitlab.com/handbook/values/#boring-solutions) and the fastest one +to implement, as GitLab already includes Redis as part of its [Architecture](../architecture.md#redis). + +The downside of this approach is that Redis does not provide the same persistence/consistency guarantees as PostgreSQL, +and this is information we can't afford to lose in a Redis failure. + +### Attempt D: Tag the root namespace and its child namespaces + +Directly relate the root namespace to its child namespaces, so +whenever a namespace is created without a parent, this one is tagged +with the root namespace ID: + +| ID | root ID | parent ID | +|:---|:--------|:----------| +| 1 | 1 | NULL | +| 2 | 1 | 1 | +| 3 | 1 | 2 | + +To aggregate the statistics inside a namespace we'd execute something like: + +```sql +SELECT COUNT(...) +FROM projects +WHERE namespace_id IN ( + SELECT id + FROM namespaces + WHERE root_id = X +) +``` + +Even though this approach would make aggregating much easier, it has some major downsides: + +- We'd have to migrate **all namespaces** by adding and filling a new column. Because of the size of the table, dealing with time/cost would be significant. The background migration would take approximately `153h`, see . +- Background migration has to be shipped one release before, delaying the functionality by another milestone. + +### Attempt E (final): Update the namespace storage statistics asynchronously + +This approach consists of continuing to use the incremental statistics updates we already have, +but we refresh them through Sidekiq jobs and in different transactions: + +1. Create a second table (`namespace_aggregation_schedules`) with two columns `id` and `namespace_id`. +1. Whenever the statistics of a project changes, insert a row into `namespace_aggregation_schedules` + - We don't insert a new row if there's already one related to the root namespace. + - Keeping in mind the length of the transaction that involves updating `project_statistics`(), the insertion should be done in a different transaction and through a Sidekiq Job. +1. After inserting the row, we schedule another worker to be executed asynchronously at two different moments: + - One enqueued for immediate execution and another one scheduled in `1.5h` hours. + - We only schedule the jobs, if we can obtain a `1.5h` lease on Redis on a key based on the root namespace ID. + - If we can't obtain the lease, it indicates there's another aggregation already in progress, or scheduled in no more than `1.5h`. +1. This worker will: + - Update the root namespace storage statistics by querying all the namespaces through a service. + - Delete the related `namespace_aggregation_schedules` after the update. +1. Another Sidekiq job is also included to traverse any remaining rows on the `namespace_aggregation_schedules` table and schedule jobs for every pending row. + - This job is scheduled with cron to run every night (UTC). + +This implementation has the following benefits: + +- All the updates are done asynchronously, so we're not increasing the length of the transactions for `project_statistics`. +- We're doing the update in a single SQL query. +- It is compatible with PostgreSQL and MySQL. +- No background migration required. + +The only downside of this approach is that namespaces' statistics are updated up to `1.5` hours after the change is done, +which means there's a time window in which the statistics are inaccurate. Because we're still not +[enforcing storage limits](https://gitlab.com/gitlab-org/gitlab/-/issues/17664), this is not a major problem. + +## Conclusion + +Updating the storage statistics asynchronously, was the less problematic and +performant approach of aggregating the root namespaces. + +All the details regarding this use case can be found on: + +- +- Merge Request with the implementation: + +Performance of the namespace storage statistics were measured in staging and production (GitLab.com). All results were posted +on : No problem has been reported so far. diff --git a/doc/development/fe_guide/graphql.md b/doc/development/fe_guide/graphql.md index db267a6d5b3..500e9dfe62a 100644 --- a/doc/development/fe_guide/graphql.md +++ b/doc/development/fe_guide/graphql.md @@ -597,7 +597,7 @@ export default { Note that, even if the directive evaluates to `false`, the guarded entity is sent to the backend and matched against the GraphQL schema. So this approach requires that the feature-flagged entity exists in the schema, even if the feature flag is disabled. When the feature flag is turned off, it -is recommended that the resolver returns `null` at the very least using the same feature flag as the frontend. See the [API GraphQL guide](../api_graphql_styleguide.md#frontend-and-backend-feature-flag-strategies). +is recommended that the resolver returns `null` at the very least using the same feature flag as the frontend. See the [API GraphQL guide](../api_graphql_styleguide.md#feature-flags). ##### Different versions of a query diff --git a/doc/development/filtering_by_label.md b/doc/development/filtering_by_label.md index 9e759744a1a..675fe004c22 100644 --- a/doc/development/filtering_by_label.md +++ b/doc/development/filtering_by_label.md @@ -1,179 +1,11 @@ --- -stage: Plan -group: Project Management -info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments +redirect_to: 'database/filtering_by_label.md' +remove_date: '2022-11-05' --- -# Filtering by label -## Introduction +This document was moved to [another location](database/filtering_by_label.md). -GitLab has [labels](../user/project/labels.md) that can be assigned to issues, -merge requests, and epics. Labels on those objects are a many-to-many relation -through the polymorphic `label_links` table. - -To filter these objects by multiple labels - for instance, 'all open -issues with the label ~Plan and the label ~backend' - we generate a -query containing a `GROUP BY` clause. In a simple form, this looks like: - -```sql -SELECT - issues.* -FROM - issues - INNER JOIN label_links ON label_links.target_id = issues.id - AND label_links.target_type = 'Issue' - INNER JOIN labels ON labels.id = label_links.label_id -WHERE - issues.project_id = 13083 - AND (issues.state IN ('opened')) - AND labels.title IN ('Plan', - 'backend') -GROUP BY - issues.id -HAVING (COUNT(DISTINCT labels.title) = 2) -ORDER BY - issues.updated_at DESC, - issues.id DESC -LIMIT 20 OFFSET 0 -``` - -In particular, note that: - -1. We `GROUP BY issues.id` so that we can ... -1. Use the `HAVING (COUNT(DISTINCT labels.title) = 2)` condition to ensure that - all matched issues have both labels. - -This is more complicated than is ideal. It makes the query construction more -prone to errors (such as -[issue #15557](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/15557)). - -## Attempt A: `WHERE EXISTS` - -### Attempt A1: use multiple subqueries with `WHERE EXISTS` - -In [issue #37137](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/37137) -and its associated [merge request](https://gitlab.com/gitlab-org/gitlab-foss/-/merge_requests/14022), -we tried to replace the `GROUP BY` with multiple uses of `WHERE EXISTS`. For the -example above, this would give: - -```sql -WHERE (EXISTS ( - SELECT - TRUE - FROM - label_links - INNER JOIN labels ON labels.id = label_links.label_id - WHERE - labels.title = 'Plan' - AND target_type = 'Issue' - AND target_id = issues.id)) -AND (EXISTS ( - SELECT - TRUE - FROM - label_links - INNER JOIN labels ON labels.id = label_links.label_id - WHERE - labels.title = 'backend' - AND target_type = 'Issue' - AND target_id = issues.id)) -``` - -While this worked without schema changes, and did improve readability somewhat, -it did not improve query performance. - -### Attempt A2: use label IDs in the `WHERE EXISTS` clause - -In [merge request #34503](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/34503), we followed a similar approach to A1. But this time, we -did a separate query to fetch the IDs of the labels used in the filter so that we avoid the `JOIN` in the `EXISTS` clause and filter directly by -`label_links.label_id`. We also added a new index on `label_links` for the `target_id`, `label_id`, and `target_type` columns to speed up this query. - -Finding the label IDs wasn't straightforward because there could be multiple labels with the same title within a single root namespace. We solved -this by grouping the label IDs by title and then using the array of IDs in the `EXISTS` clauses. - -This resulted in a significant performance improvement. However, this optimization could not be applied to the dashboard pages -where we do not have a project or group context. We could not easily search for the label IDs here because that would mean searching across all -projects and groups that the user has access to. - -## Attempt B: Denormalize using an array column - -Having [removed MySQL support in GitLab 12.1](https://about.gitlab.com/blog/2019/06/27/removing-mysql-support/), -using [PostgreSQL's arrays](https://www.postgresql.org/docs/11/arrays.html) became more -tractable as we didn't have to support two databases. We discussed denormalizing -the `label_links` table for querying in -[issue #49651](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/49651), -with two options: label IDs and titles. - -We can think of both of those as array columns on `issues`, `merge_requests`, -and `epics`: `issues.label_ids` would be an array column of label IDs, and -`issues.label_titles` would be an array of label titles. - -These array columns can be complemented with [GIN -indexes](https://www.postgresql.org/docs/11/gin-intro.html) to improve -matching. - -### Attempt B1: store label IDs for each object - -This has some strong advantages over titles: - -1. Unless a label is deleted, or a project is moved, we never need to - bulk-update the denormalized column. -1. It uses less storage than the titles. - -Unfortunately, our application design makes this hard. If we were able to query -just by label ID easily, we wouldn't need the `INNER JOIN labels` in the initial -query at the start of this document. GitLab allows users to filter by label -title across projects and even across groups, so a filter by the label ~Plan may -include labels with multiple distinct IDs. - -We do not want users to have to know about the different IDs, which means that -given this data set: - -| Project | ~Plan label ID | ~backend label ID | -| ------- | -------------- | ----------------- | -| A | 11 | 12 | -| B | 21 | 22 | -| C | 31 | 32 | - -We would need something like: - -```sql -WHERE - label_ids @> ARRAY[11, 12] - OR label_ids @> ARRAY[21, 22] - OR label_ids @> ARRAY[31, 32] -``` - -This can get even more complicated when we consider that in some cases, there -might be two ~backend labels - with different IDs - that could apply to the same -object, so the number of combinations would balloon further. - -### Attempt B2: store label titles for each object - -From the perspective of updating the object, this is the worst -option. We have to bulk update the objects when: - -1. The objects are moved from one project to another. -1. The project is moved from one group to another. -1. The label is renamed. -1. The label is deleted. - -It also uses much more storage. Querying is simple, though: - -```sql -WHERE - label_titles @> ARRAY['Plan', 'backend'] -``` - -And our [tests in issue #49651](https://gitlab.com/gitlab-org/gitlab-foss/-/issues/49651#note_188777346) -showed that this could be fast. - -However, at present, the disadvantages outweigh the advantages. - -## Conclusion - -We found a method A2 that does not need denormalization and improves the query performance significantly. This -did not apply to all cases, but we were able to apply method A1 to the rest of the cases so that we remove the -`GROUP BY` and `HAVING` clauses in all scenarios. - -This simplified the query and improved the performance in the most common cases. + + + + diff --git a/doc/development/foreign_keys.md b/doc/development/foreign_keys.md index e0dd0fe8e7c..cdf655bf0bf 100644 --- a/doc/development/foreign_keys.md +++ b/doc/development/foreign_keys.md @@ -1,200 +1,11 @@ --- -stage: Data Stores -group: Database -info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments +redirect_to: 'database/foreign_keys.md' +remove_date: '2022-11-05' --- -# Foreign Keys & Associations +This document was moved to [another location](database/foreign_keys.md). -When adding an association to a model you must also add a foreign key. For -example, say you have the following model: - -```ruby -class User < ActiveRecord::Base - has_many :posts -end -``` - -Add a foreign key here on column `posts.user_id`. This ensures -that data consistency is enforced on database level. Foreign keys also mean that -the database can very quickly remove associated data (for example, when removing a -user), instead of Rails having to do this. - -## Adding Foreign Keys In Migrations - -Foreign keys can be added concurrently using `add_concurrent_foreign_key` as -defined in `Gitlab::Database::MigrationHelpers`. See the [Migration Style -Guide](migration_style_guide.md) for more information. - -Keep in mind that you can only safely add foreign keys to existing tables after -you have removed any orphaned rows. The method `add_concurrent_foreign_key` -does not take care of this so you must do so manually. See -[adding foreign key constraint to an existing column](database/add_foreign_key_to_existing_column.md). - -## Updating Foreign Keys In Migrations - -Sometimes a foreign key constraint must be changed, preserving the column -but updating the constraint condition. For example, moving from -`ON DELETE CASCADE` to `ON DELETE SET NULL` or vice-versa. - -PostgreSQL does not prevent you from adding overlapping foreign keys. It -honors the most recently added constraint. This allows us to replace foreign keys without -ever losing foreign key protection on a column. - -To replace a foreign key: - -1. [Add the new foreign key without validation](database/add_foreign_key_to_existing_column.md#prevent-invalid-records) - - The name of the foreign key constraint must be changed to add a new - foreign key before removing the old one. - - ```ruby - class ReplaceFkOnPackagesPackagesProjectId < Gitlab::Database::Migration[2.0] - disable_ddl_transaction! - - NEW_CONSTRAINT_NAME = 'fk_new' - - def up - add_concurrent_foreign_key(:packages_packages, :projects, column: :project_id, on_delete: :nullify, validate: false, name: NEW_CONSTRAINT_NAME) - end - - def down - with_lock_retries do - remove_foreign_key_if_exists(:packages_packages, column: :project_id, on_delete: :nullify, name: NEW_CONSTRAINT_NAME) - end - end - end - ``` - -1. [Validate the new foreign key](database/add_foreign_key_to_existing_column.md#validate-the-foreign-key) - - ```ruby - class ValidateFkNew < Gitlab::Database::Migration[2.0] - NEW_CONSTRAINT_NAME = 'fk_new' - - # foreign key added in - def up - validate_foreign_key(:packages_packages, name: NEW_CONSTRAINT_NAME) - end - - def down - # no-op - end - end - ``` - -1. Remove the old foreign key: - - ```ruby - class RemoveFkOld < Gitlab::Database::Migration[2.0] - OLD_CONSTRAINT_NAME = 'fk_old' - - # new foreign key added in - # and validated in - def up - remove_foreign_key_if_exists(:packages_packages, column: :project_id, on_delete: :cascade, name: OLD_CONSTRAINT_NAME) - end - - def down - # Validation is skipped here, so if rolled back, this will need to be revalidated in a separate migration - add_concurrent_foreign_key(:packages_packages, :projects, column: :project_id, on_delete: :cascade, validate: false, name: OLD_CONSTRAINT_NAME) - end - end - ``` - -## Cascading Deletes - -Every foreign key must define an `ON DELETE` clause, and in 99% of the cases -this should be set to `CASCADE`. - -## Indexes - -When adding a foreign key in PostgreSQL the column is not indexed automatically, -thus you must also add a concurrent index. Not doing so results in cascading -deletes being very slow. - -## Naming foreign keys - -By default Ruby on Rails uses the `_id` suffix for foreign keys. So we should -only use this suffix for associations between two tables. If you want to -reference an ID on a third party platform the `_xid` suffix is recommended. - -The spec `spec/db/schema_spec.rb` tests if all columns with the `_id` suffix -have a foreign key constraint. So if that spec fails, don't add the column to -`IGNORED_FK_COLUMNS`, but instead add the FK constraint, or consider naming it -differently. - -## Dependent Removals - -Don't define options such as `dependent: :destroy` or `dependent: :delete` when -defining an association. Defining these options means Rails handles the -removal of data, instead of letting the database handle this in the most -efficient way possible. - -In other words, this is bad and should be avoided at all costs: - -```ruby -class User < ActiveRecord::Base - has_many :posts, dependent: :destroy -end -``` - -Should you truly have a need for this it should be approved by a database -specialist first. - -You should also not define any `before_destroy` or `after_destroy` callbacks on -your models _unless_ absolutely required and only when approved by database -specialists. For example, if each row in a table has a corresponding file on a -file system it may be tempting to add a `after_destroy` hook. This however -introduces non database logic to a model, and means we can no longer rely on -foreign keys to remove the data as this would result in the file system data -being left behind. In such a case you should use a service class instead that -takes care of removing non database data. - -In cases where the relation spans multiple databases you have even -further problems using `dependent: :destroy` or the above hooks. You can -read more about alternatives at [Avoid `dependent: :nullify` and -`dependent: :destroy` across -databases](database/multiple_databases.md#avoid-dependent-nullify-and-dependent-destroy-across-databases). - -## Alternative primary keys with `has_one` associations - -Sometimes a `has_one` association is used to create a one-to-one relationship: - -```ruby -class User < ActiveRecord::Base - has_one :user_config -end - -class UserConfig < ActiveRecord::Base - belongs_to :user -end -``` - -In these cases, there may be an opportunity to remove the unnecessary `id` -column on the associated table, `user_config.id` in this example. Instead, -the originating table ID can be used as the primary key for the associated -table: - -```ruby -create_table :user_configs, id: false do |t| - t.references :users, primary_key: true, default: nil, index: false, foreign_key: { on_delete: :cascade } - ... -end -``` - -Setting `default: nil` ensures a primary key sequence is not created, and since the primary key -automatically gets an index, we set `index: false` to avoid creating a duplicate. -You also need to add the new primary key to the model: - -```ruby -class UserConfig < ActiveRecord::Base - self.primary_key = :user_id - - belongs_to :user -end -``` - -Using a foreign key as primary key saves space but can make -[batch counting](service_ping/implement.md#batch-counters) in [Service Ping](service_ping/index.md) less efficient. -Consider using a regular `id` column if the table is relevant for Service Ping. + + + + diff --git a/doc/development/insert_into_tables_in_batches.md b/doc/development/insert_into_tables_in_batches.md index ebed3d16319..ced5332e880 100644 --- a/doc/development/insert_into_tables_in_batches.md +++ b/doc/development/insert_into_tables_in_batches.md @@ -1,196 +1,11 @@ --- -stage: Data Stores -group: Database -info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments -description: "Sometimes it is necessary to store large amounts of records at once, which can be inefficient -when iterating collections and performing individual `save`s. With the arrival of `insert_all` -in Rails 6, which operates at the row level (that is, using `Hash`es), GitLab has added a set -of APIs that make it safe and simple to insert ActiveRecord objects in bulk." +redirect_to: 'database/insert_into_tables_in_batches.md' +remove_date: '2022-11-05' --- -# Insert into tables in batches +This document was moved to [another location](database/insert_into_tables_in_batches.md). -Sometimes it is necessary to store large amounts of records at once, which can be inefficient -when iterating collections and saving each record individually. With the arrival of -[`insert_all`](https://apidock.com/rails/ActiveRecord/Persistence/ClassMethods/insert_all) -in Rails 6, which operates at the row level (that is, using `Hash` objects), GitLab has added a set -of APIs that make it safe and simple to insert `ActiveRecord` objects in bulk. - -## Prepare `ApplicationRecord`s for bulk insertion - -In order for a model class to take advantage of the bulk insertion API, it has to include the -`BulkInsertSafe` concern first: - -```ruby -class MyModel < ApplicationRecord - # other includes here - # ... - include BulkInsertSafe # include this last - - # ... -end -``` - -The `BulkInsertSafe` concern has two functions: - -- It performs checks against your model class to ensure that it does not use ActiveRecord - APIs that are not safe to use with respect to bulk insertions (more on that below). -- It adds new class methods `bulk_insert!` and `bulk_upsert!`, which you can use to insert many records at once. - -## Insert records with `bulk_insert!` and `bulk_upsert!` - -If the target class passes the checks performed by `BulkInsertSafe`, you can insert an array of -ActiveRecord model objects as follows: - -```ruby -records = [MyModel.new, ...] - -MyModel.bulk_insert!(records) -``` - -Calls to `bulk_insert!` always attempt to insert _new records_. If instead -you would like to replace existing records with new values, while still inserting those -that do not already exist, then you can use `bulk_upsert!`: - -```ruby -records = [MyModel.new, existing_model, ...] - -MyModel.bulk_upsert!(records, unique_by: [:name]) -``` - -In this example, `unique_by` specifies the columns by which records are considered to be -unique and as such are updated if they existed prior to insertion. For example, if -`existing_model` has a `name` attribute, and if a record with the same `name` value already -exists, its fields are updated with those of `existing_model`. - -The `unique_by` parameter can also be passed as a `Symbol`, in which case it specifies -a database index by which a column is considered unique: - -```ruby -MyModel.bulk_insert!(records, unique_by: :index_on_name) -``` - -### Record validation - -The `bulk_insert!` method guarantees that `records` are inserted transactionally, and -runs validations on each record prior to insertion. If any record fails to validate, -an error is raised and the transaction is rolled back. You can turn off validations via -the `:validate` option: - -```ruby -MyModel.bulk_insert!(records, validate: false) -``` - -### Batch size configuration - -In those cases where the number of `records` is above a given threshold, insertions -occur in multiple batches. The default batch size is defined in -[`BulkInsertSafe::DEFAULT_BATCH_SIZE`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/models/concerns/bulk_insert_safe.rb). -Assuming a default threshold of 500, inserting 950 records -would result in two batches being written sequentially (of size 500 and 450 respectively.) -You can override the default batch size via the `:batch_size` option: - -```ruby -MyModel.bulk_insert!(records, batch_size: 100) -``` - -Assuming the same number of 950 records, this would result in 10 batches being written instead. -Since this also affects the number of `INSERT` statements that occur, make sure you measure the -performance impact this might have on your code. There is a trade-off between the number of -`INSERT` statements the database has to process and the size and cost of each `INSERT`. - -### Handling duplicate records - -NOTE: -This parameter applies only to `bulk_insert!`. If you intend to update existing -records, use `bulk_upsert!` instead. - -It may happen that some records you are trying to insert already exist, which would result in -primary key conflicts. There are two ways to address this problem: failing fast by raising an -error or skipping duplicate records. The default behavior of `bulk_insert!` is to fail fast -and raise an `ActiveRecord::RecordNotUnique` error. - -If this is undesirable, you can instead skip duplicate records with the `skip_duplicates` flag: - -```ruby -MyModel.bulk_insert!(records, skip_duplicates: true) -``` - -### Requirements for safe bulk insertions - -Large parts of ActiveRecord's persistence API are built around the notion of callbacks. Many -of these callbacks fire in response to model life cycle events such as `save` or `create`. -These callbacks cannot be used with bulk insertions, since they are meant to be called for -every instance that is saved or created. Since these events do not fire when -records are inserted in bulk, we currently prevent their use. - -The specifics around which callbacks are explicitly allowed are defined in -[`BulkInsertSafe`](https://gitlab.com/gitlab-org/gitlab/-/blob/master/app/models/concerns/bulk_insert_safe.rb). -Consult the module source code for details. If your class uses callbacks that are not explicitly designated -safe and you `include BulkInsertSafe` the application fails with an error. - -### `BulkInsertSafe` versus `InsertAll` - -Internally, `BulkInsertSafe` is based on `InsertAll`, and you may wonder when to choose -the former over the latter. To help you make the decision, -the key differences between these classes are listed in the table below. - -| | Input type | Validates input | Specify batch size | Can bypass callbacks | Transactional | -|--------------- | -------------------- | --------------- | ------------------ | --------------------------------- | ------------- | -| `bulk_insert!` | ActiveRecord objects | Yes (optional) | Yes (optional) | No (prevents unsafe callback use) | Yes | -| `insert_all!` | Attribute hashes | No | No | Yes | Yes | - -To summarize, `BulkInsertSafe` moves bulk inserts closer to how ActiveRecord objects -and inserts would normally behave. However, if all you need is to insert raw data in bulk, then -`insert_all` is more efficient. - -## Insert `has_many` associations in bulk - -A common use case is to save collections of associated relations through the owner side of the relation, -where the owned relation is associated to the owner through the `has_many` class method: - -```ruby -owner = OwnerModel.new(owned_relations: array_of_owned_relations) -# saves all `owned_relations` one-by-one -owner.save! -``` - -This issues a single `INSERT`, and transaction, for every record in `owned_relations`, which is inefficient if -`array_of_owned_relations` is large. To remedy this, the `BulkInsertableAssociations` concern can be -used to declare that the owner defines associations that are safe for bulk insertion: - -```ruby -class OwnerModel < ApplicationRecord - # other includes here - # ... - include BulkInsertableAssociations # include this last - - has_many :my_models -end -``` - -Here `my_models` must be declared `BulkInsertSafe` (as described previously) for bulk insertions -to happen. You can now insert any yet unsaved records as follows: - -```ruby -BulkInsertableAssociations.with_bulk_insert do - owner = OwnerModel.new(my_models: array_of_my_model_instances) - # saves `my_models` using a single bulk insert (possibly via multiple batches) - owner.save! -end -``` - -You can still save relations that are not `BulkInsertSafe` in this block; they -simply are treated as if you had invoked `save` from outside the block. - -## Known limitations - -There are a few restrictions to how these APIs can be used: - -- `BulkInsertableAssociations`: - - It is currently only compatible with `has_many` relations. - - It does not yet support `has_many through: ...` relations. - -Moreover, input data should either be limited to around 1000 records at most, -or already batched prior to calling bulk insert. The `INSERT` statement runs in a single -transaction, so for large amounts of records it may negatively affect database stability. + + + + diff --git a/doc/development/namespaces_storage_statistics.md b/doc/development/namespaces_storage_statistics.md index 20371c48172..75e79d1f693 100644 --- a/doc/development/namespaces_storage_statistics.md +++ b/doc/development/namespaces_storage_statistics.md @@ -1,193 +1,11 @@ --- -stage: none -group: unassigned -info: To determine the technical writer assigned to the Stage/Group associated with this page, see https://about.gitlab.com/handbook/engineering/ux/technical-writing/#assignments +redirect_to: 'database/namespaces_storage_statistics.md' +remove_date: '2022-11-05' --- -# Database case study: Namespaces storage statistics +This document was moved to [another location](database/namespaces_storage_statistics.md). -## Introduction - -On [Storage and limits management for groups](https://gitlab.com/groups/gitlab-org/-/epics/886), -we want to facilitate a method for easily viewing the amount of -storage consumed by a group, and allow easy management. - -## Proposal - -1. Create a new ActiveRecord model to hold the namespaces' statistics in an aggregated form (only for root [namespaces](../user/namespace/index.md)). -1. Refresh the statistics in this model every time a project belonging to this namespace is changed. - -## Problem - -In GitLab, we update the project storage statistics through a -[callback](https://gitlab.com/gitlab-org/gitlab/-/blob/4ab54c2233e91f60a80e5b6fa2181e6899fdcc3e/app/models/project.rb#L97) -every time the project is saved. - -The summary of those statistics per namespace is then retrieved -by [`Namespaces#with_statistics`](https://gitlab.com/gitlab-org/gitlab/-/blob/4ab54c2233e91f60a80e5b6fa2181e6899fdcc3e/app/models/namespace.rb#L70) scope. Analyzing this query we noticed that: - -- It takes up to `1.2` seconds for namespaces with over `15k` projects. -- It can't be analyzed with [ChatOps](chatops_on_gitlabcom.md), as it times out. - -Additionally, the pattern that is currently used to update the project statistics -(the callback) doesn't scale adequately. It is currently one of the largest -[database queries transactions on production](https://gitlab.com/gitlab-org/gitlab/-/issues/29070) -that takes the most time overall. We can't add one more query to it as -it increases the transaction's length. - -Because of all of the above, we can't apply the same pattern to store -and update the namespaces statistics, as the `namespaces` table is one -of the largest tables on GitLab.com. Therefore we needed to find a performant and -alternative method. - -## Attempts - -### Attempt A: PostgreSQL materialized view - -Model can be updated through a refresh strategy based on a project routes SQL and a [materialized view](https://www.postgresql.org/docs/11/rules-materializedviews.html): - -```sql -SELECT split_part("rs".path, '/', 1) as root_path, - COALESCE(SUM(ps.storage_size), 0) AS storage_size, - COALESCE(SUM(ps.repository_size), 0) AS repository_size, - COALESCE(SUM(ps.wiki_size), 0) AS wiki_size, - COALESCE(SUM(ps.lfs_objects_size), 0) AS lfs_objects_size, - COALESCE(SUM(ps.build_artifacts_size), 0) AS build_artifacts_size, - COALESCE(SUM(ps.pipeline_artifacts_size), 0) AS pipeline_artifacts_size, - COALESCE(SUM(ps.packages_size), 0) AS packages_size, - COALESCE(SUM(ps.snippets_size), 0) AS snippets_size, - COALESCE(SUM(ps.uploads_size), 0) AS uploads_size -FROM "projects" - INNER JOIN routes rs ON rs.source_id = projects.id AND rs.source_type = 'Project' - INNER JOIN project_statistics ps ON ps.project_id = projects.id -GROUP BY root_path -``` - -We could then execute the query with: - -```sql -REFRESH MATERIALIZED VIEW root_namespace_storage_statistics; -``` - -While this implied a single query update (and probably a fast one), it has some downsides: - -- Materialized views syntax varies from PostgreSQL and MySQL. While this feature was worked on, MySQL was still supported by GitLab. -- Rails does not have native support for materialized views. We'd need to use a specialized gem to take care of the management of the database views, which implies additional work. - -### Attempt B: An update through a CTE - -Similar to Attempt A: Model update done through a refresh strategy with a [Common Table Expression](https://www.postgresql.org/docs/9.1/queries-with.html) - -```sql -WITH refresh AS ( - SELECT split_part("rs".path, '/', 1) as root_path, - COALESCE(SUM(ps.storage_size), 0) AS storage_size, - COALESCE(SUM(ps.repository_size), 0) AS repository_size, - COALESCE(SUM(ps.wiki_size), 0) AS wiki_size, - COALESCE(SUM(ps.lfs_objects_size), 0) AS lfs_objects_size, - COALESCE(SUM(ps.build_artifacts_size), 0) AS build_artifacts_size, - COALESCE(SUM(ps.pipeline_artifacts_size), 0) AS pipeline_artifacts_size, - COALESCE(SUM(ps.packages_size), 0) AS packages_size, - COALESCE(SUM(ps.snippets_size), 0) AS snippets_size, - COALESCE(SUM(ps.uploads_size), 0) AS uploads_size - FROM "projects" - INNER JOIN routes rs ON rs.source_id = projects.id AND rs.source_type = 'Project' - INNER JOIN project_statistics ps ON ps.project_id = projects.id - GROUP BY root_path) -UPDATE namespace_storage_statistics -SET storage_size = refresh.storage_size, - repository_size = refresh.repository_size, - wiki_size = refresh.wiki_size, - lfs_objects_size = refresh.lfs_objects_size, - build_artifacts_size = refresh.build_artifacts_size, - pipeline_artifacts_size = refresh.pipeline_artifacts_size, - packages_size = refresh.packages_size, - snippets_size = refresh.snippets_size, - uploads_size = refresh.uploads_size -FROM refresh - INNER JOIN routes rs ON rs.path = refresh.root_path AND rs.source_type = 'Namespace' -WHERE namespace_storage_statistics.namespace_id = rs.source_id -``` - -Same benefits and downsides as attempt A. - -### Attempt C: Get rid of the model and store the statistics on Redis - -We could get rid of the model that stores the statistics in aggregated form and instead use a Redis Set. -This would be the [boring solution](https://about.gitlab.com/handbook/values/#boring-solutions) and the fastest one -to implement, as GitLab already includes Redis as part of its [Architecture](architecture.md#redis). - -The downside of this approach is that Redis does not provide the same persistence/consistency guarantees as PostgreSQL, -and this is information we can't afford to lose in a Redis failure. - -### Attempt D: Tag the root namespace and its child namespaces - -Directly relate the root namespace to its child namespaces, so -whenever a namespace is created without a parent, this one is tagged -with the root namespace ID: - -| ID | root ID | parent ID | -|:---|:--------|:----------| -| 1 | 1 | NULL | -| 2 | 1 | 1 | -| 3 | 1 | 2 | - -To aggregate the statistics inside a namespace we'd execute something like: - -```sql -SELECT COUNT(...) -FROM projects -WHERE namespace_id IN ( - SELECT id - FROM namespaces - WHERE root_id = X -) -``` - -Even though this approach would make aggregating much easier, it has some major downsides: - -- We'd have to migrate **all namespaces** by adding and filling a new column. Because of the size of the table, dealing with time/cost would be significant. The background migration would take approximately `153h`, see . -- Background migration has to be shipped one release before, delaying the functionality by another milestone. - -### Attempt E (final): Update the namespace storage statistics asynchronously - -This approach consists of continuing to use the incremental statistics updates we already have, -but we refresh them through Sidekiq jobs and in different transactions: - -1. Create a second table (`namespace_aggregation_schedules`) with two columns `id` and `namespace_id`. -1. Whenever the statistics of a project changes, insert a row into `namespace_aggregation_schedules` - - We don't insert a new row if there's already one related to the root namespace. - - Keeping in mind the length of the transaction that involves updating `project_statistics`(), the insertion should be done in a different transaction and through a Sidekiq Job. -1. After inserting the row, we schedule another worker to be executed asynchronously at two different moments: - - One enqueued for immediate execution and another one scheduled in `1.5h` hours. - - We only schedule the jobs, if we can obtain a `1.5h` lease on Redis on a key based on the root namespace ID. - - If we can't obtain the lease, it indicates there's another aggregation already in progress, or scheduled in no more than `1.5h`. -1. This worker will: - - Update the root namespace storage statistics by querying all the namespaces through a service. - - Delete the related `namespace_aggregation_schedules` after the update. -1. Another Sidekiq job is also included to traverse any remaining rows on the `namespace_aggregation_schedules` table and schedule jobs for every pending row. - - This job is scheduled with cron to run every night (UTC). - -This implementation has the following benefits: - -- All the updates are done asynchronously, so we're not increasing the length of the transactions for `project_statistics`. -- We're doing the update in a single SQL query. -- It is compatible with PostgreSQL and MySQL. -- No background migration required. - -The only downside of this approach is that namespaces' statistics are updated up to `1.5` hours after the change is done, -which means there's a time window in which the statistics are inaccurate. Because we're still not -[enforcing storage limits](https://gitlab.com/gitlab-org/gitlab/-/issues/17664), this is not a major problem. - -## Conclusion - -Updating the storage statistics asynchronously, was the less problematic and -performant approach of aggregating the root namespaces. - -All the details regarding this use case can be found on: - -- -- Merge Request with the implementation: - -Performance of the namespace storage statistics were measured in staging and production (GitLab.com). All results were posted -on : No problem has been reported so far. + + + + diff --git a/doc/development/performance.md b/doc/development/performance.md index d7cbef0a211..d53dd3882c6 100644 --- a/doc/development/performance.md +++ b/doc/development/performance.md @@ -932,7 +932,7 @@ Models](mass_insert.md) functionality. Assuming you are working with ActiveRecord models, you might also find these links helpful: -- [Insert records in batches](insert_into_tables_in_batches.md) +- [Insert records in batches](database/insert_into_tables_in_batches.md) - [BulkInsert gem](https://github.com/jamis/bulk_insert) - [ActiveRecord::PgGenerateSeries gem](https://github.com/ryu39/active_record-pg_generate_series) diff --git a/doc/user/packages/dependency_proxy/index.md b/doc/user/packages/dependency_proxy/index.md index 4770057e4ea..ea9435de12a 100644 --- a/doc/user/packages/dependency_proxy/index.md +++ b/doc/user/packages/dependency_proxy/index.md @@ -316,6 +316,28 @@ services: alias: docker ``` +### Issues when authenticating to the Dependency Proxy from CI/CD jobs + +GitLab Runner will automatically authenticate to the Dependency Proxy. However, the underlying Docker engine is still subject to its [authorization resolving process](https://gitlab.com/gitlab-org/gitlab-runner/-/blob/main/docs/configuration/advanced-configuration.md#precedence-of-docker-authorization-resolving). + +Misconfigurations in the authentication mechanism may cause `HTTP Basic: Access denied` and `403: Access forbidden` errors. + +You can use the job logs to view the authentication mechanism used to authenticate against the Dependency Proxy: + +```plaintext +Authenticating with credentials from $DOCKER_AUTH_CONFIG +``` + +```plaintext +Authenticating with credentials from /root/.docker/config.json +``` + +```plaintext +Authenticating with credentials from job payload (GitLab Registry) +``` + +Make sure you are using the expected authentication mechanism. + ### "Not Found" error when pulling image Docker errors similar to the following may indicate that the user running the build job doesn't have diff --git a/lib/api/commits.rb b/lib/api/commits.rb index 5fd9a8e3278..221e044b24d 100644 --- a/lib/api/commits.rb +++ b/lib/api/commits.rb @@ -10,6 +10,8 @@ module API before do require_repository_enabled! authorize! :download_code, user_project + + verify_pagination_params! end helpers do diff --git a/lib/api/pagination_params.rb b/lib/api/pagination_params.rb index 85ac50d5bec..bdb69d0ba44 100644 --- a/lib/api/pagination_params.rb +++ b/lib/api/pagination_params.rb @@ -20,6 +20,26 @@ module API optional :page, type: Integer, default: 1, desc: 'Current page number' optional :per_page, type: Integer, default: 20, desc: 'Number of items per page', except_values: [0] end + + def verify_pagination_params! + return if Feature.disabled?(:only_positive_pagination_values) + + page = begin + Integer(params[:page]) + rescue ArgumentError, TypeError + nil + end + + return render_structured_api_error!({ error: 'page does not have a valid value' }, 400) if page&.< 1 + + per_page = begin + Integer(params[:per_page]) + rescue ArgumentError, TypeError + nil + end + + return render_structured_api_error!({ error: 'per_page does not have a valid value' }, 400) if per_page&.< 1 + end end end end diff --git a/rubocop/cop/gitlab/mark_used_feature_flags.rb b/rubocop/cop/gitlab/mark_used_feature_flags.rb index 0bebd7901f3..63bccec31c0 100644 --- a/rubocop/cop/gitlab/mark_used_feature_flags.rb +++ b/rubocop/cop/gitlab/mark_used_feature_flags.rb @@ -145,7 +145,7 @@ module RuboCop return unless node.children[opts_index] node.children[opts_index].each_pair.find do |pair| - pair.key.value == :feature_flag + pair.key.value == :_deprecated_feature_flag end&.value else arg_index = rugged_method?(node) ? 3 : 2 diff --git a/spec/graphql/features/feature_flag_spec.rb b/spec/graphql/features/feature_flag_spec.rb index e5560fccf89..b06718eb16a 100644 --- a/spec/graphql/features/feature_flag_spec.rb +++ b/spec/graphql/features/feature_flag_spec.rb @@ -24,7 +24,7 @@ RSpec.describe 'Graphql Field feature flags' do let(:query_type) do query_factory do |query| - query.field :item, type, null: true, feature_flag: feature_flag, resolver: new_resolver(test_object) + query.field :item, type, null: true, _deprecated_feature_flag: feature_flag, resolver: new_resolver(test_object) end end diff --git a/spec/graphql/types/base_field_spec.rb b/spec/graphql/types/base_field_spec.rb index a3efc60ff5b..b85716e4d21 100644 --- a/spec/graphql/types/base_field_spec.rb +++ b/spec/graphql/types/base_field_spec.rb @@ -209,7 +209,7 @@ RSpec.describe Types::BaseField do describe '#visible?' do context 'and has a feature_flag' do let(:flag) { :test_feature } - let(:field) { described_class.new(name: 'test', type: GraphQL::Types::String, feature_flag: flag, null: false) } + let(:field) { described_class.new(name: 'test', type: GraphQL::Types::String, _deprecated_feature_flag: flag, null: false) } let(:context) { {} } before do @@ -253,7 +253,7 @@ RSpec.describe Types::BaseField do describe '#description' do context 'feature flag given' do - let(:field) { described_class.new(name: 'test', type: GraphQL::Types::String, feature_flag: flag, null: false, description: 'Test description.') } + let(:field) { described_class.new(name: 'test', type: GraphQL::Types::String, _deprecated_feature_flag: flag, null: false, description: 'Test description.') } let(:flag) { :test_flag } it 'prepends the description' do @@ -313,11 +313,11 @@ RSpec.describe Types::BaseField do described_class.new(**base_args.merge(args)) end - it 'interacts well with the `feature_flag` property' do + it 'interacts well with the `_deprecated_feature_flag` property' do field = subject( deprecated: { milestone: '1.10', reason: 'Deprecation reason' }, description: 'Field description.', - feature_flag: 'foo_flag' + _deprecated_feature_flag: 'foo_flag' ) expect(field.description).to start_with('Field description. Available only when feature flag `foo_flag` is enabled.') diff --git a/spec/requests/api/commits_spec.rb b/spec/requests/api/commits_spec.rb index 6dd163b908b..edf072e617d 100644 --- a/spec/requests/api/commits_spec.rb +++ b/spec/requests/api/commits_spec.rb @@ -206,11 +206,13 @@ RSpec.describe API::Commits do let(:page) { 1 } let(:per_page) { 5 } let(:ref_name) { 'master' } - let!(:request) do + let(:request) do get api("/projects/#{project_id}/repository/commits?page=#{page}&per_page=#{per_page}&ref_name=#{ref_name}", user) end it 'returns correct headers' do + request + expect(response).to include_limited_pagination_headers expect(response.headers['Link']).to match(/page=1&per_page=5/) expect(response.headers['Link']).to match(/page=2&per_page=5/) @@ -218,6 +220,8 @@ RSpec.describe API::Commits do context 'viewing the first page' do it 'returns the first 5 commits' do + request + commit = project.repository.commit expect(json_response.size).to eq(per_page) @@ -230,6 +234,8 @@ RSpec.describe API::Commits do let(:page) { 3 } it 'returns the third 5 commits' do + request + commit = project.repository.commits('HEAD', limit: per_page, offset: (page - 1) * per_page).first expect(json_response.size).to eq(per_page) @@ -238,10 +244,55 @@ RSpec.describe API::Commits do end end - context 'when per_page is 0' do - let(:per_page) { 0 } + context 'when pagination params are invalid' do + let_it_be(:project) { create(:project, :repository) } - it_behaves_like '400 response' + using RSpec::Parameterized::TableSyntax + + where(:page, :per_page, :error_message) do + 0 | nil | 'page does not have a valid value' + -1 | nil | 'page does not have a valid value' + 'a' | nil | 'page is invalid' + nil | 0 | 'per_page does not have a valid value' + nil | -1 | 'per_page does not have a valid value' + nil | 'a' | 'per_page is invalid' + end + + with_them do + it 'returns 400 response' do + request + + expect(response).to have_gitlab_http_status(:bad_request) + expect(json_response['error']).to eq(error_message) + end + end + + context 'when FF is off' do + before do + stub_feature_flags(only_positive_pagination_values: false) + end + + where(:page, :per_page, :error_message, :status) do + 0 | nil | nil | :success + -10 | nil | nil | :internal_server_error + 'a' | nil | 'page is invalid' | :bad_request + nil | 0 | 'per_page has a value not allowed' | :bad_request + nil | -1 | nil | :success + nil | 'a' | 'per_page is invalid' | :bad_request + end + + with_them do + it 'returns a response' do + request + + expect(response).to have_gitlab_http_status(status) + + if error_message + expect(json_response['error']).to eq(error_message) + end + end + end + end end end diff --git a/spec/rubocop/cop/gitlab/mark_used_feature_flags_spec.rb b/spec/rubocop/cop/gitlab/mark_used_feature_flags_spec.rb index 2ec3ae7aada..9ab5cdc24a4 100644 --- a/spec/rubocop/cop/gitlab/mark_used_feature_flags_spec.rb +++ b/spec/rubocop/cop/gitlab/mark_used_feature_flags_spec.rb @@ -217,8 +217,8 @@ RSpec.describe RuboCop::Cop::Gitlab::MarkUsedFeatureFlags do allow(cop).to receive(:in_graphql_types?).and_return(true) end - include_examples 'sets flag as used', 'field :runners, Types::Ci::RunnerType.connection_type, null: true, feature_flag: :foo', 'foo' - include_examples 'sets flag as used', 'field :runners, null: true, feature_flag: :foo', 'foo' + include_examples 'sets flag as used', 'field :runners, Types::Ci::RunnerType.connection_type, null: true, _deprecated_feature_flag: :foo', 'foo' + include_examples 'sets flag as used', 'field :runners, null: true, _deprecated_feature_flag: :foo', 'foo' include_examples 'does not set any flags as used', 'field :solution' include_examples 'does not set any flags as used', 'field :runners, Types::Ci::RunnerType.connection_type' include_examples 'does not set any flags as used', 'field :runners, Types::Ci::RunnerType.connection_type, null: true, description: "hello world"'