diff --git a/changelogs/unreleased/34086-es-bulk-incremental-index-updates.yml b/changelogs/unreleased/34086-es-bulk-incremental-index-updates.yml new file mode 100644 index 00000000000..67cceb21af0 --- /dev/null +++ b/changelogs/unreleased/34086-es-bulk-incremental-index-updates.yml @@ -0,0 +1,5 @@ +--- +title: 'Add a bulk processor for elasticsearch incremental updates' +merge_request: 24298 +author: +type: added diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index 20c75a6e255..330e5109ed4 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -454,6 +454,11 @@ production: &base pseudonymizer_worker: cron: "0 * * * *" + # Elasticsearch bulk updater for incremental updates. + # NOTE: This will only take effect if elasticsearch is enabled. + elastic_index_bulk_cron_worker: + cron: "*/1 * * * *" + registry: # enabled: true # host: registry.example.com diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index 156cf78dfc4..684ccb73603 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -537,6 +537,9 @@ Gitlab.ee do Settings.cron_jobs['update_max_seats_used_for_gitlab_com_subscriptions_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['update_max_seats_used_for_gitlab_com_subscriptions_worker']['cron'] ||= '0 12 * * *' Settings.cron_jobs['update_max_seats_used_for_gitlab_com_subscriptions_worker']['job_class'] = 'UpdateMaxSeatsUsedForGitlabComSubscriptionsWorker' + Settings.cron_jobs['elastic_index_bulk_cron_worker'] ||= Settingslogic.new({}) + Settings.cron_jobs['elastic_index_bulk_cron_worker']['cron'] ||= '*/1 * * * *' + Settings.cron_jobs['elastic_index_bulk_cron_worker']['job_class'] ||= 'ElasticIndexBulkCronWorker' end # diff --git a/doc/development/elasticsearch.md b/doc/development/elasticsearch.md index b8d2a873d8b..69113fe8030 100644 --- a/doc/development/elasticsearch.md +++ b/doc/development/elasticsearch.md @@ -36,7 +36,11 @@ Additionally, if you need large repos or multiple forks for testing, please cons The Elasticsearch integration depends on an external indexer. We ship an [indexer written in Go](https://gitlab.com/gitlab-org/gitlab-elasticsearch-indexer). The user must trigger the initial indexing via a rake task but, after this is done, GitLab itself will trigger reindexing when required via `after_` callbacks on create, update, and destroy that are inherited from [/ee/app/models/concerns/elastic/application_versioned_search.rb](https://gitlab.com/gitlab-org/gitlab/blob/master/ee/app/models/concerns/elastic/application_versioned_search.rb). -All indexing after the initial one is done via `ElasticIndexerWorker` (Sidekiq jobs). +After initial indexing is complete, updates proceed in one of two ways, depending on the `:elastic_bulk_incremental_updates` feature flag. + +If disabled, every create, update, or delete operation on an Elasticsearch-tracked model enqueues a new `ElasticIndexerWorker` Sidekiq job which takes care of updating just that document. This is quite inefficient. + +If the feature flag is enabled, create, update, and delete operations for all models except projects (see [#207494](https://gitlab.com/gitlab-org/gitlab/issues/207494)) are tracked in a Redis [`ZSET`](https://redis.io/topics/data-types#sorted-sets) instead. A regular `sidekiq-cron` `ElasticIndexBulkCronWorker` processes this queue, updating many Elasticsearch documents at a time with the [Bulk Request API](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html). Search queries are generated by the concerns found in [ee/app/models/concerns/elastic](https://gitlab.com/gitlab-org/gitlab/tree/master/ee/app/models/concerns/elastic). These concerns are also in charge of access control, and have been a historic source of security bugs so please pay close attention to them!