From 271e7a325340551475ae937aaf2ed7a6344be9e8 Mon Sep 17 00:00:00 2001 From: Greg Stark Date: Fri, 19 Jan 2018 12:44:27 +0000 Subject: [PATCH 1/5] Add indexes and change SQL for expired artifacts to deal with artifacts migration efficiently Artifacts are in the middle of being migrated from ci_builds to ci_job_artifacts. The expiration date is currently visible in both of these tables and the test for whether an expired artifact is present for a job is complex as it requires checking both the of the tables. Add two new indexes, one on ci_builds.artifacts_expire_at and one on ci_job_artifacts.expire_at to enable finding expired artifacts efficiently. And until the migration is finished, replace the SQL for finding expired and non-expired artifacts with a hand-crafted UNION ALL based query instead of using OR. This overcomes a database optimizer limitation that prevents it from using these indexes. When the migration is finished the next version should remove this query and replace it with a much simpler query on just ci_job_artifacts. See https://gitlab.com/gitlab-org/gitlab-ce/issues/42561 for followup. --- app/models/ci/build.rb | 40 +++++++++++++++++-- .../unreleased/expired-ci-artifacts.yml | 5 +++ ...0180119160751_optimize_ci_job_artifacts.rb | 23 +++++++++++ db/schema.rb | 2 + spec/factories/ci/builds.rb | 4 +- 5 files changed, 68 insertions(+), 6 deletions(-) create mode 100644 changelogs/unreleased/expired-ci-artifacts.yml create mode 100644 db/migrate/20180119160751_optimize_ci_job_artifacts.rb diff --git a/app/models/ci/build.rb b/app/models/ci/build.rb index 20534b8eed0..a5f6ec8b022 100644 --- a/app/models/ci/build.rb +++ b/app/models/ci/build.rb @@ -41,12 +41,44 @@ module Ci scope :unstarted, ->() { where(runner_id: nil) } scope :ignore_failures, ->() { where(allow_failure: false) } + + # This convoluted mess is because we need to handle two cases of + # artifact files during the migration. And a simple OR clause + # makes it impossible to optimize. + + # Instead we want to use UNION ALL and do two carefully + # constructed disjoint queries. But Rails cannot handle UNION or + # UNION ALL queries so we do the query in a subquery and wrap it + # in an otherwise redundant WHERE IN query (IN is fine for + # non-null columns). + + # This should all be ripped out when the migration is finished and + # replaced with just the new storage to avoid the extra work. + scope :with_artifacts, ->() do - where('(artifacts_file IS NOT NULL AND artifacts_file <> ?) OR EXISTS (?)', - '', Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id')) + old = where(%q[artifacts_file <> '']) + new = where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], + Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id')) + union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) + Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") end - scope :with_artifacts_not_expired, ->() { with_artifacts.where('artifacts_expire_at IS NULL OR artifacts_expire_at > ?', Time.now) } - scope :with_expired_artifacts, ->() { with_artifacts.where('artifacts_expire_at < ?', Time.now) } + + scope :with_artifacts_not_expired, ->() do + old = where(%q[artifacts_file <> '' AND (artifacts_expire_at IS NULL OR artifacts_expire_at > ?)], Time.now) + new = where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], + Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND (expire_at IS NULL OR expire_at > ?)', Time.now)) + union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) + Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") + end + + scope :with_expired_artifacts, ->() do + old = where(%q[artifacts_file <> '' AND artifacts_expire_at < ?], Time.now) + new = where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], + Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND expire_at < ?', Time.now)) + union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) + Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") + end + scope :last_month, ->() { where('created_at > ?', Date.today - 1.month) } scope :manual_actions, ->() { where(when: :manual, status: COMPLETED_STATUSES + [:manual]) } scope :ref_protected, -> { where(protected: true) } diff --git a/changelogs/unreleased/expired-ci-artifacts.yml b/changelogs/unreleased/expired-ci-artifacts.yml new file mode 100644 index 00000000000..2fcbdb02f84 --- /dev/null +++ b/changelogs/unreleased/expired-ci-artifacts.yml @@ -0,0 +1,5 @@ +--- +title: Change SQL for expired artifacts to use new ci_job_artifacts.expire_at +merge_request: 16578 +author: +type: performance diff --git a/db/migrate/20180119160751_optimize_ci_job_artifacts.rb b/db/migrate/20180119160751_optimize_ci_job_artifacts.rb new file mode 100644 index 00000000000..9b4340ed7b7 --- /dev/null +++ b/db/migrate/20180119160751_optimize_ci_job_artifacts.rb @@ -0,0 +1,23 @@ +# See http://doc.gitlab.com/ce/development/migration_style_guide.html +# for more information on how to write migrations for GitLab. + +class OptimizeCiJobArtifacts < ActiveRecord::Migration + include Gitlab::Database::MigrationHelpers + + # Set this constant to true if this migration requires downtime. + DOWNTIME = false + + disable_ddl_transaction! + + def up + # job_id is just here to be a covering index for index only scans + # since we'll almost always be joining against ci_builds on job_id + add_concurrent_index(:ci_job_artifacts, [:expire_at, :job_id]) + add_concurrent_index(:ci_builds, [:artifacts_expire_at], where: "artifacts_file <> ''") + end + + def down + remove_concurrent_index(:ci_job_artifacts, [:expire_at, :job_id]) + remove_concurrent_index(:ci_builds, [:artifacts_expire_at], where: "artifacts_file <> ''") + end +end diff --git a/db/schema.rb b/db/schema.rb index d07a4c31618..57b3f4c9988 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -293,6 +293,7 @@ ActiveRecord::Schema.define(version: 20180206200543) do t.integer "failure_reason" end + add_index "ci_builds", ["artifacts_expire_at"], name: "index_ci_builds_on_artifacts_expire_at", where: "(artifacts_file <> ''::text)", using: :btree add_index "ci_builds", ["auto_canceled_by_id"], name: "index_ci_builds_on_auto_canceled_by_id", using: :btree add_index "ci_builds", ["commit_id", "stage_idx", "created_at"], name: "index_ci_builds_on_commit_id_and_stage_idx_and_created_at", using: :btree add_index "ci_builds", ["commit_id", "status", "type"], name: "index_ci_builds_on_commit_id_and_status_and_type", using: :btree @@ -333,6 +334,7 @@ ActiveRecord::Schema.define(version: 20180206200543) do t.string "file" end + add_index "ci_job_artifacts", ["expire_at", "job_id"], name: "index_ci_job_artifacts_on_expire_at_and_job_id", using: :btree add_index "ci_job_artifacts", ["job_id", "file_type"], name: "index_ci_job_artifacts_on_job_id_and_file_type", unique: true, using: :btree add_index "ci_job_artifacts", ["project_id"], name: "index_ci_job_artifacts_on_project_id", using: :btree diff --git a/spec/factories/ci/builds.rb b/spec/factories/ci/builds.rb index 6ba599cdf83..f6ba3a581ca 100644 --- a/spec/factories/ci/builds.rb +++ b/spec/factories/ci/builds.rb @@ -180,8 +180,8 @@ FactoryBot.define do trait :artifacts do after(:create) do |build| - create(:ci_job_artifact, :archive, job: build) - create(:ci_job_artifact, :metadata, job: build) + create(:ci_job_artifact, :archive, job: build, expire_at: build.artifacts_expire_at) + create(:ci_job_artifact, :metadata, job: build, expire_at: build.artifacts_expire_at) build.reload end end From 7b048631c6f53dd02e5c3b7175c30968f9d08db6 Mon Sep 17 00:00:00 2001 From: Greg Stark Date: Thu, 8 Feb 2018 11:27:51 +0100 Subject: [PATCH 2/5] WIP experiment --- app/models/ci/build.rb | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/app/models/ci/build.rb b/app/models/ci/build.rb index a5f6ec8b022..ba8d9944199 100644 --- a/app/models/ci/build.rb +++ b/app/models/ci/build.rb @@ -60,7 +60,9 @@ module Ci new = where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id')) union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") + # XXX + #Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") + where(%Q[ci_builds.id IN (SELECT ci_builds.id FROM (#{union.to_sql}))]) end scope :with_artifacts_not_expired, ->() do @@ -68,7 +70,9 @@ module Ci new = where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND (expire_at IS NULL OR expire_at > ?)', Time.now)) union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") + # XXX + #Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") + where(%Q[ci_builds.id IN (SELECT ci_builds.id FROM (#{union.to_sql}))]) end scope :with_expired_artifacts, ->() do @@ -76,7 +80,9 @@ module Ci new = where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND expire_at < ?', Time.now)) union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") + # XXX + #Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") + where(%Q[ci_builds.id IN (SELECT ci_builds.id FROM (#{union.to_sql}))]) end scope :last_month, ->() { where('created_at > ?', Date.today - 1.month) } From d21f41307a531636f5f54c8c3d055cbbf1fa9b21 Mon Sep 17 00:00:00 2001 From: Greg Stark Date: Fri, 9 Feb 2018 16:16:32 +0100 Subject: [PATCH 3/5] Revert to old code style where branches of the union do not have extra where clauses on them --- app/models/ci/build.rb | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/app/models/ci/build.rb b/app/models/ci/build.rb index ba8d9944199..3321d98e237 100644 --- a/app/models/ci/build.rb +++ b/app/models/ci/build.rb @@ -56,33 +56,27 @@ module Ci # replaced with just the new storage to avoid the extra work. scope :with_artifacts, ->() do - old = where(%q[artifacts_file <> '']) - new = where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], - Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id')) + old = Ci::Build.select(:id).where(%q[artifacts_file <> '']) + new = Ci::Build.select(:id).where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], + Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id')) union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - # XXX - #Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") - where(%Q[ci_builds.id IN (SELECT ci_builds.id FROM (#{union.to_sql}))]) + where(%Q[ci_builds.id IN (#{union.to_sql})]) end scope :with_artifacts_not_expired, ->() do - old = where(%q[artifacts_file <> '' AND (artifacts_expire_at IS NULL OR artifacts_expire_at > ?)], Time.now) - new = where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], - Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND (expire_at IS NULL OR expire_at > ?)', Time.now)) + old = Ci::Build.select(:id).where(%q[artifacts_file <> '' AND (artifacts_expire_at IS NULL OR artifacts_expire_at > ?)], Time.now) + new = Ci::Build.select(:id).where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], + Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND (expire_at IS NULL OR expire_at > ?)', Time.now)) union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - # XXX - #Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") - where(%Q[ci_builds.id IN (SELECT ci_builds.id FROM (#{union.to_sql}))]) + where(%Q[ci_builds.id IN (#{union.to_sql})]) end scope :with_expired_artifacts, ->() do - old = where(%q[artifacts_file <> '' AND artifacts_expire_at < ?], Time.now) - new = where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], - Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND expire_at < ?', Time.now)) + old = Ci::Build.select(:id).where(%q[artifacts_file <> '' AND artifacts_expire_at < ?], Time.now) + new = Ci::Build.select(:id).where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], + Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND expire_at < ?', Time.now)) union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - # XXX - #Ci::Build.from("(#{union.to_sql}) #{Ci::Build.table_name}") - where(%Q[ci_builds.id IN (SELECT ci_builds.id FROM (#{union.to_sql}))]) + where(%Q[ci_builds.id IN (#{union.to_sql})]) end scope :last_month, ->() { where('created_at > ?', Date.today - 1.month) } From 82f9c1c429af118b6bcedfb60813e1ba741511d1 Mon Sep 17 00:00:00 2001 From: Greg Stark Date: Fri, 9 Feb 2018 17:35:51 +0100 Subject: [PATCH 4/5] add rubocop whitelist --- app/models/ci/build.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/models/ci/build.rb b/app/models/ci/build.rb index 3321d98e237..836b9699cf5 100644 --- a/app/models/ci/build.rb +++ b/app/models/ci/build.rb @@ -60,7 +60,7 @@ module Ci new = Ci::Build.select(:id).where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id')) union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - where(%Q[ci_builds.id IN (#{union.to_sql})]) + where(%Q[ci_builds.id IN (#{union.to_sql})]) # rubocop:disable GitlabSecurity/SqlInjection end scope :with_artifacts_not_expired, ->() do @@ -68,7 +68,7 @@ module Ci new = Ci::Build.select(:id).where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND (expire_at IS NULL OR expire_at > ?)', Time.now)) union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - where(%Q[ci_builds.id IN (#{union.to_sql})]) + where(%Q[ci_builds.id IN (#{union.to_sql})]) # rubocop:disable GitlabSecurity/SqlInjection end scope :with_expired_artifacts, ->() do @@ -76,7 +76,7 @@ module Ci new = Ci::Build.select(:id).where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND expire_at < ?', Time.now)) union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - where(%Q[ci_builds.id IN (#{union.to_sql})]) + where(%Q[ci_builds.id IN (#{union.to_sql})]) # rubocop:disable GitlabSecurity/SqlInjection end scope :last_month, ->() { where('created_at > ?', Date.today - 1.month) } From 660a9d33d13f6e95d50b76028a22fcc437b86d0b Mon Sep 17 00:00:00 2001 From: Greg Stark Date: Fri, 9 Feb 2018 18:22:05 +0100 Subject: [PATCH 5/5] revert to earlier coding using a hard coded UNION ALL instead of Gitlab::SQL::Union --- app/models/ci/build.rb | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/app/models/ci/build.rb b/app/models/ci/build.rb index 836b9699cf5..7312e704932 100644 --- a/app/models/ci/build.rb +++ b/app/models/ci/build.rb @@ -59,24 +59,21 @@ module Ci old = Ci::Build.select(:id).where(%q[artifacts_file <> '']) new = Ci::Build.select(:id).where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id')) - union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - where(%Q[ci_builds.id IN (#{union.to_sql})]) # rubocop:disable GitlabSecurity/SqlInjection + where('ci_builds.id IN (? UNION ALL ?)', old, new) end scope :with_artifacts_not_expired, ->() do old = Ci::Build.select(:id).where(%q[artifacts_file <> '' AND (artifacts_expire_at IS NULL OR artifacts_expire_at > ?)], Time.now) new = Ci::Build.select(:id).where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND (expire_at IS NULL OR expire_at > ?)', Time.now)) - union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - where(%Q[ci_builds.id IN (#{union.to_sql})]) # rubocop:disable GitlabSecurity/SqlInjection + where('ci_builds.id IN (? UNION ALL ?)', old, new) end scope :with_expired_artifacts, ->() do old = Ci::Build.select(:id).where(%q[artifacts_file <> '' AND artifacts_expire_at < ?], Time.now) new = Ci::Build.select(:id).where(%q[(artifacts_file IS NULL OR artifacts_file = '') AND EXISTS (?)], Ci::JobArtifact.select(1).where('ci_builds.id = ci_job_artifacts.job_id AND expire_at < ?', Time.now)) - union = Gitlab::SQL::Union.new([old, new], remove_duplicates: false) - where(%Q[ci_builds.id IN (#{union.to_sql})]) # rubocop:disable GitlabSecurity/SqlInjection + where('ci_builds.id IN (? UNION ALL ?)', old, new) end scope :last_month, ->() { where('created_at > ?', Date.today - 1.month) }