Move strategies in their own files
This improves readability quite a bit.
This commit is contained in:
parent
ff35cb45e9
commit
474fd9138c
|
@ -50,161 +50,6 @@ module Gitlab
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
# This strategy performs an exact count on the model.
|
||||
#
|
||||
# This is guaranteed to be accurate, however it also scans the
|
||||
# whole table. Hence, there are no guarantees with respect
|
||||
# to runtime.
|
||||
#
|
||||
# Note that for very large tables, this may even timeout.
|
||||
class ExactCountStrategy
|
||||
attr_reader :models
|
||||
def initialize(models)
|
||||
@models = models
|
||||
end
|
||||
|
||||
def count
|
||||
models.each_with_object({}) do |model, data|
|
||||
data[model] = model.count
|
||||
end
|
||||
end
|
||||
|
||||
def self.enabled?
|
||||
true
|
||||
end
|
||||
end
|
||||
|
||||
class PgClass < ActiveRecord::Base
|
||||
self.table_name = 'pg_class'
|
||||
end
|
||||
|
||||
# This strategy counts based on PostgreSQL's statistics in pg_stat_user_tables.
|
||||
#
|
||||
# Specifically, it relies on the column reltuples in said table. An additional
|
||||
# check is performed to make sure statistics were updated within the last hour.
|
||||
#
|
||||
# Otherwise, this strategy skips tables with outdated statistics.
|
||||
#
|
||||
# There are no guarantees with respect to the accuracy of this strategy. Runtime
|
||||
# however is guaranteed to be "fast", because it only looks up statistics.
|
||||
class ReltuplesCountStrategy
|
||||
attr_reader :models
|
||||
def initialize(models)
|
||||
@models = models
|
||||
end
|
||||
|
||||
# Returns a hash of the table names that have recently updated tuples.
|
||||
#
|
||||
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
|
||||
def count
|
||||
size_estimates
|
||||
rescue *CONNECTION_ERRORS
|
||||
{}
|
||||
end
|
||||
|
||||
def self.enabled?
|
||||
Gitlab::Database.postgresql?
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def table_names
|
||||
models.map(&:table_name)
|
||||
end
|
||||
|
||||
def size_estimates(check_statistics: true)
|
||||
table_to_model = models.each_with_object({}) { |model, h| h[model.table_name] = model }
|
||||
|
||||
# Querying tuple stats only works on the primary. Due to load balancing, the
|
||||
# easiest way to do this is to start a transaction.
|
||||
ActiveRecord::Base.transaction do
|
||||
get_statistics(table_names, check_statistics: check_statistics).each_with_object({}) do |row, data|
|
||||
model = table_to_model[row.table_name]
|
||||
data[model] = row.estimate
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Generates the PostgreSQL query to return the tuples for tables
|
||||
# that have been vacuumed or analyzed in the last hour.
|
||||
#
|
||||
# @param [Array] table names
|
||||
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
|
||||
def get_statistics(table_names, check_statistics: true)
|
||||
time = "to_timestamp(#{1.hour.ago.to_i})"
|
||||
|
||||
query = PgClass.joins("LEFT JOIN pg_stat_user_tables USING (relname)")
|
||||
.where(relname: table_names)
|
||||
.select('pg_class.relname AS table_name, reltuples::bigint AS estimate')
|
||||
|
||||
if check_statistics
|
||||
query = query.where('last_vacuum > ? OR last_autovacuum > ? OR last_analyze > ? OR last_autoanalyze > ?',
|
||||
time, time, time, time)
|
||||
end
|
||||
|
||||
query
|
||||
end
|
||||
end
|
||||
|
||||
# A tablesample count executes in two phases:
|
||||
# * Estimate table sizes based on reltuples.
|
||||
# * Based on the estimate:
|
||||
# * If the table is considered 'small', execute an exact relation count.
|
||||
# * Otherwise, count on a sample of the table using TABLESAMPLE.
|
||||
#
|
||||
# The size of the sample is chosen in a way that we always roughly scan
|
||||
# the same amount of rows (see TABLESAMPLE_ROW_TARGET).
|
||||
#
|
||||
# There are no guarantees with respect to the accuracy of the result or runtime.
|
||||
class TablesampleCountStrategy < ReltuplesCountStrategy
|
||||
EXACT_COUNT_THRESHOLD = 100_000
|
||||
TABLESAMPLE_ROW_TARGET = 100_000
|
||||
|
||||
def count
|
||||
estimates = size_estimates(check_statistics: false)
|
||||
|
||||
models.each_with_object({}) do |model, count_by_model|
|
||||
count = perform_count(model, estimates[model])
|
||||
count_by_model[model] = count if count
|
||||
end
|
||||
rescue *CONNECTION_ERRORS
|
||||
{}
|
||||
end
|
||||
|
||||
def self.enabled?
|
||||
Gitlab::Database.postgresql? && Feature.enabled?(:tablesample_counts)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def perform_count(model, estimate)
|
||||
# If we estimate 0, we may not have statistics at all. Don't use them.
|
||||
return nil unless estimate && estimate > 0
|
||||
|
||||
if estimate < EXACT_COUNT_THRESHOLD
|
||||
# The table is considered small, the assumption here is that
|
||||
# the exact count will be fast anyways.
|
||||
model.count
|
||||
else
|
||||
# The table is considered large, let's only count on a sample.
|
||||
tablesample_count(model, estimate)
|
||||
end
|
||||
end
|
||||
|
||||
def tablesample_count(model, estimate)
|
||||
portion = (TABLESAMPLE_ROW_TARGET.to_f / estimate).round(4)
|
||||
inverse = 1 / portion
|
||||
query = <<~SQL
|
||||
SELECT (COUNT(*)*#{inverse})::integer AS count
|
||||
FROM #{model.table_name} TABLESAMPLE SYSTEM (#{portion * 100})
|
||||
SQL
|
||||
|
||||
rows = ActiveRecord::Base.connection.select_all(query)
|
||||
|
||||
Integer(rows.first['count'])
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module Gitlab
|
||||
module Database
|
||||
module Count
|
||||
# This strategy performs an exact count on the model.
|
||||
#
|
||||
# This is guaranteed to be accurate, however it also scans the
|
||||
# whole table. Hence, there are no guarantees with respect
|
||||
# to runtime.
|
||||
#
|
||||
# Note that for very large tables, this may even timeout.
|
||||
class ExactCountStrategy
|
||||
attr_reader :models
|
||||
def initialize(models)
|
||||
@models = models
|
||||
end
|
||||
|
||||
def count
|
||||
models.each_with_object({}) do |model, data|
|
||||
data[model] = model.count
|
||||
end
|
||||
end
|
||||
|
||||
def self.enabled?
|
||||
true
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,79 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module Gitlab
|
||||
module Database
|
||||
module Count
|
||||
class PgClass < ActiveRecord::Base
|
||||
self.table_name = 'pg_class'
|
||||
end
|
||||
|
||||
# This strategy counts based on PostgreSQL's statistics in pg_stat_user_tables.
|
||||
#
|
||||
# Specifically, it relies on the column reltuples in said table. An additional
|
||||
# check is performed to make sure statistics were updated within the last hour.
|
||||
#
|
||||
# Otherwise, this strategy skips tables with outdated statistics.
|
||||
#
|
||||
# There are no guarantees with respect to the accuracy of this strategy. Runtime
|
||||
# however is guaranteed to be "fast", because it only looks up statistics.
|
||||
class ReltuplesCountStrategy
|
||||
attr_reader :models
|
||||
def initialize(models)
|
||||
@models = models
|
||||
end
|
||||
|
||||
# Returns a hash of the table names that have recently updated tuples.
|
||||
#
|
||||
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
|
||||
def count
|
||||
size_estimates
|
||||
rescue *CONNECTION_ERRORS
|
||||
{}
|
||||
end
|
||||
|
||||
def self.enabled?
|
||||
Gitlab::Database.postgresql?
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def table_names
|
||||
models.map(&:table_name)
|
||||
end
|
||||
|
||||
def size_estimates(check_statistics: true)
|
||||
table_to_model = models.each_with_object({}) { |model, h| h[model.table_name] = model }
|
||||
|
||||
# Querying tuple stats only works on the primary. Due to load balancing, the
|
||||
# easiest way to do this is to start a transaction.
|
||||
ActiveRecord::Base.transaction do
|
||||
get_statistics(table_names, check_statistics: check_statistics).each_with_object({}) do |row, data|
|
||||
model = table_to_model[row.table_name]
|
||||
data[model] = row.estimate
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Generates the PostgreSQL query to return the tuples for tables
|
||||
# that have been vacuumed or analyzed in the last hour.
|
||||
#
|
||||
# @param [Array] table names
|
||||
# @returns [Hash] Table name to count mapping (e.g. { 'projects' => 5, 'users' => 100 })
|
||||
def get_statistics(table_names, check_statistics: true)
|
||||
time = 1.hour.ago
|
||||
|
||||
query = PgClass.joins("LEFT JOIN pg_stat_user_tables USING (relname)")
|
||||
.where(relname: table_names)
|
||||
.select('pg_class.relname AS table_name, reltuples::bigint AS estimate')
|
||||
|
||||
if check_statistics
|
||||
query = query.where('last_vacuum > ? OR last_autovacuum > ? OR last_analyze > ? OR last_autoanalyze > ?',
|
||||
time, time, time, time)
|
||||
end
|
||||
|
||||
query
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,66 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module Gitlab
|
||||
module Database
|
||||
module Count
|
||||
# A tablesample count executes in two phases:
|
||||
# * Estimate table sizes based on reltuples.
|
||||
# * Based on the estimate:
|
||||
# * If the table is considered 'small', execute an exact relation count.
|
||||
# * Otherwise, count on a sample of the table using TABLESAMPLE.
|
||||
#
|
||||
# The size of the sample is chosen in a way that we always roughly scan
|
||||
# the same amount of rows (see TABLESAMPLE_ROW_TARGET).
|
||||
#
|
||||
# There are no guarantees with respect to the accuracy of the result or runtime.
|
||||
class TablesampleCountStrategy < ReltuplesCountStrategy
|
||||
EXACT_COUNT_THRESHOLD = 100_000
|
||||
TABLESAMPLE_ROW_TARGET = 100_000
|
||||
|
||||
def count
|
||||
estimates = size_estimates(check_statistics: false)
|
||||
|
||||
models.each_with_object({}) do |model, count_by_model|
|
||||
count = perform_count(model, estimates[model])
|
||||
count_by_model[model] = count if count
|
||||
end
|
||||
rescue *CONNECTION_ERRORS
|
||||
{}
|
||||
end
|
||||
|
||||
def self.enabled?
|
||||
Gitlab::Database.postgresql? && Feature.enabled?(:tablesample_counts)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def perform_count(model, estimate)
|
||||
# If we estimate 0, we may not have statistics at all. Don't use them.
|
||||
return nil unless estimate && estimate > 0
|
||||
|
||||
if estimate < EXACT_COUNT_THRESHOLD
|
||||
# The table is considered small, the assumption here is that
|
||||
# the exact count will be fast anyways.
|
||||
model.count
|
||||
else
|
||||
# The table is considered large, let's only count on a sample.
|
||||
tablesample_count(model, estimate)
|
||||
end
|
||||
end
|
||||
|
||||
def tablesample_count(model, estimate)
|
||||
portion = (TABLESAMPLE_ROW_TARGET.to_f / estimate).round(4)
|
||||
inverse = 1 / portion
|
||||
query = <<~SQL
|
||||
SELECT (COUNT(*)*#{inverse})::integer AS count
|
||||
FROM #{model.table_name} TABLESAMPLE SYSTEM (#{portion * 100})
|
||||
SQL
|
||||
|
||||
rows = ActiveRecord::Base.connection.select_all(query)
|
||||
|
||||
Integer(rows.first['count'])
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,34 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Gitlab::Database::Count::ExactCountStrategy do
|
||||
before do
|
||||
create_list(:project, 3)
|
||||
create(:identity)
|
||||
end
|
||||
|
||||
let(:models) { [Project, Identity] }
|
||||
|
||||
subject { described_class.new(models).count }
|
||||
|
||||
describe '#count' do
|
||||
it 'counts all models' do
|
||||
expect(models).to all(receive(:count).and_call_original)
|
||||
|
||||
expect(subject).to eq({ Project => 3, Identity => 1 })
|
||||
end
|
||||
end
|
||||
|
||||
describe '.enabled?' do
|
||||
it 'is enabled for PostgreSQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
|
||||
|
||||
expect(described_class.enabled?).to be_truthy
|
||||
end
|
||||
|
||||
it 'is enabled for MySQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
|
||||
|
||||
expect(described_class.enabled?).to be_truthy
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,48 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Gitlab::Database::Count::ReltuplesCountStrategy do
|
||||
before do
|
||||
create_list(:project, 3)
|
||||
create(:identity)
|
||||
end
|
||||
|
||||
let(:models) { [Project, Identity] }
|
||||
subject { described_class.new(models).count }
|
||||
|
||||
describe '#count', :postgresql do
|
||||
context 'when reltuples is up to date' do
|
||||
before do
|
||||
ActiveRecord::Base.connection.execute('ANALYZE projects')
|
||||
ActiveRecord::Base.connection.execute('ANALYZE identities')
|
||||
end
|
||||
|
||||
it 'uses statistics to do the count' do
|
||||
models.each { |model| expect(model).not_to receive(:count) }
|
||||
|
||||
expect(subject).to eq({ Project => 3, Identity => 1 })
|
||||
end
|
||||
end
|
||||
|
||||
context 'insufficient permissions' do
|
||||
it 'returns an empty hash' do
|
||||
allow(ActiveRecord::Base).to receive(:transaction).and_raise(PG::InsufficientPrivilege)
|
||||
|
||||
expect(subject).to eq({})
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '.enabled?' do
|
||||
it 'is enabled for PostgreSQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
|
||||
|
||||
expect(described_class.enabled?).to be_truthy
|
||||
end
|
||||
|
||||
it 'is disabled for MySQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
|
||||
|
||||
expect(described_class.enabled?).to be_falsey
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,65 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Gitlab::Database::Count::TablesampleCountStrategy do
|
||||
before do
|
||||
create_list(:project, 3)
|
||||
create(:identity)
|
||||
end
|
||||
|
||||
let(:models) { [Project, Identity] }
|
||||
let(:strategy) { described_class.new(models) }
|
||||
|
||||
subject { strategy.count }
|
||||
|
||||
describe '#count', :postgresql do
|
||||
let(:estimates) { { Project => threshold + 1, Identity => threshold - 1 } }
|
||||
let(:threshold) { Gitlab::Database::Count::TablesampleCountStrategy::EXACT_COUNT_THRESHOLD }
|
||||
|
||||
before do
|
||||
allow(strategy).to receive(:size_estimates).with(check_statistics: false).and_return(estimates)
|
||||
end
|
||||
|
||||
context 'for tables with an estimated small size' do
|
||||
it 'performs an exact count' do
|
||||
expect(Identity).to receive(:count).and_call_original
|
||||
|
||||
expect(subject).to include({ Identity => 1 })
|
||||
end
|
||||
end
|
||||
|
||||
context 'for tables with an estimated large size' do
|
||||
it 'performs a tablesample count' do
|
||||
expect(Project).not_to receive(:count)
|
||||
|
||||
result = subject
|
||||
expect(result[Project]).to eq(3)
|
||||
end
|
||||
end
|
||||
|
||||
context 'insufficient permissions' do
|
||||
it 'returns an empty hash' do
|
||||
allow(strategy).to receive(:size_estimates).and_raise(PG::InsufficientPrivilege)
|
||||
|
||||
expect(subject).to eq({})
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '.enabled?' do
|
||||
before do
|
||||
stub_feature_flags(tablesample_counts: true)
|
||||
end
|
||||
|
||||
it 'is enabled for PostgreSQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
|
||||
|
||||
expect(described_class.enabled?).to be_truthy
|
||||
end
|
||||
|
||||
it 'is disabled for MySQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
|
||||
|
||||
expect(described_class.enabled?).to be_falsey
|
||||
end
|
||||
end
|
||||
end
|
|
@ -56,128 +56,4 @@ describe Gitlab::Database::Count do
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe Gitlab::Database::Count::ExactCountStrategy do
|
||||
subject { described_class.new(models).count }
|
||||
|
||||
describe '#count' do
|
||||
it 'counts all models' do
|
||||
expect(models).to all(receive(:count).and_call_original)
|
||||
|
||||
expect(subject).to eq({ Project => 3, Identity => 1 })
|
||||
end
|
||||
end
|
||||
|
||||
describe '.enabled?' do
|
||||
it 'is enabled for PostgreSQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
|
||||
|
||||
expect(described_class.enabled?).to be_truthy
|
||||
end
|
||||
|
||||
it 'is enabled for MySQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
|
||||
|
||||
expect(described_class.enabled?).to be_truthy
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe Gitlab::Database::Count::ReltuplesCountStrategy do
|
||||
subject { described_class.new(models).count }
|
||||
|
||||
describe '#count', :postgresql do
|
||||
context 'when reltuples is up to date' do
|
||||
before do
|
||||
ActiveRecord::Base.connection.execute('ANALYZE projects')
|
||||
ActiveRecord::Base.connection.execute('ANALYZE identities')
|
||||
end
|
||||
|
||||
it 'uses statistics to do the count' do
|
||||
models.each { |model| expect(model).not_to receive(:count) }
|
||||
|
||||
expect(subject).to eq({ Project => 3, Identity => 1 })
|
||||
end
|
||||
end
|
||||
|
||||
context 'insufficient permissions' do
|
||||
it 'returns an empty hash' do
|
||||
allow(ActiveRecord::Base).to receive(:transaction).and_raise(PG::InsufficientPrivilege)
|
||||
|
||||
expect(subject).to eq({})
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '.enabled?' do
|
||||
it 'is enabled for PostgreSQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
|
||||
|
||||
expect(described_class.enabled?).to be_truthy
|
||||
end
|
||||
|
||||
it 'is disabled for MySQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
|
||||
|
||||
expect(described_class.enabled?).to be_falsey
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe Gitlab::Database::Count::TablesampleCountStrategy do
|
||||
subject { strategy.count }
|
||||
let(:strategy) { described_class.new(models) }
|
||||
|
||||
describe '#count', :postgresql do
|
||||
let(:estimates) { { Project => threshold + 1, Identity => threshold - 1 } }
|
||||
let(:threshold) { Gitlab::Database::Count::TablesampleCountStrategy::EXACT_COUNT_THRESHOLD }
|
||||
|
||||
before do
|
||||
allow(strategy).to receive(:size_estimates).with(check_statistics: false).and_return(estimates)
|
||||
end
|
||||
|
||||
context 'for tables with an estimated small size' do
|
||||
it 'performs an exact count' do
|
||||
expect(Identity).to receive(:count).and_call_original
|
||||
|
||||
expect(subject).to include({ Identity => 1 })
|
||||
end
|
||||
end
|
||||
|
||||
context 'for tables with an estimated large size' do
|
||||
it 'performs a tablesample count' do
|
||||
expect(Project).not_to receive(:count)
|
||||
|
||||
result = subject
|
||||
expect(result[Project]).to eq(3)
|
||||
end
|
||||
end
|
||||
|
||||
context 'insufficient permissions' do
|
||||
it 'returns an empty hash' do
|
||||
allow(strategy).to receive(:size_estimates).and_raise(PG::InsufficientPrivilege)
|
||||
|
||||
expect(subject).to eq({})
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '.enabled?' do
|
||||
before do
|
||||
stub_feature_flags(tablesample_counts: true)
|
||||
end
|
||||
|
||||
it 'is enabled for PostgreSQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(true)
|
||||
|
||||
expect(described_class.enabled?).to be_truthy
|
||||
end
|
||||
|
||||
it 'is disabled for MySQL' do
|
||||
allow(Gitlab::Database).to receive(:postgresql?).and_return(false)
|
||||
|
||||
expect(described_class.enabled?).to be_falsey
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue