gitlab-org--gitlab-foss/lib/gitlab/database/median.rb

119 lines
4.4 KiB
Ruby

# https://www.periscopedata.com/blog/medians-in-sql.html
module Gitlab
module Database
module Median
def median_datetime(arel_table, query_so_far, column_sym)
median_queries =
if Gitlab::Database.postgresql?
pg_median_datetime_sql(arel_table, query_so_far, column_sym)
elsif Gitlab::Database.mysql?
mysql_median_datetime_sql(arel_table, query_so_far, column_sym)
end
results = Array.wrap(median_queries).map do |query|
ActiveRecord::Base.connection.execute(query)
end
extract_median(results).presence
end
def extract_median(results)
result = results.compact.first
if Gitlab::Database.postgresql?
result = result.first.presence
median = result['median'] if result
median.to_f if median
elsif Gitlab::Database.mysql?
result.to_a.flatten.first
end
end
def mysql_median_datetime_sql(arel_table, query_so_far, column_sym)
query = arel_table
.from(arel_table.project(Arel.sql('*')).order(arel_table[column_sym]).as(arel_table.table_name))
.project(average([arel_table[column_sym]], 'median'))
.where(
Arel::Nodes::Between.new(
Arel.sql("(select @row_id := @row_id + 1)"),
Arel::Nodes::And.new(
[Arel.sql('@ct/2.0'),
Arel.sql('@ct/2.0 + 1')]
)
)
).
# Disallow negative values
where(arel_table[column_sym].gteq(0))
[
Arel.sql("CREATE TEMPORARY TABLE IF NOT EXISTS #{query_so_far.to_sql}"),
Arel.sql("set @ct := (select count(1) from #{arel_table.table_name});"),
Arel.sql("set @row_id := 0;"),
query.to_sql,
Arel.sql("DROP TEMPORARY TABLE IF EXISTS #{arel_table.table_name};")
]
end
def pg_median_datetime_sql(arel_table, query_so_far, column_sym)
# Create a CTE with the column we're operating on, row number (after sorting by the column
# we're operating on), and count of the table we're operating on (duplicated across) all rows
# of the CTE. For example, if we're looking to find the median of the `projects.star_count`
# column, the CTE might look like this:
#
# star_count | row_id | ct
# ------------+--------+----
# 5 | 1 | 3
# 9 | 2 | 3
# 15 | 3 | 3
cte_table = Arel::Table.new("ordered_records")
cte = Arel::Nodes::As.new(
cte_table,
arel_table
.project(
arel_table[column_sym].as(column_sym.to_s),
Arel::Nodes::Over.new(Arel::Nodes::NamedFunction.new("row_number", []),
Arel::Nodes::Window.new.order(arel_table[column_sym])).as('row_id'),
arel_table.project("COUNT(1)").as('ct')).
# Disallow negative values
where(arel_table[column_sym].gteq(zero_interval)))
# From the CTE, select either the middle row or the middle two rows (this is accomplished
# by 'where cte.row_id between cte.ct / 2.0 AND cte.ct / 2.0 + 1'). Find the average of the
# selected rows, and this is the median value.
cte_table.project(average([extract_epoch(cte_table[column_sym])], "median"))
.where(
Arel::Nodes::Between.new(
cte_table[:row_id],
Arel::Nodes::And.new(
[(cte_table[:ct] / Arel.sql('2.0')),
(cte_table[:ct] / Arel.sql('2.0') + 1)]
)
)
)
.with(query_so_far, cte)
.to_sql
end
private
def average(args, as)
Arel::Nodes::NamedFunction.new("AVG", args, as)
end
def extract_epoch(arel_attribute)
Arel.sql(%Q{EXTRACT(EPOCH FROM "#{arel_attribute.relation.name}"."#{arel_attribute.name}")})
end
def extract_diff_epoch(diff)
return diff unless Gitlab::Database.postgresql?
Arel.sql(%Q{EXTRACT(EPOCH FROM (#{diff.to_sql}))})
end
# Need to cast '0' to an INTERVAL before we can check if the interval is positive
def zero_interval
Arel::Nodes::NamedFunction.new("CAST", [Arel.sql("'0' AS INTERVAL")])
end
end
end
end