2020-07-23 05:09:18 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module Gitlab
|
|
|
|
module Database
|
|
|
|
class SimilarityScore
|
|
|
|
EMPTY_STRING = Arel.sql("''").freeze
|
|
|
|
EXPRESSION_ON_INVALID_INPUT = Arel::Nodes::NamedFunction.new('CAST', [Arel.sql('0').as('integer')]).freeze
|
|
|
|
DEFAULT_MULTIPLIER = 1
|
2020-09-24 05:09:35 -04:00
|
|
|
DISPLAY_NAME = self.name.underscore.freeze
|
|
|
|
|
|
|
|
# Adds a "magic" comment in the generated SQL expression in order to be able to tell if we're sorting by similarity.
|
|
|
|
# Example: /* gitlab/database/similarity_score */ SIMILARITY(COALESCE...
|
2021-03-25 23:09:21 -04:00
|
|
|
SIMILARITY_FUNCTION_CALL_WITH_ANNOTATION = "/* #{DISPLAY_NAME} */ SIMILARITY"
|
2020-07-23 05:09:18 -04:00
|
|
|
|
|
|
|
# This method returns an Arel expression that can be used in an ActiveRecord query to order the resultset by similarity.
|
|
|
|
#
|
|
|
|
# Note: Calculating similarity score for large volume of records is inefficient. use SimilarityScore only for smaller
|
|
|
|
# resultset which is already filtered by other conditions (< 10_000 records).
|
|
|
|
#
|
|
|
|
# ==== Parameters
|
|
|
|
# * +search+ - [String] the user provided search string
|
|
|
|
# * +rules+ - [{ column: COLUMN, multiplier: 1 }, { column: COLUMN_2, multiplier: 0.5 }] rules for the scoring.
|
|
|
|
# * +column+ - Arel column expression, example: Project.arel_table["name"]
|
|
|
|
# * +multiplier+ - Integer or Float to increase or decrease the score (optional, defaults to 1)
|
|
|
|
#
|
|
|
|
# ==== Use case
|
|
|
|
#
|
|
|
|
# We'd like to search for projects by path, name and description. We want to rank higher the path and name matches, since
|
|
|
|
# it's more likely that the user was remembering the path or the name of the project.
|
|
|
|
#
|
|
|
|
# Rules:
|
|
|
|
# [
|
|
|
|
# { column: Project.arel_table['path'], multiplier: 1 },
|
|
|
|
# { column: Project.arel_table['name'], multiplier: 1 },
|
|
|
|
# { column: Project.arel_table['description'], multiplier: 0.5 }
|
|
|
|
# ]
|
|
|
|
#
|
|
|
|
# ==== Examples
|
|
|
|
#
|
|
|
|
# Similarity calculation based on one column:
|
|
|
|
#
|
|
|
|
# Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [{ column: Project.arel_table['name'] }])
|
|
|
|
#
|
|
|
|
# Similarity calculation based on two column, where the second column has lower priority:
|
|
|
|
#
|
|
|
|
# Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [
|
|
|
|
# { column: Project.arel_table['name'], multiplier: 1 },
|
|
|
|
# { column: Project.arel_table['description'], multiplier: 0.5 }
|
|
|
|
# ])
|
|
|
|
#
|
|
|
|
# Integration with an ActiveRecord query:
|
|
|
|
#
|
|
|
|
# table = Project.arel_table
|
|
|
|
#
|
|
|
|
# order_expression = Gitlab::Database::SimilarityScore.build_expession(search: 'input', rules: [
|
|
|
|
# { column: table['name'], multiplier: 1 },
|
|
|
|
# { column: table['description'], multiplier: 0.5 }
|
|
|
|
# ])
|
|
|
|
#
|
|
|
|
# Project.where("name LIKE ?", '%' + 'input' + '%').order(order_expression.desc)
|
|
|
|
#
|
|
|
|
# The expression can be also used in SELECT:
|
|
|
|
#
|
|
|
|
# results = Project.select(order_expression.as('similarity')).where("name LIKE ?", '%' + 'input' + '%').order(similarity: :desc)
|
|
|
|
# puts results.map(&:similarity)
|
|
|
|
#
|
|
|
|
def self.build_expression(search:, rules:)
|
|
|
|
return EXPRESSION_ON_INVALID_INPUT if search.blank? || rules.empty?
|
|
|
|
|
2021-07-23 08:09:05 -04:00
|
|
|
quoted_search = ApplicationRecord.connection.quote(search.to_s)
|
2020-07-23 05:09:18 -04:00
|
|
|
|
|
|
|
first_expression, *expressions = rules.map do |rule|
|
|
|
|
rule_to_arel(quoted_search, rule)
|
|
|
|
end
|
|
|
|
|
|
|
|
# (SIMILARITY ...) + (SIMILARITY ...)
|
2021-03-10 10:09:11 -05:00
|
|
|
additions = expressions.inject(first_expression) do |expression1, expression2|
|
2020-07-23 05:09:18 -04:00
|
|
|
Arel::Nodes::Addition.new(expression1, expression2)
|
|
|
|
end
|
2021-03-10 10:09:11 -05:00
|
|
|
|
|
|
|
score_as_numeric = Arel::Nodes::NamedFunction.new('CAST', [Arel::Nodes::Grouping.new(additions).as('numeric')])
|
|
|
|
|
|
|
|
# Rounding the score to two decimals
|
|
|
|
Arel::Nodes::NamedFunction.new('ROUND', [score_as_numeric, 2])
|
2020-07-23 05:09:18 -04:00
|
|
|
end
|
|
|
|
|
2020-09-24 05:09:35 -04:00
|
|
|
def self.order_by_similarity?(arel_query)
|
|
|
|
arel_query.to_sql.include?(SIMILARITY_FUNCTION_CALL_WITH_ANNOTATION)
|
|
|
|
end
|
|
|
|
|
2020-07-23 05:09:18 -04:00
|
|
|
# (SIMILARITY(COALESCE(column, ''), 'search_string') * CAST(multiplier AS numeric))
|
|
|
|
def self.rule_to_arel(search, rule)
|
|
|
|
Arel::Nodes::Grouping.new(
|
|
|
|
Arel::Nodes::Multiplication.new(
|
|
|
|
similarity_function_call(search, column_expression(rule)),
|
|
|
|
multiplier_expression(rule)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
end
|
|
|
|
|
|
|
|
# COALESCE(column, '')
|
|
|
|
def self.column_expression(rule)
|
|
|
|
Arel::Nodes::NamedFunction.new('COALESCE', [rule.fetch(:column), EMPTY_STRING])
|
|
|
|
end
|
|
|
|
|
|
|
|
# SIMILARITY(COALESCE(column, ''), 'search_string')
|
|
|
|
def self.similarity_function_call(search, column)
|
2020-09-24 05:09:35 -04:00
|
|
|
Arel::Nodes::NamedFunction.new(SIMILARITY_FUNCTION_CALL_WITH_ANNOTATION, [column, Arel.sql(search)])
|
2020-07-23 05:09:18 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
# CAST(multiplier AS numeric)
|
|
|
|
def self.multiplier_expression(rule)
|
2021-07-23 08:09:05 -04:00
|
|
|
quoted_multiplier = ApplicationRecord.connection.quote(rule.fetch(:multiplier, DEFAULT_MULTIPLIER).to_s)
|
2020-07-23 05:09:18 -04:00
|
|
|
|
|
|
|
Arel::Nodes::NamedFunction.new('CAST', [Arel.sql(quoted_multiplier).as('numeric')])
|
|
|
|
end
|
|
|
|
|
|
|
|
private_class_method :rule_to_arel
|
|
|
|
private_class_method :column_expression
|
|
|
|
private_class_method :similarity_function_call
|
|
|
|
private_class_method :multiplier_expression
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|