gitlab-org--gitlab-foss/lib/gitlab/search/found_blob.rb
Jan Provaznik 58bfd73331 Optimized file search to work without limits
* removed 100 limit on file search results because we
  load all results anyway
* expensive processing (parsing match content, utf encoding)
  is done only for selected page in paginated output
2018-12-06 09:25:09 +01:00

162 lines
4.6 KiB
Ruby

# frozen_string_literal: true
module Gitlab
module Search
class FoundBlob
include EncodingHelper
include Presentable
include BlobLanguageFromGitAttributes
include Gitlab::Utils::StrongMemoize
attr_reader :project, :content_match, :blob_filename
FILENAME_REGEXP = /\A(?<ref>[^:]*):(?<filename>[^\x00]*)\x00/.freeze
CONTENT_REGEXP = /^(?<ref>[^:]*):(?<filename>[^\x00]*)\x00(?<startline>\d+)\x00/.freeze
def self.preload_blobs(blobs)
to_fetch = blobs.select { |blob| blob.is_a?(self) && blob.blob_filename }
to_fetch.each { |blob| blob.fetch_blob }
end
def initialize(opts = {})
@id = opts.fetch(:id, nil)
@binary_filename = opts.fetch(:filename, nil)
@binary_basename = opts.fetch(:basename, nil)
@ref = opts.fetch(:ref, nil)
@startline = opts.fetch(:startline, nil)
@binary_data = opts.fetch(:data, nil)
@per_page = opts.fetch(:per_page, 20)
@project = opts.fetch(:project, nil)
# Some caller does not have project object (e.g. elastic search),
# yet they can trigger many calls in one go,
# causing duplicated queries.
# Allow those to just pass project_id instead.
@project_id = opts.fetch(:project_id, nil)
@content_match = opts.fetch(:content_match, nil)
@blob_filename = opts.fetch(:blob_filename, nil)
@repository = opts.fetch(:repository, nil)
end
def id
@id ||= parsed_content[:id]
end
def ref
@ref ||= parsed_content[:ref]
end
def startline
@startline ||= parsed_content[:startline]
end
# binary_filename is used for running filters on all matches,
# for grepped results (which use content_match), we get
# filename from the beginning of the grepped result which is faster
# then parsing whole snippet
def binary_filename
@binary_filename ||= content_match ? search_result_filename : parsed_content[:binary_filename]
end
def filename
@filename ||= encode_utf8(@binary_filename || parsed_content[:binary_filename])
end
def basename
@basename ||= encode_utf8(@binary_basename || parsed_content[:binary_basename])
end
def data
@data ||= encode_utf8(@binary_data || parsed_content[:binary_data])
end
def path
filename
end
def project_id
@project_id || @project&.id
end
def present
super(presenter_class: BlobPresenter)
end
def fetch_blob
path = [ref, blob_filename]
missing_blob = { binary_filename: blob_filename }
BatchLoader.for(path).batch(default_value: missing_blob) do |refs, loader|
Gitlab::Git::Blob.batch(repository, refs, blob_size_limit: 1024).each do |blob|
# if the blob couldn't be fetched for some reason,
# show at least the blob filename
data = {
id: blob.id,
binary_filename: blob.path,
binary_basename: File.basename(blob.path, File.extname(blob.path)),
ref: ref,
startline: 1,
binary_data: blob.data,
project: project
}
loader.call([ref, blob.path], data)
end
end
end
private
def search_result_filename
content_match.match(FILENAME_REGEXP) { |matches| matches[:filename] }
end
def parsed_content
strong_memoize(:parsed_content) do
if content_match
parse_search_result
elsif blob_filename
fetch_blob
else
{}
end
end
end
def parse_search_result
ref = nil
filename = nil
basename = nil
data = []
startline = 0
content_match.each_line.each_with_index do |line, index|
prefix ||= line.match(CONTENT_REGEXP)&.tap do |matches|
ref = matches[:ref]
filename = matches[:filename]
startline = matches[:startline]
startline = startline.to_i - index
extname = Regexp.escape(File.extname(filename))
basename = filename.sub(/#{extname}$/, '')
end
data << line.sub(prefix.to_s, '')
end
{
binary_filename: filename,
binary_basename: basename,
ref: ref,
startline: startline,
binary_data: data.join,
project: project
}
end
def repository
@repository ||= project.repository
end
end
end
end