gitlab-org--gitlab-foss/lib/gitlab/background_migration/recalculate_vulnerabilities_occurrences_uuid.rb

218 lines
7.6 KiB
Ruby

# frozen_string_literal: true
# rubocop: disable Style/Documentation
class Gitlab::BackgroundMigration::RecalculateVulnerabilitiesOccurrencesUuid # rubocop:disable Metrics/ClassLength
# rubocop: disable Gitlab/NamespacedClass
class VulnerabilitiesIdentifier < ActiveRecord::Base
self.table_name = "vulnerability_identifiers"
has_many :primary_findings, class_name: 'VulnerabilitiesFinding', inverse_of: :primary_identifier, foreign_key: 'primary_identifier_id'
end
class VulnerabilitiesFinding < ActiveRecord::Base
include EachBatch
include ShaAttribute
self.table_name = "vulnerability_occurrences"
has_many :signatures, foreign_key: 'finding_id', class_name: 'VulnerabilityFindingSignature', inverse_of: :finding
belongs_to :primary_identifier, class_name: 'VulnerabilitiesIdentifier', inverse_of: :primary_findings, foreign_key: 'primary_identifier_id'
REPORT_TYPES = {
sast: 0,
dependency_scanning: 1,
container_scanning: 2,
dast: 3,
secret_detection: 4,
coverage_fuzzing: 5,
api_fuzzing: 6,
cluster_image_scanning: 7,
generic: 99
}.with_indifferent_access.freeze
enum report_type: REPORT_TYPES
sha_attribute :fingerprint
sha_attribute :location_fingerprint
end
class VulnerabilityFindingSignature < ActiveRecord::Base
include ShaAttribute
self.table_name = 'vulnerability_finding_signatures'
belongs_to :finding, foreign_key: 'finding_id', inverse_of: :signatures, class_name: 'VulnerabilitiesFinding'
sha_attribute :signature_sha
end
class VulnerabilitiesFindingPipeline < ActiveRecord::Base
include EachBatch
self.table_name = "vulnerability_occurrence_pipelines"
end
class Vulnerability < ActiveRecord::Base
include EachBatch
self.table_name = "vulnerabilities"
end
class CalculateFindingUUID
FINDING_NAMESPACES_IDS = {
development: "a143e9e2-41b3-47bc-9a19-081d089229f4",
test: "a143e9e2-41b3-47bc-9a19-081d089229f4",
staging: "a6930898-a1b2-4365-ab18-12aa474d9b26",
production: "58dc0f06-936c-43b3-93bb-71693f1b6570"
}.freeze
NAMESPACE_REGEX = /(\h{8})-(\h{4})-(\h{4})-(\h{4})-(\h{4})(\h{8})/.freeze
PACK_PATTERN = "NnnnnN"
def self.call(value)
Digest::UUID.uuid_v5(namespace_id, value)
end
def self.namespace_id
namespace_uuid = FINDING_NAMESPACES_IDS.fetch(Rails.env.to_sym)
# Digest::UUID is broken when using an UUID in namespace_id
# https://github.com/rails/rails/issues/37681#issue-520718028
namespace_uuid.scan(NAMESPACE_REGEX).flatten.map { |s| s.to_i(16) }.pack(PACK_PATTERN)
end
end
# rubocop: enable Gitlab/NamespacedClass
# rubocop: disable Metrics/AbcSize,Metrics/MethodLength,Metrics/BlockLength
def perform(start_id, end_id)
log_info('Migration started', start_id: start_id, end_id: end_id)
VulnerabilitiesFinding
.joins(:primary_identifier)
.includes(:signatures)
.select(:id, :report_type, :primary_identifier_id, :fingerprint, :location_fingerprint, :project_id, :created_at, :vulnerability_id, :uuid)
.where(id: start_id..end_id)
.each_batch(of: 50) do |relation|
duplicates = find_duplicates(relation)
remove_findings(ids: duplicates) if duplicates.present?
to_update = relation.reject { |finding| duplicates.include?(finding.id) }
begin
known_uuids = Set.new
to_be_deleted = []
mappings = to_update.each_with_object({}) do |finding, hash|
uuid = calculate_uuid_v5_for_finding(finding)
if known_uuids.add?(uuid)
hash[finding] = { uuid: uuid }
else
to_be_deleted << finding.id
end
end
# It is technically still possible to have duplicate uuids
# if the data integrity is broken somehow and the primary identifiers of
# the findings are pointing to different projects with the same fingerprint values.
if to_be_deleted.present?
log_info('Conflicting UUIDs found within the batch', finding_ids: to_be_deleted)
remove_findings(ids: to_be_deleted)
end
::Gitlab::Database::BulkUpdate.execute(%i[uuid], mappings) if mappings.present?
log_info('Recalculation is done', finding_ids: mappings.keys.pluck(:id))
rescue ActiveRecord::RecordNotUnique => error
log_info('RecordNotUnique error received')
match_data = /\(uuid\)=\((?<uuid>\S{36})\)/.match(error.message)
# This exception returns the **correct** UUIDv5 which probably comes from a later record
# and it's the one we can drop in the easiest way before retrying the UPDATE query
if match_data
uuid = match_data[:uuid]
log_info('Conflicting UUID found', uuid: uuid)
id = VulnerabilitiesFinding.find_by(uuid: uuid)&.id
remove_findings(ids: id) if id
retry
else
log_error('Couldnt find conflicting uuid')
Gitlab::ErrorTracking.track_and_raise_exception(error)
end
end
end
mark_job_as_succeeded(start_id, end_id)
rescue StandardError => error
log_error('An exception happened')
Gitlab::ErrorTracking.track_and_raise_exception(error)
end
# rubocop: disable Metrics/AbcSize,Metrics/MethodLength,Metrics/BlockLength
private
def find_duplicates(relation)
to_exclude = []
relation.flat_map do |record|
# Assuming we're scanning id 31 and the duplicate is id 40
# first we'd process 31 and add 40 to the list of ids to remove
# then we would process record 40 and add 31 to the list of removals
# so we would drop both records
to_exclude << record.id
VulnerabilitiesFinding.where(
report_type: record.report_type,
location_fingerprint: record.location_fingerprint,
primary_identifier_id: record.primary_identifier_id,
project_id: record.project_id
).where.not(id: to_exclude).pluck(:id)
end
end
def remove_findings(ids:)
ids = Array(ids)
log_info('Removing Findings and associated records', ids: ids)
vulnerability_ids = VulnerabilitiesFinding.where(id: ids).pluck(:vulnerability_id).uniq.compact
VulnerabilitiesFindingPipeline.where(occurrence_id: ids).each_batch { |batch| batch.delete_all }
Vulnerability.where(id: vulnerability_ids).each_batch { |batch| batch.delete_all }
VulnerabilitiesFinding.where(id: ids).delete_all
end
def calculate_uuid_v5_for_finding(vulnerability_finding)
return unless vulnerability_finding
signatures = vulnerability_finding.signatures.sort_by { |signature| signature.algorithm_type_before_type_cast }
location_fingerprint = signatures.last&.signature_sha || vulnerability_finding.location_fingerprint
uuid_v5_name_components = {
report_type: vulnerability_finding.report_type,
primary_identifier_fingerprint: vulnerability_finding.fingerprint,
location_fingerprint: location_fingerprint,
project_id: vulnerability_finding.project_id
}
name = uuid_v5_name_components.values.join('-')
CalculateFindingUUID.call(name)
end
def log_info(message, **extra)
logger.info(migrator: 'RecalculateVulnerabilitiesOccurrencesUuid', message: message, **extra)
end
def log_error(message, **extra)
logger.error(migrator: 'RecalculateVulnerabilitiesOccurrencesUuid', message: message, **extra)
end
def logger
@logger ||= Gitlab::BackgroundMigration::Logger.build
end
def mark_job_as_succeeded(*arguments)
Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded(
'RecalculateVulnerabilitiesOccurrencesUuid',
arguments
)
end
end