2019-04-02 03:48:35 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module Gitlab
|
|
|
|
module Sanitizers
|
|
|
|
class Exif
|
|
|
|
# these tags are not removed from the image
|
|
|
|
WHITELISTED_TAGS = %w(
|
|
|
|
ResolutionUnit
|
|
|
|
XResolution
|
|
|
|
YResolution
|
|
|
|
YCbCrSubSampling
|
|
|
|
YCbCrPositioning
|
|
|
|
BitsPerSample
|
|
|
|
ImageHeight
|
|
|
|
ImageWidth
|
|
|
|
ImageSize
|
|
|
|
Copyright
|
|
|
|
CopyrightNotice
|
|
|
|
Orientation
|
|
|
|
).freeze
|
|
|
|
|
|
|
|
# these tags are common in exiftool output, these
|
|
|
|
# do not contain any sensitive information, but
|
|
|
|
# we don't need to preserve them when removing
|
|
|
|
# exif tags
|
|
|
|
IGNORED_TAGS = %w(
|
|
|
|
ColorComponents
|
|
|
|
EncodingProcess
|
|
|
|
ExifByteOrder
|
|
|
|
ExifToolVersion
|
|
|
|
JFIFVersion
|
|
|
|
Directory
|
|
|
|
FileAccessDate
|
|
|
|
FileInodeChangeDate
|
|
|
|
FileModifyDate
|
|
|
|
FileName
|
|
|
|
FilePermissions
|
|
|
|
FileSize
|
|
|
|
SourceFile
|
|
|
|
Megapixels
|
|
|
|
FileType
|
|
|
|
FileTypeExtension
|
|
|
|
MIMEType
|
|
|
|
).freeze
|
|
|
|
|
|
|
|
ALLOWED_TAGS = WHITELISTED_TAGS + IGNORED_TAGS
|
|
|
|
EXCLUDE_PARAMS = WHITELISTED_TAGS.map { |tag| "-#{tag}" }
|
2021-04-14 11:09:04 -04:00
|
|
|
ALLOWED_MIME_TYPES = %w(image/jpeg image/tiff).freeze
|
2019-04-02 03:48:35 -04:00
|
|
|
|
|
|
|
attr_reader :logger
|
|
|
|
|
2020-09-08 05:08:31 -04:00
|
|
|
def initialize(logger: Gitlab::AppLogger)
|
2019-04-02 03:48:35 -04:00
|
|
|
@logger = logger
|
|
|
|
end
|
|
|
|
|
|
|
|
# rubocop: disable CodeReuse/ActiveRecord
|
2019-07-09 14:51:42 -04:00
|
|
|
def batch_clean(start_id: nil, stop_id: nil, dry_run: true, sleep_time: nil, uploader: nil, since: nil)
|
2019-04-02 03:48:35 -04:00
|
|
|
relation = Upload.where('lower(path) like ? or lower(path) like ? or lower(path) like ?',
|
|
|
|
'%.jpg', '%.jpeg', '%.tiff')
|
2019-07-09 14:51:42 -04:00
|
|
|
relation = relation.where(uploader: uploader) if uploader
|
|
|
|
relation = relation.where('created_at > ?', since) if since
|
2019-04-02 03:48:35 -04:00
|
|
|
|
|
|
|
logger.info "running in dry run mode, no images will be rewritten" if dry_run
|
|
|
|
|
|
|
|
find_params = {
|
|
|
|
start: start_id.present? ? start_id.to_i : nil,
|
2019-07-09 14:51:42 -04:00
|
|
|
finish: stop_id.present? ? stop_id.to_i : Upload.last&.id,
|
|
|
|
batch_size: 1000
|
2019-04-02 03:48:35 -04:00
|
|
|
}
|
|
|
|
|
2020-12-15 13:10:06 -05:00
|
|
|
relation.find_each(**find_params) do |upload|
|
2019-10-02 20:05:59 -04:00
|
|
|
clean(upload.retrieve_uploader, dry_run: dry_run)
|
2019-04-02 03:48:35 -04:00
|
|
|
sleep sleep_time if sleep_time
|
2021-04-26 08:09:44 -04:00
|
|
|
rescue StandardError => err
|
2019-04-02 03:48:35 -04:00
|
|
|
logger.error "failed to sanitize #{upload_ref(upload)}: #{err.message}"
|
|
|
|
logger.debug err.backtrace.join("\n ")
|
|
|
|
end
|
|
|
|
end
|
|
|
|
# rubocop: enable CodeReuse/ActiveRecord
|
|
|
|
|
|
|
|
def clean(uploader, dry_run: true)
|
|
|
|
Dir.mktmpdir('gitlab-exif') do |tmpdir|
|
|
|
|
src_path = fetch_upload_to_file(uploader, tmpdir)
|
|
|
|
|
|
|
|
to_remove = extra_tags(src_path)
|
|
|
|
|
|
|
|
if to_remove.empty?
|
|
|
|
logger.info "#{upload_ref(uploader.upload)}: only whitelisted tags present, skipping"
|
|
|
|
break
|
|
|
|
end
|
|
|
|
|
|
|
|
logger.info "#{upload_ref(uploader.upload)}: found exif tags to remove: #{to_remove}"
|
|
|
|
|
|
|
|
break if dry_run
|
|
|
|
|
|
|
|
remove_and_store(tmpdir, src_path, uploader)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2022-02-17 19:19:10 -05:00
|
|
|
def clean_existing_path(src_path, dry_run: false, content: nil, skip_unallowed_types: false)
|
|
|
|
content ||= File.read(src_path)
|
|
|
|
|
|
|
|
if skip_unallowed_types
|
|
|
|
return unless check_for_allowed_types(content, raise_error: false)
|
|
|
|
else
|
|
|
|
check_for_allowed_types(content)
|
|
|
|
end
|
|
|
|
|
|
|
|
to_remove = extra_tags(src_path)
|
|
|
|
|
|
|
|
if to_remove.empty?
|
|
|
|
logger.info "#{src_path}: only whitelisted tags present, skipping"
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
|
|
|
logger.info "#{src_path}: found exif tags to remove: #{to_remove}"
|
|
|
|
return if dry_run
|
|
|
|
|
|
|
|
exec_remove_exif!(src_path)
|
|
|
|
end
|
|
|
|
|
2021-04-14 11:09:04 -04:00
|
|
|
private
|
|
|
|
|
2019-04-02 03:48:35 -04:00
|
|
|
def extra_tags(path)
|
|
|
|
exif_tags(path).keys - ALLOWED_TAGS
|
|
|
|
end
|
|
|
|
|
|
|
|
def remove_and_store(tmpdir, src_path, uploader)
|
|
|
|
exec_remove_exif!(src_path)
|
|
|
|
logger.info "#{upload_ref(uploader.upload)}: exif removed, storing"
|
|
|
|
File.open(src_path, 'r') { |f| uploader.store!(f) }
|
|
|
|
end
|
|
|
|
|
|
|
|
def exec_remove_exif!(path)
|
|
|
|
# IPTC and XMP-iptcExt groups may keep copyright information so
|
|
|
|
# we always preserve them
|
|
|
|
cmd = ["exiftool", "-all=", "-tagsFromFile", "@", *EXCLUDE_PARAMS, "--IPTC:all", "--XMP-iptcExt:all", path]
|
|
|
|
output, status = Gitlab::Popen.popen(cmd)
|
|
|
|
|
|
|
|
if status != 0
|
|
|
|
raise "exiftool return code is #{status}: #{output}"
|
|
|
|
end
|
|
|
|
|
|
|
|
if File.size(path) == 0
|
|
|
|
raise "size of file is 0"
|
|
|
|
end
|
|
|
|
|
|
|
|
# exiftool creates backup of the original file in filename_original
|
|
|
|
old_path = "#{path}_original"
|
|
|
|
if File.size(path) == File.size(old_path)
|
|
|
|
raise "size of sanitized file is same as original size"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def fetch_upload_to_file(uploader, dir)
|
|
|
|
# upload is stored into the file with the original name - this filename
|
|
|
|
# is used by carrierwave when storing the file back to the storage
|
|
|
|
filename = File.join(dir, uploader.filename)
|
2021-04-14 11:09:04 -04:00
|
|
|
contents = uploader.read
|
|
|
|
|
|
|
|
check_for_allowed_types(contents)
|
2019-04-02 03:48:35 -04:00
|
|
|
|
|
|
|
File.open(filename, 'w') do |file|
|
|
|
|
file.binmode
|
2021-04-14 11:09:04 -04:00
|
|
|
file.write contents
|
2019-04-02 03:48:35 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
filename
|
|
|
|
end
|
|
|
|
|
2022-02-17 19:19:10 -05:00
|
|
|
def check_for_allowed_types(contents, raise_error: true)
|
2021-04-14 11:09:04 -04:00
|
|
|
mime_type = Gitlab::Utils::MimeType.from_string(contents)
|
|
|
|
|
2022-02-17 19:19:10 -05:00
|
|
|
allowed = ALLOWED_MIME_TYPES.include?(mime_type)
|
|
|
|
if !allowed && raise_error
|
2021-04-14 11:09:04 -04:00
|
|
|
raise "File type #{mime_type} not supported. Only supports #{ALLOWED_MIME_TYPES.join(", ")}."
|
|
|
|
end
|
2022-02-17 19:19:10 -05:00
|
|
|
|
|
|
|
allowed
|
2021-04-14 11:09:04 -04:00
|
|
|
end
|
|
|
|
|
2019-04-02 03:48:35 -04:00
|
|
|
def upload_ref(upload)
|
|
|
|
"#{upload.id}:#{upload.path}"
|
|
|
|
end
|
|
|
|
|
|
|
|
def exif_tags(path)
|
|
|
|
cmd = ["exiftool", "-all", "-j", "-sort", "--IPTC:all", "--XMP-iptcExt:all", path]
|
|
|
|
output, status = Gitlab::Popen.popen(cmd)
|
|
|
|
|
|
|
|
raise "failed to get exif tags: #{output}" if status != 0
|
|
|
|
|
2020-04-30 20:09:59 -04:00
|
|
|
Gitlab::Json.parse(output).first
|
2019-04-02 03:48:35 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|