gitlab-org--gitlab-foss/lib/gitlab/blob_helper.rb

# frozen_string_literal: true

# This has been extracted from https://github.com/github/linguist/blob/master/lib/linguist/blob_helper.rb
module Gitlab
  module BlobHelper
    include Gitlab::Utils::StrongMemoize

    def extname
      File.extname(name.to_s)
    end

    def known_extension?
      LanguageData.extensions.include?(extname)
    end

    def viewable?
      !large? && text_in_repo?
    end

    MEGABYTE = 1024 * 1024

    def large?
      size.to_i > MEGABYTE
    end

    def binary_in_repo?
      # Large blobs aren't even loaded into memory
      if data.nil?
        true

      # Treat blank files as text
      elsif data == ""
        false

      # Charlock doesn't know what to think
      elsif encoding.nil?
        true

      # If Charlock says its binary
      else
        find_encoding[:type] == :binary
      end
    end

    def text_in_repo?
      !binary_in_repo?
    end

    def image?
      ['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'].include?(extname.downcase)
    end

    # Internal: Lookup mime type for extension.
    #
    # Returns a MIME::Type
    # rubocop:disable Gitlab/ModuleWithInstanceVariables
    def _mime_type
      if defined? @_mime_type
        @_mime_type
      else
        guesses = ::MIME::Types.type_for(extname.to_s)

        # Prefer text mime types over binary
        @_mime_type = guesses.detect { |type| type.ascii? } || guesses.first
      end
    end
    # rubocop:enable Gitlab/ModuleWithInstanceVariables

    # Public: Get the actual blob mime type
    #
    # Examples
    #
    #   # => 'text/plain'
    #   # => 'text/html'
    #
    # Returns a mime type String.
    def mime_type
      _mime_type ? _mime_type.to_s : 'text/plain'
    end

    def binary_mime_type?
      _mime_type ? _mime_type.binary? : false
    end

    def lines
      @lines ||=
        if viewable? && data
          # `data` is usually encoded as ASCII-8BIT even when the content has
          # been detected as a different encoding. However, we are not allowed
          # to change the encoding of `data` because we've made the implicit
          # guarantee that each entry in `lines` is encoded the same way as
          # `data`.
          #
          # Instead, we re-encode each possible newline sequence as the
          # detected encoding, then force them back to the encoding of `data`
          # (usually a binary encoding like ASCII-8BIT). This means that the
          # byte sequence will match how newlines are likely encoded in the
          # file, but we don't have to change the encoding of `data` as far as
          # Ruby is concerned. This allows us to correctly parse out each line
          # without changing the encoding of `data`, and
          # also--importantly--without having to duplicate many (potentially
          # large) strings.
          begin
            data.split(encoded_newlines_re, -1)
          rescue Encoding::ConverterNotFoundError
            # The data is not splittable in the detected encoding.  Assume it's
            # one big line.
            [data]
          end
        else
          []
        end
    end

    def content_type
      # rubocop:disable Style/MultilineTernaryOperator
      # rubocop:disable Style/NestedTernaryOperator
      @content_type ||= binary_mime_type? || binary_in_repo? ? mime_type :
                          (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
      # rubocop:enable Style/NestedTernaryOperator
      # rubocop:enable Style/MultilineTernaryOperator
    end

    def encoded_newlines_re
      strong_memoize(:encoded_newlines_re) do
        newlines = ["\r\n", "\r", "\n"]
        data_encoding = data&.encoding

        if ruby_encoding && data_encoding
          newlines.map! do |nl|
            nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data_encoding)
          end
        end

        Regexp.union(newlines)
      end
    end

    def ruby_encoding
      if hash = find_encoding
        hash[:ruby_encoding]
      end
    end

    def encoding
      if hash = find_encoding
        hash[:encoding]
      end
    end

    def empty?
      data.nil? || data == ""
    end

    private

    def find_encoding
      @find_encoding ||= Gitlab::EncodingHelper.detect_encoding(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables
    end
  end
end
Enable frozen string for lib/gitlab/*.rb 2018-10-22 07:00:50 +00:00			`# frozen_string_literal: true`

Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`# This has been extracted from https://github.com/github/linguist/blob/master/lib/linguist/blob_helper.rb`
			`module Gitlab`
			`module BlobHelper`
Add latest changes from gitlab-org/gitlab@master 2020-04-23 09:09:46 +00:00			`include Gitlab::Utils::StrongMemoize`

Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`def extname`
			`File.extname(name.to_s)`
			`end`

			`def known_extension?`
			`LanguageData.extensions.include?(extname)`
			`end`

			`def viewable?`
Fixing image lfs bug and also displaying text lfs This commit, introduced in https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/23812, fixes a problem creating a displaying image diff notes when the image is stored in LFS. The main problem was that `Gitlab::Diff::File` was returning an invalid valid in `text?` for this kind of files. It also fixes a rendering problem with other LFS files, like text ones. They LFS pointer shouldn't be shown when LFS is enabled for the project, but they were. 2018-12-13 17:49:05 +00:00			`!large? && text_in_repo?`
Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`end`

			`MEGABYTE = 1024 * 1024`

			`def large?`
			`size.to_i > MEGABYTE`
			`end`

Fixing image lfs bug and also displaying text lfs This commit, introduced in https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/23812, fixes a problem creating a displaying image diff notes when the image is stored in LFS. The main problem was that `Gitlab::Diff::File` was returning an invalid valid in `text?` for this kind of files. It also fixes a rendering problem with other LFS files, like text ones. They LFS pointer shouldn't be shown when LFS is enabled for the project, but they were. 2018-12-13 17:49:05 +00:00			`def binary_in_repo?`
Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`# Large blobs aren't even loaded into memory`
			`if data.nil?`
			`true`

			`# Treat blank files as text`
			`elsif data == ""`
			`false`

			`# Charlock doesn't know what to think`
			`elsif encoding.nil?`
			`true`

			`# If Charlock says its binary`
			`else`
Add latest changes from gitlab-org/gitlab@master 2021-05-06 12:10:38 +00:00			`find_encoding[:type] == :binary`
Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`end`
			`end`

Fixing image lfs bug and also displaying text lfs This commit, introduced in https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/23812, fixes a problem creating a displaying image diff notes when the image is stored in LFS. The main problem was that `Gitlab::Diff::File` was returning an invalid valid in `text?` for this kind of files. It also fixes a rendering problem with other LFS files, like text ones. They LFS pointer shouldn't be shown when LFS is enabled for the project, but they were. 2018-12-13 17:49:05 +00:00			`def text_in_repo?`
			`!binary_in_repo?`
Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`end`

			`def image?`
Add latest changes from gitlab-org/gitlab@master 2021-11-03 03:13:15 +00:00			`['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'].include?(extname.downcase)`
Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`end`

			`# Internal: Lookup mime type for extension.`
			`#`
			`# Returns a MIME::Type`
			`# rubocop:disable Gitlab/ModuleWithInstanceVariables`
			`def _mime_type`
			`if defined? @_mime_type`
			`@_mime_type`
			`else`
			`guesses = ::MIME::Types.type_for(extname.to_s)`

			`# Prefer text mime types over binary`
			`@_mime_type = guesses.detect { \|type\| type.ascii? } \|\| guesses.first`
			`end`
			`end`
			`# rubocop:enable Gitlab/ModuleWithInstanceVariables`

			`# Public: Get the actual blob mime type`
			`#`
			`# Examples`
			`#`
			`# # => 'text/plain'`
			`# # => 'text/html'`
			`#`
			`# Returns a mime type String.`
			`def mime_type`
			`_mime_type ? _mime_type.to_s : 'text/plain'`
			`end`

			`def binary_mime_type?`
			`_mime_type ? _mime_type.binary? : false`
			`end`

			`def lines`
			`@lines \|\|=`
			`if viewable? && data`
			# `data` is usually encoded as ASCII-8BIT even when the content has
			`# been detected as a different encoding. However, we are not allowed`
			# to change the encoding of `data` because we've made the implicit
			# guarantee that each entry in `lines` is encoded the same way as
			# `data`.
			`#`
			`# Instead, we re-encode each possible newline sequence as the`
			# detected encoding, then force them back to the encoding of `data`
			`# (usually a binary encoding like ASCII-8BIT). This means that the`
			`# byte sequence will match how newlines are likely encoded in the`
			# file, but we don't have to change the encoding of `data` as far as
			`# Ruby is concerned. This allows us to correctly parse out each line`
			# without changing the encoding of `data`, and
			`# also--importantly--without having to duplicate many (potentially`
			`# large) strings.`
			`begin`
			`data.split(encoded_newlines_re, -1)`
			`rescue Encoding::ConverterNotFoundError`
			`# The data is not splittable in the detected encoding. Assume it's`
			`# one big line.`
			`[data]`
			`end`
			`else`
			`[]`
			`end`
			`end`

			`def content_type`
			`# rubocop:disable Style/MultilineTernaryOperator`
			`# rubocop:disable Style/NestedTernaryOperator`
Fixing image lfs bug and also displaying text lfs This commit, introduced in https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/23812, fixes a problem creating a displaying image diff notes when the image is stored in LFS. The main problem was that `Gitlab::Diff::File` was returning an invalid valid in `text?` for this kind of files. It also fixes a rendering problem with other LFS files, like text ones. They LFS pointer shouldn't be shown when LFS is enabled for the project, but they were. 2018-12-13 17:49:05 +00:00			`@content_type \|\|= binary_mime_type? \|\| binary_in_repo? ? mime_type :`
Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`(encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")`
			`# rubocop:enable Style/NestedTernaryOperator`
			`# rubocop:enable Style/MultilineTernaryOperator`
			`end`

			`def encoded_newlines_re`
Add latest changes from gitlab-org/gitlab@master 2020-04-23 09:09:46 +00:00			`strong_memoize(:encoded_newlines_re) do`
			`newlines = ["\r\n", "\r", "\n"]`
			`data_encoding = data&.encoding`

			`if ruby_encoding && data_encoding`
			`newlines.map! do \|nl\|`
			`nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data_encoding)`
			`end`
			`end`

			`Regexp.union(newlines)`
			`end`
Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`end`

			`def ruby_encoding`
Add latest changes from gitlab-org/gitlab@master 2021-05-06 12:10:38 +00:00			`if hash = find_encoding`
Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`hash[:ruby_encoding]`
			`end`
			`end`

			`def encoding`
Add latest changes from gitlab-org/gitlab@master 2021-05-06 12:10:38 +00:00			`if hash = find_encoding`
Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`hash[:encoding]`
			`end`
			`end`

			`def empty?`
			`data.nil? \|\| data == ""`
			`end`
Add latest changes from gitlab-org/gitlab@master 2021-05-06 12:10:38 +00:00
			`private`

			`def find_encoding`
			`@find_encoding \|\|= Gitlab::EncodingHelper.detect_encoding(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables`
			`end`
Remove dependencies on Linguist This saves about 128 MB of baseline RAM usage per Unicorn and Sidekiq process (!). Linguist wasn't detecting languages anymore from CE/EE since 9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper was still being depended on by BlobLike and others. This removes the Linguist gem, given it isn't required anymore. EscapeUtils were pulled in as dependency, but given Banzai depends on it, it is now added explicitly. Previously, Linguist was used to detect the best ACE mode. Instead, we rely on ACE to guess the best mode based on the file extension. 2018-08-03 13:24:26 +00:00			`end`
			`end`