gitlab-org--gitlab-foss/spec/lib/gitlab/path_regex_spec.rb

# frozen_string_literal: true

require 'spec_helper'

RSpec.describe Gitlab::PathRegex do
  let(:starting_with_namespace) { %r{^/\*namespace_id/:(project_)?id} }
  let(:non_param_parts) { %r{[^:*][a-z\-_/]*} }
  let(:any_other_path_part) { %r{[a-z\-_/:]*} }
  let(:wildcard_segment) { /\*/ }

  # Pass in a full path to remove the format segment:
  # `/ci/lint(.:format)` -> `/ci/lint`
  def without_format(path)
    path.split('(', 2)[0]
  end

  # Pass in a full path and get the last segment before a wildcard
  # That's not a parameter
  # `/*namespace_id/:project_id/builds/artifacts/*ref_name_and_path`
  #    -> 'builds/artifacts'
  def path_before_wildcard(path)
    path = path.gsub(starting_with_namespace, "")
    path_segments = path.split('/').reject(&:empty?)
    wildcard_index = path_segments.index { |segment| parameter?(segment) }

    segments_before_wildcard = path_segments[0..wildcard_index - 1]

    segments_before_wildcard.join('/')
  end

  def parameter?(segment)
    segment =~ /[*:]/
  end

  # If the path is reserved. Then no conflicting paths can# be created for any
  # route using this reserved word.
  #
  # Both `builds/artifacts` & `build` are covered by reserving the word
  # `build`
  def wildcards_include?(path)
    described_class::PROJECT_WILDCARD_ROUTES.include?(path) ||
      described_class::PROJECT_WILDCARD_ROUTES.include?(path.split('/').first)
  end

  def failure_message(constant_name, migration_helper, missing_words: [], additional_words: [])
    missing_words = Array(missing_words)
    additional_words = Array(additional_words)
    message = ""
    if missing_words.any?
      message += <<-MISSING
      Found new routes that could cause conflicts with existing namespaced routes
      for groups or projects.

      Nest <#{missing_words.join(', ')}> in a route containing `-`, that way
      we know there will be no conflicts with groups or projects created with those
      paths.

      MISSING
    end

    if additional_words.any?
      message += <<-ADDITIONAL
      Is <#{additional_words.join(', ')}> in `#{constant_name}` required?
      If they are really required, update these specs to reflect that.

      ADDITIONAL
    end

    message
  end

  let(:all_non_legacy_routes) do
    route_set = Rails.application.routes
    routes_collection = route_set.routes
    routes_array = routes_collection.routes

    non_legacy_routes = routes_array.reject do |route|
      route.name.to_s =~ /legacy_(\w*)_redirect/
    end

    non_deprecated_redirect_routes = non_legacy_routes.reject do |route|
      app = route.app
      # `app.app` is either another app, or `self`. We want to find the final app.
      app = app.app while app.try(:app) && app.app != app

      app.is_a?(ActionDispatch::Routing::PathRedirect) && app.block.include?('/-/')
    end

    non_deprecated_redirect_routes.map { |route| route.path.spec.to_s }
  end

  let(:routes_without_format) { all_non_legacy_routes.map { |path| without_format(path) } }

  # Routes not starting with `/:` or `/*`
  # all routes not starting with a param
  let(:routes_not_starting_in_wildcard) { routes_without_format.select { |p| p !~ %r{^/[:*]} } }

  let(:top_level_words) do
    routes_not_starting_in_wildcard
      .map { |route| route.split('/')[1] }
      .concat(ee_top_level_words)
      .concat(files_in_public)
      .concat(Array(API::API.prefix.to_s))
      .concat(sitemap_words)
      .concat(deprecated_routes)
      .compact
      .uniq
  end

  let(:sitemap_words) do
    %w(sitemap sitemap.xml sitemap.xml.gz)
  end

  let(:deprecated_routes) do
    # profile was deprecated in https://gitlab.com/gitlab-org/gitlab/-/merge_requests/51646
    %w(profile)
  end

  let(:ee_top_level_words) do
    %w(unsubscribes v2)
  end

  let(:files_in_public) do
    git = Gitlab.config.git.bin_path
    tracked = `cd #{Rails.root} && #{git} ls-files public`
      .split("\n")
      .map { |entry| entry.start_with?('public/-/') ? '-' : entry.gsub('public/', '') }
      .uniq
    tracked + %w(assets uploads)
  end

  # All routes that start with a namespaced path, that have 1 or more
  # path-segments before having another wildcard parameter.
  # - Starting with paths:
  #   - `/*namespace_id/:project_id/`
  #   - `/*namespace_id/:id/`
  # - Followed by one or more path-parts not starting with `:` or `*`
  # - Followed by a path-part that includes a wildcard parameter `*`
  # At the time of writing these routes match: http://rubular.com/r/Rv2pDE5Dvw
  let(:namespaced_wildcard_routes) do
    routes_without_format.select do |p|
      p =~ %r{#{starting_with_namespace}/#{non_param_parts}/#{any_other_path_part}#{wildcard_segment}}
    end
  end

  # This will return all paths that are used in a namespaced route
  # before another wildcard path:
  #
  # /*namespace_id/:project_id/builds/artifacts/*ref_name_and_path
  # /*namespace_id/:project_id/info/lfs/objects/*oid
  # /*namespace_id/:project_id/commits/*id
  # /*namespace_id/:project_id/builds/:build_id/artifacts/file/*path
  # -> ['builds/artifacts', 'info/lfs/objects', 'commits', 'artifacts/file']
  let(:all_wildcard_paths) do
    namespaced_wildcard_routes.map do |route|
      path_before_wildcard(route)
    end.uniq
  end

  let(:starting_with_group) { %r{^/groups/\*(group_)?id/} }
  let(:group_routes) do
    routes_without_format.grep(starting_with_group)
  end

  let(:paths_after_group_id) do
    group_routes.map do |route|
      route.gsub(starting_with_group, '').split('/').first
    end.uniq
  end

  describe 'TOP_LEVEL_ROUTES' do
    it 'includes all the top level namespaces' do
      failure_block = lambda do
        missing_words = top_level_words - described_class::TOP_LEVEL_ROUTES
        additional_words = described_class::TOP_LEVEL_ROUTES - top_level_words
        failure_message('TOP_LEVEL_ROUTES', 'rename_root_paths',
                        missing_words: missing_words, additional_words: additional_words)
      end

      expect(described_class::TOP_LEVEL_ROUTES)
        .to contain_exactly(*top_level_words), failure_block
    end

    # We ban new items in this list, see https://gitlab.com/gitlab-org/gitlab/-/issues/215362
    it 'does not allow expansion' do
      expect(described_class::TOP_LEVEL_ROUTES.size).to eq(40)
    end
  end

  describe 'GROUP_ROUTES' do
    it "don't contain a second wildcard" do
      failure_block = lambda do
        missing_words = paths_after_group_id - described_class::GROUP_ROUTES
        additional_words = described_class::GROUP_ROUTES - paths_after_group_id
        failure_message('GROUP_ROUTES', 'rename_child_paths',
                        missing_words: missing_words, additional_words: additional_words)
      end

      expect(described_class::GROUP_ROUTES)
        .to contain_exactly(*paths_after_group_id), failure_block
    end

    # We ban new items in this list, see https://gitlab.com/gitlab-org/gitlab/-/issues/215362
    it 'does not allow expansion' do
      expect(described_class::GROUP_ROUTES.size).to eq(1)
    end
  end

  describe 'PROJECT_WILDCARD_ROUTES' do
    it 'includes all paths that can be used after a namespace/project path' do
      aggregate_failures do
        all_wildcard_paths.each do |path|
          expect(wildcards_include?(path))
            .to be(true), failure_message('PROJECT_WILDCARD_ROUTES', 'rename_wildcard_paths', missing_words: path)
        end
      end
    end

    # We ban new items in this list, see https://gitlab.com/gitlab-org/gitlab/-/issues/215362
    it 'does not allow expansion' do
      expect(described_class::PROJECT_WILDCARD_ROUTES.size).to eq(21)
    end
  end

  describe '.root_namespace_route_regex' do
    subject { %r{\A#{described_class.root_namespace_route_regex}/\z} }

    it 'rejects top level routes' do
      expect(subject).not_to match('admin/')
      expect(subject).not_to match('api/')
      expect(subject).not_to match('.well-known/')
      expect(subject).not_to match('sitemap.xml/')
      expect(subject).not_to match('sitemap.xml.gz/')
    end

    it 'accepts project wildcard routes' do
      expect(subject).to match('blob/')
      expect(subject).to match('edit/')
      expect(subject).to match('wikis/')
    end

    it 'accepts group routes' do
      expect(subject).to match('activity/')
    end

    it 'is not case sensitive' do
      expect(subject).not_to match('Users/')
    end

    it 'does not allow extra slashes' do
      expect(subject).not_to match('/blob/')
      expect(subject).not_to match('blob//')
    end
  end

  describe '.full_namespace_path_regex' do
    subject { described_class.full_namespace_path_regex }

    context 'at the top level' do
      context 'when the final level' do
        it 'rejects top level routes' do
          expect(subject).not_to match('admin/')
          expect(subject).not_to match('api/')
          expect(subject).not_to match('.well-known/')
        end

        it 'accepts project wildcard routes' do
          expect(subject).to match('blob/')
          expect(subject).to match('edit/')
          expect(subject).to match('wikis/')
        end

        it 'accepts group routes' do
          expect(subject).to match('activity/')
        end
      end

      context 'when more levels follow' do
        it 'rejects top level routes' do
          expect(subject).not_to match('admin/more/')
          expect(subject).not_to match('api/more/')
          expect(subject).not_to match('.well-known/more/')
        end

        it 'accepts project wildcard routes' do
          expect(subject).to match('blob/more/')
          expect(subject).to match('edit/more/')
          expect(subject).to match('wikis/more/')
          expect(subject).to match('environments/folders/')
          expect(subject).to match('info/lfs/objects/')
        end

        it 'accepts group routes' do
          expect(subject).to match('activity/more/')
        end
      end
    end

    context 'at the second level' do
      context 'when the final level' do
        it 'accepts top level routes' do
          expect(subject).to match('root/admin/')
          expect(subject).to match('root/api/')
          expect(subject).to match('root/.well-known/')
        end

        it 'rejects project wildcard routes' do
          expect(subject).not_to match('root/blob/')
          expect(subject).not_to match('root/edit/')
          expect(subject).not_to match('root/wikis/')
          expect(subject).not_to match('root/environments/folders/')
          expect(subject).not_to match('root/info/lfs/objects/')
        end

        it 'rejects group routes' do
          expect(subject).not_to match('root/-/')
        end
      end

      context 'when more levels follow' do
        it 'accepts top level routes' do
          expect(subject).to match('root/admin/more/')
          expect(subject).to match('root/api/more/')
          expect(subject).to match('root/.well-known/more/')
        end

        it 'rejects project wildcard routes' do
          expect(subject).not_to match('root/blob/more/')
          expect(subject).not_to match('root/edit/more/')
          expect(subject).not_to match('root/wikis/more/')
          expect(subject).not_to match('root/environments/folders/more/')
          expect(subject).not_to match('root/info/lfs/objects/more/')
        end

        it 'rejects group routes' do
          expect(subject).not_to match('root/-/')
        end
      end
    end

    it 'is not case sensitive' do
      expect(subject).not_to match('root/Blob/')
    end

    it 'does not allow extra slashes' do
      expect(subject).not_to match('/root/admin/')
      expect(subject).not_to match('root/admin//')
    end
  end

  describe '.project_route_regex' do
    subject { %r{\A#{described_class.project_route_regex}/\z} }

    it 'accepts top level routes' do
      expect(subject).to match('admin/')
      expect(subject).to match('api/')
      expect(subject).to match('.well-known/')
    end

    it 'rejects project wildcard routes' do
      expect(subject).not_to match('blob/')
      expect(subject).not_to match('edit/')
      expect(subject).not_to match('wikis/')
      expect(subject).not_to match('environments/folders/')
      expect(subject).not_to match('info/lfs/objects/')
    end

    it 'accepts group routes' do
      expect(subject).to match('analytics/')
    end

    it 'is not case sensitive' do
      expect(subject).not_to match('Blob/')
    end

    it 'does not allow extra slashes' do
      expect(subject).not_to match('/admin/')
      expect(subject).not_to match('admin//')
    end
  end

  describe '.full_project_path_regex' do
    subject { described_class.full_project_path_regex }

    it 'accepts top level routes' do
      expect(subject).to match('root/admin/')
      expect(subject).to match('root/api/')
      expect(subject).to match('root/.well-known/')
    end

    it 'rejects project wildcard routes' do
      expect(subject).not_to match('root/blob/')
      expect(subject).not_to match('root/edit/')
      expect(subject).not_to match('root/wikis/')
      expect(subject).not_to match('root/environments/folders/')
      expect(subject).not_to match('root/info/lfs/objects/')
    end

    it 'accepts group routes' do
      expect(subject).to match('root/analytics/')
    end

    it 'is not case sensitive' do
      expect(subject).not_to match('root/Blob/')
    end

    it 'does not allow extra slashes' do
      expect(subject).not_to match('/root/admin/')
      expect(subject).not_to match('root/admin//')
    end
  end

  describe '.namespace_format_regex' do
    subject { described_class.namespace_format_regex }

    it { is_expected.to match('gitlab-ce') }
    it { is_expected.to match('gitlab_git') }
    it { is_expected.to match('_underscore.js') }
    it { is_expected.to match('100px.com') }
    it { is_expected.to match('gitlab.org') }
    it { is_expected.not_to match('?gitlab') }
    it { is_expected.not_to match('git lab') }
    it { is_expected.not_to match('gitlab.git') }
    it { is_expected.not_to match('gitlab.org.') }
    it { is_expected.not_to match('gitlab.org/') }
    it { is_expected.not_to match('/gitlab.org') }
    it { is_expected.not_to match('gitlab git') }
  end

  describe '.project_path_format_regex' do
    subject { described_class.project_path_format_regex }

    it { is_expected.to match('gitlab-ce') }
    it { is_expected.to match('gitlab_git') }
    it { is_expected.to match('_underscore.js') }
    it { is_expected.to match('100px.com') }
    it { is_expected.not_to match('?gitlab') }
    it { is_expected.not_to match('git lab') }
    it { is_expected.not_to match('gitlab.git') }
  end

  context 'repository routes' do
    # Paths that match a known container
    let_it_be(:container_paths) do
      [
        'gitlab-org',
        'gitlab-org/gitlab-test',
        'gitlab-org/gitlab-test/snippets/1',
        'gitlab-org/gitlab-test/snippets/foo', # ambiguous, we allow creating a sub-group called 'snippets'
        'snippets/1'
      ]
    end

    # Paths that never match a container
    let_it_be(:invalid_paths) do
      [
        'gitlab/',
        '/gitlab',
        'gitlab/foo/',
        '?gitlab',
        'git lab',
        '/snippets/1',
        'snippets/foo',
        'gitlab-org/gitlab/snippets/'
      ]
    end

    let_it_be(:git_paths) { container_paths.map { |path| path + '.git' } }
    let_it_be(:snippet_paths) { container_paths.grep(%r{snippets/\d}) }
    let_it_be(:wiki_git_paths) { (container_paths - snippet_paths).map { |path| path + '.wiki.git' } }
    let_it_be(:invalid_git_paths) { invalid_paths.map { |path| path + '.git' } }

    def expect_route_match(paths)
      paths.each { |path| is_expected.to match(path) }
    end

    def expect_no_route_match(paths)
      paths.each { |path| is_expected.not_to match(path) }
    end

    describe '.repository_route_regex' do
      subject { %r{\A#{described_class.repository_route_regex}\z} }

      it 'matches the expected paths' do
        expect_route_match(container_paths)
        expect_no_route_match(invalid_paths + git_paths)
      end
    end

    describe '.repository_git_route_regex' do
      subject { %r{\A#{described_class.repository_git_route_regex}\z} }

      it 'matches the expected paths' do
        expect_route_match(git_paths + wiki_git_paths)
        expect_no_route_match(container_paths + invalid_paths + invalid_git_paths)
      end
    end

    describe '.repository_wiki_git_route_regex' do
      subject { %r{\A#{described_class.repository_wiki_git_route_regex}\z} }

      it 'matches the expected paths' do
        expect_route_match(wiki_git_paths)
        expect_no_route_match(git_paths + invalid_git_paths)
      end

      it { is_expected.not_to match('snippets/1.wiki.git') }
    end

    describe '.full_snippets_repository_path_regex' do
      subject { described_class.full_snippets_repository_path_regex }

      it 'matches the expected paths' do
        expect_route_match(snippet_paths)
        expect_no_route_match(container_paths - snippet_paths + git_paths + invalid_paths)
      end

      it { is_expected.not_to match('root/snippets/1') }
      it { is_expected.not_to match('gitlab-org/gitlab-test/snippets/foo') }
    end
  end

  describe '.container_image_regex' do
    subject { described_class.container_image_regex }

    it { is_expected.to match('gitlab-foss') }
    it { is_expected.to match('gitlab_foss') }
    it { is_expected.to match('gitlab-org/gitlab-foss') }
    it { is_expected.to match('100px.com/100px.ruby') }

    it 'only matches at most one slash' do
      expect(subject.match('foo/bar/baz')[0]).to eq('foo/bar')
    end

    it 'does not match other non-word characters' do
      expect(subject.match('ruby:2.7.0')[0]).to eq('ruby')
    end
  end

  describe '.container_image_blob_sha_regex' do
    subject { described_class.container_image_blob_sha_regex }

    it { is_expected.to match('sha256:asdf1234567890ASDF') }
    it { is_expected.to match('foo:123') }
    it { is_expected.to match('a12bc3f590szp') }
    it { is_expected.not_to match('') }

    it 'does not match malicious characters' do
      expect(subject.match('sha256:asdf1234%2f')[0]).to eq('sha256:asdf1234')
    end
  end
end