2017-10-13 12:50:36 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2014-12-31 08:07:48 -05:00
|
|
|
module Gitlab
|
2015-02-02 17:26:29 -05:00
|
|
|
module GithubImport
|
2017-10-13 12:50:36 -04:00
|
|
|
# HTTP client for interacting with the GitHub API.
|
|
|
|
#
|
|
|
|
# This class is basically a fancy wrapped around Octokit while adding some
|
|
|
|
# functionality to deal with rate limiting and parallel imports. Usage is
|
|
|
|
# mostly the same as Octokit, for example:
|
|
|
|
#
|
|
|
|
# client = GithubImport::Client.new('hunter2')
|
|
|
|
#
|
|
|
|
# client.labels.each do |label|
|
|
|
|
# puts label.name
|
|
|
|
# end
|
2014-12-31 08:07:48 -05:00
|
|
|
class Client
|
2018-01-09 12:30:04 -05:00
|
|
|
include ::Gitlab::Utils::StrongMemoize
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
attr_reader :octokit
|
2016-06-08 21:48:44 -04:00
|
|
|
|
2020-11-13 16:09:31 -05:00
|
|
|
SEARCH_MAX_REQUESTS_PER_MINUTE = 30
|
2021-08-19 14:10:32 -04:00
|
|
|
DEFAULT_PER_PAGE = 100
|
|
|
|
LOWER_PER_PAGE = 50
|
2021-11-23 13:12:49 -05:00
|
|
|
CLIENT_CONNECTION_ERROR = ::Faraday::ConnectionFailed # used/set in sawyer agent which octokit uses
|
2020-11-13 16:09:31 -05:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
# A single page of data and the corresponding page number.
|
|
|
|
Page = Struct.new(:objects, :number)
|
2014-12-31 08:07:48 -05:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
# The minimum number of requests we want to keep available.
|
|
|
|
#
|
|
|
|
# We don't use a value of 0 as multiple threads may be using the same
|
|
|
|
# token in parallel. This could result in all of them hitting the GitHub
|
|
|
|
# rate limit at once. The threshold is put in place to not hit the limit
|
|
|
|
# in most cases.
|
|
|
|
RATE_LIMIT_THRESHOLD = 50
|
2020-11-13 16:09:31 -05:00
|
|
|
SEARCH_RATE_LIMIT_THRESHOLD = 3
|
2015-02-05 19:57:27 -05:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
# token - The GitHub API token to use.
|
|
|
|
#
|
2020-10-27 23:08:41 -04:00
|
|
|
# host - The GitHub hostname. If nil, github.com will be used.
|
|
|
|
#
|
2017-10-13 12:50:36 -04:00
|
|
|
# per_page - The number of objects that should be displayed per page.
|
|
|
|
#
|
|
|
|
# parallel - When set to true hitting the rate limit will result in a
|
|
|
|
# dedicated error being raised. When set to `false` we will
|
|
|
|
# instead just `sleep()` until the rate limit is reset. Setting
|
|
|
|
# this value to `true` for parallel importing is crucial as
|
|
|
|
# otherwise hitting the rate limit will result in a thread
|
|
|
|
# being blocked in a `sleep()` call for up to an hour.
|
2021-08-19 14:10:32 -04:00
|
|
|
def initialize(token, host: nil, per_page: DEFAULT_PER_PAGE, parallel: true)
|
2020-10-27 23:08:41 -04:00
|
|
|
@host = host
|
2019-06-27 05:14:01 -04:00
|
|
|
@octokit = ::Octokit::Client.new(
|
2017-11-08 12:06:03 -05:00
|
|
|
access_token: token,
|
|
|
|
per_page: per_page,
|
2020-10-27 23:08:41 -04:00
|
|
|
api_endpoint: api_endpoint,
|
|
|
|
web_endpoint: web_endpoint
|
2017-11-08 12:06:03 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
@octokit.connection_options[:ssl] = { verify: verify_ssl }
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
@parallel = parallel
|
2016-05-02 11:22:38 -04:00
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
def parallel?
|
|
|
|
@parallel
|
2016-05-02 11:22:38 -04:00
|
|
|
end
|
2016-04-22 15:43:10 -04:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
# Returns the details of a GitHub user.
|
|
|
|
#
|
|
|
|
# username - The username of the user.
|
|
|
|
def user(username)
|
|
|
|
with_rate_limit { octokit.user(username) }
|
2015-02-05 19:57:27 -05:00
|
|
|
end
|
|
|
|
|
2020-11-29 13:09:41 -05:00
|
|
|
def pull_request_reviews(repo_name, iid)
|
2021-05-05 05:10:02 -04:00
|
|
|
each_object(:pull_request_reviews, repo_name, iid)
|
2020-11-29 13:09:41 -05:00
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
# Returns the details of a GitHub repository.
|
|
|
|
#
|
|
|
|
# name - The path (in the form `owner/repository`) of the repository.
|
|
|
|
def repository(name)
|
|
|
|
with_rate_limit { octokit.repo(name) }
|
2015-02-05 19:57:27 -05:00
|
|
|
end
|
|
|
|
|
2020-11-28 01:09:28 -05:00
|
|
|
def pull_request(repo_name, iid)
|
|
|
|
with_rate_limit { octokit.pull_request(repo_name, iid) }
|
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
def labels(*args)
|
|
|
|
each_object(:labels, *args)
|
2015-02-05 19:57:27 -05:00
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
def milestones(*args)
|
|
|
|
each_object(:milestones, *args)
|
2015-02-05 19:57:27 -05:00
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
def releases(*args)
|
|
|
|
each_object(:releases, *args)
|
2014-12-31 08:07:48 -05:00
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
# Fetches data from the GitHub API and yields a Page object for every page
|
|
|
|
# of data, without loading all of them into memory.
|
|
|
|
#
|
|
|
|
# method - The Octokit method to use for getting the data.
|
|
|
|
# args - Arguments to pass to the Octokit method.
|
|
|
|
#
|
|
|
|
# rubocop: disable GitlabSecurity/PublicSend
|
|
|
|
def each_page(method, *args, &block)
|
|
|
|
return to_enum(__method__, method, *args) unless block_given?
|
2017-02-19 23:05:20 -05:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
page =
|
|
|
|
if args.last.is_a?(Hash) && args.last[:page]
|
|
|
|
args.last[:page]
|
|
|
|
else
|
|
|
|
1
|
|
|
|
end
|
2017-02-19 23:05:20 -05:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
collection = with_rate_limit { octokit.public_send(method, *args) }
|
|
|
|
next_url = octokit.last_response.rels[:next]
|
2014-12-31 08:07:48 -05:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
yield Page.new(collection, page)
|
|
|
|
|
|
|
|
while next_url
|
|
|
|
response = with_rate_limit { next_url.get }
|
|
|
|
next_url = response.rels[:next]
|
|
|
|
|
|
|
|
yield Page.new(response.data, page += 1)
|
2016-12-15 11:36:53 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
# Iterates over all of the objects for the given method (e.g. `:labels`).
|
|
|
|
#
|
|
|
|
# method - The method to send to Octokit for querying data.
|
|
|
|
# args - Any arguments to pass to the Octokit method.
|
|
|
|
def each_object(method, *args, &block)
|
|
|
|
return to_enum(__method__, method, *args) unless block_given?
|
2014-12-31 08:07:48 -05:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
each_page(method, *args) do |page|
|
|
|
|
page.objects.each do |object|
|
|
|
|
yield object
|
|
|
|
end
|
2016-05-02 11:22:38 -04:00
|
|
|
end
|
2014-12-31 08:07:48 -05:00
|
|
|
end
|
2016-06-08 21:48:44 -04:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
# Yields the supplied block, responding to any rate limit errors.
|
|
|
|
#
|
|
|
|
# The exact strategy used for handling rate limiting errors depends on
|
|
|
|
# whether we are running in parallel mode or not. For more information see
|
|
|
|
# `#rate_or_wait_for_rate_limit`.
|
|
|
|
def with_rate_limit
|
2021-11-23 13:12:49 -05:00
|
|
|
return with_retry { yield } unless rate_limiting_enabled?
|
2017-11-08 15:37:01 -05:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
request_count_counter.increment
|
2016-07-11 17:53:39 -04:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
raise_or_wait_for_rate_limit unless requests_remaining?
|
2016-07-12 09:44:46 -04:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
begin
|
2021-11-23 13:12:49 -05:00
|
|
|
with_retry { yield }
|
2019-06-27 05:14:01 -04:00
|
|
|
rescue ::Octokit::TooManyRequests
|
2017-10-13 12:50:36 -04:00
|
|
|
raise_or_wait_for_rate_limit
|
2016-06-08 21:48:44 -04:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
# This retry will only happen when running in sequential mode as we'll
|
|
|
|
# raise an error in parallel mode.
|
|
|
|
retry
|
|
|
|
end
|
2016-06-08 21:48:44 -04:00
|
|
|
end
|
|
|
|
|
2020-12-09 07:09:42 -05:00
|
|
|
def search_repos_by_name(name, options = {})
|
2021-11-23 13:12:49 -05:00
|
|
|
with_retry { octokit.search_repositories(search_query(str: name, type: :name), options) }
|
2020-11-13 16:09:31 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def search_query(str:, type:, include_collaborations: true, include_orgs: true)
|
|
|
|
query = "#{str} in:#{type} is:public,private user:#{octokit.user.login}"
|
|
|
|
|
|
|
|
query = [query, collaborations_subquery].join(' ') if include_collaborations
|
|
|
|
query = [query, organizations_subquery].join(' ') if include_orgs
|
|
|
|
|
|
|
|
query
|
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
# Returns `true` if we're still allowed to perform API calls.
|
2020-11-13 16:09:31 -05:00
|
|
|
# Search API has rate limit of 30, use lowered threshold when search is used.
|
2017-10-13 12:50:36 -04:00
|
|
|
def requests_remaining?
|
2020-11-13 16:09:31 -05:00
|
|
|
if requests_limit == SEARCH_MAX_REQUESTS_PER_MINUTE
|
|
|
|
return remaining_requests > SEARCH_RATE_LIMIT_THRESHOLD
|
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
remaining_requests > RATE_LIMIT_THRESHOLD
|
2016-06-08 21:48:44 -04:00
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
def remaining_requests
|
|
|
|
octokit.rate_limit.remaining
|
|
|
|
end
|
2016-06-08 21:48:44 -04:00
|
|
|
|
2020-11-13 16:09:31 -05:00
|
|
|
def requests_limit
|
|
|
|
octokit.rate_limit.limit
|
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
def raise_or_wait_for_rate_limit
|
|
|
|
rate_limit_counter.increment
|
2016-10-27 09:07:18 -04:00
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
if parallel?
|
|
|
|
raise RateLimitError
|
2016-10-05 15:20:00 -04:00
|
|
|
else
|
2017-10-13 12:50:36 -04:00
|
|
|
sleep(rate_limit_resets_in)
|
2016-10-05 15:20:00 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2017-10-13 12:50:36 -04:00
|
|
|
def rate_limit_resets_in
|
|
|
|
# We add a few seconds to the rate limit so we don't _immediately_
|
|
|
|
# resume when the rate limit resets as this may result in us performing
|
|
|
|
# a request before GitHub has a chance to reset the limit.
|
|
|
|
octokit.rate_limit.resets_in + 5
|
|
|
|
end
|
|
|
|
|
2017-11-08 15:37:01 -05:00
|
|
|
def rate_limiting_enabled?
|
2018-01-09 12:30:04 -05:00
|
|
|
strong_memoize(:rate_limiting_enabled) do
|
|
|
|
api_endpoint.include?('.github.com')
|
|
|
|
end
|
2017-11-08 15:37:01 -05:00
|
|
|
end
|
|
|
|
|
2017-11-08 12:06:03 -05:00
|
|
|
def api_endpoint
|
2020-10-27 23:08:41 -04:00
|
|
|
@host || custom_api_endpoint || default_api_endpoint
|
|
|
|
end
|
|
|
|
|
|
|
|
def web_endpoint
|
|
|
|
@host || custom_api_endpoint || ::Octokit::Default.web_endpoint
|
2017-11-08 12:06:03 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def custom_api_endpoint
|
|
|
|
github_omniauth_provider.dig('args', 'client_options', 'site')
|
|
|
|
end
|
|
|
|
|
|
|
|
def default_api_endpoint
|
2020-01-14 10:07:55 -05:00
|
|
|
OmniAuth::Strategies::GitHub.default_options[:client_options][:site] || ::Octokit::Default.api_endpoint
|
2017-11-08 12:06:03 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def verify_ssl
|
|
|
|
github_omniauth_provider.fetch('verify_ssl', true)
|
|
|
|
end
|
|
|
|
|
|
|
|
def github_omniauth_provider
|
2018-03-12 17:01:43 -04:00
|
|
|
@github_omniauth_provider ||= Gitlab::Auth::OAuth::Provider.config_for('github').to_h
|
2017-10-13 12:50:36 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def rate_limit_counter
|
|
|
|
@rate_limit_counter ||= Gitlab::Metrics.counter(
|
|
|
|
:github_importer_rate_limit_hits,
|
|
|
|
'The number of times we hit the GitHub rate limit when importing projects'
|
|
|
|
)
|
|
|
|
end
|
|
|
|
|
|
|
|
def request_count_counter
|
|
|
|
@request_counter ||= Gitlab::Metrics.counter(
|
|
|
|
:github_importer_request_count,
|
|
|
|
'The number of GitHub API calls performed when importing projects'
|
|
|
|
)
|
2016-06-08 21:48:44 -04:00
|
|
|
end
|
2020-11-13 16:09:31 -05:00
|
|
|
|
|
|
|
private
|
|
|
|
|
|
|
|
def collaborations_subquery
|
|
|
|
each_object(:repos, nil, { affiliation: 'collaborator' })
|
|
|
|
.map { |repo| "repo:#{repo.full_name}" }
|
|
|
|
.join(' ')
|
|
|
|
end
|
|
|
|
|
|
|
|
def organizations_subquery
|
|
|
|
each_object(:organizations)
|
|
|
|
.map { |org| "org:#{org.login}" }
|
|
|
|
.join(' ')
|
|
|
|
end
|
2021-11-23 13:12:49 -05:00
|
|
|
|
|
|
|
def with_retry
|
|
|
|
Retriable.retriable(on: CLIENT_CONNECTION_ERROR, on_retry: on_retry) do
|
|
|
|
yield
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def on_retry
|
|
|
|
proc do |exception, try, elapsed_time, next_interval|
|
|
|
|
Gitlab::Import::Logger.info(
|
|
|
|
message: "GitHub connection retry triggered",
|
|
|
|
'error.class': exception.class,
|
|
|
|
'error.message': exception.message,
|
|
|
|
try_count: try,
|
|
|
|
elapsed_time_s: elapsed_time,
|
|
|
|
wait_to_retry_s: next_interval
|
|
|
|
)
|
|
|
|
end
|
|
|
|
end
|
2014-12-31 08:07:48 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|