gitlab-org--gitlab-foss/lib/gitlab/memory/watchdog.rb

162 lines
4.4 KiB
Ruby

# frozen_string_literal: true
module Gitlab
module Memory
# A background thread that monitors Ruby memory and calls
# into a handler when the Ruby process violates defined limits
# for an extended period of time.
class Watchdog
# This handler does nothing. It returns `false` to indicate to the
# caller that the situation has not been dealt with so it will
# receive calls repeatedly if fragmentation remains high.
#
# This is useful for "dress rehearsals" in production since it allows
# us to observe how frequently the handler is invoked before taking action.
class NullHandler
include Singleton
def call
# NOP
false
end
end
# This handler sends SIGTERM and considers the situation handled.
class TermProcessHandler
def initialize(pid = $$)
@pid = pid
end
def call
Process.kill(:TERM, @pid)
true
end
end
# This handler invokes Puma's graceful termination handler, which takes
# into account a configurable grace period during which a process may
# remain unresponsive to a SIGTERM.
class PumaHandler
def initialize(puma_options = ::Puma.cli_config.options)
@worker = ::Puma::Cluster::WorkerHandle.new(0, $$, 0, puma_options)
end
def call
@worker.term
true
end
end
def initialize
@configuration = Configuration.new
@alive = true
init_prometheus_metrics
end
##
# Configuration for Watchdog, use like:
#
# watchdog.configure do |config|
# config.handler = Gitlab::Memory::Watchdog::TermProcessHandler
# config.sleep_time_seconds = 60
# config.logger = Gitlab::AppLogger
# config.monitors do |stack|
# stack.push MyMonitorClass, args*, max_strikes:, kwargs**, &block
# end
# end
def configure
yield @configuration
end
def call
logger.info(log_labels.merge(message: 'started'))
while @alive
sleep(sleep_time_seconds)
monitor if Feature.enabled?(:gitlab_memory_watchdog, type: :ops)
end
logger.info(log_labels.merge(message: 'stopped'))
end
def stop
@alive = false
end
private
def monitor
@configuration.monitors.call_each do |result|
break unless @alive
next unless result.threshold_violated?
@counter_violations.increment(reason: result.monitor_name)
next unless result.strikes_exceeded?
@alive = !memory_limit_exceeded_callback(result.monitor_name, result.payload)
end
end
def memory_limit_exceeded_callback(monitor_name, monitor_payload)
all_labels = log_labels.merge(monitor_payload)
logger.warn(all_labels)
@counter_violations_handled.increment(reason: monitor_name)
handler.call
end
def handler
# This allows us to keep the watchdog running but turn it into "friendly mode" where
# all that happens is we collect logs and Prometheus events for fragmentation violations.
return NullHandler.instance unless Feature.enabled?(:enforce_memory_watchdog, type: :ops)
@configuration.handler
end
def logger
@configuration.logger
end
def sleep_time_seconds
@configuration.sleep_time_seconds
end
def log_labels
{
pid: $$,
worker_id: worker_id,
memwd_handler_class: handler.class.name,
memwd_sleep_time_s: sleep_time_seconds,
memwd_rss_bytes: process_rss_bytes
}
end
def process_rss_bytes
Gitlab::Metrics::System.memory_usage_rss[:total]
end
def worker_id
::Prometheus::PidProvider.worker_id
end
def init_prometheus_metrics
default_labels = { pid: worker_id }
@counter_violations = Gitlab::Metrics.counter(
:gitlab_memwd_violations_total,
'Total number of times a Ruby process violated a memory threshold',
default_labels
)
@counter_violations_handled = Gitlab::Metrics.counter(
:gitlab_memwd_violations_handled_total,
'Total number of times Ruby process memory violations were handled',
default_labels
)
end
end
end
end