From dc14c91d065d869b77b0ec0db47b8b36c96f15be Mon Sep 17 00:00:00 2001 From: Andrew Newdigate Date: Tue, 16 Jul 2019 22:10:44 +0200 Subject: [PATCH] Adds chaos endpoints to Sidekiq This allows the chaos endpoints to be invoked in Sidekiq so that this environment can be tested for resilience. --- app/controllers/chaos_controller.rb | 54 ++++----- app/workers/all_queues.yml | 6 + app/workers/chaos/cpu_spin_worker.rb | 12 ++ app/workers/chaos/db_spin_worker.rb | 12 ++ app/workers/chaos/kill_worker.rb | 12 ++ app/workers/chaos/leak_mem_worker.rb | 12 ++ app/workers/chaos/sleep_worker.rb | 12 ++ app/workers/concerns/chaos_queue.rb | 9 ++ changelogs/unreleased/an-sidekiq-chaos.yml | 5 + config/routes.rb | 2 +- config/sidekiq_queues.yml | 1 + doc/development/chaos_endpoints.md | 31 +++-- lib/gitlab/chaos.rb | 49 ++++++++ spec/controllers/chaos_controller_spec.rb | 127 +++++++++++++++++++++ 14 files changed, 302 insertions(+), 42 deletions(-) create mode 100644 app/workers/chaos/cpu_spin_worker.rb create mode 100644 app/workers/chaos/db_spin_worker.rb create mode 100644 app/workers/chaos/kill_worker.rb create mode 100644 app/workers/chaos/leak_mem_worker.rb create mode 100644 app/workers/chaos/sleep_worker.rb create mode 100644 app/workers/concerns/chaos_queue.rb create mode 100644 changelogs/unreleased/an-sidekiq-chaos.yml create mode 100644 lib/gitlab/chaos.rb create mode 100644 spec/controllers/chaos_controller_spec.rb diff --git a/app/controllers/chaos_controller.rb b/app/controllers/chaos_controller.rb index 2985da35d83..ac008165c16 100644 --- a/app/controllers/chaos_controller.rb +++ b/app/controllers/chaos_controller.rb @@ -1,57 +1,38 @@ # frozen_string_literal: true class ChaosController < ActionController::Base - before_action :validate_chaos_secret, unless: :development? - before_action :request_start_time + before_action :validate_chaos_secret, unless: :development_or_test? def leakmem - retainer = [] - # Add `n` 1mb chunks of memory to the retainer array - memory_mb.times { retainer << "x" * 1.megabyte } - - Kernel.sleep(duration_left) - - render plain: "OK" + do_chaos :leak_mem, Chaos::LeakMemWorker, memory_mb, duration_s end def cpu_spin - rand while Time.now < expected_end_time - - render plain: "OK" + do_chaos :cpu_spin, Chaos::CpuSpinWorker, duration_s end def db_spin - while Time.now < expected_end_time - ActiveRecord::Base.connection.execute("SELECT 1") - - end_interval_time = Time.now + [duration_s, interval_s].min - rand while Time.now < end_interval_time - end + do_chaos :db_spin, Chaos::DbSpinWorker, duration_s, interval_s end def sleep - Kernel.sleep(duration_left) - - render plain: "OK" + do_chaos :sleep, Chaos::SleepWorker, duration_s end def kill - Process.kill("KILL", Process.pid) + do_chaos :kill, Chaos::KillWorker end private - def request_start_time - @start_time ||= Time.now - end + def do_chaos(method, worker, *args) + if async + worker.perform_async(*args) + else + Gitlab::Chaos.public_send(method, *args) # rubocop: disable GitlabSecurity/PublicSend + end - def expected_end_time - request_start_time + duration_s - end - - def duration_left - # returns 0 if over time - [expected_end_time - Time.now, 0].max + render plain: "OK" end def validate_chaos_secret @@ -91,7 +72,12 @@ class ChaosController < ActionController::Base memory_mb.to_i end - def development? - Rails.env.development? + def async + async = params[:async] || false + Gitlab::Utils.to_boolean(async) + end + + def development_or_test? + Rails.env.development? || Rails.env.test? end end diff --git a/app/workers/all_queues.yml b/app/workers/all_queues.yml index 3d34bfc05c7..991a177018e 100644 --- a/app/workers/all_queues.yml +++ b/app/workers/all_queues.yml @@ -3,6 +3,12 @@ - auto_merge:auto_merge_process +- chaos:chaos_cpu_spin +- chaos:chaos_db_spin +- chaos:chaos_kill +- chaos:chaos_leak_mem +- chaos:chaos_sleep + - cronjob:admin_email - cronjob:expire_build_artifacts - cronjob:gitlab_usage_ping diff --git a/app/workers/chaos/cpu_spin_worker.rb b/app/workers/chaos/cpu_spin_worker.rb new file mode 100644 index 00000000000..43a32c3274f --- /dev/null +++ b/app/workers/chaos/cpu_spin_worker.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module Chaos + class CpuSpinWorker + include ApplicationWorker + include ChaosQueue + + def perform(duration_s) + Gitlab::Chaos.cpu_spin(duration_s) + end + end +end diff --git a/app/workers/chaos/db_spin_worker.rb b/app/workers/chaos/db_spin_worker.rb new file mode 100644 index 00000000000..217ddabbcb6 --- /dev/null +++ b/app/workers/chaos/db_spin_worker.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module Chaos + class DbSpinWorker + include ApplicationWorker + include ChaosQueue + + def perform(duration_s, interval_s) + Gitlab::Chaos.db_spin(duration_s, interval_s) + end + end +end diff --git a/app/workers/chaos/kill_worker.rb b/app/workers/chaos/kill_worker.rb new file mode 100644 index 00000000000..bbad53c9b86 --- /dev/null +++ b/app/workers/chaos/kill_worker.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module Chaos + class KillWorker + include ApplicationWorker + include ChaosQueue + + def perform + Gitlab::Chaos.kill + end + end +end diff --git a/app/workers/chaos/leak_mem_worker.rb b/app/workers/chaos/leak_mem_worker.rb new file mode 100644 index 00000000000..0caa99e0de9 --- /dev/null +++ b/app/workers/chaos/leak_mem_worker.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module Chaos + class LeakMemWorker + include ApplicationWorker + include ChaosQueue + + def perform(memory_mb, duration_s) + Gitlab::Chaos.leak_mem(memory_mb, duration_s) + end + end +end diff --git a/app/workers/chaos/sleep_worker.rb b/app/workers/chaos/sleep_worker.rb new file mode 100644 index 00000000000..7c724c4cb4e --- /dev/null +++ b/app/workers/chaos/sleep_worker.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module Chaos + class SleepWorker + include ApplicationWorker + include ChaosQueue + + def perform(duration_s) + Gitlab::Chaos.sleep(duration_s) + end + end +end diff --git a/app/workers/concerns/chaos_queue.rb b/app/workers/concerns/chaos_queue.rb new file mode 100644 index 00000000000..e406509d12d --- /dev/null +++ b/app/workers/concerns/chaos_queue.rb @@ -0,0 +1,9 @@ +# frozen_string_literal: true +# +module ChaosQueue + extend ActiveSupport::Concern + + included do + queue_namespace :chaos + end +end diff --git a/changelogs/unreleased/an-sidekiq-chaos.yml b/changelogs/unreleased/an-sidekiq-chaos.yml new file mode 100644 index 00000000000..cede35c95cc --- /dev/null +++ b/changelogs/unreleased/an-sidekiq-chaos.yml @@ -0,0 +1,5 @@ +--- +title: Adds chaos endpoints to Sidekiq +merge_request: 30814 +author: +type: other diff --git a/config/routes.rb b/config/routes.rb index 641807203bf..459f2b22bf0 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -116,7 +116,7 @@ Rails.application.routes.draw do end end - if ENV['GITLAB_CHAOS_SECRET'] || Rails.env.development? + if ENV['GITLAB_CHAOS_SECRET'] || Rails.env.development? || Rails.env.test? resource :chaos, only: [] do get :leakmem get :cpu_spin diff --git a/config/sidekiq_queues.yml b/config/sidekiq_queues.yml index 80791795390..c7586aa1e38 100644 --- a/config/sidekiq_queues.yml +++ b/config/sidekiq_queues.yml @@ -95,6 +95,7 @@ - [update_project_statistics, 1] - [phabricator_import_import_tasks, 1] - [update_namespace_statistics, 1] + - [chaos, 2] # EE-specific queues - [ldap_group_sync, 2] diff --git a/doc/development/chaos_endpoints.md b/doc/development/chaos_endpoints.md index b3406275937..eb6dde2d24e 100644 --- a/doc/development/chaos_endpoints.md +++ b/doc/development/chaos_endpoints.md @@ -36,6 +36,10 @@ Replace `secret` with your own secret token. Once you have enabled the chaos endpoints and restarted the application, you can start testing using the endpoints. +By default, when invoking a chaos endpoint, the web worker process which receives the request will handle it. This means, for example, that if the Kill +operation is invoked, the Puma or Unicorn worker process handling the request will be killed. To test these operations in Sidekiq, the `async` parameter on +each endpoint can be set to `true`. This will run the chaos process in a Sidekiq worker. + ## Memory leaks To simulate a memory leak in your application, use the `/-/chaos/leakmem` endpoint. @@ -47,12 +51,14 @@ The memory is not retained after the request finishes. Once the request has comp GET /-/chaos/leakmem GET /-/chaos/leakmem?memory_mb=1024 GET /-/chaos/leakmem?memory_mb=1024&duration_s=50 +GET /-/chaos/leakmem?memory_mb=1024&duration_s=50&async=true ``` -| Attribute | Type | Required | Description | -| ------------ | ------- | -------- | ---------------------------------------------------------------------------------- | -| `memory_mb` | integer | no | How much memory, in MB, should be leaked. Defaults to 100MB. | +| Attribute | Type | Required | Description | +| ------------ | ------- | -------- | ------------------------------------------------------------------------------------ | +| `memory_mb` | integer | no | How much memory, in MB, should be leaked. Defaults to 100MB. | | `duration_s` | integer | no | Minimum duration_s, in seconds, that the memory should be retained. Defaults to 30s. | +| `async` | boolean | no | Set to true to leak memory in a Sidekiq background worker process | ```bash curl http://localhost:3000/-/chaos/leakmem?memory_mb=1024&duration_s=10 --header 'X-Chaos-Secret: secret' @@ -69,11 +75,13 @@ If you're using Unicorn, this is done by killing the worker process. ``` GET /-/chaos/cpu_spin GET /-/chaos/cpu_spin?duration_s=50 +GET /-/chaos/cpu_spin?duration_s=50&async=true ``` | Attribute | Type | Required | Description | | ------------ | ------- | -------- | --------------------------------------------------------------------- | | `duration_s` | integer | no | Duration, in seconds, that the core will be utilised. Defaults to 30s | +| `async` | boolean | no | Set to true to consume CPU in a Sidekiq background worker process | ```bash curl http://localhost:3000/-/chaos/cpu_spin?duration_s=60 --header 'X-Chaos-Secret: secret' @@ -91,12 +99,14 @@ If you're using Unicorn, this is done by killing the worker process. ``` GET /-/chaos/db_spin GET /-/chaos/db_spin?duration_s=50 +GET /-/chaos/db_spin?duration_s=50&async=true ``` -| Attribute | Type | Required | Description | -| ------------ | ------- | -------- | --------------------------------------------------------------------- | -| `interval_s` | float | no | Interval, in seconds, for every DB request. Defaults to 1s | -| `duration_s` | integer | no | Duration, in seconds, that the core will be utilised. Defaults to 30s | +| Attribute | Type | Required | Description | +| ------------ | ------- | -------- | --------------------------------------------------------------------------- | +| `interval_s` | float | no | Interval, in seconds, for every DB request. Defaults to 1s | +| `duration_s` | integer | no | Duration, in seconds, that the core will be utilised. Defaults to 30s | +| `async` | boolean | no | Set to true to perform the operation in a Sidekiq background worker process | ```bash curl http://localhost:3000/-/chaos/db_spin?interval_s=1&duration_s=60 --header 'X-Chaos-Secret: secret' @@ -112,11 +122,13 @@ As with the CPU Spin endpoint, this may lead to your request timing out if durat ``` GET /-/chaos/sleep GET /-/chaos/sleep?duration_s=50 +GET /-/chaos/sleep?duration_s=50&async=true ``` | Attribute | Type | Required | Description | | ------------ | ------- | -------- | ---------------------------------------------------------------------- | | `duration_s` | integer | no | Duration, in seconds, that the request will sleep for. Defaults to 30s | +| `async` | boolean | no | Set to true to sleep in a Sidekiq background worker process | ```bash curl http://localhost:3000/-/chaos/sleep?duration_s=60 --header 'X-Chaos-Secret: secret' @@ -132,8 +144,13 @@ Since this endpoint uses the `KILL` signal, the worker is not given a chance to ``` GET /-/chaos/kill +GET /-/chaos/kill?async=true ``` +| Attribute | Type | Required | Description | +| ------------ | ------- | -------- | ---------------------------------------------------------------------- | +| `async` | boolean | no | Set to true to kill a Sidekiq background worker process | + ```bash curl http://localhost:3000/-/chaos/kill --header 'X-Chaos-Secret: secret' curl http://localhost:3000/-/chaos/kill?token=secret diff --git a/lib/gitlab/chaos.rb b/lib/gitlab/chaos.rb new file mode 100644 index 00000000000..4f47cdef971 --- /dev/null +++ b/lib/gitlab/chaos.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +module Gitlab + # Chaos methods for GitLab. + # See https://docs.gitlab.com/ee/development/chaos_endpoints.html for more details. + class Chaos + # leak_mem will retain the specified amount of memory and sleep. + # On return, the memory will be released. + def self.leak_mem(memory_mb, duration_s) + start_time = Time.now + + retainer = [] + # Add `n` 1mb chunks of memory to the retainer array + memory_mb.times { retainer << "x" * 1.megabyte } + + duration_left = [start_time + duration_s - Time.now, 0].max + Kernel.sleep(duration_left) + end + + # cpu_spin will consume all CPU on a single core for the specified duration + def self.cpu_spin(duration_s) + expected_end_time = Time.now + duration_s + + rand while Time.now < expected_end_time + end + + # db_spin will query the database in a tight loop for the specified duration + def self.db_spin(duration_s, interval_s) + expected_end_time = Time.now + duration_s + + while Time.now < expected_end_time + ActiveRecord::Base.connection.execute("SELECT 1") + + end_interval_time = Time.now + [duration_s, interval_s].min + rand while Time.now < end_interval_time + end + end + + # sleep will sleep for the specified duration + def self.sleep(duration_s) + Kernel.sleep(duration_s) + end + + # Kill will send a SIGKILL signal to the current process + def self.kill + Process.kill("KILL", Process.pid) + end + end +end diff --git a/spec/controllers/chaos_controller_spec.rb b/spec/controllers/chaos_controller_spec.rb new file mode 100644 index 00000000000..bafd4a70862 --- /dev/null +++ b/spec/controllers/chaos_controller_spec.rb @@ -0,0 +1,127 @@ +# frozen_string_literal: true + +require 'spec_helper' + +describe ChaosController do + describe '#leakmem' do + it 'calls synchronously' do + expect(Gitlab::Chaos).to receive(:leak_mem).with(100, 30.seconds) + + get :leakmem + + expect(response).to have_gitlab_http_status(200) + end + + it 'call synchronously with params' do + expect(Gitlab::Chaos).to receive(:leak_mem).with(1, 2.seconds) + + get :leakmem, params: { memory_mb: 1, duration_s: 2 } + + expect(response).to have_gitlab_http_status(200) + end + + it 'calls asynchronously' do + expect(Chaos::LeakMemWorker).to receive(:perform_async).with(100, 30.seconds) + + get :leakmem, params: { async: 1 } + + expect(response).to have_gitlab_http_status(200) + end + end + + describe '#cpu_spin' do + it 'calls synchronously' do + expect(Gitlab::Chaos).to receive(:cpu_spin).with(30.seconds) + + get :cpu_spin + + expect(response).to have_gitlab_http_status(200) + end + + it 'calls synchronously with params' do + expect(Gitlab::Chaos).to receive(:cpu_spin).with(3.seconds) + + get :cpu_spin, params: { duration_s: 3 } + + expect(response).to have_gitlab_http_status(200) + end + + it 'calls asynchronously' do + expect(Chaos::CpuSpinWorker).to receive(:perform_async).with(30.seconds) + + get :cpu_spin, params: { async: 1 } + + expect(response).to have_gitlab_http_status(200) + end + end + + describe '#db_spin' do + it 'calls synchronously' do + expect(Gitlab::Chaos).to receive(:db_spin).with(30.seconds, 1.second) + + get :db_spin + + expect(response).to have_gitlab_http_status(200) + end + + it 'calls synchronously with params' do + expect(Gitlab::Chaos).to receive(:db_spin).with(4.seconds, 5.seconds) + + get :db_spin, params: { duration_s: 4, interval_s: 5 } + + expect(response).to have_gitlab_http_status(200) + end + + it 'calls asynchronously' do + expect(Chaos::DbSpinWorker).to receive(:perform_async).with(30.seconds, 1.second) + + get :db_spin, params: { async: 1 } + + expect(response).to have_gitlab_http_status(200) + end + end + + describe '#sleep' do + it 'calls synchronously' do + expect(Gitlab::Chaos).to receive(:sleep).with(30.seconds) + + get :sleep + + expect(response).to have_gitlab_http_status(200) + end + + it 'calls synchronously with params' do + expect(Gitlab::Chaos).to receive(:sleep).with(5.seconds) + + get :sleep, params: { duration_s: 5 } + + expect(response).to have_gitlab_http_status(200) + end + + it 'calls asynchronously' do + expect(Chaos::SleepWorker).to receive(:perform_async).with(30.seconds) + + get :sleep, params: { async: 1 } + + expect(response).to have_gitlab_http_status(200) + end + end + + describe '#kill' do + it 'calls synchronously' do + expect(Gitlab::Chaos).to receive(:kill).with(no_args) + + get :kill + + expect(response).to have_gitlab_http_status(200) + end + + it 'calls asynchronously' do + expect(Chaos::KillWorker).to receive(:perform_async).with(no_args) + + get :kill, params: { async: 1 } + + expect(response).to have_gitlab_http_status(200) + end + end +end