diff --git a/app/controllers/projects/environments_controller.rb b/app/controllers/projects/environments_controller.rb index e35f34be23c..41fe70052ce 100644 --- a/app/controllers/projects/environments_controller.rb +++ b/app/controllers/projects/environments_controller.rb @@ -12,6 +12,8 @@ class Projects::EnvironmentsController < Projects::ApplicationController before_action :expire_etag_cache, only: [:index] before_action only: [:metrics, :additional_metrics] do push_frontend_feature_flag(:metrics_time_window) + push_frontend_feature_flag(:environment_metrics_use_prometheus_endpoint) + push_frontend_feature_flag(:environment_metrics_show_multiple_dashboards) end def index @@ -156,6 +158,18 @@ class Projects::EnvironmentsController < Projects::ApplicationController end end + def metrics_dashboard + access_denied! unless Feature.enabled?(:environment_metrics_use_prometheus_endpoint, project) + + respond_to do |format| + format.json do + dashboard = MetricsDashboardService.new(@project).find(params[:dashboard]) + + render json: dashboard, status: :ok + end + end + end + def search respond_to do |format| format.json do diff --git a/app/services/metrics_dashboard_service.rb b/app/services/metrics_dashboard_service.rb new file mode 100644 index 00000000000..532f5697be8 --- /dev/null +++ b/app/services/metrics_dashboard_service.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +# Searches a projects repository for a metrics dashboard and formats the output. +# Expects any custom dashboards will be located in `.gitlab/dashboards` +class MetricsDashboardService + DASHBOARD_ROOT = ".gitlab/dashboards" + DASHBOARD_EXTENSION = '.yml' + + SYSTEM_DASHBOARD_NAME = 'system_dashboard' + SYSTEM_DASHBOARD_ROOT = "config/prometheus" + SYSTEM_DASHBOARD_PATH = Rails.root.join(SYSTEM_DASHBOARD_ROOT, "#{SYSTEM_DASHBOARD_NAME}#{DASHBOARD_EXTENSION}") + + def initialize(project) + @project = project + end + + # Returns a DB-supplemented json representation of a dashboard config file. + # + # param: dashboard_name [String] Filename of dashboard w/o an extension. + # If not provided, the system dashboard will be returned. + def find(dashboard_name = nil) + unless Feature.enabled?(:environment_metrics_show_multiple_dashboards, @project) + return process_dashboard(system_dashboard) + end + + dashboard = Rails.cache.fetch(cache_key(dashboard_name)) do + dashboard_name ? project_dashboard(dashboard) : system_dashboard + end + + process_dashboard(dashboard) + end + + private + + # Returns the base metrics shipped with every GitLab service. + def system_dashboard + YAML.load_file(SYSTEM_DASHBOARD_PATH) + end + + # Searches the project repo for a custom-defined dashboard. + def project_dashboard(dashboard_name) + Gitlab::Template::Finders::RepoTemplateFinder.new( + project, + DASHBOARD_ROOT, + DASHBOARD_EXTENSION + ).find(dashboard_name).read + end + + def cache_key(dashboard_name) + return "metrics_dashboard_#{SYSTEM_DASHBOARD_NAME}" unless dashboard_name + + "project_#{@project.id}_metrics_dashboard_#{dashboard_name}" + end + + # TODO: "Processing" the dashboard needs to include several steps such as + # inserting metric ids and alert information. + def process_dashboard(dashboard) + dashboard.to_json + end +end diff --git a/config/prometheus/system_dashboard.yml b/config/prometheus/system_dashboard.yml new file mode 100644 index 00000000000..694d6531034 --- /dev/null +++ b/config/prometheus/system_dashboard.yml @@ -0,0 +1,274 @@ +dashboard: 'System Metrics' +order: 0 +panel_groups: + # NGINX Ingress metrics for pre-0.16.0 versions + - group: Response metrics (NGINX Ingress VTS) + priority: 10 + panels: + - type: area-chart + title: "Throughput" + y_label: "Requests / Sec" + metrics: + - id: response_metrics_nginx_ingress_throughput_status_code + query_range: 'sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) by (status_code)' + unit: req / sec + label: Status Code + required_metrics: + - nginx_upstream_responses_total + series: + - label: status_code + when: + - value: 2xx + color: green + - value: 4xx + color: orange + - value: 5xx + color: red + - type: area-chart + title: "Latency" + y_label: "Latency (ms)" + metrics: + - id: response_metrics_nginx_ingress_latency_pod_average + query_range: 'avg(nginx_upstream_response_msecs_avg{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"})' + label: Pod average + unit: ms + required_metrics: + - nginx_upstream_response_msecs_avg + - type: area-chart + title: "HTTP Error Rate" + y_label: "HTTP Errors" + metrics: + - id: response_metrics_nginx_ingress_http_error_rate + query_range: 'sum(rate(nginx_upstream_responses_total{status_code="5xx", upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) / sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) * 100' + label: 5xx Errors + unit: "%" + required_metrics: + - nginx_upstream_responses_total + # NGINX Ingress metrics for post-0.16.0 versions + - group: Response metrics (NGINX Ingress) + priority: 10 + panels: + - type: area-chart + title: "Throughput" + y_label: "Requests / Sec" + metrics: + - id: response_metrics_nginx_ingress_16_throughput_status_code + query_range: 'sum(label_replace(rate(nginx_ingress_controller_requests{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m]), "status_code", "${1}xx", "status", "(.)..")) by (status_code)' + unit: req / sec + required_metrics: + - nginx_ingress_controller_requests + label: Status Code + series: + - label: status_code + when: + - value: 2xx + color: green + - value: 3xx + color: blue + - value: 4xx + color: orange + - value: 5xx + color: red + - type: area-chart + title: "Latency" + y_label: "Latency (ms)" + metrics: + - id: response_metrics_nginx_ingress_16_latency_pod_average + query_range: 'sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_sum{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) / sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_count{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) * 1000' + label: Pod average + unit: ms + required_metrics: + - nginx_ingress_controller_ingress_upstream_latency_seconds_sum + - type: area-chart + title: "HTTP Error Rate" + y_label: "HTTP Errors" + metrics: + - id: response_metrics_nginx_ingress_16_http_error_rate + query_range: 'sum(rate(nginx_ingress_controller_requests{status=~"5.*",namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) / sum(rate(nginx_ingress_controller_requests{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) * 100' + label: 5xx Errors + unit: "%" + required_metrics: + - nginx_ingress_controller_requests + - group: Response metrics (HA Proxy) + priority: 10 + panels: + - type: area-chart + title: "Throughput" + y_label: "Requests / Sec" + metrics: + - id: response_metrics_ha_proxy_throughput_status_code + query_range: 'sum(rate(haproxy_frontend_http_requests_total{%{environment_filter}}[2m])) by (code)' + unit: req / sec + label: Status Code + required_metrics: + - haproxy_frontend_http_requests_total + series: + - label: status_code + when: + - value: 2xx + color: green + - value: 4xx + color: yellow + - value: 5xx + color: red + - type: area-chart + title: "HTTP Error Rate" + y_label: "Error Rate (%)" + metrics: + - id: response_metrics_ha_proxy_http_error_rate + query_range: 'sum(rate(haproxy_frontend_http_responses_total{code="5xx",%{environment_filter}}[2m])) / sum(rate(haproxy_frontend_http_responses_total{%{environment_filter}}[2m]))' + label: HTTP Errors + unit: "%" + required_metrics: + - haproxy_frontend_http_responses_total + - group: Response metrics (AWS ELB) + priority: 10 + panels: + - type: area-chart + title: "Throughput" + y_label: "Requests / Sec" + metrics: + - id: response_metrics_aws_elb_throughput_requests + query_range: 'sum(aws_elb_request_count_sum{%{environment_filter}}) / 60' + label: Total + unit: req / sec + required_metrics: + - aws_elb_request_count_sum + - type: area-chart + title: "Latency" + y_label: "Latency (ms)" + metrics: + - id: response_metrics_aws_elb_latency_average + query_range: 'avg(aws_elb_latency_average{%{environment_filter}}) * 1000' + label: Average + unit: ms + required_metrics: + - aws_elb_latency_average + - type: area-chart + title: "HTTP Error Rate" + y_label: "Error Rate (%)" + metrics: + - id: response_metrics_aws_elb_http_error_rate + query_range: 'sum(aws_elb_httpcode_backend_5_xx_sum{%{environment_filter}}) / sum(aws_elb_request_count_sum{%{environment_filter}})' + label: HTTP Errors + unit: "%" + required_metrics: + - aws_elb_request_count_sum + - aws_elb_httpcode_backend_5_xx_sum + - group: Response metrics (NGINX) + priority: 10 + panels: + - type: area-chart + title: "Throughput" + y_label: "Requests / Sec" + metrics: + - id: response_metrics_nginx_throughput_status_code + query_range: 'sum(rate(nginx_server_requests{server_zone!="*", server_zone!="_", %{environment_filter}}[2m])) by (code)' + unit: req / sec + required_metrics: + - nginx_server_requests + label: Status Code + series: + - label: status_code + when: + - value: 2xx + color: green + - value: 4xx + color: orange + - value: 5xx + color: red + - type: area-chart + title: "Latency" + y_label: "Latency (ms)" + metrics: + - id: response_metrics_nginx_latency + query_range: 'avg(nginx_server_requestMsec{%{environment_filter}})' + label: Upstream + unit: ms + required_metrics: + - nginx_server_requestMsec + - type: area-chart + title: "HTTP Error Rate" + y_label: "HTTP 500 Errors / Sec" + metrics: + - id: response_metrics_nginx_http_error_rate + query_range: 'sum(rate(nginx_server_requests{code="5xx", %{environment_filter}}[2m]))' + label: HTTP Errors + unit: "errors / sec" + required_metrics: + - nginx_server_requests + - group: System metrics (Kubernetes) + priority: 5 + panels: + - type: area-chart + title: "Memory Usage (Total)" + y_label: "Total Memory Used" + metrics: + - id: system_metrics_kubernetes_container_memory_total + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) /1024/1024/1024' + label: Total + unit: GB + required_metrics: + - container_memory_usage_bytes + - type: area-chart + title: "Core Usage (Total)" + y_label: "Total Cores" + metrics: + - id: system_metrics_kubernetes_container_cores_total + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job)' + label: Total + unit: "cores" + required_metrics: + - container_cpu_usage_seconds_total + - type: area-chart + title: "Memory Usage (Pod Average)" + y_label: "Memory Used per Pod" + metrics: + - id: system_metrics_kubernetes_container_memory_average + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024' + label: Pod average + unit: MB + required_metrics: + - container_memory_usage_bytes + - type: area-chart + title: "Canary: Memory Usage (Pod Average)" + y_label: "Memory Used per Pod" + metrics: + - id: system_metrics_kubernetes_container_memory_average_canary + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024' + label: Pod average + unit: MB + required_metrics: + - container_memory_usage_bytes + track: canary + - type: area-chart + title: "Core Usage (Pod Average)" + y_label: "Cores per Pod" + metrics: + - id: system_metrics_kubernetes_container_core_usage + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))' + label: Pod average + unit: "cores" + required_metrics: + - container_cpu_usage_seconds_total + - type: area-chart + title: "Canary: Core Usage (Pod Average)" + y_label: "Cores per Pod" + metrics: + - id: system_metrics_kubernetes_container_core_usage_canary + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))' + label: Pod average + unit: "cores" + track: canary + required_metrics: + - container_cpu_usage_seconds_total + - type: area-chart + title: "Knative function invocations" + y_label: "Invocations" + metrics: + - id: system_metrics_knative_function_invocation_count + query_range: 'floor(sum(rate(istio_revision_request_count{destination_configuration="%{function_name}", destination_namespace="%{kube_namespace}"}[1m])*30))' + label: invocations / minute + unit: requests + required_metrics: + - istio_revision_request_count diff --git a/config/routes/project.rb b/config/routes/project.rb index 93d168fc595..f7841bbe595 100644 --- a/config/routes/project.rb +++ b/config/routes/project.rb @@ -218,6 +218,7 @@ constraints(::Constraints::ProjectUrlConstrainer.new) do get :terminal get :metrics get :additional_metrics + get :metrics_dashboard get '/terminal.ws/authorize', to: 'environments#terminal_websocket_authorize', constraints: { format: nil } get '/prometheus/api/v1/*proxy_path', to: 'environments/prometheus_api#proxy'