From 3859db34df5393cdc9f985e7003ffb36f4fb4058 Mon Sep 17 00:00:00 2001 From: shehbaz-pathan <46107507+shehbaz-pathan@users.noreply.github.com> Date: Thu, 27 Jun 2024 11:03:15 +0530 Subject: [PATCH] Restructured nopo11y helm chart (#12) * Removed appLabel dependency from the metrics * Restructured nopo11y helm chart to support multiple services in single deployment * Values file comment cleanup and added global value for thresholds * Added global defaults for SLO objectives and alert thresholds * Used service name instead of deployment name * Updated latency to latencyMS in alertThresholds * Replaced .deployment with .service in alerts template * Updated prepend release name key * changed service name with deployment name --------- Co-authored-by: Shehbaz Pathan (Consultant) --- charts/nopo11y/Chart.yaml | 2 +- charts/nopo11y/templates/_helpers.tpl | 82 ++++++-- charts/nopo11y/templates/alerts.yaml | 68 ++++++ ...{defaultDashboard.yaml => dashboards.yaml} | 61 +++--- charts/nopo11y/templates/defaultAlerts.yaml | 60 ------ charts/nopo11y/templates/defaultSLOs.yaml | 199 ------------------ charts/nopo11y/templates/slos.yaml | 196 +++++++++++++++++ charts/nopo11y/values.yaml | 40 ++-- 8 files changed, 386 insertions(+), 322 deletions(-) create mode 100644 charts/nopo11y/templates/alerts.yaml rename charts/nopo11y/templates/{defaultDashboard.yaml => dashboards.yaml} (80%) delete mode 100644 charts/nopo11y/templates/defaultAlerts.yaml delete mode 100644 charts/nopo11y/templates/defaultSLOs.yaml create mode 100644 charts/nopo11y/templates/slos.yaml diff --git a/charts/nopo11y/Chart.yaml b/charts/nopo11y/Chart.yaml index e0c047a..bac13b9 100644 --- a/charts/nopo11y/Chart.yaml +++ b/charts/nopo11y/Chart.yaml @@ -21,4 +21,4 @@ version: 1.0.2 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.1.0" +appVersion: "2.0.0" diff --git a/charts/nopo11y/templates/_helpers.tpl b/charts/nopo11y/templates/_helpers.tpl index c70bb28..3288c04 100644 --- a/charts/nopo11y/templates/_helpers.tpl +++ b/charts/nopo11y/templates/_helpers.tpl @@ -1,20 +1,72 @@ -{{- define "dashboard-uid" -}} -{{- printf "%s-%s" .Release.Name .Release.Namespace | trunc 40 -}} +{{- define "nopo11y.services" -}} +{{- $servicesList:= list }} +{{- $defaulAvailability:= .Values.defaults.slo.availability -}} +{{- $defaulLatency:= .Values.defaults.slo.latency -}} +{{- $defaulLatencyThreshold:= .Values.defaults.alertThresholds.latencyMS -}} +{{- $defaul5xx:= .Values.defaults.alertThresholds.rate5xx -}} +{{- $default4xx:= .Values.defaults.alertThresholds.rate4xx -}} +{{- $release:= "" }} +{{- if .Values.prependReleaseName -}} +{{- $release = printf "%s-" .Release.Name }} +{{- end }} +{{- $namespace:= .Release.Namespace }} +{{- range .Values.services }} +{{- $service:= dict }} +{{- if or (not (hasKey . "deploymentName")) (not (hasKey . "serviceName")) -}} +{{- fail "deploymentName and serviceName are required for each service" -}} +{{- else if and (eq .deploymentName "") (eq .serviceName "") -}} +{{- fail "deploymentName and ServiceName are required for each service" -}} +{{- end -}} +{{ $service = set $service "deployment" (printf "%s%s" $release .deploymentName) }} +{{ $service = set $service "service" (printf "%s%s" $release .serviceName) }} +{{- if not (hasKey . "slo") }} +{{ $service = set $service "availability" $defaulAvailability }} +{{ $service = set $service "latency" $defaulLatency }} +{{- else if hasKey . "slo" }} +{{- if not (hasKey .slo "availability") }} +{{ $service = set $service "availability" $defaulAvailability }} +{{- else if not .slo.availability }} +{{ $service = set $service "availability" $defaulAvailability }} +{{- else }} +{{ $service = set $service "availability" .slo.availability }} +{{- end -}} +{{- if not (hasKey .slo "latency") }} +{{ $service = set $service "latency" $defaulLatency }} +{{- else if not .slo.latency }} +{{ $service = set $service "latency" $defaulLatency }} +{{- else }} +{{ $service = set $service "latency" .slo.latency }} +{{- end }} +{{- end }} +{{- if not (hasKey . "alertThresholds") }} +{{ $service = set $service "rate5xx" $defaul5xx }} +{{ $service = set $service "rate4xx" $default4xx }} +{{ $service = set $service "latencyThreshold" $defaulLatencyThreshold }} +{{- else if hasKey . "alertThresholds" }} +{{- if not (hasKey .alertThresholds "rate5xx") }} +{{ $service = set $service "rate5xx" $defaul5xx }} +{{- else if not .alertThresholds.rate5xx }} +{{ $service = set $service "rate5xx" $defaul5xx }} +{{- else }} +{{ $service = set $service "rate5xx" .alertThresholds.rate5xx }} +{{- end -}} +{{- if not (hasKey .alertThresholds "rate4xx") }} +{{ $service = set $service "rate4xx" $default4xx }} +{{- else if not .alertThresholds.rate5xx }} +{{ $service = set $service "rate4xx" $default4xx }} +{{- else }} +{{ $service = set $service "rate4xx" .alertThresholds.rate4xx }} {{- end -}} - - -{{- define "app.label" -}} -{{- if .Values.includeReleaseNameInMetricsLabels }} -{{- printf "%s-%s" .Release.Name .Values.appLabel -}} -{{- else -}} -{{ printf "%s" .Values.appLabel }} +{{- if not (hasKey .alertThresholds "latencyMS") }} +{{ $service = set $service "latencyThreshold" $defaulLatencyThreshold }} +{{- else if not .alertThresholds.latencyMS }} +{{ $service = set $service "latencyThreshold" $defaulLatencyThreshold }} +{{- else }} +{{ $service = set $service "latencyThreshold" .alertThresholds.latencyMS }} {{- end }} {{- end }} - -{{- define "deployment.name" -}} -{{- if .Values.includeReleaseNameInMetricsLabels }} -{{- printf "%s-%s" .Release.Name .Values.deploymentName -}} -{{- else -}} -{{ printf "%s" .Values.deploymentName }} +{{ $service = set $service "dashboarduid" (printf "%s-%s" .serviceName $namespace) }} +{{ $servicesList = append $servicesList $service }} {{- end }} +{{- toJson $servicesList }} {{- end }} \ No newline at end of file diff --git a/charts/nopo11y/templates/alerts.yaml b/charts/nopo11y/templates/alerts.yaml new file mode 100644 index 0000000..4ce0eeb --- /dev/null +++ b/charts/nopo11y/templates/alerts.yaml @@ -0,0 +1,68 @@ +{{- if .Values.enabled }} +{{- if eq .Values.namespace ""}} +{{ fail "values.namespace is required" }} +{{- end }} +{{- if and (not .Values.istioMetrics.enabled) (not .Values.nginxIngressMetrics.enabled ) }} +{{ fail "Enabling either istioMetrics or nginxIngresMetrics is required" }} +{{- end }} +{{- range (include "nopo11y.services" . |fromJsonArray) }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + namespace: {{ $.Values.namespace }} + name: {{ .service }}-nopo11y-alert-rules + labels: + managedby: nopo11y +spec: + groups: + - name: {{ .service }}-nopo11y-alert-rules + rules: + {{- if $.Values.istioMetrics.enabled }} + - alert: {{ .service }}High5xxErrorRate + expr: sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}", response_code=~"5.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}"}[5m])) by (instance) * 100 > {{ .rate5xx }} + annotations: + description: {{ .service }} service is experiencing high 5xx errors rate from last 5 minutes. + summary: {{ .service }} service is experiencing high 5xx error rate. + {{- if $.Values.grafanaURL }} + dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }} + {{- end }} + labels: + severity: critical + - alert: {{ .service }}High4xxErrorRate + expr: sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}", response_code=~"4.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}"}[5m])) by (instance) * 100 > {{ .rate4xx }} + for: 5m + annotations: + {{- if $.Values.grafanaURL }} + dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }} + {{- end }} + description: {{ .service }} service is experiencing high 4xx errors rate from last 5 minutes. + summary: {{ .service }} service is experiencing high 4xx error rate. + labels: + severity: warning + {{- end }} + {{- if $.Values.nginxIngressMetrics.enabled }} + - alert: {{ .service }}High5xxErrorRate-NginxIngress + expr: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}status=~"5..", exported_service="{{ .service }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}exported_service="{{ .service }}"}[5m])) * 100 > {{ .rate5xx }} + annotations: + description: {{ .service }} service is experiencing high 5xx errors rate from last 5 minutes. + summary: {{ .service }} is experiencing high 5xx error rate. + {{- if $.Values.grafanaURL }} + dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }} + {{- end }} + labels: + severity: critical + - alert: {{ .service }}High4xxErrorRate-NginxIngress + expr: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}status=~"4..", exported_service="{{ .service }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}exported_service="{{ .service }}"}[5m])) * 100 > {{ .rate4xx }} + for: 10m + annotations: + description: {{ .service }} service is experiencing high 4xx errors rate from last 5 minutes. + summary: {{ .service }} service is experiencing high 4xx error rate. + {{- if $.Values.grafanaURL }} + dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }} + {{- end }} + labels: + severity: warning + {{- end }} +--- +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/nopo11y/templates/defaultDashboard.yaml b/charts/nopo11y/templates/dashboards.yaml similarity index 80% rename from charts/nopo11y/templates/defaultDashboard.yaml rename to charts/nopo11y/templates/dashboards.yaml index 76de762..87248b3 100644 --- a/charts/nopo11y/templates/defaultDashboard.yaml +++ b/charts/nopo11y/templates/dashboards.yaml @@ -1,23 +1,14 @@ -{{- if or (eq .Values.namespace "") (eq .Values.deploymentName "") (eq .Values.appLabel "")}} -{{ fail "namespace, deploymentName and appLabel are required" }} -{{- end }} -{{- if and (not .Values.istioMetrics.enabled) (not .Values.nginxIngressMetrics.enabled ) }} -{{ fail "Enabling either istioMetrics or nginxIngresMetrics is required" }} -{{- end }} -{{- if and (eq .Values.nginxIngressMetrics.ingressName "") .Values.nginxIngressMetrics.enabled }} -{{ fail "Required nginxIngressMetrics.ingressName" }} -{{- end }} - {{- if .Values.enabled }} +{{- range (include "nopo11y.services" . |fromJsonArray) }} apiVersion: v1 kind: ConfigMap metadata: - name: "{{ include "app.label" . }}-service-overview-dashboard" - namespace: {{ .Values.namespace }} + name: "{{ .service }}-service-overview-dashboard" + namespace: {{ $.Values.namespace }} labels: grafana_dashboard: "1" data: - {{ include "app.label" . }}-overview-dashboard.json: |- + {{ .service }}-overview-dashboard.json: |- { "annotations": { "list": [ @@ -48,7 +39,7 @@ data: "links": [], "liveNow": false, "panels": [ - {{- if .Values.nginxIngressMetrics.enabled }} + {{- if $.Values.nginxIngressMetrics.enabled }} { "collapsed": false, "gridPos": { @@ -147,7 +138,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval]))", + "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}exported_service=\"{{ .service }}\"}[$__rate_interval]))", "legendFormat": "Requests/sec", "range": true, "refId": "A" @@ -241,7 +232,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_request_duration_seconds_sum{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_request_duration_seconds_count{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval]))", + "expr": "sum(rate(nginx_ingress_controller_request_duration_seconds_sum{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}exported_service=\"{{ .service }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_request_duration_seconds_count{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}exported_service=\"{{ .service }}\"}[$__rate_interval]))", "legendFormat": "Latency", "range": true, "refId": "A" @@ -335,7 +326,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\",status=~\"5..\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}status=~\"5..\", exported_service=\"{{ .service }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}exported_service=\"{{ .service }}\"}[$__rate_interval])) * 100", "legendFormat": "5xx error rate", "range": true, "refId": "A" @@ -429,7 +420,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\",status=~\"2..|4..\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}status=~\"2..|4..\", exported_service=\"{{ .service }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}exported_service=\"{{ .service }}\"}[$__rate_interval])) * 100", "legendFormat": "Success ", "range": true, "refId": "A" @@ -440,7 +431,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\",status=~\"5..\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}status=~\"5..\", exported_service=\"{{ .service }}\"}[$__rate_interval])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}exported_service=\"{{ .service }}\"}[$__rate_interval])) * 100", "hide": false, "legendFormat": "Error", "range": true, @@ -535,7 +526,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum (rate(nginx_ingress_controller_response_size_sum{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval]))/sum(rate(nginx_ingress_controller_response_size_count{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}ingress=~\"{{ .Values.nginxIngressMetrics.ingressName }}\", path=\"{{ .Values.nginxIngressMetrics.path }}\"}[$__rate_interval]))", + "expr": "sum (rate(nginx_ingress_controller_response_size_sum{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}exported_service=\"{{ .service }}\"}[$__rate_interval]))/sum(rate(nginx_ingress_controller_response_size_count{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}exported_service=\"{{ .service }}\"}[$__rate_interval]))", "legendFormat": "Response Size", "range": true, "refId": "A" @@ -545,7 +536,7 @@ data: "type": "timeseries" }, {{- end }} - {{- if .Values.istioMetrics.enabled }} + {{- if $.Values.istioMetrics.enabled }} { "collapsed": false, "gridPos": { @@ -639,7 +630,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval]))", + "expr": "sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ .service }}\"}[$__rate_interval]))", "legendFormat": "Requests/sec", "range": true, "refId": "A" @@ -733,7 +724,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_request_duration_milliseconds_sum{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) / sum(rate(istio_request_duration_milliseconds_count{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval]))", + "expr": "sum(rate(istio_request_duration_milliseconds_sum{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ .service }}\"}[$__rate_interval])) / sum(rate(istio_request_duration_milliseconds_count{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ .service }}\"}[$__rate_interval]))", "legendFormat": "Latency", "range": true, "refId": "A" @@ -827,7 +818,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ .service }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ .service }}\"}[$__rate_interval])) * 100", "legendFormat": "5xx error rate", "range": true, "refId": "A" @@ -921,7 +912,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\", response_code=~\"2..|4..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ .service }}\", response_code=~\"2..|4..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ .service }}\"}[$__rate_interval])) * 100", "legendFormat": "Success ", "range": true, "refId": "A" @@ -932,7 +923,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=~\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ .service }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ .service }}\"}[$__rate_interval])) * 100", "hide": false, "legendFormat": "Error", "range": true, @@ -1028,7 +1019,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_response_bytes_sum{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}app=\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval])) / sum(rate(istio_response_bytes_count{app=\"{{ include "app.label" . }}\", destination_app=~\"{{ include "app.label" . }}\"}[$__rate_interval]))", + "expr": "sum(rate(istio_response_bytes_sum{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ .service }}\"}[$__rate_interval])) / sum(rate(istio_response_bytes_count{source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ .service }}\"}[$__rate_interval]))", "legendFormat": "Response Size", "range": true, "refId": "A" @@ -1136,7 +1127,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(\r\n container_memory_working_set_bytes{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", container!=\"\", image!=\"\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"memory\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", + "expr": "sum(\r\n container_memory_working_set_bytes{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\", container!=\"\", image!=\"\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\", workload=\"{{ .deployment }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_requests{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ $.Release.Namespace }}\", resource=\"memory\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\", workload=\"{{ .deployment }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1230,7 +1221,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(\r\n container_memory_working_set_bytes{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", container!=\"\", image!=\"\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_limits{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"memory\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", + "expr": "sum(\r\n container_memory_working_set_bytes{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\", container!=\"\", image!=\"\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\", workload=\"{{ .deployment }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_limits{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ $.Release.Namespace }}\", resource=\"memory\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\", workload=\"{{ .deployment }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1324,7 +1315,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(\r\n rate(container_cpu_usage_seconds_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\"}[5m])\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_requests{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"cpu\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", + "expr": "sum(\r\n rate(container_cpu_usage_seconds_total{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\"}[5m])\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\", workload=\"{{ .deployment }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_requests{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ $.Release.Namespace }}\", resource=\"cpu\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\", workload=\"{{ .deployment }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1418,7 +1409,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(\r\n rate(container_cpu_usage_seconds_total{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\"}[5m])\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_limits{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ .Release.Namespace }}\", resource=\"cpu\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}namespace=\"{{ .Release.Namespace }}\", workload=\"{{ include "deployment.name" . }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", + "expr": "sum(\r\n rate(container_cpu_usage_seconds_total{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\"}[5m])\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\", workload=\"{{ .deployment }}\", workload_type=\"deployment\"}\r\n) by (pod)\r\n/sum(\r\n kube_pod_container_resource_limits{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}job=\"kube-state-metrics\", namespace=\"{{ $.Release.Namespace }}\", resource=\"cpu\"}\r\n * on(namespace,pod)\r\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }}namespace=\"{{ $.Release.Namespace }}\", workload=\"{{ .deployment }}\", workload_type=\"deployment\"}\r\n) by (pod) * 100", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1471,7 +1462,7 @@ data: "uid": "P8E80F9AEF21F6940" }, "editorMode": "code", - "expr": "{ {{- if hasKey .Values "cluster" }}cluster=\"{{ .Values.cluster }}\", {{- end }}{{- if and .Values.logLabel .Values.logLabelValue }} {{ .Values.logLabel }}=\"{{ .Values.logLabelValue }}\"{{- else }} app=\"{{ include "app.label" . }}\"{{- end }}, container!=\"istio-proxy\"} |= ``", + "expr": "{ {{- if hasKey . "cluster" }}cluster=\"{{ .cluster }}\", {{- end }} pod=~\"{{ .deployment }}.*\", container!=\"istio-proxy\"} |= ``", "queryType": "range", "refId": "A" } @@ -1495,9 +1486,11 @@ data: }, "timepicker": {}, "timezone": "", - "title": "{{ title ( include "app.label" . ) }} Overview - Dashboard", - "uid": "{{ include "dashboard-uid" .}}", + "title": "{{ .service }} Overview - Dashboard", + "uid": "{{ .dashboarduid }}", "version": 10, "weekStart": "" } +--- +{{- end }} {{- end }} \ No newline at end of file diff --git a/charts/nopo11y/templates/defaultAlerts.yaml b/charts/nopo11y/templates/defaultAlerts.yaml deleted file mode 100644 index cd69fe8..0000000 --- a/charts/nopo11y/templates/defaultAlerts.yaml +++ /dev/null @@ -1,60 +0,0 @@ -{{- if .Values.enabled }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - namespace: {{ .Values.namespace }} - name: {{ include "app.label" . }}-default-alert-rules - labels: - release: {{ .Values.prometheusReleaseLabel }} - managedby: nopo11y -spec: - groups: - - name: {{ include "app.label" . }}-default-alert-rules - rules: - {{- if .Values.istioMetrics.enabled }} - - alert: {{ include "app.label" . }}High5xxErrorRate - expr: sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", response_code=~"5.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}"}[5m])) by (instance) * 100 > {{ .Values.errorRate5xx }} - annotations: - description: {{ include "app.label" . }} service is experiencing high 5xx errors rate from last 5 minutes. - summary: {{ include "app.label" . }} service is experiencing high 5xx error rate. - {{- if .Values.grafanaURL }} - dashboard: {{ .Values.grafanaURL }}/d/{{ include "dashboard-uid" .}} - {{- end }} - labels: - severity: critical - - alert: {{ include "app.label" . }}High4xxErrorRate - expr: sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", response_code=~"4.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}"}[5m])) by (instance) * 100 > {{ .Values.errorRate4xx }} - for: 5m - annotations: - {{- if .Values.grafanaURL }} - dashboard: {{ .Values.grafanaURL }}/d/{{ include "dashboard-uid" .}} - {{- end }} - description: {{ include "app.label" . }} service is experiencing high 4xx errors rate from last 5 minutes. - summary: {{ include "app.label" . }} service is experiencing high 4xx error rate. - labels: - severity: warning - {{- end }} - {{- if .Values.nginxIngressMetrics.enabled }} - - alert: {{ include "app.label" . }}IngressHigh5xxErrorRate - expr: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",status=~"5..", path="{{ .Values.nginxIngressMetrics.path }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}", path="{{ .Values.nginxIngressMetrics.path }}"}[5m])) * 100 > {{ .Values.errorRate5xx }} - annotations: - description: {{ include "app.label" . }} service is experiencing high 5xx errors rate from last 5 minutes. - summary: {{ include "app.label" . }} is experiencing high 5xx error rate. - {{- if .Values.grafanaURL }} - dashboard: {{ .Values.grafanaURL }}/d/{{ include "dashboard-uid" .}} - {{- end }} - labels: - severity: critical - - alert: {{ include "app.label" . }}IngressHigh4xxErrorRate - expr: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",status=~"4..", path="{{ .Values.nginxIngressMetrics.path }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}", path="{{ .Values.nginxIngressMetrics.path }}"}[5m])) * 100 > {{ .Values.errorRate4xx }} - for: 10m - annotations: - description: {{ include "app.label" . }} service is experiencing high 4xx errors rate from last 5 minutes. - summary: {{ include "app.label" . }} service is experiencing high 4xx error rate. - {{- if .Values.grafanaURL }} - dashboard: {{ .Values.grafanaURL }}/d/{{ include "dashboard-uid" .}} - {{- end }} - labels: - severity: warning - {{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/nopo11y/templates/defaultSLOs.yaml b/charts/nopo11y/templates/defaultSLOs.yaml deleted file mode 100644 index 1172b77..0000000 --- a/charts/nopo11y/templates/defaultSLOs.yaml +++ /dev/null @@ -1,199 +0,0 @@ -{{- if .Values.enabled }} -{{- if .Values.istioMetrics.enabled }} -apiVersion: sloth.slok.dev/v1 -kind: PrometheusServiceLevel -metadata: - labels: - release: {{ .Values.prometheusReleaseLabel }} - managedby: nopo11y - name: {{ include "app.label" . }}-availability-slo - namespace: {{ .Values.namespace }} -spec: - labels: - app: sloth - role: alert-rules - component: {{ include "app.label" . }}-availability-SLO-rules - service: {{ include "app.label" . }} - slos: - - alerting: - annotations: - {{- if .Values.grafanaURL }} - dashboard: {{ .Values.grafanaURL }}/d/slo-detail?var-service={{ include "app.label" . }} - {{- end }} - summary: SLO to measure success vs errors - {{ .Values.availabilitySLO }}% of the time requests should - be succesfully served (non 5xx). When you receive this alert it means that - the SLO is at risk as your error budget is getting exhausted. To know more - about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - description: SLO to measure success vs errors - {{ .Values.availabilitySLO }}% of the time requests should - be succesfully served (non 5xx). When you receive this alert it means that the - SLO is at risk as your error budget is getting exhausted. To know more about - ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - name: {{ include "app.label" . }} - availability SLO is at RISK - pageAlert: - labels: - alert_type: symptom - severity: critical - ticketAlert: - labels: - alert_type: symptom - severity: warning - description: SLO to measure success vs errors - {{ .Values.availabilitySLO }}% of the time requests should - be succesfully served (non 5xx). When you receive this alert it means that the - SLO is at risk as your error budget is getting exhausted. To know more about - ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - name: availability-{{- if .Values.includeReleaseNameInMetricsLabels }}{{ .Release.Name }}-{{ include "app.label" . }}{{- else }}{{ include "app.label" . }}{{- end }} - objective: {{ .Values.availabilitySLO }} - sli: - events: - errorQuery: sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", response_code=~"5.."}[{{ printf "{{.window}}" }}])) - totalQuery: sum(rate(istio_requests_total{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}"}[{{ printf "{{.window}}" }}])) ---- -apiVersion: sloth.slok.dev/v1 -kind: PrometheusServiceLevel -metadata: - labels: - release: {{ .Values.prometheusReleaseLabel }} - managedby: nopo11y - name: {{ include "app.label" . }}-latency-slo - namespace: {{ .Values.namespace }} -spec: - labels: - app: sloth - role: alert-rules - component: {{ include "app.label" . }}-latency-SLO-rules - service: {{ include "app.label" . }} - slos: - - alerting: - annotations: - {{- if .Values.grafanaURL }} - dashboard: {{ .Values.grafanaURL }}/d/slo-detail?var-service={{ include "app.label" . }} - {{- end }} - summary: SLO to measure response time - {{ .Values.latencySLO }}% of the time requests should - be succesfully served in < {{ .Values.latency }}ms. When you receive this alert it means that - the SLO is at risk as your error budget is getting exhausted. To know more - about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - description: SLO to measure response time - {{ .Values.latencySLO }}% of the time requests should - be succesfully served in < {{ .Values.latency }}ms. When you receive this alert it means that - the SLO is at risk as your error budget is getting exhausted. To know more - about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - name: {{ include "app.label" . }} - latency SLO is at RISK - pageAlert: - labels: - alert_type: symptom - severity: critical - ticketAlert: - labels: - alert_type: symptom - severity: warning - description: SLO to measure response time - {{ .Values.latencySLO }}% of the time requests should - be succesfully served in < {{ .Values.latency }}ms. When you receive this alert it means that the - SLO is at risk as your error budget is getting exhausted. To know more about - ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - name: latency-{{- if .Values.includeReleaseNameInMetricsLabels }}{{ .Release.Name }}-{{ include "app.label" . }}{{- else }}{{ include "app.label" . }}{{- end }} - objective: {{ .Values.latencySLO }} - sli: - events: - errorQuery: (sum(rate(istio_request_duration_milliseconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", le="+Inf"}[{{ printf "{{.window}}" }}])) - sum(rate(istio_request_duration_milliseconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", le="{{ .Values.latency }}"}[{{ printf "{{.window}}" }}]))) - totalQuery: sum(rate(istio_request_duration_milliseconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}app="{{ include "app.label" . }}", destination_app=~"{{ include "app.label" . }}", le="+Inf"}[{{ printf "{{.window}}" }}])) -{{- end }} -{{- end }} ---- -{{- if .Values.enabled }} -{{- if .Values.nginxIngressMetrics.enabled }} -apiVersion: sloth.slok.dev/v1 -kind: PrometheusServiceLevel -metadata: - labels: - release: {{ .Values.prometheusReleaseLabel }} - managedby: nopo11y - name: {{ include "app.label" . }}-ingress-availability-slo - namespace: {{ .Values.namespace }} -spec: - labels: - app: sloth - role: alert-rules - component: {{ include "app.label" . }}-ingress-availability-SLO-rules - service: {{ include "app.label" . }}-ingress - slos: - - alerting: - annotations: - {{- if .Values.grafanaURL }} - dashboard: {{ .Values.grafanaURL }}/d/slo-detail?var-service={{ include "app.label" . }} - {{- end }} - summary: SLO to measure success vs errors - {{ .Values.availabilitySLO }}% of the time requests should - be succesfully served (non 5xx). When you receive this alert it means that - the SLO is at risk as your error budget is getting exhausted. To know more - about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - description: SLO to measure success vs errors - {{ .Values.availabilitySLO }}% of the time requests should - be succesfully served (non 5xx). When you receive this alert it means that - the SLO is at risk as your error budget is getting exhausted. To know more - about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - name: {{ include "app.label" . }}-ingress - availability SLO is at RISK - pageAlert: - labels: - alert_type: symptom - severity: critical - ticketAlert: - labels: - alert_type: symptom - severity: warning - description: SLO to measure success vs errors - {{ .Values.availabilitySLO }}% of the time requests should - be succesfully served (non 5xx). When you receive this alert it means that the - SLO is at risk as your error budget is getting exhausted. To know more about - ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - name: availability-{{ include "app.label" . }}-ingress - objective: {{ .Values.availabilitySLO }} - sli: - events: - errorQuery: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",status=~"5..", path="{{ .Values.nginxIngressMetrics.path }}"}[{{ printf "{{.window}}" }}])) - totalQuery: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}", path="{{ .Values.nginxIngressMetrics.path }}"}[{{ printf "{{.window}}" }}])) ---- -apiVersion: sloth.slok.dev/v1 -kind: PrometheusServiceLevel -metadata: - labels: - release: {{ .Values.prometheusReleaseLabel }} - managedby: nopo11y - name: {{ include "app.label" . }}-ingress-latency-slo - namespace: {{ .Values.namespace }} -spec: - labels: - app: sloth - role: alert-rules - component: {{ include "app.label" . }}-ingress-latency-SLO-rules - service: {{ include "app.label" . }}-ingress - slos: - - alerting: - annotations: - {{- if .Values.grafanaURL }} - dashboard: {{ .Values.grafanaURL }}/d/slo-detail?var-service={{ include "app.label" . }} - {{- end }} - summary: SLO to measure response time - {{ .Values.latencySLO }}% of the time requests should - be succesfully served in < {{ .Values.latency }}ms. When you receive this alert it means that - the SLO is at risk as your error budget is getting exhausted. To know more - about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - description: SLO to measure response time - {{ .Values.latencySLO }}% of the time requests should - be succesfully served in < {{ .Values.latency }}ms. When you receive this alert it means that - the SLO is at risk as your error budget is getting exhausted. To know more - about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - name: {{ include "app.label" . }}-ingress - latency SLO is at RISK - pageAlert: - labels: - alert_type: symptom - severity: critical - ticketAlert: - labels: - alert_type: symptom - severity: warning - description: SLO to measure response time - {{ .Values.latencySLO }}% of the time requests should - be succesfully served in < {{ .Values.latency }}ms. When you receive this alert it means that the - SLO is at risk as your error budget is getting exhausted. To know more about - ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ - name: latency-{{ include "app.label" . }}-ingress - objective: {{ .Values.latencySLO }} - sli: - events: - errorQuery: (sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",le="+Inf", path="{{ .Values.nginxIngressMetrics.path }}"}[{{ printf "{{.window}}" }}])) - sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",le="{{ divf .Values.latency 1000 }}", path="{{ .Values.nginxIngressMetrics.path }}"}[{{ printf "{{.window}}" }}]))) - totalQuery: sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {{- if hasKey .Values "cluster" }}cluster="{{ .Values.cluster }}", {{- end }}ingress=~"{{ .Values.nginxIngressMetrics.ingressName }}",le="+Inf", path="{{ .Values.nginxIngressMetrics.path }}"}[{{ printf "{{.window}}" }}])) -{{- end }} -{{- end }} diff --git a/charts/nopo11y/templates/slos.yaml b/charts/nopo11y/templates/slos.yaml new file mode 100644 index 0000000..d469164 --- /dev/null +++ b/charts/nopo11y/templates/slos.yaml @@ -0,0 +1,196 @@ +{{- if .Values.enabled }} +{{- range (include "nopo11y.services" . |fromJsonArray) }} +{{- if $.Values.istioMetrics.enabled }} +apiVersion: sloth.slok.dev/v1 +kind: PrometheusServiceLevel +metadata: + labels: + managedby: nopo11y + name: {{ .service }}-availability-slo + namespace: {{ $.Values.namespace }} +spec: + labels: + app: sloth + role: alert-rules + component: {{ .service }}-availability-SLO-rules + service: {{ .service }} + slos: + - alerting: + annotations: + {{- if $.Values.grafanaURL }} + dashboard: {{ $.Values.grafanaURL }}/d/slo-detail?var-service={{ .service }} + {{- end }} + summary: SLO to measure success vs errors - {{ .availability }}% of the time requests should + be succesfully served (non 5xx). When you receive this alert it means that + the SLO is at risk as your error budget is getting exhausted. To know more + about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + description: SLO to measure success vs errors - {{ .availability }}% of the time requests should + be succesfully served (non 5xx). When you receive this alert it means that the + SLO is at risk as your error budget is getting exhausted. To know more about + ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + name: {{ .service }} - availability SLO is at RISK + pageAlert: + labels: + alert_type: symptom + severity: critical + ticketAlert: + labels: + alert_type: symptom + severity: warning + description: SLO to measure success vs errors - {{ .availability }}% of the time requests should + be succesfully served (non 5xx). When you receive this alert it means that the + SLO is at risk as your error budget is getting exhausted. To know more about + ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + name: availability-{{ .service }} + objective: {{ .availability }} + sli: + events: + errorQuery: sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}", response_code=~"5.."}[{{ printf "{{.window}}" }}])) + totalQuery: sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}"}[{{ printf "{{.window}}" }}])) +--- +apiVersion: sloth.slok.dev/v1 +kind: PrometheusServiceLevel +metadata: + labels: + managedby: nopo11y + name: {{ .service }}-latency-slo + namespace: {{ $.Values.namespace }} +spec: + labels: + app: sloth + role: alert-rules + component: {{ .service }}-latency-SLO-rules + service: {{ .service }} + slos: + - alerting: + annotations: + {{- if $.Values.grafanaURL }} + dashboard: {{ $.Values.grafanaURL }}/d/slo-detail?var-service={{ .service }} + {{- end }} + summary: SLO to measure response time - {{ .latency }}% of the time requests should + be succesfully served in < {{ .latencyThreshold }}ms. When you receive this alert it means that + the SLO is at risk as your error budget is getting exhausted. To know more + about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + description: SLO to measure response time - {{ .latency }}% of the time requests should + be succesfully served in < {{ .latencyThreshold }}ms. When you receive this alert it means that + the SLO is at risk as your error budget is getting exhausted. To know more + about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + name: {{ .service }} - latency SLO is at RISK + pageAlert: + labels: + alert_type: symptom + severity: critical + ticketAlert: + labels: + alert_type: symptom + severity: warning + description: SLO to measure response time - {{ .latency }}% of the time requests should + be succesfully served in < {{ .latencyThreshold }}ms. When you receive this alert it means that the + SLO is at risk as your error budget is getting exhausted. To know more about + ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + name: latency-{{ .service }} + objective: {{ .latency }} + sli: + events: + errorQuery: (sum(rate(istio_request_duration_milliseconds_bucket{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}", le="+Inf"}[{{ printf "{{.window}}" }}])) - sum(rate(istio_request_duration_milliseconds_bucket{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}", le="{{ .latencyThreshold }}"}[{{ printf "{{.window}}" }}]))) + totalQuery: sum(rate(istio_request_duration_milliseconds_bucket{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}", le="+Inf"}[{{ printf "{{.window}}" }}])) +--- +{{- end }} +{{- if $.Values.nginxIngressMetrics.enabled }} +apiVersion: sloth.slok.dev/v1 +kind: PrometheusServiceLevel +metadata: + labels: + managedby: nopo11y + name: {{ .service }}-ingress-availability-slo + namespace: {{ $.Values.namespace }} +spec: + labels: + app: sloth + role: alert-rules + component: {{ .service }}-ingress-availability-SLO-rules + service: {{ .service }} + slos: + - alerting: + annotations: + {{- if $.Values.grafanaURL }} + dashboard: {{ $.Values.grafanaURL }}/d/slo-detail?var-service={{ .service }} + {{- end }} + summary: SLO to measure success vs errors - {{ .availability }}% of the time requests should + be succesfully served (non 5xx). When you receive this alert it means that + the SLO is at risk as your error budget is getting exhausted. To know more + about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + description: SLO to measure success vs errors - {{ .availability }}% of the time requests should + be succesfully served (non 5xx). When you receive this alert it means that + the SLO is at risk as your error budget is getting exhausted. To know more + about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + name: {{ .service }}-ingress - availability SLO is at RISK + pageAlert: + labels: + alert_type: symptom + severity: critical + ticketAlert: + labels: + alert_type: symptom + severity: warning + description: SLO to measure success vs errors - {{ .availability }}% of the time requests should + be succesfully served (non 5xx). When you receive this alert it means that the + SLO is at risk as your error budget is getting exhausted. To know more about + ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + name: availability-{{ .service }}-ingress + objective: {{ .availability }} + sli: + events: + errorQuery: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}status=~"5..", exported_service="{{ .service }}"}[{{ printf "{{.window}}" }}])) + totalQuery: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}exported_service="{{ .service }}"}[{{ printf "{{.window}}" }}])) +--- +apiVersion: sloth.slok.dev/v1 +kind: PrometheusServiceLevel +metadata: + labels: + managedby: nopo11y + name: {{ .service }}-ingress-latency-slo + namespace: {{ $.Values.namespace }} +spec: + labels: + app: sloth + role: alert-rules + component: {{ .service }}-ingress-latency-SLO-rules + service: {{ .service }} + slos: + - alerting: + annotations: + {{- if $.Values.grafanaURL }} + dashboard: {{ $.Values.grafanaURL }}/d/slo-detail?var-service={{ .service }} + {{- end }} + summary: SLO to measure response time - {{ .latency }}% of the time requests should + be succesfully served in < {{ .latencyThreshold }}ms. When you receive this alert it means that + the SLO is at risk as your error budget is getting exhausted. To know more + about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + description: SLO to measure response time - {{ .latency }}% of the time requests should + be succesfully served in < {{ .latencyThreshold }}ms. When you receive this alert it means that + the SLO is at risk as your error budget is getting exhausted. To know more + about ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + name: {{ .service }} - latency SLO is at RISK + pageAlert: + labels: + alert_type: symptom + severity: critical + ticketAlert: + labels: + alert_type: symptom + severity: warning + description: SLO to measure response time - {{ .latency }}% of the time requests should + be succesfully served in < {{ .latencyThreshold }}ms. When you receive this alert it means that the + SLO is at risk as your error budget is getting exhausted. To know more about + ErrorBudgets and SLOs read https://sre.google/workbook/implementing-slos/ + name: latency-{{ .service }}-ingress + objective: {{ .latency }} + sli: + events: + errorQuery: (sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}le="+Inf", exported_service="{{ .service }}"}[{{ printf "{{.window}}" }}])) - sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}le="{{ divf .latencyThreshold 1000 }}", exported_service="{{ .service }}"}[{{ printf "{{.window}}" }}]))) + totalQuery: sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}le="+Inf", exported_service="{{ .service }}"}[{{ printf "{{.window}}" }}])) +--- +{{- end }} +{{- end }} +{{- end }} diff --git a/charts/nopo11y/values.yaml b/charts/nopo11y/values.yaml index 48e6933..6496629 100644 --- a/charts/nopo11y/values.yaml +++ b/charts/nopo11y/values.yaml @@ -1,20 +1,34 @@ enabled: false + +defaults: + slo: + availability: 99.9 + latency: 99 + alertThresholds: + latencyMS: 100 + rate5xx: 0.05 + rate4xx: 5 + namespace: observability -appLabel: "sample" -includeReleaseNameInMetricsLabels: false -deploymentName: "sample" -prometheusReleaseLabel: "nopo11y-stack" -availabilitySLO: 99.9 -latencySLO: 99 -latency: 1000 -errorRate5xx: 0.05 -errorRate4xx: 5 + +prependReleaseName: false + grafanaURL: "" -logLabel: "" -logLabelValue: "" + istioMetrics: enabled: true + nginxIngressMetrics: enabled: false - ingressName: "sample-ingress" - path: "/" \ No newline at end of file +services: [] +# - serviceName: "sample" +# deploymentName: "sample" + +# slo: {} +# # availability: 99.9 +# # latency: 99 + +# alertThresholds: {} +# # latencyMS: 100 +# # rate4xx: 5 +# # rate5xx: 0.05 \ No newline at end of file