-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Restructured nopo11y helm chart (#12)
* Removed appLabel dependency from the metrics * Restructured nopo11y helm chart to support multiple services in single deployment * Values file comment cleanup and added global value for thresholds * Added global defaults for SLO objectives and alert thresholds * Used service name instead of deployment name * Updated latency to latencyMS in alertThresholds * Replaced .deployment with .service in alerts template * Updated prepend release name key * changed service name with deployment name --------- Co-authored-by: Shehbaz Pathan (Consultant) <[email protected]>
- Loading branch information
1 parent
6eeb7b5
commit 3859db3
Showing
8 changed files
with
386 additions
and
322 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,72 @@ | ||
{{- define "dashboard-uid" -}} | ||
{{- printf "%s-%s" .Release.Name .Release.Namespace | trunc 40 -}} | ||
{{- define "nopo11y.services" -}} | ||
{{- $servicesList:= list }} | ||
{{- $defaulAvailability:= .Values.defaults.slo.availability -}} | ||
{{- $defaulLatency:= .Values.defaults.slo.latency -}} | ||
{{- $defaulLatencyThreshold:= .Values.defaults.alertThresholds.latencyMS -}} | ||
{{- $defaul5xx:= .Values.defaults.alertThresholds.rate5xx -}} | ||
{{- $default4xx:= .Values.defaults.alertThresholds.rate4xx -}} | ||
{{- $release:= "" }} | ||
{{- if .Values.prependReleaseName -}} | ||
{{- $release = printf "%s-" .Release.Name }} | ||
{{- end }} | ||
{{- $namespace:= .Release.Namespace }} | ||
{{- range .Values.services }} | ||
{{- $service:= dict }} | ||
{{- if or (not (hasKey . "deploymentName")) (not (hasKey . "serviceName")) -}} | ||
{{- fail "deploymentName and serviceName are required for each service" -}} | ||
{{- else if and (eq .deploymentName "") (eq .serviceName "") -}} | ||
{{- fail "deploymentName and ServiceName are required for each service" -}} | ||
{{- end -}} | ||
{{ $service = set $service "deployment" (printf "%s%s" $release .deploymentName) }} | ||
{{ $service = set $service "service" (printf "%s%s" $release .serviceName) }} | ||
{{- if not (hasKey . "slo") }} | ||
{{ $service = set $service "availability" $defaulAvailability }} | ||
{{ $service = set $service "latency" $defaulLatency }} | ||
{{- else if hasKey . "slo" }} | ||
{{- if not (hasKey .slo "availability") }} | ||
{{ $service = set $service "availability" $defaulAvailability }} | ||
{{- else if not .slo.availability }} | ||
{{ $service = set $service "availability" $defaulAvailability }} | ||
{{- else }} | ||
{{ $service = set $service "availability" .slo.availability }} | ||
{{- end -}} | ||
{{- if not (hasKey .slo "latency") }} | ||
{{ $service = set $service "latency" $defaulLatency }} | ||
{{- else if not .slo.latency }} | ||
{{ $service = set $service "latency" $defaulLatency }} | ||
{{- else }} | ||
{{ $service = set $service "latency" .slo.latency }} | ||
{{- end }} | ||
{{- end }} | ||
{{- if not (hasKey . "alertThresholds") }} | ||
{{ $service = set $service "rate5xx" $defaul5xx }} | ||
{{ $service = set $service "rate4xx" $default4xx }} | ||
{{ $service = set $service "latencyThreshold" $defaulLatencyThreshold }} | ||
{{- else if hasKey . "alertThresholds" }} | ||
{{- if not (hasKey .alertThresholds "rate5xx") }} | ||
{{ $service = set $service "rate5xx" $defaul5xx }} | ||
{{- else if not .alertThresholds.rate5xx }} | ||
{{ $service = set $service "rate5xx" $defaul5xx }} | ||
{{- else }} | ||
{{ $service = set $service "rate5xx" .alertThresholds.rate5xx }} | ||
{{- end -}} | ||
{{- if not (hasKey .alertThresholds "rate4xx") }} | ||
{{ $service = set $service "rate4xx" $default4xx }} | ||
{{- else if not .alertThresholds.rate5xx }} | ||
{{ $service = set $service "rate4xx" $default4xx }} | ||
{{- else }} | ||
{{ $service = set $service "rate4xx" .alertThresholds.rate4xx }} | ||
{{- end -}} | ||
|
||
|
||
{{- define "app.label" -}} | ||
{{- if .Values.includeReleaseNameInMetricsLabels }} | ||
{{- printf "%s-%s" .Release.Name .Values.appLabel -}} | ||
{{- else -}} | ||
{{ printf "%s" .Values.appLabel }} | ||
{{- if not (hasKey .alertThresholds "latencyMS") }} | ||
{{ $service = set $service "latencyThreshold" $defaulLatencyThreshold }} | ||
{{- else if not .alertThresholds.latencyMS }} | ||
{{ $service = set $service "latencyThreshold" $defaulLatencyThreshold }} | ||
{{- else }} | ||
{{ $service = set $service "latencyThreshold" .alertThresholds.latencyMS }} | ||
{{- end }} | ||
{{- end }} | ||
|
||
{{- define "deployment.name" -}} | ||
{{- if .Values.includeReleaseNameInMetricsLabels }} | ||
{{- printf "%s-%s" .Release.Name .Values.deploymentName -}} | ||
{{- else -}} | ||
{{ printf "%s" .Values.deploymentName }} | ||
{{ $service = set $service "dashboarduid" (printf "%s-%s" .serviceName $namespace) }} | ||
{{ $servicesList = append $servicesList $service }} | ||
{{- end }} | ||
{{- toJson $servicesList }} | ||
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
{{- if .Values.enabled }} | ||
{{- if eq .Values.namespace ""}} | ||
{{ fail "values.namespace is required" }} | ||
{{- end }} | ||
{{- if and (not .Values.istioMetrics.enabled) (not .Values.nginxIngressMetrics.enabled ) }} | ||
{{ fail "Enabling either istioMetrics or nginxIngresMetrics is required" }} | ||
{{- end }} | ||
{{- range (include "nopo11y.services" . |fromJsonArray) }} | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PrometheusRule | ||
metadata: | ||
namespace: {{ $.Values.namespace }} | ||
name: {{ .service }}-nopo11y-alert-rules | ||
labels: | ||
managedby: nopo11y | ||
spec: | ||
groups: | ||
- name: {{ .service }}-nopo11y-alert-rules | ||
rules: | ||
{{- if $.Values.istioMetrics.enabled }} | ||
- alert: {{ .service }}High5xxErrorRate | ||
expr: sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}", response_code=~"5.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}"}[5m])) by (instance) * 100 > {{ .rate5xx }} | ||
annotations: | ||
description: {{ .service }} service is experiencing high 5xx errors rate from last 5 minutes. | ||
summary: {{ .service }} service is experiencing high 5xx error rate. | ||
{{- if $.Values.grafanaURL }} | ||
dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }} | ||
{{- end }} | ||
labels: | ||
severity: critical | ||
- alert: {{ .service }}High4xxErrorRate | ||
expr: sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}", response_code=~"4.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}"}[5m])) by (instance) * 100 > {{ .rate4xx }} | ||
for: 5m | ||
annotations: | ||
{{- if $.Values.grafanaURL }} | ||
dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }} | ||
{{- end }} | ||
description: {{ .service }} service is experiencing high 4xx errors rate from last 5 minutes. | ||
summary: {{ .service }} service is experiencing high 4xx error rate. | ||
labels: | ||
severity: warning | ||
{{- end }} | ||
{{- if $.Values.nginxIngressMetrics.enabled }} | ||
- alert: {{ .service }}High5xxErrorRate-NginxIngress | ||
expr: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}status=~"5..", exported_service="{{ .service }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}exported_service="{{ .service }}"}[5m])) * 100 > {{ .rate5xx }} | ||
annotations: | ||
description: {{ .service }} service is experiencing high 5xx errors rate from last 5 minutes. | ||
summary: {{ .service }} is experiencing high 5xx error rate. | ||
{{- if $.Values.grafanaURL }} | ||
dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }} | ||
{{- end }} | ||
labels: | ||
severity: critical | ||
- alert: {{ .service }}High4xxErrorRate-NginxIngress | ||
expr: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}status=~"4..", exported_service="{{ .service }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}exported_service="{{ .service }}"}[5m])) * 100 > {{ .rate4xx }} | ||
for: 10m | ||
annotations: | ||
description: {{ .service }} service is experiencing high 4xx errors rate from last 5 minutes. | ||
summary: {{ .service }} service is experiencing high 4xx error rate. | ||
{{- if $.Values.grafanaURL }} | ||
dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }} | ||
{{- end }} | ||
labels: | ||
severity: warning | ||
{{- end }} | ||
--- | ||
{{- end }} | ||
{{- end }} |
Oops, something went wrong.