Skip to content

Commit

Permalink
Restructured nopo11y helm chart (#12)
Browse files Browse the repository at this point in the history
* Removed appLabel dependency from the metrics

* Restructured nopo11y helm chart to support multiple services in single deployment

* Values file comment cleanup and added global value for thresholds

* Added global defaults for SLO objectives and alert thresholds

* Used service name instead of deployment name

* Updated latency to latencyMS in alertThresholds

* Replaced .deployment with .service in alerts template

* Updated prepend release name key

* changed service name with deployment name

---------

Co-authored-by: Shehbaz Pathan (Consultant) <[email protected]>
  • Loading branch information
shehbaz-pathan and Shehbaz Pathan (Consultant) authored Jun 27, 2024
1 parent 6eeb7b5 commit 3859db3
Show file tree
Hide file tree
Showing 8 changed files with 386 additions and 322 deletions.
2 changes: 1 addition & 1 deletion charts/nopo11y/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ version: 1.0.2
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.1.0"
appVersion: "2.0.0"
82 changes: 67 additions & 15 deletions charts/nopo11y/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -1,20 +1,72 @@
{{- define "dashboard-uid" -}}
{{- printf "%s-%s" .Release.Name .Release.Namespace | trunc 40 -}}
{{- define "nopo11y.services" -}}
{{- $servicesList:= list }}
{{- $defaulAvailability:= .Values.defaults.slo.availability -}}
{{- $defaulLatency:= .Values.defaults.slo.latency -}}
{{- $defaulLatencyThreshold:= .Values.defaults.alertThresholds.latencyMS -}}
{{- $defaul5xx:= .Values.defaults.alertThresholds.rate5xx -}}
{{- $default4xx:= .Values.defaults.alertThresholds.rate4xx -}}
{{- $release:= "" }}
{{- if .Values.prependReleaseName -}}
{{- $release = printf "%s-" .Release.Name }}
{{- end }}
{{- $namespace:= .Release.Namespace }}
{{- range .Values.services }}
{{- $service:= dict }}
{{- if or (not (hasKey . "deploymentName")) (not (hasKey . "serviceName")) -}}
{{- fail "deploymentName and serviceName are required for each service" -}}
{{- else if and (eq .deploymentName "") (eq .serviceName "") -}}
{{- fail "deploymentName and ServiceName are required for each service" -}}
{{- end -}}
{{ $service = set $service "deployment" (printf "%s%s" $release .deploymentName) }}
{{ $service = set $service "service" (printf "%s%s" $release .serviceName) }}
{{- if not (hasKey . "slo") }}
{{ $service = set $service "availability" $defaulAvailability }}
{{ $service = set $service "latency" $defaulLatency }}
{{- else if hasKey . "slo" }}
{{- if not (hasKey .slo "availability") }}
{{ $service = set $service "availability" $defaulAvailability }}
{{- else if not .slo.availability }}
{{ $service = set $service "availability" $defaulAvailability }}
{{- else }}
{{ $service = set $service "availability" .slo.availability }}
{{- end -}}
{{- if not (hasKey .slo "latency") }}
{{ $service = set $service "latency" $defaulLatency }}
{{- else if not .slo.latency }}
{{ $service = set $service "latency" $defaulLatency }}
{{- else }}
{{ $service = set $service "latency" .slo.latency }}
{{- end }}
{{- end }}
{{- if not (hasKey . "alertThresholds") }}
{{ $service = set $service "rate5xx" $defaul5xx }}
{{ $service = set $service "rate4xx" $default4xx }}
{{ $service = set $service "latencyThreshold" $defaulLatencyThreshold }}
{{- else if hasKey . "alertThresholds" }}
{{- if not (hasKey .alertThresholds "rate5xx") }}
{{ $service = set $service "rate5xx" $defaul5xx }}
{{- else if not .alertThresholds.rate5xx }}
{{ $service = set $service "rate5xx" $defaul5xx }}
{{- else }}
{{ $service = set $service "rate5xx" .alertThresholds.rate5xx }}
{{- end -}}
{{- if not (hasKey .alertThresholds "rate4xx") }}
{{ $service = set $service "rate4xx" $default4xx }}
{{- else if not .alertThresholds.rate5xx }}
{{ $service = set $service "rate4xx" $default4xx }}
{{- else }}
{{ $service = set $service "rate4xx" .alertThresholds.rate4xx }}
{{- end -}}


{{- define "app.label" -}}
{{- if .Values.includeReleaseNameInMetricsLabels }}
{{- printf "%s-%s" .Release.Name .Values.appLabel -}}
{{- else -}}
{{ printf "%s" .Values.appLabel }}
{{- if not (hasKey .alertThresholds "latencyMS") }}
{{ $service = set $service "latencyThreshold" $defaulLatencyThreshold }}
{{- else if not .alertThresholds.latencyMS }}
{{ $service = set $service "latencyThreshold" $defaulLatencyThreshold }}
{{- else }}
{{ $service = set $service "latencyThreshold" .alertThresholds.latencyMS }}
{{- end }}
{{- end }}

{{- define "deployment.name" -}}
{{- if .Values.includeReleaseNameInMetricsLabels }}
{{- printf "%s-%s" .Release.Name .Values.deploymentName -}}
{{- else -}}
{{ printf "%s" .Values.deploymentName }}
{{ $service = set $service "dashboarduid" (printf "%s-%s" .serviceName $namespace) }}
{{ $servicesList = append $servicesList $service }}
{{- end }}
{{- toJson $servicesList }}
{{- end }}
68 changes: 68 additions & 0 deletions charts/nopo11y/templates/alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{{- if .Values.enabled }}
{{- if eq .Values.namespace ""}}
{{ fail "values.namespace is required" }}
{{- end }}
{{- if and (not .Values.istioMetrics.enabled) (not .Values.nginxIngressMetrics.enabled ) }}
{{ fail "Enabling either istioMetrics or nginxIngresMetrics is required" }}
{{- end }}
{{- range (include "nopo11y.services" . |fromJsonArray) }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
namespace: {{ $.Values.namespace }}
name: {{ .service }}-nopo11y-alert-rules
labels:
managedby: nopo11y
spec:
groups:
- name: {{ .service }}-nopo11y-alert-rules
rules:
{{- if $.Values.istioMetrics.enabled }}
- alert: {{ .service }}High5xxErrorRate
expr: sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}", response_code=~"5.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}"}[5m])) by (instance) * 100 > {{ .rate5xx }}
annotations:
description: {{ .service }} service is experiencing high 5xx errors rate from last 5 minutes.
summary: {{ .service }} service is experiencing high 5xx error rate.
{{- if $.Values.grafanaURL }}
dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }}
{{- end }}
labels:
severity: critical
- alert: {{ .service }}High4xxErrorRate
expr: sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}", response_code=~"4.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}source_workload!~"unknown", reporter="source", destination_service_name="{{ .service }}"}[5m])) by (instance) * 100 > {{ .rate4xx }}
for: 5m
annotations:
{{- if $.Values.grafanaURL }}
dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }}
{{- end }}
description: {{ .service }} service is experiencing high 4xx errors rate from last 5 minutes.
summary: {{ .service }} service is experiencing high 4xx error rate.
labels:
severity: warning
{{- end }}
{{- if $.Values.nginxIngressMetrics.enabled }}
- alert: {{ .service }}High5xxErrorRate-NginxIngress
expr: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}status=~"5..", exported_service="{{ .service }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}exported_service="{{ .service }}"}[5m])) * 100 > {{ .rate5xx }}
annotations:
description: {{ .service }} service is experiencing high 5xx errors rate from last 5 minutes.
summary: {{ .service }} is experiencing high 5xx error rate.
{{- if $.Values.grafanaURL }}
dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }}
{{- end }}
labels:
severity: critical
- alert: {{ .service }}High4xxErrorRate-NginxIngress
expr: sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}status=~"4..", exported_service="{{ .service }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {{- if hasKey . "cluster" }}cluster="{{ .cluster }}", {{- end }}exported_service="{{ .service }}"}[5m])) * 100 > {{ .rate4xx }}
for: 10m
annotations:
description: {{ .service }} service is experiencing high 4xx errors rate from last 5 minutes.
summary: {{ .service }} service is experiencing high 4xx error rate.
{{- if $.Values.grafanaURL }}
dashboard: {{ $.Values.grafanaURL }}/d/{{ .dashboarduid }}
{{- end }}
labels:
severity: warning
{{- end }}
---
{{- end }}
{{- end }}
Loading

0 comments on commit 3859db3

Please sign in to comment.