From d59d484717b77e4205377f975ade8b06cbcaa869 Mon Sep 17 00:00:00 2001 From: Philip Gough Date: Wed, 29 Nov 2023 10:55:22 +0000 Subject: [PATCH] Add an alert to catch mismtach of sts replicas vs expected/ready --- docs/sop/observatorium.md | 34 +++++++++++++++++++ observability/prometheusrules.jsonnet | 14 ++++++++ ...om-metrics-production.prometheusrules.yaml | 13 +++++++ ...-custom-metrics-stage.prometheusrules.yaml | 13 +++++++ 4 files changed, 74 insertions(+) diff --git a/docs/sop/observatorium.md b/docs/sop/observatorium.md index d04a7c213c..63e1d06c3f 100644 --- a/docs/sop/observatorium.md +++ b/docs/sop/observatorium.md @@ -31,6 +31,7 @@ * [ObservatoriumNoRulesLoaded](#observatoriumnorulesloaded) * [ObservatoriumPersistentVolumeUsageHigh](#observatoriumpersistentvolumeusagehigh) * [ObservatoriumPersistentVolumeUsageCritical](#observatoriumpersistentvolumeusagecritical) + * [ObservatoriumExpectedReplicasUnavailable](#observatoriumexpectedreplicasunavailable) * [Observatorium Gubernator Alerts](#observatorium-gubernator-alerts) * [GubernatorIsDown](#gubernatorisdown) * [Observatorium Obsctl Reloader Alerts](#observatorium-obsctl-reloader-alerts) @@ -866,6 +867,39 @@ One or more PVCs are filled to more than 95%. The remaining free space does not - Locate the affected deployment in the [AppSRE Interface](https://gitlab.cee.redhat.com/service/app-interface/-/tree/master/data/services/rhobs), depending on which namespace the alert is coming from - Increase the size of the PVC by adjusting the relevant parameter in one of the `saas.yaml` files +## ObservatoriumExpectedReplicasUnavailable + +### Impact + +A StatefulSet belonging to the RHOBS service is not running the expected number of replicas for a prolonged period of time. +This may impact the metric query or ingest performance of the system. + +### Summary + +A StatefulSet has an undesired amount of replicas. This may be caused by a number of reasons, including: + +1. Pod stuck in a terminating state. +2. Pod unable to be scheduled due to constraints on the cluster such as node capacity or resource limits. + +### Severity + +`critical` + +### Access Required + +- Console access to the cluster that runs Observatorium. +- Edit access to the Observatorium namespaces: + - `observatorium-metrics-stage` + - `observatorium-metrics-production` + - `observatorium-mst-stage` + - `observatorium-mst-production` + +### Steps + +- Check the alert and establish which component is the one affected. +- Determine the reason for the missing replica(s). +- Act on the above information to address the issue. + # Observatorium Gubernator Alerts ## GubernatorIsDown diff --git a/observability/prometheusrules.jsonnet b/observability/prometheusrules.jsonnet index 74b56d54e9..ead8a142be 100644 --- a/observability/prometheusrules.jsonnet +++ b/observability/prometheusrules.jsonnet @@ -357,6 +357,20 @@ local renderAlerts(name, environment, mixin) = { severity: 'critical', }, }, + { + alert: 'ObservatoriumExpectedReplicasUnavailable', + annotations: { + description: 'The StatefulSet {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has a mismatch between the expected and ready replicas.', + summary: 'One or more workloads in Observatorium persistently have less replicas in a ready state than expected for an extended period.', + }, + expr: ||| + kube_statefulset_replicas - kube_statefulset_status_replicas_ready > 0 + |||, + 'for': '20m', + labels: { + severity: 'critical', + }, + }, ], }, ], diff --git a/resources/observability/prometheusrules/observatorium-custom-metrics-production.prometheusrules.yaml b/resources/observability/prometheusrules/observatorium-custom-metrics-production.prometheusrules.yaml index b796663a52..dc7cc589e1 100644 --- a/resources/observability/prometheusrules/observatorium-custom-metrics-production.prometheusrules.yaml +++ b/resources/observability/prometheusrules/observatorium-custom-metrics-production.prometheusrules.yaml @@ -67,3 +67,16 @@ spec: labels: service: telemeter severity: critical + - alert: ObservatoriumExpectedReplicasUnavailable + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/observatorium-metrics?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + description: The StatefulSet {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has a mismatch between the expected and ready replicas. + message: The StatefulSet {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has a mismatch between the expected and ready replicas. + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#observatoriumexpectedreplicasunavailable + summary: One or more workloads in Observatorium persistently have less replicas in a ready state than expected for an extended period. + expr: | + kube_statefulset_replicas - kube_statefulset_status_replicas_ready > 0 + for: 20m + labels: + service: telemeter + severity: critical diff --git a/resources/observability/prometheusrules/observatorium-custom-metrics-stage.prometheusrules.yaml b/resources/observability/prometheusrules/observatorium-custom-metrics-stage.prometheusrules.yaml index f4540d8eab..7937af1ba9 100644 --- a/resources/observability/prometheusrules/observatorium-custom-metrics-stage.prometheusrules.yaml +++ b/resources/observability/prometheusrules/observatorium-custom-metrics-stage.prometheusrules.yaml @@ -67,3 +67,16 @@ spec: labels: service: telemeter severity: high + - alert: ObservatoriumExpectedReplicasUnavailable + annotations: + dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/observatorium-metrics?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m + description: The StatefulSet {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has a mismatch between the expected and ready replicas. + message: The StatefulSet {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has a mismatch between the expected and ready replicas. + runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#observatoriumexpectedreplicasunavailable + summary: One or more workloads in Observatorium persistently have less replicas in a ready state than expected for an extended period. + expr: | + kube_statefulset_replicas - kube_statefulset_status_replicas_ready > 0 + for: 20m + labels: + service: telemeter + severity: high