From d59d484717b77e4205377f975ade8b06cbcaa869 Mon Sep 17 00:00:00 2001
From: Philip Gough <philip.p.gough@gmail.com>
Date: Wed, 29 Nov 2023 10:55:22 +0000
Subject: [PATCH] Add an alert to catch mismtach of sts replicas vs
 expected/ready

---
 docs/sop/observatorium.md                     | 34 +++++++++++++++++++
 observability/prometheusrules.jsonnet         | 14 ++++++++
 ...om-metrics-production.prometheusrules.yaml | 13 +++++++
 ...-custom-metrics-stage.prometheusrules.yaml | 13 +++++++
 4 files changed, 74 insertions(+)

diff --git a/docs/sop/observatorium.md b/docs/sop/observatorium.md
index d04a7c213c..63e1d06c3f 100644
--- a/docs/sop/observatorium.md
+++ b/docs/sop/observatorium.md
@@ -31,6 +31,7 @@
   * [ObservatoriumNoRulesLoaded](#observatoriumnorulesloaded)
   * [ObservatoriumPersistentVolumeUsageHigh](#observatoriumpersistentvolumeusagehigh)
   * [ObservatoriumPersistentVolumeUsageCritical](#observatoriumpersistentvolumeusagecritical)
+  * [ObservatoriumExpectedReplicasUnavailable](#observatoriumexpectedreplicasunavailable)
 * [Observatorium Gubernator Alerts](#observatorium-gubernator-alerts)
   * [GubernatorIsDown](#gubernatorisdown)
 * [Observatorium Obsctl Reloader Alerts](#observatorium-obsctl-reloader-alerts)
@@ -866,6 +867,39 @@ One or more PVCs are filled to more than 95%. The remaining free space does not
 - Locate the affected deployment in the [AppSRE Interface](https://gitlab.cee.redhat.com/service/app-interface/-/tree/master/data/services/rhobs), depending on which namespace the alert is coming from
 - Increase the size of the PVC by adjusting the relevant parameter in one of the `saas.yaml` files
 
+## ObservatoriumExpectedReplicasUnavailable
+
+### Impact
+
+A StatefulSet belonging to the RHOBS service is not running the expected number of replicas for a prolonged period of time.
+This may impact the metric query or ingest performance of the system.
+
+### Summary
+
+A StatefulSet has an undesired amount of replicas. This may be caused by a number of reasons, including:
+
+1. Pod stuck in a terminating state.
+2. Pod unable to be scheduled due to constraints on the cluster such as node capacity or resource limits.
+
+### Severity
+
+`critical`
+
+### Access Required
+
+- Console access to the cluster that runs Observatorium.
+- Edit access to the Observatorium namespaces:
+  - `observatorium-metrics-stage`
+  - `observatorium-metrics-production`
+  - `observatorium-mst-stage`
+  - `observatorium-mst-production`
+
+### Steps
+
+- Check the alert and establish which component is the one affected.
+- Determine the reason for the missing replica(s).
+- Act on the above information to address the issue.
+
 # Observatorium Gubernator Alerts
 
 ## GubernatorIsDown
diff --git a/observability/prometheusrules.jsonnet b/observability/prometheusrules.jsonnet
index 74b56d54e9..ead8a142be 100644
--- a/observability/prometheusrules.jsonnet
+++ b/observability/prometheusrules.jsonnet
@@ -357,6 +357,20 @@ local renderAlerts(name, environment, mixin) = {
                 severity: 'critical',
               },
             },
+            {
+              alert: 'ObservatoriumExpectedReplicasUnavailable',
+              annotations: {
+                description: 'The StatefulSet {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has a mismatch between the expected and ready replicas.',
+                summary: 'One or more workloads in Observatorium persistently have less replicas in a ready state than expected for an extended period.',
+              },
+              expr: |||
+                kube_statefulset_replicas - kube_statefulset_status_replicas_ready > 0
+              |||,
+              'for': '20m',
+              labels: {
+                severity: 'critical',
+              },
+            },
           ],
         },
       ],
diff --git a/resources/observability/prometheusrules/observatorium-custom-metrics-production.prometheusrules.yaml b/resources/observability/prometheusrules/observatorium-custom-metrics-production.prometheusrules.yaml
index b796663a52..dc7cc589e1 100644
--- a/resources/observability/prometheusrules/observatorium-custom-metrics-production.prometheusrules.yaml
+++ b/resources/observability/prometheusrules/observatorium-custom-metrics-production.prometheusrules.yaml
@@ -67,3 +67,16 @@ spec:
       labels:
         service: telemeter
         severity: critical
+    - alert: ObservatoriumExpectedReplicasUnavailable
+      annotations:
+        dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/observatorium-metrics?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
+        description: The StatefulSet {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has a mismatch between the expected and ready replicas.
+        message: The StatefulSet {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has a mismatch between the expected and ready replicas.
+        runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#observatoriumexpectedreplicasunavailable
+        summary: One or more workloads in Observatorium persistently have less replicas in a ready state than expected for an extended period.
+      expr: |
+        kube_statefulset_replicas - kube_statefulset_status_replicas_ready > 0
+      for: 20m
+      labels:
+        service: telemeter
+        severity: critical
diff --git a/resources/observability/prometheusrules/observatorium-custom-metrics-stage.prometheusrules.yaml b/resources/observability/prometheusrules/observatorium-custom-metrics-stage.prometheusrules.yaml
index f4540d8eab..7937af1ba9 100644
--- a/resources/observability/prometheusrules/observatorium-custom-metrics-stage.prometheusrules.yaml
+++ b/resources/observability/prometheusrules/observatorium-custom-metrics-stage.prometheusrules.yaml
@@ -67,3 +67,16 @@ spec:
       labels:
         service: telemeter
         severity: high
+    - alert: ObservatoriumExpectedReplicasUnavailable
+      annotations:
+        dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/observatorium-metrics?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
+        description: The StatefulSet {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has a mismatch between the expected and ready replicas.
+        message: The StatefulSet {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has a mismatch between the expected and ready replicas.
+        runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#observatoriumexpectedreplicasunavailable
+        summary: One or more workloads in Observatorium persistently have less replicas in a ready state than expected for an extended period.
+      expr: |
+        kube_statefulset_replicas - kube_statefulset_status_replicas_ready > 0
+      for: 20m
+      labels:
+        service: telemeter
+        severity: high