From f99e588c315a68173bda213d65cd658f9368a853 Mon Sep 17 00:00:00 2001
From: Simon Pasquier <spasquie@redhat.com>
Date: Mon, 18 Nov 2024 14:48:48 +0100
Subject: [PATCH] feat: deploy PrometheusRule resource

This change deploys a PrometheusRule resource with one alerting rule
firing when one controller hits more than 10% of failed reconciliations
in the last 5 minutes for more than 15 minutes.

Signed-off-by: Simon Pasquier <spasquie@redhat.com>
---
 ...bility-operator.clusterserviceversion.yaml |  3 +-
 .../observability-operator-cluster-role.yaml  |  1 +
 pkg/controllers/operator/components.go        | 46 +++++++++++++++++++
 pkg/controllers/operator/controller.go        |  8 ++--
 4 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/bundle/manifests/observability-operator.clusterserviceversion.yaml b/bundle/manifests/observability-operator.clusterserviceversion.yaml
index 2e0a95b2..a5ad36a0 100644
--- a/bundle/manifests/observability-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/observability-operator.clusterserviceversion.yaml
@@ -42,7 +42,7 @@ metadata:
     categories: Monitoring
     certified: "false"
     containerImage: observability-operator:0.4.2
-    createdAt: "2024-11-18T09:45:14Z"
+    createdAt: "2024-11-18T14:49:04Z"
     description: A Go based Kubernetes operator to setup and manage highly available
       Monitoring Stack using Prometheus, Alertmanager and Thanos Querier.
     operatorframework.io/cluster-monitoring: "true"
@@ -414,6 +414,7 @@ spec:
         - apiGroups:
           - monitoring.coreos.com
           resources:
+          - prometheusrules
           - servicemonitors
           verbs:
           - create
diff --git a/deploy/operator/observability-operator-cluster-role.yaml b/deploy/operator/observability-operator-cluster-role.yaml
index 16c472ff..867a06fd 100644
--- a/deploy/operator/observability-operator-cluster-role.yaml
+++ b/deploy/operator/observability-operator-cluster-role.yaml
@@ -140,6 +140,7 @@ rules:
 - apiGroups:
   - monitoring.coreos.com
   resources:
+  - prometheusrules
   - servicemonitors
   verbs:
   - create
diff --git a/pkg/controllers/operator/components.go b/pkg/controllers/operator/components.go
index 4066e0c9..89841380 100644
--- a/pkg/controllers/operator/components.go
+++ b/pkg/controllers/operator/components.go
@@ -7,6 +7,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	rbacv1 "k8s.io/api/rbac/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/utils/ptr"
 
 	"github.com/rhobs/observability-operator/pkg/reconciler"
@@ -21,9 +22,54 @@ func operatorComponentReconcilers(owner metav1.Object, namespace string) []recon
 		reconciler.NewUpdater(newServiceMonitor(namespace), owner),
 		reconciler.NewUpdater(newPrometheusRole(namespace), owner),
 		reconciler.NewUpdater(newRoleBindingForPrometheusRole(namespace), owner),
+		reconciler.NewUpdater(newPrometheusRule(namespace), owner),
 	}
 }
 
+func newPrometheusRule(namespace string) *monv1.PrometheusRule {
+	return &monv1.PrometheusRule{
+		TypeMeta: metav1.TypeMeta{
+			APIVersion: monv1.SchemeGroupVersion.String(),
+			Kind:       "PrometheusRule",
+		},
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      name,
+			Namespace: namespace,
+			Labels: map[string]string{
+				"app.kubernetes.io/component":  "operator",
+				"app.kubernetes.io/name":       name,
+				"app.kubernetes.io/part-of":    name,
+				"openshift.io/user-monitoring": "false",
+			},
+		},
+
+		Spec: monv1.PrometheusRuleSpec{
+			Groups: []monv1.RuleGroup{
+				{
+					Name: "operator",
+					Rules: []monv1.Rule{
+						{
+							Alert: "ClusterObservabilityOperatorReconciliationsFailed",
+							Expr: intstr.FromString(
+								fmt.Sprintf(`sum by(controller,pod,namespace) (rate(controller_runtime_reconcile_total{result="error",job="observability-operator",namespace="%s"}[5m]))
+/
+sum by(controller,pod,namespace) (rate(controller_runtime_reconcile_total{job="observability-operator",namespace="%s"}[5m])) > 0.1`, namespace, namespace),
+							),
+							For: ptr.To(monv1.Duration("15m")),
+							Labels: map[string]string{
+								"severity": "warning",
+							},
+							Annotations: map[string]string{
+								"description": "{{$value | humanize}}% of reconciliation requests are failing for the '{{ $labels.controller}}' controller. Check the logs of the {{$labels.namespace}}/{{$labels.pod}} pod to investigate further.",
+								"summary":     "Cluster observability operator fails to reconcile resources",
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+}
 func newServiceMonitor(namespace string) *monv1.ServiceMonitor {
 	return &monv1.ServiceMonitor{
 		TypeMeta: metav1.TypeMeta{
diff --git a/pkg/controllers/operator/controller.go b/pkg/controllers/operator/controller.go
index d699d308..ab28025b 100644
--- a/pkg/controllers/operator/controller.go
+++ b/pkg/controllers/operator/controller.go
@@ -45,6 +45,7 @@ type resourceManager struct {
 // subresource to set the owner reference with blockOwnerDeletion=true on the
 // ServiceMonitor resource.
 //+kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=list;watch;create;update;delete;patch
+//+kubebuilder:rbac:groups=monitoring.coreos.com,resources=prometheusrules,verbs=list;watch;create;update;delete;patch
 //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles;rolebindings,verbs=list;create;update;patch
 //+kubebuilder:rbac:groups="",resources=services/finalizers,verbs=update;patch
 
@@ -72,6 +73,7 @@ func RegisterWithManager(mgr ctrl.Manager, namespace string) error {
 		).
 		Named(name).
 		Owns(&monv1.ServiceMonitor{}, generationChanged).
+		Owns(&monv1.PrometheusRule{}, generationChanged).
 		Build(rm)
 
 	if err != nil {
@@ -85,8 +87,8 @@ func (rm resourceManager) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
 	logger := rm.logger.WithValues("operator", req.NamespacedName)
 	logger.Info("Reconciling operator resources")
 
-	op := &corev1.Service{}
-	err := rm.k8sClient.Get(ctx, req.NamespacedName, op)
+	svc := &corev1.Service{}
+	err := rm.k8sClient.Get(ctx, req.NamespacedName, svc)
 	if errors.IsNotFound(err) {
 		return ctrl.Result{}, nil
 	}
@@ -94,7 +96,7 @@ func (rm resourceManager) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
 		return ctrl.Result{}, err
 	}
 
-	reconcilers := operatorComponentReconcilers(op, rm.namespace)
+	reconcilers := operatorComponentReconcilers(svc, rm.namespace)
 	for _, reconciler := range reconcilers {
 		err := reconciler.Reconcile(ctx, rm.k8sClient, rm.scheme)
 		// handle create / update errors that can happen due to a stale cache by