From f99e588c315a68173bda213d65cd658f9368a853 Mon Sep 17 00:00:00 2001 From: Simon Pasquier Date: Mon, 18 Nov 2024 14:48:48 +0100 Subject: [PATCH] feat: deploy PrometheusRule resource This change deploys a PrometheusRule resource with one alerting rule firing when one controller hits more than 10% of failed reconciliations in the last 5 minutes for more than 15 minutes. Signed-off-by: Simon Pasquier --- ...bility-operator.clusterserviceversion.yaml | 3 +- .../observability-operator-cluster-role.yaml | 1 + pkg/controllers/operator/components.go | 46 +++++++++++++++++++ pkg/controllers/operator/controller.go | 8 ++-- 4 files changed, 54 insertions(+), 4 deletions(-) diff --git a/bundle/manifests/observability-operator.clusterserviceversion.yaml b/bundle/manifests/observability-operator.clusterserviceversion.yaml index 2e0a95b2..a5ad36a0 100644 --- a/bundle/manifests/observability-operator.clusterserviceversion.yaml +++ b/bundle/manifests/observability-operator.clusterserviceversion.yaml @@ -42,7 +42,7 @@ metadata: categories: Monitoring certified: "false" containerImage: observability-operator:0.4.2 - createdAt: "2024-11-18T09:45:14Z" + createdAt: "2024-11-18T14:49:04Z" description: A Go based Kubernetes operator to setup and manage highly available Monitoring Stack using Prometheus, Alertmanager and Thanos Querier. operatorframework.io/cluster-monitoring: "true" @@ -414,6 +414,7 @@ spec: - apiGroups: - monitoring.coreos.com resources: + - prometheusrules - servicemonitors verbs: - create diff --git a/deploy/operator/observability-operator-cluster-role.yaml b/deploy/operator/observability-operator-cluster-role.yaml index 16c472ff..867a06fd 100644 --- a/deploy/operator/observability-operator-cluster-role.yaml +++ b/deploy/operator/observability-operator-cluster-role.yaml @@ -140,6 +140,7 @@ rules: - apiGroups: - monitoring.coreos.com resources: + - prometheusrules - servicemonitors verbs: - create diff --git a/pkg/controllers/operator/components.go b/pkg/controllers/operator/components.go index 4066e0c9..89841380 100644 --- a/pkg/controllers/operator/components.go +++ b/pkg/controllers/operator/components.go @@ -7,6 +7,7 @@ import ( corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/ptr" "github.com/rhobs/observability-operator/pkg/reconciler" @@ -21,9 +22,54 @@ func operatorComponentReconcilers(owner metav1.Object, namespace string) []recon reconciler.NewUpdater(newServiceMonitor(namespace), owner), reconciler.NewUpdater(newPrometheusRole(namespace), owner), reconciler.NewUpdater(newRoleBindingForPrometheusRole(namespace), owner), + reconciler.NewUpdater(newPrometheusRule(namespace), owner), } } +func newPrometheusRule(namespace string) *monv1.PrometheusRule { + return &monv1.PrometheusRule{ + TypeMeta: metav1.TypeMeta{ + APIVersion: monv1.SchemeGroupVersion.String(), + Kind: "PrometheusRule", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: map[string]string{ + "app.kubernetes.io/component": "operator", + "app.kubernetes.io/name": name, + "app.kubernetes.io/part-of": name, + "openshift.io/user-monitoring": "false", + }, + }, + + Spec: monv1.PrometheusRuleSpec{ + Groups: []monv1.RuleGroup{ + { + Name: "operator", + Rules: []monv1.Rule{ + { + Alert: "ClusterObservabilityOperatorReconciliationsFailed", + Expr: intstr.FromString( + fmt.Sprintf(`sum by(controller,pod,namespace) (rate(controller_runtime_reconcile_total{result="error",job="observability-operator",namespace="%s"}[5m])) +/ +sum by(controller,pod,namespace) (rate(controller_runtime_reconcile_total{job="observability-operator",namespace="%s"}[5m])) > 0.1`, namespace, namespace), + ), + For: ptr.To(monv1.Duration("15m")), + Labels: map[string]string{ + "severity": "warning", + }, + Annotations: map[string]string{ + "description": "{{$value | humanize}}% of reconciliation requests are failing for the '{{ $labels.controller}}' controller. Check the logs of the {{$labels.namespace}}/{{$labels.pod}} pod to investigate further.", + "summary": "Cluster observability operator fails to reconcile resources", + }, + }, + }, + }, + }, + }, + } +} func newServiceMonitor(namespace string) *monv1.ServiceMonitor { return &monv1.ServiceMonitor{ TypeMeta: metav1.TypeMeta{ diff --git a/pkg/controllers/operator/controller.go b/pkg/controllers/operator/controller.go index d699d308..ab28025b 100644 --- a/pkg/controllers/operator/controller.go +++ b/pkg/controllers/operator/controller.go @@ -45,6 +45,7 @@ type resourceManager struct { // subresource to set the owner reference with blockOwnerDeletion=true on the // ServiceMonitor resource. //+kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=list;watch;create;update;delete;patch +//+kubebuilder:rbac:groups=monitoring.coreos.com,resources=prometheusrules,verbs=list;watch;create;update;delete;patch //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles;rolebindings,verbs=list;create;update;patch //+kubebuilder:rbac:groups="",resources=services/finalizers,verbs=update;patch @@ -72,6 +73,7 @@ func RegisterWithManager(mgr ctrl.Manager, namespace string) error { ). Named(name). Owns(&monv1.ServiceMonitor{}, generationChanged). + Owns(&monv1.PrometheusRule{}, generationChanged). Build(rm) if err != nil { @@ -85,8 +87,8 @@ func (rm resourceManager) Reconcile(ctx context.Context, req ctrl.Request) (ctrl logger := rm.logger.WithValues("operator", req.NamespacedName) logger.Info("Reconciling operator resources") - op := &corev1.Service{} - err := rm.k8sClient.Get(ctx, req.NamespacedName, op) + svc := &corev1.Service{} + err := rm.k8sClient.Get(ctx, req.NamespacedName, svc) if errors.IsNotFound(err) { return ctrl.Result{}, nil } @@ -94,7 +96,7 @@ func (rm resourceManager) Reconcile(ctx context.Context, req ctrl.Request) (ctrl return ctrl.Result{}, err } - reconcilers := operatorComponentReconcilers(op, rm.namespace) + reconcilers := operatorComponentReconcilers(svc, rm.namespace) for _, reconciler := range reconcilers { err := reconciler.Reconcile(ctx, rm.k8sClient, rm.scheme) // handle create / update errors that can happen due to a stale cache by