Skip to content

Commit

Permalink
feat: deploy PrometheusRule resource
Browse files Browse the repository at this point in the history
This change deploys a PrometheusRule resource with one alerting rule
firing when one controller hits more than 10% of failed reconciliations
in the last 5 minutes for more than 15 minutes.

Signed-off-by: Simon Pasquier <[email protected]>
  • Loading branch information
simonpasquier committed Nov 18, 2024
1 parent 490a91d commit f99e588
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ metadata:
categories: Monitoring
certified: "false"
containerImage: observability-operator:0.4.2
createdAt: "2024-11-18T09:45:14Z"
createdAt: "2024-11-18T14:49:04Z"
description: A Go based Kubernetes operator to setup and manage highly available
Monitoring Stack using Prometheus, Alertmanager and Thanos Querier.
operatorframework.io/cluster-monitoring: "true"
Expand Down Expand Up @@ -414,6 +414,7 @@ spec:
- apiGroups:
- monitoring.coreos.com
resources:
- prometheusrules
- servicemonitors
verbs:
- create
Expand Down
1 change: 1 addition & 0 deletions deploy/operator/observability-operator-cluster-role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ rules:
- apiGroups:
- monitoring.coreos.com
resources:
- prometheusrules
- servicemonitors
verbs:
- create
Expand Down
46 changes: 46 additions & 0 deletions pkg/controllers/operator/components.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"

"github.com/rhobs/observability-operator/pkg/reconciler"
Expand All @@ -21,9 +22,54 @@ func operatorComponentReconcilers(owner metav1.Object, namespace string) []recon
reconciler.NewUpdater(newServiceMonitor(namespace), owner),
reconciler.NewUpdater(newPrometheusRole(namespace), owner),
reconciler.NewUpdater(newRoleBindingForPrometheusRole(namespace), owner),
reconciler.NewUpdater(newPrometheusRule(namespace), owner),
}
}

func newPrometheusRule(namespace string) *monv1.PrometheusRule {
return &monv1.PrometheusRule{
TypeMeta: metav1.TypeMeta{
APIVersion: monv1.SchemeGroupVersion.String(),
Kind: "PrometheusRule",
},
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: namespace,
Labels: map[string]string{
"app.kubernetes.io/component": "operator",
"app.kubernetes.io/name": name,
"app.kubernetes.io/part-of": name,
"openshift.io/user-monitoring": "false",
},
},

Spec: monv1.PrometheusRuleSpec{
Groups: []monv1.RuleGroup{
{
Name: "operator",
Rules: []monv1.Rule{
{
Alert: "ClusterObservabilityOperatorReconciliationsFailed",
Expr: intstr.FromString(
fmt.Sprintf(`sum by(controller,pod,namespace) (rate(controller_runtime_reconcile_total{result="error",job="observability-operator",namespace="%s"}[5m]))
/
sum by(controller,pod,namespace) (rate(controller_runtime_reconcile_total{job="observability-operator",namespace="%s"}[5m])) > 0.1`, namespace, namespace),
),
For: ptr.To(monv1.Duration("15m")),
Labels: map[string]string{
"severity": "warning",
},
Annotations: map[string]string{
"description": "{{$value | humanize}}% of reconciliation requests are failing for the '{{ $labels.controller}}' controller. Check the logs of the {{$labels.namespace}}/{{$labels.pod}} pod to investigate further.",
"summary": "Cluster observability operator fails to reconcile resources",
},
},
},
},
},
},
}
}
func newServiceMonitor(namespace string) *monv1.ServiceMonitor {
return &monv1.ServiceMonitor{
TypeMeta: metav1.TypeMeta{
Expand Down
8 changes: 5 additions & 3 deletions pkg/controllers/operator/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ type resourceManager struct {
// subresource to set the owner reference with blockOwnerDeletion=true on the
// ServiceMonitor resource.
//+kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=list;watch;create;update;delete;patch
//+kubebuilder:rbac:groups=monitoring.coreos.com,resources=prometheusrules,verbs=list;watch;create;update;delete;patch
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles;rolebindings,verbs=list;create;update;patch
//+kubebuilder:rbac:groups="",resources=services/finalizers,verbs=update;patch

Expand Down Expand Up @@ -72,6 +73,7 @@ func RegisterWithManager(mgr ctrl.Manager, namespace string) error {
).
Named(name).
Owns(&monv1.ServiceMonitor{}, generationChanged).
Owns(&monv1.PrometheusRule{}, generationChanged).
Build(rm)

if err != nil {
Expand All @@ -85,16 +87,16 @@ func (rm resourceManager) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
logger := rm.logger.WithValues("operator", req.NamespacedName)
logger.Info("Reconciling operator resources")

op := &corev1.Service{}
err := rm.k8sClient.Get(ctx, req.NamespacedName, op)
svc := &corev1.Service{}
err := rm.k8sClient.Get(ctx, req.NamespacedName, svc)
if errors.IsNotFound(err) {
return ctrl.Result{}, nil
}
if err != nil {
return ctrl.Result{}, err
}

reconcilers := operatorComponentReconcilers(op, rm.namespace)
reconcilers := operatorComponentReconcilers(svc, rm.namespace)
for _, reconciler := range reconcilers {
err := reconciler.Reconcile(ctx, rm.k8sClient, rm.scheme)
// handle create / update errors that can happen due to a stale cache by
Expand Down

0 comments on commit f99e588

Please sign in to comment.