From c4ea3b1a1fd29e5f2afa63a6d274ef93cad97435 Mon Sep 17 00:00:00 2001 From: Sebastian Widmer Date: Wed, 16 Oct 2024 11:40:08 +0200 Subject: [PATCH] Improve logging of MCP pause manager --- controllers/upgradeconfig_controller.go | 2 +- controllers/upgradejob_controller.go | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/controllers/upgradeconfig_controller.go b/controllers/upgradeconfig_controller.go index 6c69cde..aefd8af 100644 --- a/controllers/upgradeconfig_controller.go +++ b/controllers/upgradeconfig_controller.go @@ -86,7 +86,7 @@ func (r *UpgradeConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques jobSchedRequeue, schedErr := r.scheduleJob(ctx, &uc, sched, location) statusRequeue, stErr := r.updateNextPossibleSchedulesStatus(ctx, &uc, sched, location) - if err := multierr.Append(schedErr, stErr); err != nil { + if err := multierr.Combine(schedErr, stErr); err != nil { return ctrl.Result{}, err } diff --git a/controllers/upgradejob_controller.go b/controllers/upgradejob_controller.go index 481bff8..90f7f94 100644 --- a/controllers/upgradejob_controller.go +++ b/controllers/upgradejob_controller.go @@ -877,13 +877,19 @@ func findTrackedHookJob(ujhookName, event string, uj managedupgradev1beta1.Upgra // It sets a timeout condition and returns an error if the delay is expired. // It also returns an error if the machine config pools cannot be listed or updated. func (r *UpgradeJobReconciler) pauseUnpauseMachineConfigPools(ctx context.Context, uj *managedupgradev1beta1.UpgradeJob, ensureUnpause bool) error { + l := log.FromContext(ctx).WithName("UpgradeJobReconciler.pauseUnpauseMachineConfigPools") + var controllerManagesPools bool var controllerPausedPools bool for _, pool := range uj.Spec.MachineConfigPools { if pool.DelayUpgrade == (managedupgradev1beta1.UpgradeJobMachineConfigPoolDelayUpgradeSpec{}) { continue } - shouldPause := !ensureUnpause && r.timeSinceStartAfter(uj) < pool.DelayUpgrade.DelayMin.Duration + timeSinceStart := r.timeSinceStartAfter(uj) + beforeMinDelay := timeSinceStart < pool.DelayUpgrade.DelayMin.Duration + shouldPause := !ensureUnpause && beforeMinDelay + l = l.WithValues("poolconfig_matchLabels", pool.MatchLabels, "shouldPause", shouldPause, "beforeMinDelay", beforeMinDelay, "ensureUnpause", ensureUnpause, "timeSinceStart", timeSinceStart) + sel, err := metav1.LabelSelectorAsSelector(pool.MatchLabels) if err != nil { return fmt.Errorf("failed to parse machine config pool selector: %w", err) @@ -895,10 +901,11 @@ func (r *UpgradeJobReconciler) pauseUnpauseMachineConfigPools(ctx context.Contex return fmt.Errorf("failed to list machine config pools %q: %w", pool.MatchLabels, err) } for _, mcp := range mcpl.Items { + l = l.WithValues("pool", mcp.Name, "paused", mcp.Spec.Paused) controllerManagesPools = true controllerPausedPools = controllerPausedPools || shouldPause // optional delay expiry - if mcp.Spec.Paused && !shouldPause && pool.DelayUpgrade.DelayMax.Duration > 0 && r.timeSinceStartAfter(uj) >= pool.DelayUpgrade.DelayMax.Duration { + if mcp.Spec.Paused && !shouldPause && pool.DelayUpgrade.DelayMax.Duration > 0 && timeSinceStart >= pool.DelayUpgrade.DelayMax.Duration { r.setStatusCondition(&uj.Status.Conditions, metav1.Condition{ Type: managedupgradev1beta1.UpgradeJobConditionFailed, Status: metav1.ConditionTrue, @@ -906,9 +913,12 @@ func (r *UpgradeJobReconciler) pauseUnpauseMachineConfigPools(ctx context.Contex Message: fmt.Sprintf("unpausing of pool %q expired", mcp.Name), }) // always return an error so the current reconcile is stopped and requeued - return multierr.Append(fmt.Errorf("unpausing of pool %q expired", mcp.Name), r.Status().Update(ctx, uj)) + err := fmt.Errorf("unpausing of pool %q expired", mcp.Name) + l.Error(err, "unpausing of pool expired", "maxDelay", pool.DelayUpgrade.DelayMax.Duration) + return multierr.Combine(err, r.Status().Update(ctx, uj)) } if mcp.Spec.Paused != shouldPause { + l.Info("Updating machine config pools pause field", "from", mcp.Spec.Paused, "to", shouldPause) mcp.Spec.Paused = shouldPause if err := r.Update(ctx, &mcp); err != nil { return fmt.Errorf("failed to pause/unpause machine config pool %q: %w", mcp.Name, err)