Icinga · julianbrost · Apr 11, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 18, 2024
diff --git a/pkg/config/redis.go b/pkg/config/redis.go
@@ -85,16 +85,16 @@ func dialWithLogging(dialer ctxDialerFunc, logger *logging.Logger) ctxDialerFunc
 			retry.Retryable,
 			backoff.NewExponentialWithJitter(1*time.Millisecond, 1*time.Second),
 			retry.Settings{
-				Timeout: 5 * time.Minute,
-				OnError: func(_ time.Duration, _ uint64, err, lastErr error) {
+				Timeout: retry.DefaultTimeout,
+				OnRetryableError: func(_ time.Duration, _ uint64, err, lastErr error) {
 					if lastErr == nil || err.Error() != lastErr.Error() {
 						logger.Warnw("Can't connect to Redis. Retrying", zap.Error(err))
 					}
 				},
 				OnSuccess: func(elapsed time.Duration, attempt uint64, _ error) {
-					if attempt > 0 {
+					if attempt > 1 {
 						logger.Infow("Reconnected to Redis",
-							zap.Duration("after", elapsed), zap.Uint64("attempts", attempt+1))
+							zap.Duration("after", elapsed), zap.Uint64("attempts", attempt))
 					}
 				},
 			},

diff --git a/pkg/icingadb/cleanup.go b/pkg/icingadb/cleanup.go
@@ -4,7 +4,9 @@ import (
 	"context"
 	"fmt"
 	"github.com/icinga/icingadb/internal"
+	"github.com/icinga/icingadb/pkg/backoff"
 	"github.com/icinga/icingadb/pkg/com"
+	"github.com/icinga/icingadb/pkg/retry"
 	"github.com/icinga/icingadb/pkg/types"
 	"time"
 )
@@ -40,32 +42,46 @@ func (db *DB) CleanupOlderThan(
 	count uint64, olderThan time.Time, onSuccess ...OnSuccess[struct{}],
 ) (uint64, error) {
 	var counter com.Counter
-	defer db.log(ctx, stmt.Build(db.DriverName(), 0), &counter).Stop()
+
+	q := db.Rebind(stmt.Build(db.DriverName(), count))
+
+	defer db.log(ctx, q, &counter).Stop()
 
 	for {
-		q := db.Rebind(stmt.Build(db.DriverName(), count))
-		rs, err := db.NamedExecContext(ctx, q, cleanupWhere{
-			EnvironmentId: envId,
-			Time:          types.UnixMilli(olderThan),
-		})
-		if err != nil {
-			return 0, internal.CantPerformQuery(err, q)
-		}
+		var rowsDeleted int64
+
+		err := retry.WithBackoff(
+			ctx,
+			func(ctx context.Context) error {
+				rs, err := db.NamedExecContext(ctx, q, cleanupWhere{
+					EnvironmentId: envId,
+					Time:          types.UnixMilli(olderThan),
+				})
+				if err != nil {
+					return internal.CantPerformQuery(err, q)
+				}
+
+				rowsDeleted, err = rs.RowsAffected()
 
-		n, err := rs.RowsAffected()
+				return err
+			},
+			retry.Retryable,
+			backoff.NewExponentialWithJitter(1*time.Millisecond, 1*time.Second),
+			db.getDefaultRetrySettings(),
+		)
 		if err != nil {
 			return 0, err
 		}
 
-		counter.Add(uint64(n))
+		counter.Add(uint64(rowsDeleted))
 
 		for _, onSuccess := range onSuccess {
-			if err := onSuccess(ctx, make([]struct{}, n)); err != nil {
+			if err := onSuccess(ctx, make([]struct{}, rowsDeleted)); err != nil {
 				return 0, err
 			}
 		}
 
-		if n < int64(count) {
+		if rowsDeleted < int64(count) {
 			break
 		}
 	}

diff --git a/pkg/icingadb/db.go b/pkg/icingadb/db.go
@@ -13,6 +13,7 @@ import (
 	"github.com/icinga/icingadb/pkg/utils"
 	"github.com/jmoiron/sqlx"
 	"github.com/pkg/errors"
+	"go.uber.org/zap"
 	"golang.org/x/sync/errgroup"
 	"golang.org/x/sync/semaphore"
 	"reflect"
@@ -346,7 +347,7 @@ func (db *DB) BulkExec(
 						},
 						retry.Retryable,
 						backoff.NewExponentialWithJitter(1*time.Millisecond, 1*time.Second),
-						retry.Settings{},
+						db.getDefaultRetrySettings(),
 					)
 				}
 			}(b))
@@ -411,7 +412,7 @@ func (db *DB) NamedBulkExec(
 							},
 							retry.Retryable,
 							backoff.NewExponentialWithJitter(1*time.Millisecond, 1*time.Second),
-							retry.Settings{},
+							db.getDefaultRetrySettings(),
 						)
 					}
 				}(b))
@@ -484,7 +485,7 @@ func (db *DB) NamedBulkExecTx(
 							},
 							retry.Retryable,
 							backoff.NewExponentialWithJitter(1*time.Millisecond, 1*time.Second),
-							retry.Settings{},
+							db.getDefaultRetrySettings(),
 						)
 					}
 				}(b))
@@ -670,6 +671,25 @@ func (db *DB) GetSemaphoreForTable(table string) *semaphore.Weighted {
 	}
 }
 
+func (db *DB) getDefaultRetrySettings() retry.Settings {
+	return retry.Settings{
+		Timeout: retry.DefaultTimeout,
+		OnRetryableError: func(_ time.Duration, _ uint64, err, lastErr error) {
+			if lastErr == nil || err.Error() != lastErr.Error() {
+				db.logger.Warnw("Can't execute query. Retrying", zap.Error(err))
+			}
+		},
+		OnSuccess: func(elapsed time.Duration, attempt uint64, lastErr error) {
+			if attempt > 1 {
+				db.logger.Infow("Query retried successfully after error",
+					zap.Duration("after", elapsed),
+					zap.Uint64("attempts", attempt),
+					zap.NamedError("recovered_error", lastErr))
+			}
+		},
+	}
+}
+
 func (db *DB) log(ctx context.Context, query string, counter *com.Counter) periodic.Stopper {
 	return periodic.Start(ctx, db.logger.Interval(), func(tick periodic.Tick) {
 		if count := counter.Reset(); count > 0 {

diff --git a/pkg/icingadb/driver.go b/pkg/icingadb/driver.go
@@ -55,8 +55,8 @@ func (c RetryConnector) Connect(ctx context.Context) (driver.Conn, error) {
 		shouldRetry,
 		backoff.NewExponentialWithJitter(time.Millisecond*128, time.Minute*1),
 		retry.Settings{
-			Timeout: time.Minute * 5,
-			OnError: func(_ time.Duration, _ uint64, err, lastErr error) {
+			Timeout: retry.DefaultTimeout,
+			OnRetryableError: func(_ time.Duration, _ uint64, err, lastErr error) {
 				telemetry.UpdateCurrentDbConnErr(err)
 
 				if lastErr == nil || err.Error() != lastErr.Error() {
@@ -66,9 +66,9 @@ func (c RetryConnector) Connect(ctx context.Context) (driver.Conn, error) {
 			OnSuccess: func(elapsed time.Duration, attempt uint64, _ error) {
 				telemetry.UpdateCurrentDbConnErr(nil)
 
-				if attempt > 0 {
+				if attempt > 1 {
 					c.logger.Infow("Reconnected to database",
-						zap.Duration("after", elapsed), zap.Uint64("attempts", attempt+1))
+						zap.Duration("after", elapsed), zap.Uint64("attempts", attempt))
 				}
 			},
 		},

diff --git a/pkg/icingadb/ha.go b/pkg/icingadb/ha.go
@@ -148,8 +148,19 @@ func (h *HA) controller() {
 	defer routineLogTicker.Stop()
 	shouldLogRoutineEvents := true
 
+	// The retry logic in HA is twofold:
+	//
+	// 1) Updating or inserting the instance row based on the current heartbeat must be done within the heartbeat's
+	//    expiration time. Therefore, we use a deadline ctx to retry.WithBackoff() in realize() which expires earlier
+	//    than our default timeout.
+	// 2) Since we do not want to exit before our default timeout expires, we have to repeat step 1 until it does.
+	retryTimeout := time.NewTimer(retry.DefaultTimeout)
+	defer retryTimeout.Stop()
+
 	for {
 		select {
+		case <-retryTimeout.C:
+			h.abort(errors.New("retry deadline exceeded"))
 		case m := <-h.heartbeat.Events():
 			if m != nil {
 				now := time.Now()
@@ -163,8 +174,13 @@ func (h *HA) controller() {
 				}
 				if tt.Before(now.Add(-1 * peerTimeout)) {
 					h.logger.Errorw("Received heartbeat from the past", zap.Time("time", tt))
+
 					h.signalHandover("received heartbeat from the past")
 					h.realizeLostHeartbeat()
+
+					// Reset retry timeout so that the next iterations have the full amount of time available again.
+					retry.ResetTimeout(retryTimeout, retry.DefaultTimeout)
+
 					continue
 				}
 				s, err := m.Stats().IcingaStatus()
@@ -200,17 +216,17 @@ func (h *HA) controller() {
 				default:
 				}
 
-				var realizeCtx context.Context
-				var cancelRealizeCtx context.CancelFunc
-				if h.responsible {
-					realizeCtx, cancelRealizeCtx = context.WithDeadline(h.ctx, m.ExpiryTime())
-				} else {
-					realizeCtx, cancelRealizeCtx = context.WithCancel(h.ctx)
-				}
+				// Ensure that updating/inserting the instance row is completed by the current heartbeat's expiry time.
+				realizeCtx, cancelRealizeCtx := context.WithDeadline(h.ctx, m.ExpiryTime())
 				err = h.realize(realizeCtx, s, t, envId, shouldLogRoutineEvents)
 				cancelRealizeCtx()
 				if errors.Is(err, context.DeadlineExceeded) {
-					h.signalHandover("context deadline exceeded")
+					h.signalHandover("instance update/insert deadline exceeded heartbeat expiry time")
+
+					// Instance insert/update was not completed by the expiration time of the current heartbeat.
+					// Pass control back to the loop to try again with the next heartbeat,
+					// or exit the loop when the retry timeout has expired. Therefore,
+					// retry timeout is **not** reset here so that retries continue until the timeout has expired.
 					continue
 				}
 				if err != nil {
@@ -228,6 +244,14 @@ func (h *HA) controller() {
 				h.signalHandover("lost heartbeat")
 				h.realizeLostHeartbeat()
 			}
+
+			// Reset retry timeout so that the next iterations have the full amount of time available again.
+			// Don't be surprised by the location of the code,
+			// as it is obvious that the timer is also reset after an error that ends the loop anyway.
+			// But this is the best place to catch all scenarios where the timeout needs to be reset.
+			// And since HA needs quite a bit of refactoring anyway to e.g. return immediately after calling h.abort(),
+			// it's fine to have it here for now.
+			retry.ResetTimeout(retryTimeout, retry.DefaultTimeout)
 		case <-h.heartbeat.Done():
 			if err := h.heartbeat.Err(); err != nil {
 				h.abort(err)
@@ -253,6 +277,10 @@ func (h *HA) realize(
 		otherResponsible bool
 	)
 
+	if _, ok := ctx.Deadline(); !ok {
+		panic("can't use context w/o deadline in realize()")
+	}
+
 	err := retry.WithBackoff(
 		ctx,
 		func(ctx context.Context) error {
@@ -358,14 +386,31 @@ func (h *HA) realize(
 		retry.Retryable,
 		backoff.NewExponentialWithJitter(time.Millisecond*256, time.Second*3),
 		retry.Settings{
-			OnError: func(_ time.Duration, attempt uint64, err, lastErr error) {
+			// Intentionally no timeout is set, as we use a context with a deadline.
+			OnRetryableError: func(_ time.Duration, attempt uint64, err, lastErr error) {
 				if lastErr == nil || err.Error() != lastErr.Error() {
 					log := h.logger.Debugw
-					if attempt > 2 {
+					if attempt > 3 {
+						log = h.logger.Infow
+					}
+
+					log("Can't update or insert instance. Retrying", zap.Error(err))
+				}
+			},
+			OnSuccess: func(elapsed time.Duration, attempt uint64, lastErr error) {
+				if attempt > 1 {
+					log := h.logger.Debugw
+
+					if attempt > 4 {
+						// We log errors with severity info starting from the fourth attempt, (see above)
+						// so we need to log success with severity info from the fifth attempt.
 						log = h.logger.Infow
 					}
 
-					log("Can't update or insert instance. Retrying", zap.Error(err), zap.Uint64("retry count", attempt))
+					log("Instance updated/inserted successfully after error",
+						zap.Duration("after", elapsed),
+						zap.Uint64("attempts", attempt),
+						zap.NamedError("recovered_error", lastErr))
 				}
 			},
 		},