From c337c9bc7a42d81e5e07452ccd3d31880c9b046b Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Wed, 27 Mar 2024 13:26:55 +0200 Subject: [PATCH] controlplane/control: Change heartbeat strategy This commit changes the heartbeat timing strategy, to be more similar to the envoy strategy using a single request per interval, and threshold for changing state. Signed-off-by: Or Ozeri --- go.mod | 1 - go.sum | 2 -- pkg/controlplane/control/peer.go | 62 +++++++++++++++++++++----------- 3 files changed, 41 insertions(+), 24 deletions(-) diff --git a/go.mod b/go.mod index 85be0486..cab539f6 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,6 @@ go 1.20 require ( github.com/bombsimon/logrusr/v4 v4.1.0 - github.com/cenkalti/backoff/v4 v4.2.1 github.com/envoyproxy/go-control-plane v0.12.0 github.com/go-chi/chi v4.1.2+incompatible github.com/google/uuid v1.6.0 diff --git a/go.sum b/go.sum index c9cd5224..4c5a236a 100644 --- a/go.sum +++ b/go.sum @@ -4,8 +4,6 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bombsimon/logrusr/v4 v4.1.0 h1:uZNPbwusB0eUXlO8hIUwStE6Lr5bLN6IgYgG+75kuh4= github.com/bombsimon/logrusr/v4 v4.1.0/go.mod h1:pjfHC5e59CvjTBIU3V3sGhFWFAnsnhOR03TRc6im0l8= -github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= -github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/census-instrumentation/opencensus-proto v0.4.1 h1:iKLQ0xPNFxR/2hzXZMrBo8f1j86j5WHzznCCQxV/b8g= github.com/census-instrumentation/opencensus-proto v0.4.1/go.mod h1:4T9NM4+4Vw91VeyqjLS6ao50K5bOcLKN6Q42XnYaRYw= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= diff --git a/pkg/controlplane/control/peer.go b/pkg/controlplane/control/peer.go index e34ad2fc..a2bf3737 100644 --- a/pkg/controlplane/control/peer.go +++ b/pkg/controlplane/control/peer.go @@ -18,7 +18,6 @@ import ( "sync" "time" - "github.com/cenkalti/backoff/v4" "github.com/sirupsen/logrus" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -30,7 +29,14 @@ import ( ) const ( - heartbeatInterval = 10 * time.Second + // time interval between health check requests when peer has recently responded. + healthyInterval = 1 * time.Second + // time interval between health check requests when peer has not recently responded. + unhealthyInterval = 10 * time.Second + // number of consecutive successful healthchecks for a peer to be declared reachable. + healthyThreshold = 3 + // number of consecutive unsuccessful healthchecks for a peer to be declared unreachable. + unhealthyThreshold = 5 ) // peerMonitor monitors a single peer. @@ -81,12 +87,12 @@ func (m *peerMonitor) SetPeer(pr *v1alpha1.Peer) { func (m *peerMonitor) Start() { defer m.wg.Done() - ticker := time.NewTicker(heartbeatInterval) + ticker := time.NewTicker(healthyInterval) defer ticker.Stop() - backoffConfig := backoff.NewExponentialBackOff() - - reachable := false + healthy := false + strikeCount := 0 + threshold := 1 // require only a single heartbeat on startup reachableCond := metav1.Condition{ Type: v1alpha1.PeerReachable, Status: metav1.ConditionFalse, @@ -101,28 +107,42 @@ func (m *peerMonitor) Start() { break } - err := backoff.Retry(m.client.GetHeartbeat, backoffConfig) - if heartbeatOK := err == nil; heartbeatOK != reachable { - m.logger.Infof("Heartbeat result: %v", heartbeatOK) - + heartbeatOK := m.client.GetHeartbeat() == nil + if healthy == heartbeatOK { + strikeCount = 0 + } else { if heartbeatOK { - reachableCond.Status = metav1.ConditionTrue - backoffConfig.MaxElapsedTime = heartbeatInterval - } else { - reachableCond.Status = metav1.ConditionFalse - backoffConfig.MaxElapsedTime = 0 + // switch to healthy interval (even though not yet declared healthy) + ticker.Reset(healthyInterval) } + strikeCount++ + } - reachable = heartbeatOK + if strikeCount < threshold { + <-ticker.C + continue + } - m.lock.Lock() - meta.SetStatusCondition(&m.pr.Status.Conditions, reachableCond) - m.lock.Unlock() + m.logger.Infof("Peer reachable status changed to: %v", heartbeatOK) - // callback for non-CRD mode, which does not watch peers/status - m.statusCallback(m.pr) + if heartbeatOK { + reachableCond.Status = metav1.ConditionTrue + threshold = unhealthyThreshold + } else { + reachableCond.Status = metav1.ConditionFalse + threshold = healthyThreshold + ticker.Reset(unhealthyInterval) } + strikeCount = 0 + healthy = heartbeatOK + + m.lock.Lock() + meta.SetStatusCondition(&m.pr.Status.Conditions, reachableCond) + m.lock.Unlock() + + m.statusCallback(m.pr) + // wait till it's time for next heartbeat round <-ticker.C }