Skip to content

Commit

Permalink
base: decrease store liveness durations
Browse files Browse the repository at this point in the history
Previously, store liveness used a heartbeat interval and support
duration of 3s and 6s, respectively. This matched the lease extension
and lease duration, respectively. However, these values were not well
aligned with Raft's election timeout (4s) and jitter (up to 2s), so
when a follower had to campaign after withdrawing support from the
leader, the store liveness durations added up to the Raft timeout and
jitter, instead of being subsumed by them.

This commit reduces the store liveness heartbeat interval and support
duration to 1s and 3s, respectively.

Fixes: #133613

Release note: None
  • Loading branch information
miraradeva committed Dec 20, 2024
1 parent bbfd027 commit ffa4eb1
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 18 deletions.
42 changes: 32 additions & 10 deletions pkg/base/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ import (
// Base config defaults.
//
// When changing these, TestDefaultRaftConfig must also be updated via -rewrite,
// and the result copied to the defaultRangeLeaseRaftElectionTimeoutMultiplier
// comment with any adjustments to the surrounding reasoning.
// and the result copied to the defaultRangeLeaseDuration comment with any
// adjustments to the surrounding reasoning.
const (
defaultInsecure = false
defaultUser = username.RootUser
Expand Down Expand Up @@ -186,13 +186,13 @@ var (
// Total latency [ 3.03s - 7.20s]
//
// Leader lease acquisition (including raft election):
// - Store Liveness heartbeat offset (0-1 heartbeat interval) [-3.00s - 0.00s]
// - Store Liveness expiration (constant) [ 6.00s - 6.00s]
// - Store Liveness heartbeat offset (0-1 heartbeat interval) [-1.00s - 0.00s]
// - Store Liveness expiration (constant) [ 3.00s - 3.00s]
// - Store Liveness withdrawal (0-1 withdrawal interval) [ 0.00s - 0.10s]
// - Raft election timeout jitter (random 0x-1x timeout) [ 0.00s - 2.00s]
// - Election (3x RTT: prevote, vote, append) [ 0.03s - 1.20s]
// - Lease acquisition (1x RTT: append) [ 0.01s - 0.40s]
// Total latency [ 3.04s - 9.70s]
// Total latency [ 2.04s - 6.70s]
//
// (generated by TestDefaultRaftConfig)
//
Expand Down Expand Up @@ -228,6 +228,16 @@ var (
DefaultRPCHeartbeatTimeout = envutil.EnvOrDefaultDuration(
"COCKROACH_RPC_HEARTBEAT_TIMEOUT", 3*NetworkTimeout)

// defaultStoreLivenessHeartbeatInterval is the default value for
// StoreLivenessHeartbeatInterval.
defaultStoreLivenessHeartbeatInterval = envutil.EnvOrDefaultDuration(
"COCKROACH_STORE_LIVENESS_HEARTBEAT_INTERVAL", time.Second)

// defaultStoreLivenessSupportDuration is the default value for
// StoreLivenessSupportDuration.
defaultStoreLivenessSupportDuration = envutil.EnvOrDefaultDuration(
"COCKROACH_STORE_LIVENESS_SUPPORT_DURATION", 3*time.Second)

// defaultRaftTickInterval is the default resolution of the Raft timer.
defaultRaftTickInterval = envutil.EnvOrDefaultDuration(
"COCKROACH_RAFT_TICK_INTERVAL", 500*time.Millisecond)
Expand Down Expand Up @@ -549,6 +559,14 @@ type RaftConfig struct {
// RaftHeartbeatIntervalTicks is the number of ticks that pass between heartbeats.
RaftHeartbeatIntervalTicks int64

// StoreLivenessHeartbeatInterval determines how ofter stores request and
// extend store liveness support.
StoreLivenessHeartbeatInterval time.Duration

// StoreLivenessSupportDuration is the duration of store liveness support that
// stores request and extend.
StoreLivenessSupportDuration time.Duration

// RangeLeaseRaftElectionTimeoutMultiplier specifies the range lease duration.
RangeLeaseDuration time.Duration
// RangeLeaseRenewalFraction specifies what fraction the range lease renewal
Expand Down Expand Up @@ -658,6 +676,12 @@ func (cfg *RaftConfig) SetDefaults() {
if cfg.RaftHeartbeatIntervalTicks == 0 {
cfg.RaftHeartbeatIntervalTicks = defaultRaftHeartbeatIntervalTicks
}
if cfg.StoreLivenessHeartbeatInterval == 0 {
cfg.StoreLivenessHeartbeatInterval = defaultStoreLivenessHeartbeatInterval
}
if cfg.StoreLivenessSupportDuration == 0 {
cfg.StoreLivenessSupportDuration = defaultStoreLivenessSupportDuration
}
if cfg.RangeLeaseDuration == 0 {
cfg.RangeLeaseDuration = defaultRangeLeaseDuration
}
Expand Down Expand Up @@ -769,11 +793,9 @@ func (cfg RaftConfig) NodeLivenessDurations() (livenessActive, livenessRenewal t
}

// StoreLivenessDurations computes durations for store liveness heartbeat
// interval and liveness interval.
func (cfg RaftConfig) StoreLivenessDurations() (livenessInterval, heartbeatInterval time.Duration) {
livenessInterval = cfg.RangeLeaseDuration
heartbeatInterval = time.Duration(float64(livenessInterval) * livenessRenewalFraction)
return
// interval and support duration.
func (cfg RaftConfig) StoreLivenessDurations() (supportDuration, heartbeatInterval time.Duration) {
return cfg.StoreLivenessSupportDuration, cfg.StoreLivenessHeartbeatInterval
}

// SentinelGossipTTL is time-to-live for the gossip sentinel, which is gossiped
Expand Down
4 changes: 3 additions & 1 deletion pkg/base/testdata/raft_config
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ echo
RaftElectionTimeoutTicks: (int64) 4,
RaftReproposalTimeoutTicks: (int64) 6,
RaftHeartbeatIntervalTicks: (int64) 2,
StoreLivenessHeartbeatInterval: (time.Duration) 1s,
StoreLivenessSupportDuration: (time.Duration) 3s,
RangeLeaseDuration: (time.Duration) 6s,
RangeLeaseRenewalFraction: (float64) 0.5,
RaftEnableCheckQuorum: (bool) true,
Expand All @@ -24,5 +26,5 @@ RaftReproposalTimeout: 3s
RangeLeaseDurations: active=6s renewal=3s
RangeLeaseAcquireTimeout: 4s
NodeLivenessDurations: active=6s renewal=3s
StoreLivenessDurations: active=6s renewal=3s
StoreLivenessDurations: active=3s renewal=1s
SentinelGossipTTL: 3s
9 changes: 4 additions & 5 deletions pkg/base/testdata/raft_config_recovery
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Any changes in this result should be copied to the comment on
# defaultRangeLeaseRaftElectionTimeoutMultiplier, and the corresponding
# reasoning should be adjusted.
# defaultRangeLeaseDuration, and the corresponding reasoning should be adjusted.
echo
----
// Raft election (fortification disabled):
Expand All @@ -23,10 +22,10 @@ echo
// Total latency [ 3.03s - 7.20s]
//
// Leader lease acquisition (including raft election):
// - Store Liveness heartbeat offset (0-1 heartbeat interval) [-3.00s - 0.00s]
// - Store Liveness expiration (constant) [ 6.00s - 6.00s]
// - Store Liveness heartbeat offset (0-1 heartbeat interval) [-1.00s - 0.00s]
// - Store Liveness expiration (constant) [ 3.00s - 3.00s]
// - Store Liveness withdrawal (0-1 withdrawal interval) [ 0.00s - 0.10s]
// - Raft election timeout jitter (random 0x-1x timeout) [ 0.00s - 2.00s]
// - Election (3x RTT: prevote, vote, append) [ 0.03s - 1.20s]
// - Lease acquisition (1x RTT: append) [ 0.01s - 0.40s]
// Total latency [ 3.04s - 9.70s]
// Total latency [ 2.04s - 6.70s]
6 changes: 4 additions & 2 deletions pkg/kv/kvserver/client_raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6603,7 +6603,9 @@ func TestRaftCheckQuorum(t *testing.T) {
}, 10*time.Second, 500*time.Millisecond)
t.Logf("n1 stepped down as a leader")

// n2 or n3 should elect a new leader.
// n2 or n3 should elect a new leader. At this point, the store liveness
// SupportWithdrawalGracePeriod may not have expired yet, so this step waits
// a little longer.
var leaderStatus *raft.Status
require.Eventually(t, func() bool {
for _, status := range []*raft.Status{repl2.RaftStatus(), repl3.RaftStatus()} {
Expand All @@ -6614,7 +6616,7 @@ func TestRaftCheckQuorum(t *testing.T) {
}
}
return false
}, 10*time.Second, 500*time.Millisecond)
}, 20*time.Second, 500*time.Millisecond)
t.Logf("n%d became leader", leaderStatus.ID)

// n1 shouldn't become a leader.
Expand Down

0 comments on commit ffa4eb1

Please sign in to comment.