diff --git a/pkg/raft/raft.go b/pkg/raft/raft.go index 14de4b2ccdd2..61bd86c82878 100644 --- a/pkg/raft/raft.go +++ b/pkg/raft/raft.go @@ -1136,8 +1136,10 @@ func (r *raft) tickHeartbeat() { if r.heartbeatElapsed >= r.heartbeatTimeout { r.heartbeatElapsed = 0 - if err := r.Step(pb.Message{From: r.id, Type: pb.MsgBeat}); err != nil { - r.logger.Debugf("error occurred during checking sending heartbeat: %v", err) + if !r.fortificationTracker.FortificationEnabled() { + if err := r.Step(pb.Message{From: r.id, Type: pb.MsgBeat}); err != nil { + r.logger.Debugf("error occurred during checking sending heartbeat: %v", err) + } } // Mark fortifying followers as recently active. We disable heartbeats when diff --git a/pkg/raft/raft_flow_control_test.go b/pkg/raft/raft_flow_control_test.go index d0bf705c144f..b15163b2105b 100644 --- a/pkg/raft/raft_flow_control_test.go +++ b/pkg/raft/raft_flow_control_test.go @@ -145,11 +145,10 @@ func TestMsgAppFlowControl(t *testing.T) { r.tick() } ms := r.readMessages() - require.Len(t, ms, 3) - require.Equal(t, ms[0].Type, pb.MsgHeartbeat) - require.Equal(t, ms[1].Type, pb.MsgFortifyLeader) - require.Equal(t, ms[2].Type, pb.MsgApp) - require.Empty(t, ms[0].Entries) + require.Len(t, ms, 2) + require.Equal(t, ms[0].Type, pb.MsgFortifyLeader) + require.Equal(t, ms[1].Type, pb.MsgApp) + require.Empty(t, ms[1].Entries) } else { r.Step(pb.Message{From: 2, To: 1, Type: pb.MsgHeartbeatResp}) ms := r.readMessages() diff --git a/pkg/raft/raft_paper_test.go b/pkg/raft/raft_paper_test.go index 206fbfe2db25..095c69cc128f 100644 --- a/pkg/raft/raft_paper_test.go +++ b/pkg/raft/raft_paper_test.go @@ -104,9 +104,9 @@ func TestStartAsFollower(t *testing.T) { // TestLeaderBcastBeat tests that if the leader receives a heartbeat tick, // it will send a MsgHeartbeat with m.Index = 0, m.LogTerm=0 and empty entries // as heartbeat to all followers. -// Note that if store liveness is enabled, the leader will also send a MsgApp -// on every heartbeat interval. // Reference: section 5.2 +// Note that if store liveness is enabled, the leader might send a MsgApp on +// every heartbeat interval, but it won't send a MsgHeartbeat. func TestLeaderBcastBeat(t *testing.T) { // heartbeat interval hi := 3 @@ -141,8 +141,6 @@ func TestLeaderBcastBeat(t *testing.T) { {From: 1, To: 3, Term: 1, Type: pb.MsgApp, Entries: r.raftLog.allEntries()}, {From: 1, To: 2, Term: 1, Type: pb.MsgFortifyLeader}, {From: 1, To: 3, Term: 1, Type: pb.MsgFortifyLeader}, - {From: 1, To: 2, Term: 1, Type: pb.MsgHeartbeat}, - {From: 1, To: 3, Term: 1, Type: pb.MsgHeartbeat}, }, msgs) } else { assert.Equal(t, []pb.Message{ diff --git a/pkg/raft/raft_test.go b/pkg/raft/raft_test.go index 7a76bbf8ed08..aeb2404bbf16 100644 --- a/pkg/raft/raft_test.go +++ b/pkg/raft/raft_test.go @@ -1231,24 +1231,22 @@ func TestHandleHeatbeatTimeoutStoreLivenessEnabled(t *testing.T) { } msgs := sm.readMessages() - require.Len(t, msgs, 2) - assert.Equal(t, pb.MsgHeartbeat, msgs[0].Type) - assert.Equal(t, pb.MsgApp, msgs[1].Type) + require.Len(t, msgs, 1) + assert.Equal(t, pb.MsgApp, msgs[0].Type) // On another heartbeat timeout, the leader sends a MsgApp. for ticks := sm.heartbeatTimeout; ticks > 0; ticks-- { sm.tick() } msgs = sm.readMessages() - require.Len(t, msgs, 2) - assert.Equal(t, pb.MsgHeartbeat, msgs[0].Type) - assert.Equal(t, pb.MsgApp, msgs[1].Type) + require.Len(t, msgs, 1) + assert.Equal(t, pb.MsgApp, msgs[0].Type) // Once the leader receives a MsgAppResp, it doesn't send MsgApp. sm.Step(pb.Message{ From: 2, Type: pb.MsgAppResp, - Index: msgs[1].Index + uint64(len(msgs[1].Entries)), + Index: msgs[0].Index + uint64(len(msgs[0].Entries)), Commit: sm.raftLog.lastIndex(), }) @@ -1261,8 +1259,7 @@ func TestHandleHeatbeatTimeoutStoreLivenessEnabled(t *testing.T) { sm.tick() } msgs = sm.readMessages() - require.Len(t, msgs, 1) - assert.Equal(t, pb.MsgHeartbeat, msgs[0].Type) + require.Len(t, msgs, 0) } // TestMsgAppRespWaitReset verifies the resume behavior of a leader @@ -2036,7 +2033,10 @@ func TestLeaderAppResp(t *testing.T) { } // TestBcastBeat is when the leader receives a heartbeat tick, it should -// send a MsgHeartbeat with m.Index = 0, m.LogTerm=0 and empty entries. +// send a MsgHeartbeat with m.Index = 0, m.LogTerm=0 and empty entries if +// store liveness is disabled. On the other hand, if store liveness is enabled, +// the leader doesn't send a MsgHeartbeat but sends a MsgApp if the follower +// needs it to catch up. func TestBcastBeat(t *testing.T) { testutils.RunTrueAndFalse(t, "store-liveness-enabled", func(t *testing.T, storeLivenessEnabled bool) { @@ -2082,14 +2082,12 @@ func TestBcastBeat(t *testing.T) { sm.tick() } msgs := sm.readMessages() - // If storeliveness is enabled, the heartbeat timeout will also send a - // MsgApp if it needs to. In this case since follower 2 is slow, we will - // send a MsgApp to it. + // If storeliveness is enabled, the heartbeat timeout will send a MsgApp + // if it needs to. In this case since follower 2 is slow, we will send a + // MsgApp to it. if storeLivenessEnabled { - require.Len(t, msgs, 5) + require.Len(t, msgs, 3) assert.Equal(t, []pb.Message{ - {From: 1, To: 2, Term: 2, Type: pb.MsgHeartbeat, Match: 5}, - {From: 1, To: 3, Term: 2, Type: pb.MsgHeartbeat, Match: 1011}, {From: 1, To: 2, Term: 2, Type: pb.MsgFortifyLeader}, {From: 1, To: 3, Term: 2, Type: pb.MsgFortifyLeader}, {From: 1, To: 3, Term: 2, Type: pb.MsgApp, LogTerm: 2, Index: 1011, Commit: 1000, @@ -2101,14 +2099,14 @@ func TestBcastBeat(t *testing.T) { {From: 1, To: 2, Term: 2, Type: pb.MsgHeartbeat, Match: 5}, {From: 1, To: 3, Term: 2, Type: pb.MsgHeartbeat, Match: 1011}, }, msgs) - } - // Make sure that the heartbeat messages contain the expected fields. - for i, m := range msgs[:2] { - require.Equal(t, pb.MsgHeartbeat, m.Type, "#%d", i) - require.Zero(t, m.Index, "#%d", i) - require.Zero(t, m.LogTerm, "#%d", i) - require.Empty(t, m.Entries, "#%d", i) + // Make sure that the heartbeat messages contain the expected fields. + for i, m := range msgs { + require.Equal(t, pb.MsgHeartbeat, m.Type, "#%d", i) + require.Zero(t, m.Index, "#%d", i) + require.Zero(t, m.LogTerm, "#%d", i) + require.Empty(t, m.Entries, "#%d", i) + } } }) } @@ -2280,11 +2278,10 @@ func TestSendAppendForProgressProbeStoreLivenessEnabled(t *testing.T) { assert.True(t, r.trk.Progress(2).MsgAppProbesPaused) msg := r.readMessages() - assert.Len(t, msg, 3) - assert.Equal(t, pb.MsgHeartbeat, msg[0].Type) - assert.Equal(t, pb.MsgFortifyLeader, msg[1].Type) - assert.Equal(t, pb.MsgApp, msg[2].Type) - assert.Equal(t, msg[2].Index, uint64(1)) + assert.Len(t, msg, 2) + assert.Equal(t, pb.MsgFortifyLeader, msg[0].Type) + assert.Equal(t, pb.MsgApp, msg[1].Type) + assert.Equal(t, msg[1].Index, uint64(1)) assert.True(t, r.trk.Progress(2).MsgAppProbesPaused) } } diff --git a/pkg/raft/rafttest/interaction_env_handler_add_nodes.go b/pkg/raft/rafttest/interaction_env_handler_add_nodes.go index e1037d688bc2..51d4167546a0 100644 --- a/pkg/raft/rafttest/interaction_env_handler_add_nodes.go +++ b/pkg/raft/rafttest/interaction_env_handler_add_nodes.go @@ -18,6 +18,7 @@ package rafttest import ( + "context" "errors" "fmt" "reflect" @@ -26,6 +27,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/raft" pb "github.com/cockroachdb/cockroach/pkg/raft/raftpb" + "github.com/cockroachdb/cockroach/pkg/raft/raftstoreliveness" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/datadriven" @@ -138,14 +140,19 @@ func (env *InteractionEnv) AddNodes(n int, cfg raft.Config, snap pb.Snapshot) er cfg := cfg // fork the config stub cfg.ID, cfg.Storage = id, s - cfg.StoreLiveness = newStoreLiveness(env.Fabric, id) - // If the node creating command hasn't specified the CRDBVersion, use the // latest one. if cfg.CRDBVersion == nil { cfg.CRDBVersion = cluster.MakeTestingClusterSettings().Version } + // Disable store liveness if the CRDB version is less than 24.3. + if cfg.CRDBVersion.IsActive(context.TODO(), clusterversion.V24_3_StoreLivenessEnabled) { + cfg.StoreLiveness = newStoreLiveness(env.Fabric, id) + } else { + cfg.StoreLiveness = raftstoreliveness.Disabled{} + } + if env.Options.OnConfig != nil { env.Options.OnConfig(&cfg) if cfg.ID != id { diff --git a/pkg/raft/testdata/async_storage_writes_append_aba_race.txt b/pkg/raft/testdata/async_storage_writes_append_aba_race.txt index ec65a539b1db..99900880af2c 100644 --- a/pkg/raft/testdata/async_storage_writes_append_aba_race.txt +++ b/pkg/raft/testdata/async_storage_writes_append_aba_race.txt @@ -408,12 +408,6 @@ process-ready 4 ---- Ready MustSync=false: Messages: -4->1 MsgHeartbeat Term:3 Log:0/0 -4->2 MsgHeartbeat Term:3 Log:0/0 -4->3 MsgHeartbeat Term:3 Log:0/0 -4->5 MsgHeartbeat Term:3 Log:0/0 -4->6 MsgHeartbeat Term:3 Log:0/0 -4->7 MsgHeartbeat Term:3 Log:0/0 4->1 MsgFortifyLeader Term:3 Log:0/0 4->2 MsgFortifyLeader Term:3 Log:0/0 4->3 MsgFortifyLeader Term:3 Log:0/0 @@ -432,10 +426,9 @@ Messages: deliver-msgs 1 ---- -4->1 MsgHeartbeat Term:3 Log:0/0 -INFO 1 [term: 2] received a MsgHeartbeat message with higher term from 4 [term: 3] -INFO 1 became follower at term 3 4->1 MsgFortifyLeader Term:3 Log:0/0 +INFO 1 [term: 2] received a MsgFortifyLeader message with higher term from 4 [term: 3] +INFO 1 became follower at term 3 4->1 MsgApp Term:3 Log:1/11 Commit:11 Entries:[3/12 EntryNormal ""] INFO found conflict at index 12 [existing term: 2, conflicting term: 3] INFO replace the unstable entries from index 12 @@ -447,7 +440,6 @@ HardState Term:3 Commit:11 Lead:4 LeadEpoch:1 Entries: 3/12 EntryNormal "" Messages: -1->4 MsgHeartbeatResp Term:3 Log:0/0 1->AppendThread MsgStorageAppend Term:3 Log:3/12 Commit:11 Lead:4 LeadEpoch:1 Entries:[3/12 EntryNormal ""] Responses:[ 1->4 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1 1->4 MsgAppResp Term:3 Log:0/12 Commit:11 diff --git a/pkg/raft/testdata/checkquorum.txt b/pkg/raft/testdata/checkquorum.txt index 0224784413bd..2ecd54f9306a 100644 --- a/pkg/raft/testdata/checkquorum.txt +++ b/pkg/raft/testdata/checkquorum.txt @@ -77,47 +77,27 @@ stabilize State:StateFollower HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2 Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 > 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 - 1->2 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 - 1->2 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 - 1->2 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 - 1->2 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 > 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 > 2 handling Ready Ready MustSync=false: @@ -127,20 +107,10 @@ stabilize 2->1 MsgAppResp Term:2 Log:0/0 2->1 MsgAppResp Term:2 Log:0/0 2->1 MsgAppResp Term:2 Log:0/0 - 2->1 MsgAppResp Term:2 Log:0/0 - 2->1 MsgAppResp Term:2 Log:0/0 - 2->1 MsgAppResp Term:2 Log:0/0 - 2->1 MsgAppResp Term:2 Log:0/0 - 2->1 MsgAppResp Term:2 Log:0/0 > 3 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2 Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 @@ -154,21 +124,6 @@ stabilize 2->1 MsgAppResp Term:2 Log:0/0 2->1 MsgAppResp Term:2 Log:0/0 2->1 MsgAppResp Term:2 Log:0/0 - 2->1 MsgAppResp Term:2 Log:0/0 - 2->1 MsgAppResp Term:2 Log:0/0 - 2->1 MsgAppResp Term:2 Log:0/0 - 2->1 MsgAppResp Term:2 Log:0/0 - 2->1 MsgAppResp Term:2 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1] 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 diff --git a/pkg/raft/testdata/forget_leader.txt b/pkg/raft/testdata/forget_leader.txt index cdac64e8499b..754c53977891 100644 --- a/pkg/raft/testdata/forget_leader.txt +++ b/pkg/raft/testdata/forget_leader.txt @@ -93,7 +93,24 @@ raft-state 3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 4: StateFollower (Non-Voter) Term:1 Lead:0 LeadEpoch:0 -# When receiving a heartbeat from the leader, they revert to followers. +# Give store liveness support to allow MsgFortifyLeader to be sent. +grant-support 2 1 +---- + 1 2 3 4 +1 2 1 1 1 +2 2 1 1 1 +3 1 1 1 1 +4 x 1 1 1 + +grant-support 4 1 +---- + 1 2 3 4 +1 2 1 1 1 +2 2 1 1 1 +3 1 1 1 1 +4 2 1 1 1 + +# When receiving MsgFortifyLeader from the leader, they revert to followers. tick-heartbeat 1 ---- ok @@ -101,11 +118,11 @@ ok stabilize ---- > 1 handling Ready - Ready MustSync=false: + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2 Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 - 1->4 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + 1->4 MsgFortifyLeader Term:1 Log:0/0 > 2 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:11 Lead:0 LeadEpoch:0 @@ -113,41 +130,50 @@ stabilize Ready MustSync=true: HardState Term:1 Commit:11 Lead:0 LeadEpoch:0 > 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 -> 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 > 4 receiving messages - 1->4 MsgHeartbeat Term:1 Log:0/0 + 1->4 MsgFortifyLeader Term:1 Log:0/0 > 2 handling Ready Ready MustSync=true: - HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:0 - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 -> 3 handling Ready - Ready MustSync=false: + HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2 Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 2->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 > 4 handling Ready Ready MustSync=true: - HardState Term:1 Commit:11 Lead:1 LeadEpoch:0 + HardState Term:1 Commit:11 Lead:1 LeadEpoch:2 Messages: - 4->1 MsgHeartbeatResp Term:1 Log:0/0 + 4->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 > 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 4->1 MsgHeartbeatResp Term:1 Log:0/0 + 2->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + 4->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 raft-state ---- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:2 +2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:2 3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:2 + +withdraw-support 2 1 +---- + 1 2 3 4 +1 2 1 1 1 +2 x 1 1 1 +3 1 1 1 1 +4 2 1 1 1 withdraw-support 3 1 ---- 1 2 3 4 -1 1 1 1 1 +1 2 1 1 1 +2 x 1 1 1 +3 x 1 1 1 +4 2 1 1 1 + +withdraw-support 4 1 +---- + 1 2 3 4 +1 2 1 1 1 2 x 1 1 1 3 x 1 1 1 4 x 1 1 1 @@ -162,10 +188,10 @@ INFO 3 [logterm: 1, index: 11] sent MsgVote request to 2 at term 2 raft-state ---- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:2 +2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:2 3: StateCandidate (Voter) Term:2 Lead:0 LeadEpoch:0 -4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:2 forget-leader 3 ---- @@ -173,10 +199,10 @@ ok raft-state ---- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:2 +2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:2 3: StateCandidate (Voter) Term:2 Lead:0 LeadEpoch:0 -4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:2 stabilize log-level=none ---- @@ -202,7 +228,7 @@ ok withdraw-support 2 3 ---- 1 2 3 4 -1 1 1 1 1 +1 2 1 1 1 2 x 1 x 1 3 x 1 1 1 4 x 1 1 1 @@ -224,7 +250,7 @@ INFO 2 forgetting leader 3 at term 2 withdraw-support 1 3 ---- 1 2 3 4 -1 1 1 x 1 +1 2 1 x 1 2 x 1 x 1 3 x 1 1 1 4 x 1 1 1 @@ -232,7 +258,7 @@ withdraw-support 1 3 withdraw-support 3 3 ---- 1 2 3 4 -1 1 1 x 1 +1 2 1 x 1 2 x 1 x 1 3 x 1 x 1 4 x 1 1 1 @@ -240,7 +266,7 @@ withdraw-support 3 3 withdraw-support 4 3 ---- 1 2 3 4 -1 1 1 x 1 +1 2 1 x 1 2 x 1 x 1 3 x 1 x 1 4 x 1 x 1 diff --git a/pkg/raft/testdata/forget_leader_prevote_checkquorum.txt b/pkg/raft/testdata/forget_leader_prevote_checkquorum.txt index 4dfadf98e8ff..770f040a98fb 100644 --- a/pkg/raft/testdata/forget_leader_prevote_checkquorum.txt +++ b/pkg/raft/testdata/forget_leader_prevote_checkquorum.txt @@ -57,7 +57,14 @@ INFO 1 [logterm: 1, index: 11, vote: 1] ignored MsgPreVote from 3 [logterm: 1, i 3->2 MsgPreVote Term:2 Log:1/11 INFO 2 [logterm: 1, index: 11, vote: 1] ignored MsgPreVote from 3 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3) and supporting fortified leader 1 at epoch 1 -# Make 1 assert leadership over 3 again. +grant-support 3 1 +---- + 1 2 3 +1 2 1 1 +2 1 1 1 +3 2 1 1 + +# Make 1 assert leadership over 3 again by sending MsgFortifyLeader. tick-heartbeat 1 ---- ok @@ -65,34 +72,34 @@ ok stabilize ---- > 1 handling Ready - Ready MustSync=false: + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2 Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 -> 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 > 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 INFO 3 became follower at term 1 -> 2 handling Ready - Ready MustSync=false: - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 > 3 handling Ready Ready MustSync=true: State:StateFollower - HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:0 + HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2 Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 > 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 raft-state ---- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:2 2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:2 + +withdraw-support 3 1 +---- + 1 2 3 +1 2 1 1 +2 1 1 1 +3 x 1 1 # ForgetLeader is ignored if the follower is supporting the leader's store # liveness epoch. @@ -103,7 +110,7 @@ INFO 2 [term 1] ignored MsgForgetLeader from 0 due to leader fortification withdraw-support 2 1 ---- 1 2 3 -1 1 1 1 +1 2 1 1 2 x 1 1 3 x 1 1 @@ -115,9 +122,9 @@ INFO 2 forgetting leader 1 at term 1 raft-state ---- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:2 2: StateFollower (Voter) Term:1 Lead:0 LeadEpoch:0 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:2 campaign 3 ---- @@ -210,7 +217,7 @@ stabilize 2 withdraw-support 2 3 ---- 1 2 3 -1 1 1 1 +1 2 1 1 2 x 1 x 3 x 1 1 @@ -227,7 +234,7 @@ raft-log 1 withdraw-support 1 3 ---- 1 2 3 -1 1 1 x +1 2 1 x 2 x 1 x 3 x 1 1 diff --git a/pkg/raft/testdata/heartbeat_timeout_recovers_from_probing.txt b/pkg/raft/testdata/heartbeat_timeout_recovers_from_probing.txt new file mode 100644 index 000000000000..f8e10e61dd3d --- /dev/null +++ b/pkg/raft/testdata/heartbeat_timeout_recovers_from_probing.txt @@ -0,0 +1,70 @@ +# This test checks that if a fully caught-up follower transitions +# into StateProbe (for example due to a call to ReportUnreachable), the +# leader will react to a subsequent heartbeat response from the probing +# follower by sending an empty MsgApp, the response of which restores +# StateReplicate for the follower. In other words, we don't end up in +# a stable state with a fully caught up follower in StateProbe. + +# Turn off output during the setup of the test. +log-level none +---- +ok + +add-nodes 3 voters=(1,2,3) index=10 +---- +ok + +campaign 1 +---- +ok + +stabilize +---- +ok + +log-level debug +---- +ok + +status 1 +---- +1: StateReplicate match=11 next=12 sentCommit=10 matchCommit=10 +2: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 +3: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 + +# On the first replica, report the second one as not reachable. +report-unreachable 1 2 +---- +DEBUG 1 failed to send message to 2 because it is unreachable [StateProbe match=11 next=12 sentCommit=11 matchCommit=11] + +status 1 +---- +1: StateReplicate match=11 next=12 sentCommit=10 matchCommit=10 +2: StateProbe match=11 next=12 sentCommit=11 matchCommit=11 +3: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 + +tick-heartbeat 1 +---- +ok + +# HeartbeatTimeout -> MsgApp -> MsgAppResp -> StateReplicate. +stabilize +---- +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->2 MsgApp Term:1 Log:1/11 Commit:11 +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/11 Commit:11 +> 2 handling Ready + Ready MustSync=false: + Messages: + 2->1 MsgAppResp Term:1 Log:0/11 Commit:11 +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/11 Commit:11 + +status 1 +---- +1: StateReplicate match=11 next=12 sentCommit=10 matchCommit=10 +2: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 +3: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 diff --git a/pkg/raft/testdata/lagging_commit.txt b/pkg/raft/testdata/lagging_commit.txt index 15e6a1844337..7b88d9061d9a 100644 --- a/pkg/raft/testdata/lagging_commit.txt +++ b/pkg/raft/testdata/lagging_commit.txt @@ -132,8 +132,6 @@ process-ready 1 ---- Ready MustSync=false: Messages: -1->2 MsgHeartbeat Term:1 Log:0/0 -1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgApp Term:1 Log:1/13 Commit:13 # Since the leader sends a MsgApp on the heartbeat timeout, it takes this long @@ -146,7 +144,6 @@ Messages: stabilize 1 3 ---- > 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgApp Term:1 Log:1/13 Commit:13 > 3 handling Ready Ready MustSync=true: @@ -155,20 +152,6 @@ stabilize 1 3 1/12 EntryNormal "data1" 1/13 EntryNormal "data2" Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 -> 1 receiving messages - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->3 MsgApp Term:1 Log:1/13 Commit:13 -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/13 Commit:13 -> 3 handling Ready - Ready MustSync=false: - Messages: 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 > 1 receiving messages 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 diff --git a/pkg/raft/testdata/lagging_commit_no_store_liveness_support.txt b/pkg/raft/testdata/lagging_commit_no_store_liveness_support.txt index 03e7a41ca5dc..2b0c5f35a3b8 100644 --- a/pkg/raft/testdata/lagging_commit_no_store_liveness_support.txt +++ b/pkg/raft/testdata/lagging_commit_no_store_liveness_support.txt @@ -132,10 +132,7 @@ ok # send a MsgApp because follower 3 doesn't support the leader. process-ready 1 ---- -Ready MustSync=false: -Messages: -1->2 MsgHeartbeat Term:1 Log:0/0 -1->3 MsgHeartbeat Term:1 Log:0/0 + # Now that follower 3's store supports the leader's store, will send a MsgApp on # the next heartbeat timeout. @@ -155,26 +152,14 @@ process-ready 1 Ready MustSync=true: HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:2 Messages: -1->2 MsgHeartbeat Term:1 Log:0/0 -1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgApp Term:1 Log:1/13 Commit:13 stabilize ---- -> 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->2 MsgHeartbeat Term:1 Log:0/0 > 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgApp Term:1 Log:1/13 Commit:13 -> 2 handling Ready - Ready MustSync=false: - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 2->1 MsgHeartbeatResp Term:1 Log:0/0 > 3 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:2 @@ -182,30 +167,8 @@ stabilize 1/12 EntryNormal "data1" 1/13 EntryNormal "data2" Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 > 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->3 MsgApp Term:1 Log:1/13 Commit:13 - 1->3 MsgApp Term:1 Log:1/13 Commit:13 -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/13 Commit:13 - 1->3 MsgApp Term:1 Log:1/13 Commit:13 -> 3 handling Ready - Ready MustSync=false: - Messages: - 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 - 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 -> 1 receiving messages - 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 - 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 diff --git a/pkg/raft/testdata/mixedversions/checkquorum.txt b/pkg/raft/testdata/mixedversions/checkquorum.txt new file mode 100644 index 000000000000..e1a82d7c6b44 --- /dev/null +++ b/pkg/raft/testdata/mixedversions/checkquorum.txt @@ -0,0 +1,251 @@ +# Tests that CheckQuorum causes a leader to step down if it hasn't heard from a +# quorum of followers in the past election timeout interval. +# +# Also tests that votes are rejected when there is a current leader. In the Raft +# thesis this is part of PreVote, but etcd/raft enables this via CheckQuorum. + +# TODO(ibrahim): Remove this test once store liveness is on by default since we +# have the test checkquorum outside the mixedversions directory that assumes +# that assumes that store liveness is enabled. + +log-level none +---- +ok + +# Start 3 nodes, two of them has the older version 24.2. +add-nodes 2 voters=(1,2,3) index=10 checkquorum=true crdb-version=24.2 +---- +ok + +add-nodes 1 voters=(1,2,3) index=10 checkquorum=true crdb-version=24.3 +---- +ok + +campaign 1 +---- +ok + +stabilize +---- +ok + +bump-epoch 1 +---- +ok + +log-level debug +---- +ok + +# Campaigning will fail when there is an active leader. +campaign 2 +---- +INFO 2 is starting a new election at term 1 +INFO 2 became candidate at term 2 +INFO 2 [logterm: 1, index: 11] sent MsgVote request to 1 at term 2 +INFO 2 [logterm: 1, index: 11] sent MsgVote request to 3 at term 2 + +stabilize +---- +> 2 handling Ready + Ready MustSync=true: + State:StateCandidate + HardState Term:2 Vote:2 Commit:11 Lead:0 LeadEpoch:0 + Messages: + 2->1 MsgVote Term:2 Log:1/11 + 2->3 MsgVote Term:2 Log:1/11 + INFO 2 received MsgVoteResp from 2 at term 2 + INFO 2 has received 1 MsgVoteResp votes and 0 vote rejections +> 1 receiving messages + 2->1 MsgVote Term:2 Log:1/11 + INFO 1 [logterm: 1, index: 11, vote: 1] ignored MsgVote from 2 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3) +> 3 receiving messages + 2->3 MsgVote Term:2 Log:1/11 + INFO 3 [logterm: 1, index: 11, vote: 1] ignored MsgVote from 2 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3) + +# Tick the leader without processing any messages from followers. We have to +# tick 2 election timeouts, since the followers were active in the current +# interval (see messages above). +tick-election 1 +---- +DEBUG 1 does not have store liveness support from a quorum of peers + +tick-election 1 +---- +DEBUG 1 has not received messages from a quorum of peers in the last election timeout +DEBUG 1 does not have store liveness support from a quorum of peers +WARN 1 stepped down to follower since quorum is not active +INFO 1 became follower at term 1 + +# We'll now send all of the heartbeats that were buffered during the ticks +# above. Conceptually, "the network was slow". +stabilize +---- +> 1 handling Ready + Ready MustSync=false: + State:StateFollower + Messages: + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 +> 2 receiving messages + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 +> 3 receiving messages + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 +> 2 handling Ready + Ready MustSync=false: + Messages: + 2->1 MsgAppResp Term:2 Log:0/0 + 2->1 MsgAppResp Term:2 Log:0/0 + 2->1 MsgAppResp Term:2 Log:0/0 + 2->1 MsgAppResp Term:2 Log:0/0 + 2->1 MsgAppResp Term:2 Log:0/0 +> 3 handling Ready + Ready MustSync=false: + Messages: + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 +> 1 receiving messages + 2->1 MsgAppResp Term:2 Log:0/0 + INFO 1 [term: 1] received a MsgAppResp message with higher term from 2 [term: 2] + INFO 1 became follower at term 2 + 2->1 MsgAppResp Term:2 Log:0/0 + 2->1 MsgAppResp Term:2 Log:0/0 + 2->1 MsgAppResp Term:2 Log:0/0 + 2->1 MsgAppResp Term:2 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] +> 1 handling Ready + Ready MustSync=true: + HardState Term:2 Commit:11 Lead:0 LeadEpoch:0 + +# Other nodes can now successfully campaign. Note that we haven't ticked 3, so +# it won't grant votes. +campaign 2 +---- +INFO 2 is starting a new election at term 2 +INFO 2 became candidate at term 3 +INFO 2 [logterm: 1, index: 11] sent MsgVote request to 1 at term 3 +INFO 2 [logterm: 1, index: 11] sent MsgVote request to 3 at term 3 + +process-ready 2 +---- +Ready MustSync=true: +HardState Term:3 Vote:2 Commit:11 Lead:0 LeadEpoch:0 +Messages: +2->1 MsgVote Term:3 Log:1/11 +2->3 MsgVote Term:3 Log:1/11 +INFO 2 received MsgVoteResp from 2 at term 3 +INFO 2 has received 1 MsgVoteResp votes and 0 vote rejections + +deliver-msgs 1 +---- +2->1 MsgVote Term:3 Log:1/11 +INFO 1 [term: 2] received a MsgVote message with higher term from 2 [term: 3] +INFO 1 became follower at term 3 +INFO 1 [logterm: 1, index: 11, vote: 0] cast MsgVote for 2 [logterm: 1, index: 11] at term 3 + +deliver-msgs 3 +---- +2->3 MsgVote Term:3 Log:1/11 +INFO 3 [logterm: 1, index: 11, vote: 1] ignored MsgVote from 2 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3) + +stabilize +---- +> 1 handling Ready + Ready MustSync=true: + HardState Term:3 Vote:2 Commit:11 Lead:0 LeadEpoch:0 + Messages: + 1->2 MsgVoteResp Term:3 Log:0/0 +> 2 receiving messages + 1->2 MsgVoteResp Term:3 Log:0/0 + INFO 2 received MsgVoteResp from 1 at term 3 + INFO 2 has received 2 MsgVoteResp votes and 0 vote rejections + INFO 2 became leader at term 3 +> 2 handling Ready + Ready MustSync=true: + State:StateLeader + HardState Term:3 Vote:2 Commit:11 Lead:2 LeadEpoch:0 + Entries: + 3/12 EntryNormal "" + Messages: + 2->1 MsgApp Term:3 Log:1/11 Commit:11 Entries:[3/12 EntryNormal ""] + 2->3 MsgApp Term:3 Log:1/11 Commit:11 Entries:[3/12 EntryNormal ""] +> 1 receiving messages + 2->1 MsgApp Term:3 Log:1/11 Commit:11 Entries:[3/12 EntryNormal ""] +> 3 receiving messages + 2->3 MsgApp Term:3 Log:1/11 Commit:11 Entries:[3/12 EntryNormal ""] + INFO 3 [term: 1] received a MsgApp message with higher term from 2 [term: 3] + INFO 3 became follower at term 3 +> 1 handling Ready + Ready MustSync=true: + HardState Term:3 Vote:2 Commit:11 Lead:2 LeadEpoch:0 + Entries: + 3/12 EntryNormal "" + Messages: + 1->2 MsgAppResp Term:3 Log:0/12 Commit:11 +> 3 handling Ready + Ready MustSync=true: + HardState Term:3 Commit:11 Lead:2 LeadEpoch:0 + Entries: + 3/12 EntryNormal "" + Messages: + 3->2 MsgAppResp Term:3 Log:0/12 Commit:11 +> 2 receiving messages + 1->2 MsgAppResp Term:3 Log:0/12 Commit:11 + 3->2 MsgAppResp Term:3 Log:0/12 Commit:11 +> 2 handling Ready + Ready MustSync=true: + HardState Term:3 Vote:2 Commit:12 Lead:2 LeadEpoch:0 + CommittedEntries: + 3/12 EntryNormal "" + Messages: + 2->1 MsgApp Term:3 Log:3/12 Commit:12 + 2->3 MsgApp Term:3 Log:3/12 Commit:12 +> 1 receiving messages + 2->1 MsgApp Term:3 Log:3/12 Commit:12 +> 3 receiving messages + 2->3 MsgApp Term:3 Log:3/12 Commit:12 +> 1 handling Ready + Ready MustSync=true: + HardState Term:3 Vote:2 Commit:12 Lead:2 LeadEpoch:0 + CommittedEntries: + 3/12 EntryNormal "" + Messages: + 1->2 MsgAppResp Term:3 Log:0/12 Commit:12 +> 3 handling Ready + Ready MustSync=true: + HardState Term:3 Commit:12 Lead:2 LeadEpoch:0 + CommittedEntries: + 3/12 EntryNormal "" + Messages: + 3->2 MsgAppResp Term:3 Log:0/12 Commit:12 +> 2 receiving messages + 1->2 MsgAppResp Term:3 Log:0/12 Commit:12 + 3->2 MsgAppResp Term:3 Log:0/12 Commit:12 diff --git a/pkg/raft/testdata/mixedversions/forget_leader.txt b/pkg/raft/testdata/mixedversions/forget_leader.txt new file mode 100644 index 000000000000..57e4fedb51ca --- /dev/null +++ b/pkg/raft/testdata/mixedversions/forget_leader.txt @@ -0,0 +1,202 @@ +# TODO(ibrahim): Remove this test once store liveness is on by default since we +# have the test forget_leader outside the mixedversions directory that assumes +# that store liveness is enabled. + +log-level none +---- +ok + +# Start 4 nodes, two of them has the older version 24.2. +add-nodes 2 voters=(1,2,3) learners=(4) index=10 crdb-version=24.2 +---- +ok + +add-nodes 2 voters=(1,2,3) learners=(4) index=10 crdb-version=24.3 +---- +ok + + +campaign 1 +---- +ok + +stabilize +---- +ok + +log-level debug +---- +ok + +# ForgetLeader is a noop on the leader. +forget-leader 1 +---- +ok + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:0 +2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:0 + +# ForgetLeader causes a follower to forget its leader, but remain in the current +# term. It's a noop if it's called again. +forget-leader 2 +---- +INFO 2 forgetting leader 1 at term 1 + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:0 +2: StateFollower (Voter) Term:1 Lead:0 LeadEpoch:0 +3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:0 + +forget-leader 2 +---- +INFO 2 no leader at term 1; dropping forget leader msg + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:0 +2: StateFollower (Voter) Term:1 Lead:0 LeadEpoch:0 +3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:0 + +# ForgetLeader also works on learners. +forget-leader 4 +---- +INFO 4 forgetting leader 1 at term 1 + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:0 +2: StateFollower (Voter) Term:1 Lead:0 LeadEpoch:0 +3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:1 Lead:0 LeadEpoch:0 + +# When receiving a heartbeat from the leader, they revert to followers. +tick-heartbeat 1 +---- +ok + +stabilize +---- +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->4 MsgHeartbeat Term:1 Log:0/0 Commit:11 +> 2 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:11 Lead:0 LeadEpoch:0 +> 4 handling Ready + Ready MustSync=true: + HardState Term:1 Commit:11 Lead:0 LeadEpoch:0 +> 2 receiving messages + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 +> 3 receiving messages + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 +> 4 receiving messages + 1->4 MsgHeartbeat Term:1 Log:0/0 Commit:11 +> 2 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:0 + Messages: + 2->1 MsgHeartbeatResp Term:1 Log:0/0 +> 3 handling Ready + Ready MustSync=false: + Messages: + 3->1 MsgHeartbeatResp Term:1 Log:0/0 +> 4 handling Ready + Ready MustSync=true: + HardState Term:1 Commit:11 Lead:1 LeadEpoch:0 + Messages: + 4->1 MsgHeartbeatResp Term:1 Log:0/0 +> 1 receiving messages + 2->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 4->1 MsgHeartbeatResp Term:1 Log:0/0 + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:0 +2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:0 + +# ForgetLeader is a noop on candidates. +campaign 3 +---- +INFO 3 is starting a new election at term 1 +INFO 3 became candidate at term 2 +INFO 3 [logterm: 1, index: 11] sent MsgVote request to 1 at term 2 +INFO 3 [logterm: 1, index: 11] sent MsgVote request to 2 at term 2 + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:0 +2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +3: StateCandidate (Voter) Term:2 Lead:0 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:0 + +forget-leader 3 +---- +ok + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:0 +2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +3: StateCandidate (Voter) Term:2 Lead:0 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:1 Lead:1 LeadEpoch:0 + +stabilize log-level=none +---- +ok + +raft-state +---- +1: StateFollower (Voter) Term:2 Lead:3 LeadEpoch:0 +2: StateFollower (Voter) Term:2 Lead:3 LeadEpoch:0 +3: StateLeader (Voter) Term:2 Lead:3 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:2 Lead:3 LeadEpoch:0 + +# ForgetLeader shouldn't affect the election timeout: if a follower +# forgets the leader 1 tick before the election timeout fires, it +# will still campaign on the next tick. +set-randomized-election-timeout 2 timeout=3 +---- +ok + +tick-heartbeat 2 +---- +ok + +tick-heartbeat 2 +---- +ok + +forget-leader 2 +---- +INFO 2 forgetting leader 3 at term 2 + +tick-heartbeat 2 +---- +INFO 2 is starting a new election at term 2 +INFO 2 became candidate at term 3 +INFO 2 [logterm: 2, index: 12] sent MsgVote request to 1 at term 3 +INFO 2 [logterm: 2, index: 12] sent MsgVote request to 3 at term 3 + +stabilize log-level=none +---- +ok + +raft-state +---- +1: StateFollower (Voter) Term:3 Lead:2 LeadEpoch:0 +2: StateLeader (Voter) Term:3 Lead:2 LeadEpoch:0 +3: StateFollower (Voter) Term:3 Lead:2 LeadEpoch:0 +4: StateFollower (Non-Voter) Term:3 Lead:2 LeadEpoch:0 diff --git a/pkg/raft/testdata/mixedversions/forget_leader_prevote_checkquorum.txt b/pkg/raft/testdata/mixedversions/forget_leader_prevote_checkquorum.txt new file mode 100644 index 000000000000..48afd9c9d6b4 --- /dev/null +++ b/pkg/raft/testdata/mixedversions/forget_leader_prevote_checkquorum.txt @@ -0,0 +1,249 @@ +# Tests that a follower with PreVote+CheckQuorum can forget the leader, allowing +# it to grant prevotes despite having heard from the leader recently. +# +# Also tests that forgetting the leader still won't grant prevotes to a +# replica that isn't up-to-date. + +# TODO(ibrahim): Remove this test once store liveness is on by default since we +# have the test forget_leader_prevote_checkquorum outside the mixedversions +# directory that assumes that store liveness is enabled. + +log-level none +---- +ok + +# Start 3 nodes, two of them has the older version 24.2. +add-nodes 2 voters=(1,2,3) index=10 prevote=true checkquorum=true crdb-version=24.2 +---- +ok + +add-nodes 1 voters=(1,2,3) index=10 prevote=true checkquorum=true crdb-version=24.3 +---- +ok + +campaign 1 +---- +ok + +stabilize +---- +ok + +log-level debug +---- +ok + +# If 3 attempts to campaign, 2 rejects it because it has a leader. +campaign 3 +---- +INFO 3 is starting a new election at term 1 +INFO 3 became pre-candidate at term 1 +INFO 3 [logterm: 1, index: 11] sent MsgPreVote request to 1 at term 1 +INFO 3 [logterm: 1, index: 11] sent MsgPreVote request to 2 at term 1 + +stabilize 3 +---- +> 3 handling Ready + Ready MustSync=true: + State:StatePreCandidate + HardState Term:1 Vote:1 Commit:11 Lead:0 LeadEpoch:0 + Messages: + 3->1 MsgPreVote Term:2 Log:1/11 + 3->2 MsgPreVote Term:2 Log:1/11 + INFO 3 received MsgPreVoteResp from 3 at term 1 + INFO 3 has received 1 MsgPreVoteResp votes and 0 vote rejections + +deliver-msgs 1 2 +---- +3->1 MsgPreVote Term:2 Log:1/11 +INFO 1 [logterm: 1, index: 11, vote: 1] ignored MsgPreVote from 3 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3) +3->2 MsgPreVote Term:2 Log:1/11 +INFO 2 [logterm: 1, index: 11, vote: 1] ignored MsgPreVote from 3 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3) + +# Make 1 assert leadership over 3 again. +tick-heartbeat 1 +---- +ok + +stabilize +---- +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 +> 2 receiving messages + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 +> 3 receiving messages + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 + INFO 3 became follower at term 1 +> 2 handling Ready + Ready MustSync=false: + Messages: + 2->1 MsgHeartbeatResp Term:1 Log:0/0 +> 3 handling Ready + Ready MustSync=true: + State:StateFollower + HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:0 + Messages: + 3->1 MsgHeartbeatResp Term:1 Log:0/0 +> 1 receiving messages + 2->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:0 +2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 +3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 + +# If 2 forgets the leader, then 3 can obtain prevotes and hold an election +# despite 2 having heard from the leader recently. +forget-leader 2 +---- +INFO 2 forgetting leader 1 at term 1 + +raft-state +---- +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:0 +2: StateFollower (Voter) Term:1 Lead:0 LeadEpoch:0 +3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 + +campaign 3 +---- +INFO 3 is starting a new election at term 1 +INFO 3 became pre-candidate at term 1 +INFO 3 [logterm: 1, index: 11] sent MsgPreVote request to 1 at term 1 +INFO 3 [logterm: 1, index: 11] sent MsgPreVote request to 2 at term 1 + +stabilize 3 +---- +> 3 handling Ready + Ready MustSync=true: + State:StatePreCandidate + HardState Term:1 Vote:1 Commit:11 Lead:0 LeadEpoch:0 + Messages: + 3->1 MsgPreVote Term:2 Log:1/11 + 3->2 MsgPreVote Term:2 Log:1/11 + INFO 3 received MsgPreVoteResp from 3 at term 1 + INFO 3 has received 1 MsgPreVoteResp votes and 0 vote rejections + +stabilize 2 +---- +> 2 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:11 Lead:0 LeadEpoch:0 +> 2 receiving messages + 3->2 MsgPreVote Term:2 Log:1/11 + INFO 2 [logterm: 1, index: 11, vote: 1] cast MsgPreVote for 3 [logterm: 1, index: 11] at term 1 +> 2 handling Ready + Ready MustSync=false: + Messages: + 2->3 MsgPreVoteResp Term:2 Log:0/0 + +stabilize 3 +---- +> 3 receiving messages + 2->3 MsgPreVoteResp Term:2 Log:0/0 + INFO 3 received MsgPreVoteResp from 2 at term 1 + INFO 3 has received 2 MsgPreVoteResp votes and 0 vote rejections + INFO 3 became candidate at term 2 + INFO 3 [logterm: 1, index: 11] sent MsgVote request to 1 at term 2 + INFO 3 [logterm: 1, index: 11] sent MsgVote request to 2 at term 2 +> 3 handling Ready + Ready MustSync=true: + State:StateCandidate + HardState Term:2 Vote:3 Commit:11 Lead:0 LeadEpoch:0 + Messages: + 3->1 MsgVote Term:2 Log:1/11 + 3->2 MsgVote Term:2 Log:1/11 + INFO 3 received MsgVoteResp from 3 at term 2 + INFO 3 has received 1 MsgVoteResp votes and 0 vote rejections + +stabilize log-level=none +---- +ok + +raft-state +---- +1: StateFollower (Voter) Term:2 Lead:3 LeadEpoch:0 +2: StateFollower (Voter) Term:2 Lead:3 LeadEpoch:0 +3: StateLeader (Voter) Term:2 Lead:3 LeadEpoch:0 + +# Test that forgetting the leader still won't grant prevotes if the candidate +# isn't up-to-date. We first replicate a proposal on 3 and 2. +propose 3 prop_1 +---- +ok + +stabilize 3 +---- +> 3 handling Ready + Ready MustSync=true: + Entries: + 2/13 EntryNormal "prop_1" + Messages: + 3->1 MsgApp Term:2 Log:2/12 Commit:12 Entries:[2/13 EntryNormal "prop_1"] + 3->2 MsgApp Term:2 Log:2/12 Commit:12 Entries:[2/13 EntryNormal "prop_1"] + +stabilize 2 +---- +> 2 receiving messages + 3->2 MsgApp Term:2 Log:2/12 Commit:12 Entries:[2/13 EntryNormal "prop_1"] +> 2 handling Ready + Ready MustSync=true: + Entries: + 2/13 EntryNormal "prop_1" + Messages: + 2->3 MsgAppResp Term:2 Log:0/13 Commit:12 + +forget-leader 2 +---- +INFO 2 forgetting leader 3 at term 2 + +# 1 is now behind on its log. It tries to campaign, but fails. +raft-log 1 +---- +1/11 EntryNormal "" +2/12 EntryNormal "" + +campaign 1 +---- +INFO 1 is starting a new election at term 2 +INFO 1 became pre-candidate at term 2 +INFO 1 [logterm: 2, index: 12] sent MsgPreVote request to 2 at term 2 +INFO 1 [logterm: 2, index: 12] sent MsgPreVote request to 3 at term 2 + +process-ready 1 +---- +Ready MustSync=true: +State:StatePreCandidate +HardState Term:2 Commit:12 Lead:0 LeadEpoch:0 +Messages: +1->2 MsgPreVote Term:3 Log:2/12 +1->3 MsgPreVote Term:3 Log:2/12 +INFO 1 received MsgPreVoteResp from 1 at term 2 +INFO 1 has received 1 MsgPreVoteResp votes and 0 vote rejections + +stabilize 2 +---- +> 2 handling Ready + Ready MustSync=true: + HardState Term:2 Vote:3 Commit:12 Lead:0 LeadEpoch:0 +> 2 receiving messages + 1->2 MsgPreVote Term:3 Log:2/12 + INFO 2 [logterm: 2, index: 13, vote: 3] rejected MsgPreVote from 1 [logterm: 2, index: 12] at term 2 +> 2 handling Ready + Ready MustSync=false: + Messages: + 2->1 MsgPreVoteResp Term:2 Log:0/0 Rejected (Hint: 0) + +stabilize log-level=none +---- +ok + +raft-state +---- +1: StateFollower (Voter) Term:2 Lead:3 LeadEpoch:0 +2: StateFollower (Voter) Term:2 Lead:3 LeadEpoch:0 +3: StateLeader (Voter) Term:2 Lead:3 LeadEpoch:0 diff --git a/pkg/raft/testdata/heartbeat_resp_recovers_from_probing.txt b/pkg/raft/testdata/mixedversions/heartbeat_resp_recovers_from_probing.txt similarity index 76% rename from pkg/raft/testdata/heartbeat_resp_recovers_from_probing.txt rename to pkg/raft/testdata/mixedversions/heartbeat_resp_recovers_from_probing.txt index ba034b8b166f..539e0b54b724 100644 --- a/pkg/raft/testdata/heartbeat_resp_recovers_from_probing.txt +++ b/pkg/raft/testdata/mixedversions/heartbeat_resp_recovers_from_probing.txt @@ -5,12 +5,21 @@ # StateReplicate for the follower. In other words, we don't end up in # a stable state with a fully caught up follower in StateProbe. +# TODO(ibrahim): Remove this test once store liveness is on by default since we +# have the test heartbeat_timeout_recovers_from_probing outside the +# mixedversions directory that assumes that store liveness is enabled. + # Turn off output during the setup of the test. log-level none ---- ok -add-nodes 3 voters=(1,2,3) index=10 +# Start 3 nodes, two of them has the older version 24.2. +add-nodes 2 voters=(1,2,3) index=10 crdb-version=24.2 +---- +ok + +add-nodes 1 voters=(1,2,3) index=10 crdb-version=24.3 ---- ok @@ -47,32 +56,28 @@ tick-heartbeat 1 ---- ok -# Heartbeat -> MsgApp -> MsgAppResp -> StateReplicate. +# Heartbeat -> HeartbeatResp -> MsgApp -> MsgAppResp -> StateReplicate. stabilize ---- > 1 handling Ready Ready MustSync=false: Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 - 1->2 MsgApp Term:1 Log:1/11 Commit:11 + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 > 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->2 MsgApp Term:1 Log:1/11 Commit:11 + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 > 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 > 2 handling Ready Ready MustSync=false: Messages: 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 2->1 MsgAppResp Term:1 Log:0/11 Commit:11 > 3 handling Ready Ready MustSync=false: Messages: 3->1 MsgHeartbeatResp Term:1 Log:0/0 > 1 receiving messages 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 2->1 MsgAppResp Term:1 Log:0/11 Commit:11 3->1 MsgHeartbeatResp Term:1 Log:0/0 > 1 handling Ready Ready MustSync=false: diff --git a/pkg/raft/testdata/mixedversions/lagging_commit.txt b/pkg/raft/testdata/mixedversions/lagging_commit.txt new file mode 100644 index 000000000000..9472f409d19b --- /dev/null +++ b/pkg/raft/testdata/mixedversions/lagging_commit.txt @@ -0,0 +1,183 @@ +# This test demonstrates the effect of delayed commit on a follower node after a +# network hiccup between the leader and this follower. + +# TODO(ibrahim): Remove this test once store liveness is on by default since we +# have the test lagging_commit outside the mixedversions directory that assumes +# that store liveness is enabled. + +# Skip logging the boilerplate. Set up a raft group of 3 nodes, and elect node 1 +# as the leader. Nodes 2 and 3 are the followers. +log-level none +---- +ok + +# Start 3 nodes, two of them has the older version 24.2. +add-nodes 2 voters=(1,2,3) index=10 crdb-version=24.2 +---- +ok + +add-nodes 1 voters=(1,2,3) index=10 crdb-version=24.3 +---- +ok + +campaign 1 +---- +ok + +stabilize +---- +ok + +# Propose a couple of entries. +propose 1 data1 +---- +ok + +propose 1 data2 +---- +ok + +process-ready 1 +---- +ok + +# The interesting part starts below. +log-level debug +---- +ok + +deliver-msgs 2 3 +---- +1->2 MsgApp Term:1 Log:1/11 Commit:11 Entries:[1/12 EntryNormal "data1"] +1->2 MsgApp Term:1 Log:1/12 Commit:11 Entries:[1/13 EntryNormal "data2"] +1->3 MsgApp Term:1 Log:1/11 Commit:11 Entries:[1/12 EntryNormal "data1"] +1->3 MsgApp Term:1 Log:1/12 Commit:11 Entries:[1/13 EntryNormal "data2"] + +process-ready 3 +---- +Ready MustSync=true: +Entries: +1/12 EntryNormal "data1" +1/13 EntryNormal "data2" +Messages: +3->1 MsgAppResp Term:1 Log:0/12 Commit:11 +3->1 MsgAppResp Term:1 Log:0/13 Commit:11 + +# Suppose there is a network blip which prevents the leader learning that the +# follower 3 has appended the proposed entries to the log. +deliver-msgs drop=(1) +---- +dropped: 3->1 MsgAppResp Term:1 Log:0/12 Commit:11 +dropped: 3->1 MsgAppResp Term:1 Log:0/13 Commit:11 + +# In the meantime, the entries are committed, and the leader sends the commit +# index to all the followers. +stabilize 1 2 +---- +> 2 handling Ready + Ready MustSync=true: + Entries: + 1/12 EntryNormal "data1" + 1/13 EntryNormal "data2" + Messages: + 2->1 MsgAppResp Term:1 Log:0/12 Commit:11 + 2->1 MsgAppResp Term:1 Log:0/13 Commit:11 +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/12 Commit:11 + 2->1 MsgAppResp Term:1 Log:0/13 Commit:11 +> 1 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:0 + CommittedEntries: + 1/12 EntryNormal "data1" + 1/13 EntryNormal "data2" + Messages: + 1->2 MsgApp Term:1 Log:1/13 Commit:12 + 1->3 MsgApp Term:1 Log:1/13 Commit:12 + 1->2 MsgApp Term:1 Log:1/13 Commit:13 + 1->3 MsgApp Term:1 Log:1/13 Commit:13 +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/13 Commit:12 + 1->2 MsgApp Term:1 Log:1/13 Commit:13 +> 2 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:0 + CommittedEntries: + 1/12 EntryNormal "data1" + 1/13 EntryNormal "data2" + Messages: + 2->1 MsgAppResp Term:1 Log:0/13 Commit:12 + 2->1 MsgAppResp Term:1 Log:0/13 Commit:13 +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/13 Commit:12 + 2->1 MsgAppResp Term:1 Log:0/13 Commit:13 + +# The network blip prevents the follower 3 from learning that the previously +# appended entries are now committed. +deliver-msgs drop=(3) +---- +dropped: 1->3 MsgApp Term:1 Log:1/13 Commit:12 +dropped: 1->3 MsgApp Term:1 Log:1/13 Commit:13 + +# The network blip ends here. + +status 1 +---- +1: StateReplicate match=13 next=14 sentCommit=11 matchCommit=11 +2: StateReplicate match=13 next=14 sentCommit=13 matchCommit=13 +3: StateReplicate match=11 next=14 sentCommit=13 matchCommit=11 inflight=2 + +# The leader still observes that the entries are in-flight to the follower 3, +# since it hasn't heard from it. Nothing triggers updating the follower's +# commit index, so we have to wait up to the full heartbeat interval before +# the leader sends the commit index. +tick-heartbeat 1 +---- +ok + +# However, the leader does not push the real commit index to the follower 3. It +# cuts the commit index at the Progress.Match mark, because it thinks that it is +# unsafe to send a commit index higher than that. +process-ready 1 +---- +Ready MustSync=false: +Messages: +1->2 MsgHeartbeat Term:1 Log:0/0 Commit:13 +1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 + +# Since the heartbeat message does not bump the follower's commit index, it will +# take another roundtrip with the leader to update it. As such, the total time +# it takes for the follower to learn the commit index is: +# +# delay = HeartbeatInterval + 3/2 * RTT +# +# This is suboptimal. It could have taken HeartbeatInterval + 1/2 * RTT, if the +# leader sent the up-to-date commit index in the heartbeat message. +# +# See https://github.com/etcd-io/raft/issues/138 which aims to fix this. +stabilize 1 3 +---- +> 3 receiving messages + 1->3 MsgHeartbeat Term:1 Log:0/0 Commit:11 +> 3 handling Ready + Ready MustSync=false: + Messages: + 3->1 MsgHeartbeatResp Term:1 Log:0/0 +> 1 receiving messages + 3->1 MsgHeartbeatResp Term:1 Log:0/0 +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->3 MsgApp Term:1 Log:1/13 Commit:13 +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/13 Commit:13 +> 3 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:0 + CommittedEntries: + 1/12 EntryNormal "data1" + 1/13 EntryNormal "data2" + Messages: + 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 +> 1 receiving messages + 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 diff --git a/pkg/raft/testdata/msg_app_commit_index_leader_old_version.txt b/pkg/raft/testdata/mixedversions/msg_app_commit_index_leader.txt similarity index 96% rename from pkg/raft/testdata/msg_app_commit_index_leader_old_version.txt rename to pkg/raft/testdata/mixedversions/msg_app_commit_index_leader.txt index 53d6de30a943..330335388e62 100644 --- a/pkg/raft/testdata/msg_app_commit_index_leader_old_version.txt +++ b/pkg/raft/testdata/mixedversions/msg_app_commit_index_leader.txt @@ -85,7 +85,7 @@ stabilize 1 2 2->1 MsgAppResp Term:1 Log:0/13 Commit:11 > 1 handling Ready Ready MustSync=true: - HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:1 + HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:0 CommittedEntries: 1/12 EntryNormal "data1" 1/13 EntryNormal "data2" @@ -99,7 +99,7 @@ stabilize 1 2 1->2 MsgApp Term:1 Log:1/13 Commit:13 > 2 handling Ready Ready MustSync=true: - HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:1 + HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:0 CommittedEntries: 1/12 EntryNormal "data1" 1/13 EntryNormal "data2" @@ -150,7 +150,7 @@ stabilize 1 2 3 2->1 MsgHeartbeatResp Term:1 Log:0/0 > 3 handling Ready Ready MustSync=true: - HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:1 + HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:0 CommittedEntries: 1/12 EntryNormal "data1" 1/13 EntryNormal "data2" diff --git a/pkg/raft/testdata/mixedversions/snapshot_succeed_via_app_resp.txt b/pkg/raft/testdata/mixedversions/snapshot_succeed_via_app_resp.txt new file mode 100644 index 000000000000..3b2cbf48aab4 --- /dev/null +++ b/pkg/raft/testdata/mixedversions/snapshot_succeed_via_app_resp.txt @@ -0,0 +1,150 @@ +# TestSnapshotSucceedViaAppResp regression tests the situation in which a snap- +# shot is sent to a follower at the most recent index (i.e. the snapshot index +# is the leader's last index is the committed index). In that situation, a bug +# in the past left the follower in probing status until the next log entry was +# committed. +# +# See https://github.com/etcd-io/etcd/pull/10308 for additional background. + +# TODO(ibrahim): Remove this test once store liveness is on by default since we +# have the test snapshot_succeed_via_app_resp outside the mixedversions +# directory that assumes that store liveness is enabled. + +# Turn off output during the setup of the test. +log-level none +---- +ok + +# Start with two nodes with version 24.2, but the config already has a third. +add-nodes 2 voters=(1,2,3) index=10 crdb-version=24.2 +---- +ok + +campaign 1 +---- +ok + +# Fully replicate everything, including the leader's empty index. +stabilize +---- +ok + +compact 1 11 +---- +ok + +# Drop inflight messages to n3. +deliver-msgs drop=(3) +---- +ok + +# Show the Raft log messages from now on. +log-level debug +---- +ok + +status 1 +---- +1: StateReplicate match=11 next=12 sentCommit=10 matchCommit=10 +2: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 +3: StateProbe match=0 next=11 sentCommit=10 matchCommit=0 paused inactive + +# Add the node that will receive a snapshot (it has no state at all, does not +# even have a config). +add-nodes 1 +---- +INFO 3 switched to configuration voters=() +INFO 3 became follower at term 0 +INFO newRaft 3 [peers: [], term: 0, commit: 0, applied: 0, lastindex: 0, lastterm: 0] + +# Time passes on the leader so that it will try the previously missing follower +# again. +tick-heartbeat 1 +---- +ok + +process-ready 1 +---- +Ready MustSync=false: +Messages: +1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 +1->3 MsgHeartbeat Term:1 Log:0/0 + +# Iterate until no more work is done by the new peer. It receives the heartbeat +# and responds. +stabilize 3 +---- +> 3 receiving messages + 1->3 MsgHeartbeat Term:1 Log:0/0 + INFO 3 [term: 0] received a MsgHeartbeat message with higher term from 1 [term: 1] + INFO 3 became follower at term 1 +> 3 handling Ready + Ready MustSync=true: + HardState Term:1 Commit:0 Lead:1 LeadEpoch:0 + Messages: + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + +# The leader in turn will realize that n3 needs a snapshot, which it initiates. +stabilize 1 +---- +> 1 receiving messages + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + DEBUG 1 [firstindex: 12, commit: 11] sent snapshot[index: 11, term: 1] to 3 [StateProbe match=0 next=11 sentCommit=10 matchCommit=0] + DEBUG 1 paused sending replication messages to 3 [StateSnapshot match=0 next=12 sentCommit=11 matchCommit=0 paused pendingSnap=11] +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->3 MsgSnap Term:1 Log:0/0 + Snapshot: Index:11 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false + +status 1 +---- +1: StateReplicate match=11 next=12 sentCommit=10 matchCommit=10 +2: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 +3: StateSnapshot match=0 next=12 sentCommit=11 matchCommit=0 paused pendingSnap=11 + +# Follower applies the snapshot. Note how it reacts with a MsgAppResp upon completion. +# The snapshot fully catches the follower up (i.e. there are no more log entries it +# needs to apply after). The bug was that the leader failed to realize that the follower +# was now fully caught up. +stabilize 3 +---- +> 3 receiving messages + 1->3 MsgSnap Term:1 Log:0/0 + Snapshot: Index:11 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false + INFO log [committed=0, applied=0, applying=0, unstable.offset=1, unstable.offsetInProgress=1, len(unstable.Entries)=0] starts to restore snapshot [index: 11, term: 1] + INFO 3 switched to configuration voters=(1 2 3) + INFO 3 [commit: 11, lastindex: 11, lastterm: 1] restored snapshot [index: 11, term: 1] + INFO 3 [commit: 11] restored snapshot [index: 11, term: 1] +> 3 handling Ready + Ready MustSync=true: + HardState Term:1 Commit:11 Lead:1 LeadEpoch:0 + Snapshot Index:11 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false + Messages: + 3->1 MsgAppResp Term:1 Log:0/11 Commit:11 + +# The MsgAppResp lets the leader move the follower back to replicating state. +# Leader sends another MsgAppResp, to communicate the updated commit index. +stabilize 1 +---- +> 1 receiving messages + 3->1 MsgAppResp Term:1 Log:0/11 Commit:11 + DEBUG 1 recovered from needing snapshot, resumed sending replication messages to 3 [StateSnapshot match=11 next=12 sentCommit=11 matchCommit=11 paused pendingSnap=11] + +status 1 +---- +1: StateReplicate match=11 next=12 sentCommit=10 matchCommit=10 +2: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 +3: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 + +# Let things settle. +stabilize +---- +> 2 receiving messages + 1->2 MsgHeartbeat Term:1 Log:0/0 Commit:11 +> 2 handling Ready + Ready MustSync=false: + Messages: + 2->1 MsgHeartbeatResp Term:1 Log:0/0 +> 1 receiving messages + 2->1 MsgHeartbeatResp Term:1 Log:0/0 diff --git a/pkg/raft/testdata/msg_app_commit_index.txt b/pkg/raft/testdata/msg_app_commit_index.txt index d464faa2fe8e..e43df6613a04 100644 --- a/pkg/raft/testdata/msg_app_commit_index.txt +++ b/pkg/raft/testdata/msg_app_commit_index.txt @@ -122,8 +122,6 @@ process-ready 1 ---- Ready MustSync=false: Messages: -1->2 MsgHeartbeat Term:1 Log:0/0 -1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgApp Term:1 Log:1/13 Commit:13 # On the next MsgApp sent to follower 3, the leader will include that the @@ -131,15 +129,8 @@ Messages: # because it knows that it has the latest commit index. stabilize 1 2 3 ---- -> 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 > 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgApp Term:1 Log:1/13 Commit:13 -> 2 handling Ready - Ready MustSync=false: - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 > 3 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:13 Lead:1 LeadEpoch:1 @@ -147,53 +138,10 @@ stabilize 1 2 3 1/12 EntryNormal "data1" 1/13 EntryNormal "data2" Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 -> 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->3 MsgApp Term:1 Log:1/13 Commit:13 -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/13 Commit:13 -> 3 handling Ready - Ready MustSync=false: - Messages: 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 > 1 receiving messages 3->1 MsgAppResp Term:1 Log:0/13 Commit:13 -# If the commit index is up-to-date, no MsgApp will be sent. -tick-heartbeat 1 ----- -ok - -stabilize 1 2 3 ----- -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 -> 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 -> 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 -> 2 handling Ready - Ready MustSync=false: - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 -> 3 handling Ready - Ready MustSync=false: - Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 -> 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - # The leader's sentCommit and the matchCommit remain stale even after stabilize. # The reason is that the leader send itself a MsgAppResp when it receives a # MsgProp. However, the leader doesn't send itself a MsgAppResp when it @@ -204,3 +152,12 @@ status 1 1: StateReplicate match=13 next=14 sentCommit=11 matchCommit=11 2: StateReplicate match=13 next=14 sentCommit=13 matchCommit=13 3: StateReplicate match=13 next=14 sentCommit=13 matchCommit=13 + +# If the commit index is up-to-date, no MsgApp will be sent. +tick-heartbeat 1 +---- +ok + +stabilize 1 2 3 +---- +ok diff --git a/pkg/raft/testdata/refortification_basic.txt b/pkg/raft/testdata/refortification_basic.txt index 56369ceff4db..2077fe5c7621 100644 --- a/pkg/raft/testdata/refortification_basic.txt +++ b/pkg/raft/testdata/refortification_basic.txt @@ -149,19 +149,15 @@ stabilize 2->1 MsgAppResp Term:1 Log:0/3 Commit:3 3->1 MsgAppResp Term:1 Log:0/3 Commit:3 -# On the next heartbeat, the leader still won't send a MsgFortifyLeader to -# follower 3 because it doesn't support it in the store liveness fabric. +# On the next heartbeat timeout, the leader still won't send a MsgFortifyLeader +# to follower 3 because it doesn't support it in the store liveness fabric. tick-heartbeat 1 ---- ok stabilize 1 ---- -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 +ok grant-support 3 1 ---- @@ -181,33 +177,15 @@ stabilize > 1 handling Ready Ready MustSync=false: Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 -> 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->2 MsgHeartbeat Term:1 Log:0/0 > 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 -> 2 handling Ready - Ready MustSync=false: - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 2->1 MsgHeartbeatResp Term:1 Log:0/0 > 3 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:3 Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:3 > 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:3 # If the follower supports the leader at an older epoch, the leader will try @@ -229,32 +207,24 @@ stabilize Ready MustSync=true: HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:4 Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 > 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 1->2 MsgFortifyLeader Term:1 Log:0/0 > 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 > 2 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:3 Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 2->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:3 > 3 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:4 Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:4 > 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 2->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:3 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:4 # If no follower needs to be refortified, the leader won't send any @@ -265,23 +235,4 @@ ok stabilize ---- -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 -> 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 -> 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 -> 2 handling Ready - Ready MustSync=false: - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 -> 3 handling Ready - Ready MustSync=false: - Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 -> 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 +ok diff --git a/pkg/raft/testdata/replicate_pause.txt b/pkg/raft/testdata/replicate_pause.txt index 75d8fbdd27a7..84b2cb2ad452 100644 --- a/pkg/raft/testdata/replicate_pause.txt +++ b/pkg/raft/testdata/replicate_pause.txt @@ -133,42 +133,29 @@ stabilize 1 > 1 handling Ready Ready MustSync=false: Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgApp Term:1 Log:1/14 Commit:17 # Node 3 finally receives a MsgApp, but there was a gap, so it rejects it. stabilize 2 3 ---- -> 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 > 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgApp Term:1 Log:1/14 Commit:17 DEBUG 3 [logterm: 0, index: 14] rejected MsgApp [logterm: 1, index: 14] from 1 -> 2 handling Ready - Ready MustSync=false: - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 > 3 handling Ready Ready MustSync=false: Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgAppResp Term:1 Log:1/14 Rejected (Hint: 11) Commit:11 # Node 1 receives the rejection and adjusts the MsgApp sent to node 3. stabilize 1 ---- > 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgAppResp Term:1 Log:1/14 Rejected (Hint: 11) Commit:11 DEBUG 1 received MsgAppResp(rejected, hint: (index 11, term 1)) from 3 for index 14 DEBUG 1 decreased progress of 3 to [StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 paused inflight=3[full]] > 1 handling Ready Ready MustSync=false: Messages: - 1->3 MsgApp Term:1 Log:1/14 Commit:17 1->3 MsgApp Term:1 Log:1/11 Commit:17 Entries:[ 1/12 EntryNormal "prop_1_12" 1/13 EntryNormal "prop_1_13" @@ -181,8 +168,6 @@ stabilize 1 stabilize 1 3 ---- > 3 receiving messages - 1->3 MsgApp Term:1 Log:1/14 Commit:17 - DEBUG 3 [logterm: 0, index: 14] rejected MsgApp [logterm: 1, index: 14] from 1 1->3 MsgApp Term:1 Log:1/11 Commit:17 Entries:[ 1/12 EntryNormal "prop_1_12" 1/13 EntryNormal "prop_1_13" @@ -209,11 +194,8 @@ stabilize 1 3 1/16 EntryNormal "prop_1_16" 1/17 EntryNormal "prop_1_17" Messages: - 3->1 MsgAppResp Term:1 Log:1/14 Rejected (Hint: 11) Commit:11 3->1 MsgAppResp Term:1 Log:0/17 Commit:17 > 1 receiving messages - 3->1 MsgAppResp Term:1 Log:1/14 Rejected (Hint: 11) Commit:11 - DEBUG 1 received MsgAppResp(rejected, hint: (index 11, term 1)) from 3 for index 14 3->1 MsgAppResp Term:1 Log:0/17 Commit:17 # Eventually all nodes catch up on the committed state. diff --git a/pkg/raft/testdata/slow_follower_after_compaction.txt b/pkg/raft/testdata/slow_follower_after_compaction.txt index a74c15db5ebe..84a6f35d7ed5 100644 --- a/pkg/raft/testdata/slow_follower_after_compaction.txt +++ b/pkg/raft/testdata/slow_follower_after_compaction.txt @@ -109,34 +109,22 @@ stabilize > 1 handling Ready Ready MustSync=false: Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgSnap Term:1 Log:0/0 Snapshot: Index:18 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false -> 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 > 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgSnap Term:1 Log:0/0 Snapshot: Index:18 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false INFO log [committed=14, applied=14, applying=14, unstable.offset=15, unstable.offsetInProgress=15, len(unstable.Entries)=0] starts to restore snapshot [index: 18, term: 1] INFO 3 switched to configuration voters=(1 2 3) INFO 3 [commit: 18, lastindex: 18, lastterm: 1] restored snapshot [index: 18, term: 1] INFO 3 [commit: 18] restored snapshot [index: 18, term: 1] -> 2 handling Ready - Ready MustSync=false: - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 > 3 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:18 Lead:1 LeadEpoch:1 Snapshot Index:18 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgAppResp Term:1 Log:0/18 Commit:18 > 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgAppResp Term:1 Log:0/18 Commit:18 DEBUG 1 recovered from needing snapshot, resumed sending replication messages to 3 [StateSnapshot match=18 next=19 sentCommit=18 matchCommit=18 paused pendingSnap=18] diff --git a/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt b/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt index 2fafbdc5f4c6..db0abcd69d88 100644 --- a/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt +++ b/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt @@ -65,40 +65,40 @@ process-ready 1 ---- Ready MustSync=false: Messages: -1->2 MsgHeartbeat Term:1 Log:0/0 -1->3 MsgHeartbeat Term:1 Log:0/0 1->3 MsgFortifyLeader Term:1 Log:0/0 -# Iterate until no more work is done by the new peer. It receives the heartbeat -# and responds. +# Iterate until no more work is done by the new peer. stabilize 3 ---- > 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 - INFO 3 [term: 0] received a MsgHeartbeat message with higher term from 1 [term: 1] - INFO 3 became follower at term 1 1->3 MsgFortifyLeader Term:1 Log:0/0 + INFO 3 [term: 0] received a MsgFortifyLeader message with higher term from 1 [term: 1] + INFO 3 became follower at term 1 > 3 handling Ready Ready MustSync=true: HardState Term:1 Commit:0 Lead:1 LeadEpoch:1 Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:1 -# The leader in turn will realize that n3 needs a snapshot, which it initiates. -stabilize 1 +deliver-msgs 1 ---- -> 1 receiving messages - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - DEBUG 1 [firstindex: 12, commit: 11] sent snapshot[index: 11, term: 1] to 3 [StateProbe match=0 next=11 sentCommit=10 matchCommit=0] - DEBUG 1 paused sending replication messages to 3 [StateSnapshot match=0 next=12 sentCommit=11 matchCommit=0 paused pendingSnap=11] - 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:1 -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->3 MsgSnap Term:1 Log:0/0 - Snapshot: Index:11 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false +3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:1 + +# The leader sends a snapshot to the follower on the next heartbeat timeout +# because the follower is now marked as active. +# TODO(ibrahim): Call MaybeSendAppend after receiving MsgFortifyLeaderResp. +tick-heartbeat 1 +---- +DEBUG 1 [firstindex: 12, commit: 11] sent snapshot[index: 11, term: 1] to 3 [StateProbe match=0 next=11 sentCommit=10 matchCommit=0] +DEBUG 1 paused sending replication messages to 3 [StateSnapshot match=0 next=12 sentCommit=11 matchCommit=0 paused pendingSnap=11] +process-ready 1 +---- +Ready MustSync=false: +Messages: +1->3 MsgSnap Term:1 Log:0/0 + Snapshot: Index:11 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false + status 1 ---- 1: StateReplicate match=11 next=12 sentCommit=10 matchCommit=10 @@ -138,15 +138,3 @@ status 1 1: StateReplicate match=11 next=12 sentCommit=10 matchCommit=10 2: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 3: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 - -# Let things settle. -stabilize ----- -> 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 -> 2 handling Ready - Ready MustSync=false: - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 -> 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0