diff --git a/pkg/kv/kvserver/batcheval/BUILD.bazel b/pkg/kv/kvserver/batcheval/BUILD.bazel index 086873d1e94e..8afb9c324520 100644 --- a/pkg/kv/kvserver/batcheval/BUILD.bazel +++ b/pkg/kv/kvserver/batcheval/BUILD.bazel @@ -152,6 +152,7 @@ go_test( "//pkg/kv/kvserver/readsummary", "//pkg/kv/kvserver/readsummary/rspb", "//pkg/kv/kvserver/spanset", + "//pkg/kv/kvserver/stateloader", "//pkg/roachpb", "//pkg/security/securityassets", "//pkg/security/securitytest", diff --git a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go index cd2e853fac93..e7fb5fa5a0bc 100644 --- a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go +++ b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go @@ -1280,15 +1280,33 @@ func splitTriggerHelper( log.Fatalf(ctx, "LHS of split has no lease") } - replica, found := split.RightDesc.GetReplicaDescriptor(leftLease.Replica.StoreID) - if !found { + // Copy the lease from the left-hand side of the split over to the + // right-hand side so that it can immediately start serving requests. + // When doing so, we need to make a few modifications. + rightLease := leftLease + // Rebind the lease to the existing leaseholder store's replica from the + // right-hand side's descriptor. + var ok bool + rightLease.Replica, ok = split.RightDesc.GetReplicaDescriptor(leftLease.Replica.StoreID) + if !ok { return enginepb.MVCCStats{}, result.Result{}, errors.Errorf( "pre-split lease holder %+v not found in post-split descriptor %+v", leftLease.Replica, split.RightDesc, ) } - rightLease := leftLease - rightLease.Replica = replica + // Convert leader leases into expiration-based leases. A leader lease is + // tied to a specific raft leadership term within a specific raft group. + // During a range split, we initialize a new raft group on the right-hand + // side, so a leader lease term from the left-hand side is unusable. Once + // the right-hand side elects a leader and collocates the lease and leader, + // it can promote the expiration-based lease back to a leader lease. + if rightLease.Type() == roachpb.LeaseLeader { + exp := rec.Clock().Now().Add(int64(rec.GetRangeLeaseDuration()), 0) + rightLease.Expiration = &exp + rightLease.Term = 0 + rightLease.MinExpiration = hlc.Timestamp{} + } + gcThreshold, err := sl.LoadGCThreshold(ctx, batch) if err != nil { return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to load GCThreshold") diff --git a/pkg/kv/kvserver/batcheval/cmd_end_transaction_test.go b/pkg/kv/kvserver/batcheval/cmd_end_transaction_test.go index bba1b2d9b881..dda9f8229708 100644 --- a/pkg/kv/kvserver/batcheval/cmd_end_transaction_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_end_transaction_test.go @@ -14,14 +14,19 @@ import ( "context" "fmt" "regexp" + "slices" "testing" + "time" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/isolation" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/testutils" @@ -29,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/stretchr/testify/require" ) @@ -1678,3 +1684,126 @@ func TestResolveLocalLocks(t *testing.T) { }) } } + +// TestSplitTriggerWritesInitialReplicaState tests that a split trigger sets up +// the split's right-hand side range by writing the initial replica state into +// the evaluation write batch. +func TestSplitTriggerWritesInitialReplicaState(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + st := cluster.MakeTestingClusterSettings() + version := st.Version.LatestVersion() + manual := timeutil.NewManualTime(timeutil.Unix(0, 10)) + clock := hlc.NewClockForTesting(manual) + + db := storage.NewDefaultInMemForTesting() + defer db.Close() + batch := db.NewBatch() + defer batch.Close() + + rangeLeaseDuration := 99 * time.Nanosecond + startKey := roachpb.Key("0000") + endKey := roachpb.Key("9999") + desc := roachpb.RangeDescriptor{ + RangeID: 99, + StartKey: roachpb.RKey(startKey), + EndKey: roachpb.RKey(endKey), + } + desc.AddReplica(1, 1, roachpb.VOTER_FULL) + lease := roachpb.Lease{ + Replica: desc.InternalReplicas[0], + // The range was using a leader lease. The split will need to swap this to + // an expiration-based lease. + Term: 10, + MinExpiration: hlc.Timestamp{WallTime: 100}, + } + gcThreshold := hlc.Timestamp{WallTime: 4} + lastGCTimestamp := hlc.Timestamp{WallTime: 5} + gcHint := roachpb.GCHint{GCTimestamp: gcThreshold} + abortSpanTxnID := uuid.MakeV4() + as := abortspan.New(desc.RangeID) + sl := stateloader.Make(desc.RangeID) + rec := (&MockEvalCtx{ + ClusterSettings: st, + Desc: &desc, + Clock: clock, + AbortSpan: as, + LastReplicaGCTimestamp: lastGCTimestamp, + RangeLeaseDuration: rangeLeaseDuration, + }).EvalContext() + + splitKey := roachpb.RKey("5555") + leftDesc, rightDesc := desc, desc + leftDesc.EndKey = splitKey + rightDesc.RangeID++ + rightDesc.StartKey = splitKey + rightDesc.InternalReplicas = slices.Clone(leftDesc.InternalReplicas) + rightDesc.InternalReplicas[0].ReplicaID++ + split := &roachpb.SplitTrigger{ + LeftDesc: leftDesc, + RightDesc: rightDesc, + } + + // Write the range state that will be consulted and copied during the split. + err := as.Put(ctx, batch, nil, abortSpanTxnID, &roachpb.AbortSpanEntry{}) + require.NoError(t, err) + err = sl.SetLease(ctx, batch, nil, lease) + require.NoError(t, err) + err = sl.SetGCThreshold(ctx, batch, nil, &gcThreshold) + require.NoError(t, err) + err = sl.SetGCHint(ctx, batch, nil, &gcHint) + require.NoError(t, err) + err = sl.SetVersion(ctx, batch, nil, &version) + require.NoError(t, err) + + // Run the split trigger, which is normally run as a subset of EndTxn request + // evaluation. + _, _, err = splitTrigger(ctx, rec, batch, enginepb.MVCCStats{}, split, hlc.Timestamp{}) + require.NoError(t, err) + + // Verify that range state was migrated to the right-hand side properly. + asRight := abortspan.New(rightDesc.RangeID) + slRight := stateloader.Make(rightDesc.RangeID) + // The abort span should have been transferred over. + ok, err := asRight.Get(ctx, batch, abortSpanTxnID, &roachpb.AbortSpanEntry{}) + require.NoError(t, err) + require.True(t, ok) + // The lease should be present, pointing at the replica in the right-hand side + // range, and switched to an expiration-based lease. + expLease := roachpb.Lease{ + Replica: rightDesc.InternalReplicas[0], + Expiration: &hlc.Timestamp{WallTime: manual.Now().Add(rangeLeaseDuration).UnixNano()}, + } + loadedLease, err := slRight.LoadLease(ctx, batch) + require.NoError(t, err) + require.Equal(t, expLease, loadedLease) + loadedGCThreshold, err := slRight.LoadGCThreshold(ctx, batch) + require.NoError(t, err) + require.NotNil(t, loadedGCThreshold) + require.Equal(t, gcThreshold, *loadedGCThreshold) + loadedGCHint, err := slRight.LoadGCHint(ctx, batch) + require.NoError(t, err) + require.NotNil(t, loadedGCHint) + require.Equal(t, gcHint, *loadedGCHint) + expTruncState := kvserverpb.RaftTruncatedState{ + Term: stateloader.RaftInitialLogTerm, + Index: stateloader.RaftInitialLogIndex, + } + loadedTruncState, err := slRight.LoadRaftTruncatedState(ctx, batch) + require.NoError(t, err) + require.Equal(t, expTruncState, loadedTruncState) + loadedVersion, err := slRight.LoadVersion(ctx, batch) + require.NoError(t, err) + require.Equal(t, version, loadedVersion) + expAppliedState := kvserverpb.RangeAppliedState{ + RaftAppliedIndexTerm: stateloader.RaftInitialLogTerm, + RaftAppliedIndex: stateloader.RaftInitialLogIndex, + } + loadedAppliedState, err := slRight.LoadRangeAppliedState(ctx, batch) + require.NoError(t, err) + require.NotNil(t, loadedAppliedState) + loadedAppliedState.RangeStats = kvserverpb.MVCCPersistentStats{} // ignore + require.Equal(t, &expAppliedState, loadedAppliedState) +} diff --git a/pkg/kv/kvserver/batcheval/eval_context.go b/pkg/kv/kvserver/batcheval/eval_context.go index b315c5b8cdd9..1607297e4db8 100644 --- a/pkg/kv/kvserver/batcheval/eval_context.go +++ b/pkg/kv/kvserver/batcheval/eval_context.go @@ -14,6 +14,7 @@ import ( "context" "fmt" "math" + "time" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan" @@ -106,6 +107,7 @@ type EvalContext interface { ExcludeDataFromBackup(context.Context, roachpb.Span) (bool, error) GetLastReplicaGCTimestamp(context.Context) (hlc.Timestamp, error) GetLease() (roachpb.Lease, roachpb.Lease) + GetRangeLeaseDuration() time.Duration GetRangeInfo(context.Context) roachpb.RangeInfo // GetCurrentReadSummary returns a new ReadSummary reflecting all reads @@ -165,27 +167,29 @@ type ImmutableEvalContext interface { // MockEvalCtx is a dummy implementation of EvalContext for testing purposes. // For technical reasons, the interface is implemented by a wrapper .EvalContext(). type MockEvalCtx struct { - ClusterSettings *cluster.Settings - Desc *roachpb.RangeDescriptor - StoreID roachpb.StoreID - NodeID roachpb.NodeID - Clock *hlc.Clock - Stats enginepb.MVCCStats - QPS float64 - CPU float64 - AbortSpan *abortspan.AbortSpan - GCThreshold hlc.Timestamp - Term kvpb.RaftTerm - FirstIndex kvpb.RaftIndex - CanCreateTxnRecordFn func() (bool, kvpb.TransactionAbortedReason) - MinTxnCommitTSFn func() hlc.Timestamp - Lease roachpb.Lease - CurrentReadSummary rspb.ReadSummary - ClosedTimestamp hlc.Timestamp - RevokedLeaseSeq roachpb.LeaseSequence - MaxBytes int64 - ApproxDiskBytes uint64 - EvalKnobs kvserverbase.BatchEvalTestingKnobs + ClusterSettings *cluster.Settings + Desc *roachpb.RangeDescriptor + StoreID roachpb.StoreID + NodeID roachpb.NodeID + Clock *hlc.Clock + Stats enginepb.MVCCStats + QPS float64 + CPU float64 + AbortSpan *abortspan.AbortSpan + GCThreshold hlc.Timestamp + Term kvpb.RaftTerm + FirstIndex kvpb.RaftIndex + CanCreateTxnRecordFn func() (bool, kvpb.TransactionAbortedReason) + MinTxnCommitTSFn func() hlc.Timestamp + LastReplicaGCTimestamp hlc.Timestamp + Lease roachpb.Lease + RangeLeaseDuration time.Duration + CurrentReadSummary rspb.ReadSummary + ClosedTimestamp hlc.Timestamp + RevokedLeaseSeq roachpb.LeaseSequence + MaxBytes int64 + ApproxDiskBytes uint64 + EvalKnobs kvserverbase.BatchEvalTestingKnobs } // EvalContext returns the MockEvalCtx as an EvalContext. It will reflect future @@ -280,11 +284,14 @@ func (m *mockEvalCtxImpl) ExcludeDataFromBackup(context.Context, roachpb.Span) ( return false, nil } func (m *mockEvalCtxImpl) GetLastReplicaGCTimestamp(context.Context) (hlc.Timestamp, error) { - panic("unimplemented") + return m.LastReplicaGCTimestamp, nil } func (m *mockEvalCtxImpl) GetLease() (roachpb.Lease, roachpb.Lease) { return m.Lease, roachpb.Lease{} } +func (m *mockEvalCtxImpl) GetRangeLeaseDuration() time.Duration { + return m.RangeLeaseDuration +} func (m *mockEvalCtxImpl) GetRangeInfo(ctx context.Context) roachpb.RangeInfo { return roachpb.RangeInfo{Desc: *m.Desc(), Lease: m.Lease} } diff --git a/pkg/kv/kvserver/replica_eval_context_span.go b/pkg/kv/kvserver/replica_eval_context_span.go index 8e3a02b39c3b..a2787586f786 100644 --- a/pkg/kv/kvserver/replica_eval_context_span.go +++ b/pkg/kv/kvserver/replica_eval_context_span.go @@ -12,6 +12,7 @@ package kvserver import ( "context" + "time" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" @@ -207,6 +208,11 @@ func (rec SpanSetReplicaEvalContext) GetLease() (roachpb.Lease, roachpb.Lease) { return rec.i.GetLease() } +// GetRangeLeaseDuration is part of the EvalContext interface. +func (rec SpanSetReplicaEvalContext) GetRangeLeaseDuration() time.Duration { + return rec.i.GetRangeLeaseDuration() +} + // GetRangeInfo is part of the EvalContext interface. func (rec SpanSetReplicaEvalContext) GetRangeInfo(ctx context.Context) roachpb.RangeInfo { // Do the latching checks and ignore the results. diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index 64731a2f4799..23541046d8b2 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -753,6 +753,11 @@ func (r *Replica) leaseSettings(ctx context.Context) leases.Settings { } } +// GetRangeLeaseDuration is part of the EvalContext interface. +func (r *Replica) GetRangeLeaseDuration() time.Duration { + return r.store.cfg.RangeLeaseDuration +} + // requiresExpirationLeaseRLocked returns whether this range unconditionally // uses an expiration-based lease. Ranges located before or including the node // liveness table must always use expiration leases to avoid circular diff --git a/pkg/kv/kvserver/stateloader/initial.go b/pkg/kv/kvserver/stateloader/initial.go index fb885b8747b4..6a6e523ea17e 100644 --- a/pkg/kv/kvserver/stateloader/initial.go +++ b/pkg/kv/kvserver/stateloader/initial.go @@ -22,13 +22,13 @@ import ( "github.com/cockroachdb/errors" ) -// raftInitialLog{Index,Term} are the starting points for the raft log. We +// RaftInitialLog{Index,Term} are the starting points for the raft log. We // bootstrap the raft membership by synthesizing a snapshot as if there were // some discarded prefix to the log, so we must begin the log at an arbitrary // index greater than 1. const ( - raftInitialLogIndex = 10 - raftInitialLogTerm = 5 + RaftInitialLogIndex = 10 + RaftInitialLogTerm = 5 ) // WriteInitialReplicaState sets up a new Range, but without writing an @@ -51,8 +51,8 @@ func WriteInitialReplicaState( rsl := Make(desc.RangeID) var s kvserverpb.ReplicaState s.TruncatedState = &kvserverpb.RaftTruncatedState{ - Term: raftInitialLogTerm, - Index: raftInitialLogIndex, + Term: RaftInitialLogTerm, + Index: RaftInitialLogIndex, } s.RaftAppliedIndex = s.TruncatedState.Index s.RaftAppliedIndexTerm = s.TruncatedState.Term diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 0e4f0be09826..a6dbeffa5d2c 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -87,7 +87,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/log/logcrash" "github.com/cockroachdb/cockroach/pkg/util/log/severity" - "github.com/cockroachdb/cockroach/pkg/util/metamorphic" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/cockroachdb/cockroach/pkg/util/mon" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -334,12 +333,6 @@ var SnapshotSendLimit = settings.RegisterIntSetting( settings.NonNegativeInt, ) -// raftStepDownOnRemoval is a metamorphic test parameter that makes Raft leaders -// step down on demotion or removal. Following an upgrade, clusters may have -// replicas with mixed settings, because it's only changed when initializing -// replicas. Varying it makes sure we handle this state. -var raftStepDownOnRemoval = metamorphic.ConstantWithTestBool("raft-step-down-on-removal", true) - // TestStoreConfig has some fields initialized with values relevant in tests. func TestStoreConfig(clock *hlc.Clock) StoreConfig { return testStoreConfig(clock, clusterversion.Latest.Version()) @@ -409,16 +402,9 @@ func newRaftConfig( Storage: strg, Logger: logger, StoreLiveness: storeLiveness, - - // We only set this on replica initialization, so replicas without - // StepDownOnRemoval may remain on 23.2 nodes until they restart. That's - // totally fine, we just can't rely on this behavior until 24.1, but - // we currently don't either. - StepDownOnRemoval: raftStepDownOnRemoval, - - PreVote: true, - CheckQuorum: storeCfg.RaftEnableCheckQuorum, - CRDBVersion: storeCfg.Settings.Version, + PreVote: true, + CheckQuorum: storeCfg.RaftEnableCheckQuorum, + CRDBVersion: storeCfg.Settings.Version, } } @@ -707,7 +693,7 @@ A first phenomenon to understand is that of uninitialized Replicas, which is the State Machine at applied index zero, i.e. has an empty state. In CockroachDB, an uninitialized Replica can only advance to a nonzero log position ("become initialized") via a Raft snapshot (this is because we initialize all Ranges in -the system at log index raftInitialLogIndex which allows us to write arbitrary +the system at log index RaftInitialLogIndex which allows us to write arbitrary amounts of data into the initial state without having to worry about the size of individual log entries; see WriteInitialReplicaState). diff --git a/pkg/kv/kvserver/store_raft.go b/pkg/kv/kvserver/store_raft.go index b7ce368761fd..4f6203fe0881 100644 --- a/pkg/kv/kvserver/store_raft.go +++ b/pkg/kv/kvserver/store_raft.go @@ -449,7 +449,7 @@ func (s *Store) processRaftSnapshotRequest( // the snapshot is targeting an uninitialized replica. The only known reason // for raft to ignore a snapshot is if it doesn't move the applied index // forward, but an uninitialized replica's applied index is zero (and a - // snapshot's is at least raftInitialLogIndex). + // snapshot's is at least RaftInitialLogIndex). if inSnap.placeholder != nil { if _, err := s.removePlaceholder(ctx, inSnap.placeholder, typ); err != nil { log.Fatalf(ctx, "unable to remove placeholder: %s", err) diff --git a/pkg/raft/raft.go b/pkg/raft/raft.go index 776a89d1874d..b850e8ddc7ae 100644 --- a/pkg/raft/raft.go +++ b/pkg/raft/raft.go @@ -260,13 +260,6 @@ type Config struct { // See: https://github.com/etcd-io/raft/issues/80 DisableConfChangeValidation bool - // StepDownOnRemoval makes the leader step down when it is removed from the - // group or demoted to a learner. - // - // This behavior will become unconditional in the future. See: - // https://github.com/etcd-io/raft/issues/83 - StepDownOnRemoval bool - // StoreLiveness is a reference to the store liveness fabric. StoreLiveness raftstoreliveness.StoreLiveness @@ -428,7 +421,6 @@ type raft struct { // when raft changes its state to follower or candidate. randomizedElectionTimeout int disableProposalForwarding bool - stepDownOnRemoval bool tick func() step stepFunc @@ -464,7 +456,6 @@ func newRaft(c *Config) *raft { preVote: c.PreVote, disableProposalForwarding: c.DisableProposalForwarding, disableConfChangeValidation: c.DisableConfChangeValidation, - stepDownOnRemoval: c.StepDownOnRemoval, storeLiveness: c.StoreLiveness, crdbVersion: c.CRDBVersion, } @@ -2272,7 +2263,7 @@ func (r *raft) switchToConfig(cfg quorum.Config, progressMap tracker.ProgressMap r.isLearner = pr != nil && pr.IsLearner if (pr == nil || r.isLearner) && r.state == StateLeader { - // This node is leader and was removed or demoted, step down if requested. + // This node is leader and was removed or demoted, step down. // // We prevent demotions at the time writing but hypothetically we handle // them the same way as removing the leader. @@ -2280,11 +2271,10 @@ func (r *raft) switchToConfig(cfg quorum.Config, progressMap tracker.ProgressMap // TODO(tbg): ask follower with largest Match to TimeoutNow (to avoid // interruption). This might still drop some proposals but it's better than // nothing. - if r.stepDownOnRemoval { - // NB: Similar to the CheckQuorum step down case, we must remember our - // prior stint as leader, lest we regress the QSE. - r.becomeFollower(r.Term, r.lead) - } + // + // NB: Similar to the CheckQuorum step down case, we must remember our + // prior stint as leader, lest we regress the QSE. + r.becomeFollower(r.Term, r.lead) return cs } diff --git a/pkg/raft/rafttest/interaction_env_handler_add_nodes.go b/pkg/raft/rafttest/interaction_env_handler_add_nodes.go index fd4f157d1be6..0d6607d4112e 100644 --- a/pkg/raft/rafttest/interaction_env_handler_add_nodes.go +++ b/pkg/raft/rafttest/interaction_env_handler_add_nodes.go @@ -65,8 +65,6 @@ func (env *InteractionEnv) handleAddNodes(t *testing.T, d datadriven.TestData) e arg.Scan(t, i, &cfg.MaxCommittedSizePerReady) case "disable-conf-change-validation": arg.Scan(t, i, &cfg.DisableConfChangeValidation) - case "step-down-on-removal": - arg.Scan(t, i, &cfg.StepDownOnRemoval) case "crdb-version": var key string arg.Scan(t, i, &key) diff --git a/pkg/raft/testdata/confchange_v1_remove_leader.txt b/pkg/raft/testdata/confchange_v1_remove_leader.txt index 2533cf5f6fd6..e6a46569dc7d 100644 --- a/pkg/raft/testdata/confchange_v1_remove_leader.txt +++ b/pkg/raft/testdata/confchange_v1_remove_leader.txt @@ -78,10 +78,8 @@ propose 1 bar ---- ok -# n1 applies the conf change, so it has now removed itself. But it still has -# an uncommitted entry in the log. If the leader unconditionally counted itself -# as part of the commit quorum, we'd be in trouble. In the block below, we see -# it send out appends to the other nodes for the 'bar' entry. +# n1 applies the conf change, removing itself and stepping down. But it still +# has an uncommitted 'bar' entry in the log that it sends out appends for first. stabilize 1 ---- > 1 handling Ready @@ -106,10 +104,14 @@ stabilize 1 1->2 MsgApp Term:1 Log:1/6 Commit:5 1->3 MsgApp Term:1 Log:1/6 Commit:5 INFO 1 switched to configuration voters=(2 3) + INFO 1 became follower at term 1 +> 1 handling Ready + Ready MustSync=false: + State:StateFollower raft-state ---- -1: StateLeader (Non-Voter) Term:1 Lead:1 LeadEpoch:1 +1: StateFollower (Non-Voter) Term:1 Lead:0 LeadEpoch:1 2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 @@ -134,7 +136,7 @@ stabilize 2 2->1 MsgAppResp Term:1 Log:0/6 Commit:5 INFO 2 switched to configuration voters=(2 3) -# ... which thankfully is what we see on the leader. +# ...because the old leader n1 ignores the append responses. stabilize 1 ---- > 1 receiving messages @@ -174,77 +176,14 @@ stabilize 3->1 MsgAppResp Term:1 Log:0/6 Commit:3 3->1 MsgAppResp Term:1 Log:0/6 Commit:4 3->1 MsgAppResp Term:1 Log:0/6 Commit:5 -> 1 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:6 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/6 EntryNormal "bar" - Messages: - 1->2 MsgApp Term:1 Log:1/6 Commit:6 - 1->3 MsgApp Term:1 Log:1/6 Commit:6 -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/6 Commit:6 -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/6 Commit:6 -> 2 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:6 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/6 EntryNormal "bar" - Messages: - 2->1 MsgAppResp Term:1 Log:0/6 Commit:6 -> 3 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:6 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/6 EntryNormal "bar" - Messages: - 3->1 MsgAppResp Term:1 Log:0/6 Commit:6 -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/6 Commit:6 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:6 -# However not all is well. n1 is still leader but unconditionally drops all -# proposals on the floor, so we're effectively stuck if it still heartbeats -# its followers... +# n1 can no longer propose. propose 1 baz ---- +INFO 1 not forwarding to itself at term 1; dropping proposal raft proposal dropped -tick-heartbeat 1 ----- -ok - -# ... which, uh oh, it does. -# TODO(tbg): change behavior so that a leader that is removed immediately steps -# down, and initiates an optimistic handover. -stabilize ----- -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 -> 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 -> 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 -> 2 handling Ready - Ready MustSync=false: - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 -> 3 handling Ready - Ready MustSync=false: - Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 -> 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - -# Just confirming the issue above - leader does not automatically step down. -# Expected behavior: a new leader is elected after an election timeout. -raft-state +# Nor can it campaign to become leader. +campaign 1 ---- -1: StateLeader (Non-Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 +WARN 1 is unpromotable and can not campaign diff --git a/pkg/raft/testdata/confchange_v1_remove_leader_stepdown.txt b/pkg/raft/testdata/confchange_v1_remove_leader_stepdown.txt deleted file mode 100644 index 54a582deadde..000000000000 --- a/pkg/raft/testdata/confchange_v1_remove_leader_stepdown.txt +++ /dev/null @@ -1,190 +0,0 @@ -# We'll turn this back on after the boilerplate. -log-level none ----- -ok - -# Run a V1 membership change that removes the leader, asking it -# to step down on removal. -# Bootstrap n1, n2, n3. -add-nodes 3 voters=(1,2,3) index=2 step-down-on-removal=true ----- -ok - -campaign 1 ----- -ok - -stabilize ----- -ok - -log-level debug ----- -ok - -raft-state ----- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 - -# Start removing n1. -propose-conf-change 1 v1=true -r1 ----- -ok - -raft-state ----- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 - -# Propose an extra entry which will be sent out together with the conf change. -propose 1 foo ----- -ok - -# Send out the corresponding appends. -process-ready 1 ----- -Ready MustSync=true: -Entries: -1/4 EntryConfChange r1 -1/5 EntryNormal "foo" -Messages: -1->2 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] -1->3 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] -1->2 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] -1->3 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] - -# Send response from n2 (which is enough to commit the entries so far next time -# n1 runs). -stabilize 2 ----- -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] - 1->2 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] -> 2 handling Ready - Ready MustSync=true: - Entries: - 1/4 EntryConfChange r1 - 1/5 EntryNormal "foo" - Messages: - 2->1 MsgAppResp Term:1 Log:0/4 Commit:3 - 2->1 MsgAppResp Term:1 Log:0/5 Commit:3 - -# Put another entry in n1's log. -propose 1 bar ----- -ok - -# n1 applies the conf change, removing itself and stepping down. But it still -# has an uncommitted 'bar' entry in the log that it sends out appends for first. -stabilize 1 ----- -> 1 handling Ready - Ready MustSync=true: - Entries: - 1/6 EntryNormal "bar" - Messages: - 1->2 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] - 1->3 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/4 Commit:3 - 2->1 MsgAppResp Term:1 Log:0/5 Commit:3 -> 1 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/4 EntryConfChange r1 - 1/5 EntryNormal "foo" - Messages: - 1->2 MsgApp Term:1 Log:1/6 Commit:4 - 1->3 MsgApp Term:1 Log:1/6 Commit:4 - 1->2 MsgApp Term:1 Log:1/6 Commit:5 - 1->3 MsgApp Term:1 Log:1/6 Commit:5 - INFO 1 switched to configuration voters=(2 3) - INFO 1 became follower at term 1 -> 1 handling Ready - Ready MustSync=false: - State:StateFollower - -raft-state ----- -1: StateFollower (Non-Voter) Term:1 Lead:0 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 - -# n2 responds, n3 doesn't yet. Quorum for 'bar' should not be reached... -stabilize 2 ----- -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] - 1->2 MsgApp Term:1 Log:1/6 Commit:4 - 1->2 MsgApp Term:1 Log:1/6 Commit:5 -> 2 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 - Entries: - 1/6 EntryNormal "bar" - CommittedEntries: - 1/4 EntryConfChange r1 - 1/5 EntryNormal "foo" - Messages: - 2->1 MsgAppResp Term:1 Log:0/6 Commit:3 - 2->1 MsgAppResp Term:1 Log:0/6 Commit:4 - 2->1 MsgAppResp Term:1 Log:0/6 Commit:5 - INFO 2 switched to configuration voters=(2 3) - -# ...because the old leader n1 ignores the append responses. -stabilize 1 ----- -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/6 Commit:3 - 2->1 MsgAppResp Term:1 Log:0/6 Commit:4 - 2->1 MsgAppResp Term:1 Log:0/6 Commit:5 - -# When n3 responds, quorum is reached and everything falls into place. -stabilize ----- -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] - 1->3 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] - 1->3 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] - 1->3 MsgApp Term:1 Log:1/6 Commit:4 - 1->3 MsgApp Term:1 Log:1/6 Commit:5 -> 3 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 - Entries: - 1/4 EntryConfChange r1 - 1/5 EntryNormal "foo" - 1/6 EntryNormal "bar" - CommittedEntries: - 1/4 EntryConfChange r1 - 1/5 EntryNormal "foo" - Messages: - 3->1 MsgAppResp Term:1 Log:0/4 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/5 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:4 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:5 - INFO 3 switched to configuration voters=(2 3) -> 1 receiving messages - 3->1 MsgAppResp Term:1 Log:0/4 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/5 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:4 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:5 - -# n1 can no longer propose. -propose 1 baz ----- -INFO 1 not forwarding to itself at term 1; dropping proposal -raft proposal dropped - -# Nor can it campaign to become leader. -campaign 1 ----- -WARN 1 is unpromotable and can not campaign diff --git a/pkg/raft/testdata/confchange_v2_replace_leader.txt b/pkg/raft/testdata/confchange_v2_replace_leader.txt index 676b214e0fa1..036a329b75ed 100644 --- a/pkg/raft/testdata/confchange_v2_replace_leader.txt +++ b/pkg/raft/testdata/confchange_v2_replace_leader.txt @@ -1,8 +1,9 @@ -# Run a V2 membership change that removes the leader and adds another voter as -# a single operation, using joint consensus and explicitly determining when to -# transition out of the joint config. Leadership is transferred to new joiner -# while in the joint config. After the reconfiguration completes, we verify -# that the removed leader cannot campaign to become leader. +# Run a V2 membership change that removes the leader and adds another voter as a +# single operation, using joint consensus and explicitly determining when to +# transition out of the joint config. Leadership is transferred by campaigning a +# designated voter in the new config once the old leader steps down. After the +# reconfiguration completes, we verify that the removed leader cannot campaign +# to become leader. # We'll turn this back on after the boilerplate. log-level none @@ -47,385 +48,155 @@ r1 v4 ok # Enter joint config. -stabilize ----- -> 1 handling Ready - Ready MustSync=true: - Entries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 1->2 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChangeV2 r1 v4] - 1->3 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChangeV2 r1 v4] -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChangeV2 r1 v4] -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChangeV2 r1 v4] -> 2 handling Ready - Ready MustSync=true: - Entries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 2->1 MsgAppResp Term:1 Log:0/4 Commit:3 -> 3 handling Ready - Ready MustSync=true: - Entries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 3->1 MsgAppResp Term:1 Log:0/4 Commit:3 -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/4 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/4 Commit:3 -> 1 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:4 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 1->2 MsgApp Term:1 Log:1/4 Commit:4 - 1->3 MsgApp Term:1 Log:1/4 Commit:4 - INFO 1 switched to configuration voters=(2 3 4)&&(1 2 3) -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/4 Commit:4 -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/4 Commit:4 -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->4 MsgApp Term:1 Log:1/3 Commit:4 Entries:[1/4 EntryConfChangeV2 r1 v4] -> 2 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:4 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 2->1 MsgAppResp Term:1 Log:0/4 Commit:4 - INFO 2 switched to configuration voters=(2 3 4)&&(1 2 3) -> 3 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:4 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 3->1 MsgAppResp Term:1 Log:0/4 Commit:4 - INFO 3 switched to configuration voters=(2 3 4)&&(1 2 3) -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/4 Commit:4 - 3->1 MsgAppResp Term:1 Log:0/4 Commit:4 -> 4 receiving messages - 1->4 MsgApp Term:1 Log:1/3 Commit:4 Entries:[1/4 EntryConfChangeV2 r1 v4] - INFO 4 [term: 0] received a MsgApp message with higher term from 1 [term: 1] - INFO 4 became follower at term 1 -> 4 handling Ready - Ready MustSync=true: - HardState Term:1 Commit:0 Lead:1 LeadEpoch:0 - Messages: - 4->1 MsgAppResp Term:1 Log:0/3 Rejected (Hint: 0) -> 1 receiving messages - 4->1 MsgAppResp Term:1 Log:0/3 Rejected (Hint: 0) -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->4 MsgSnap Term:1 Log:0/0 - Snapshot: Index:4 Term:1 ConfState:Voters:[2 3 4] VotersOutgoing:[1 2 3] Learners:[] LearnersNext:[] AutoLeave:false -> 4 receiving messages - 1->4 MsgSnap Term:1 Log:0/0 - Snapshot: Index:4 Term:1 ConfState:Voters:[2 3 4] VotersOutgoing:[1 2 3] Learners:[] LearnersNext:[] AutoLeave:false - INFO log [committed=0, applied=0, applying=0, unstable.offset=1, unstable.offsetInProgress=1, len(unstable.Entries)=0] starts to restore snapshot [index: 4, term: 1] - INFO 4 switched to configuration voters=(2 3 4)&&(1 2 3) - INFO 4 [commit: 4, lastindex: 4, lastterm: 1] restored snapshot [index: 4, term: 1] - INFO 4 [commit: 4] restored snapshot [index: 4, term: 1] -> 4 handling Ready - Ready MustSync=true: - HardState Term:1 Commit:4 Lead:1 LeadEpoch:0 - Snapshot Index:4 Term:1 ConfState:Voters:[2 3 4] VotersOutgoing:[1 2 3] Learners:[] LearnersNext:[] AutoLeave:false - Messages: - 4->1 MsgAppResp Term:1 Log:0/4 Commit:4 -> 1 receiving messages - 4->1 MsgAppResp Term:1 Log:0/4 Commit:4 - - -# Transfer leadership while in the joint config. -transfer-leadership from=1 to=4 +stabilize log-level=none ---- -INFO 1 [term 1] starts to transfer leadership to 4 -INFO 1 sends MsgTimeoutNow to 4 immediately as 4 already has up-to-date log -INFO 1 became follower at term 1 +ok -# Leadership transfer was initiated by the outgoing leader, but not yet -# processed by the transfer target. raft-state ---- -1: StateFollower (Voter) Term:1 Lead:0 LeadEpoch:1 +1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 4: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 -# Leadership transfer is happening here. -stabilize ----- -> 1 handling Ready - Ready MustSync=false: - State:StateFollower - Messages: - 1->4 MsgTimeoutNow Term:1 Log:0/0 -> 4 receiving messages - 1->4 MsgTimeoutNow Term:1 Log:0/0 - INFO 4 [term 1] received MsgTimeoutNow from 1 and starts an election to get leadership - INFO 4 is starting a new election at term 1 - INFO 4 became candidate at term 2 - INFO 4 [logterm: 1, index: 4] sent MsgVote request to 1 at term 2 - INFO 4 [logterm: 1, index: 4] sent MsgVote request to 2 at term 2 - INFO 4 [logterm: 1, index: 4] sent MsgVote request to 3 at term 2 -> 4 handling Ready - Ready MustSync=true: - State:StateCandidate - HardState Term:2 Vote:4 Commit:4 Lead:0 LeadEpoch:0 - Messages: - 4->1 MsgVote Term:2 Log:1/4 - 4->2 MsgVote Term:2 Log:1/4 - 4->3 MsgVote Term:2 Log:1/4 - INFO 4 received MsgVoteResp from 4 at term 2 - INFO 4 has received 1 MsgVoteResp votes and 0 vote rejections -> 1 receiving messages - 4->1 MsgVote Term:2 Log:1/4 - INFO 1 [term: 1] received a MsgVote message with higher term from 4 [term: 2] - INFO 1 became follower at term 2 - INFO 1 [logterm: 1, index: 4, vote: 0] cast MsgVote for 4 [logterm: 1, index: 4] at term 2 -> 2 receiving messages - 4->2 MsgVote Term:2 Log:1/4 - INFO 2 [term: 1] received a MsgVote message with higher term from 4 [term: 2] - INFO 2 became follower at term 2 - INFO 2 [logterm: 1, index: 4, vote: 0] cast MsgVote for 4 [logterm: 1, index: 4] at term 2 -> 3 receiving messages - 4->3 MsgVote Term:2 Log:1/4 - INFO 3 [term: 1] received a MsgVote message with higher term from 4 [term: 2] - INFO 3 became follower at term 2 - INFO 3 [logterm: 1, index: 4, vote: 0] cast MsgVote for 4 [logterm: 1, index: 4] at term 2 -> 1 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:0 LeadEpoch:0 - Messages: - 1->4 MsgVoteResp Term:2 Log:0/0 -> 2 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:0 LeadEpoch:0 - Messages: - 2->4 MsgVoteResp Term:2 Log:0/0 -> 3 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:0 LeadEpoch:0 - Messages: - 3->4 MsgVoteResp Term:2 Log:0/0 -> 4 receiving messages - 1->4 MsgVoteResp Term:2 Log:0/0 - INFO 4 received MsgVoteResp from 1 at term 2 - INFO 4 has received 2 MsgVoteResp votes and 0 vote rejections - 2->4 MsgVoteResp Term:2 Log:0/0 - INFO 4 received MsgVoteResp from 2 at term 2 - INFO 4 has received 3 MsgVoteResp votes and 0 vote rejections - INFO 4 became leader at term 2 - 3->4 MsgVoteResp Term:2 Log:0/0 -> 4 handling Ready - Ready MustSync=true: - State:StateLeader - HardState Term:2 Vote:4 Commit:4 Lead:4 LeadEpoch:1 - Entries: - 2/5 EntryNormal "" - Messages: - 4->1 MsgFortifyLeader Term:2 Log:0/0 - 4->2 MsgFortifyLeader Term:2 Log:0/0 - 4->3 MsgFortifyLeader Term:2 Log:0/0 - 4->1 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] - 4->2 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] - 4->3 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] -> 1 receiving messages - 4->1 MsgFortifyLeader Term:2 Log:0/0 - 4->1 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] -> 2 receiving messages - 4->2 MsgFortifyLeader Term:2 Log:0/0 - 4->2 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] -> 3 receiving messages - 4->3 MsgFortifyLeader Term:2 Log:0/0 - 4->3 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] -> 1 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:4 LeadEpoch:1 - Entries: - 2/5 EntryNormal "" - Messages: - 1->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 1->4 MsgAppResp Term:2 Log:0/5 Commit:4 -> 2 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:4 LeadEpoch:1 - Entries: - 2/5 EntryNormal "" - Messages: - 2->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 2->4 MsgAppResp Term:2 Log:0/5 Commit:4 -> 3 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:4 LeadEpoch:1 - Entries: - 2/5 EntryNormal "" - Messages: - 3->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 3->4 MsgAppResp Term:2 Log:0/5 Commit:4 -> 4 receiving messages - 1->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 1->4 MsgAppResp Term:2 Log:0/5 Commit:4 - 2->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 2->4 MsgAppResp Term:2 Log:0/5 Commit:4 - 3->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 3->4 MsgAppResp Term:2 Log:0/5 Commit:4 -> 4 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:5 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/5 EntryNormal "" - Messages: - 4->1 MsgApp Term:2 Log:2/5 Commit:5 - 4->2 MsgApp Term:2 Log:2/5 Commit:5 - 4->3 MsgApp Term:2 Log:2/5 Commit:5 -> 1 receiving messages - 4->1 MsgApp Term:2 Log:2/5 Commit:5 -> 2 receiving messages - 4->2 MsgApp Term:2 Log:2/5 Commit:5 -> 3 receiving messages - 4->3 MsgApp Term:2 Log:2/5 Commit:5 -> 1 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:5 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/5 EntryNormal "" - Messages: - 1->4 MsgAppResp Term:2 Log:0/5 Commit:5 -> 2 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:5 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/5 EntryNormal "" - Messages: - 2->4 MsgAppResp Term:2 Log:0/5 Commit:5 -> 3 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:5 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/5 EntryNormal "" - Messages: - 3->4 MsgAppResp Term:2 Log:0/5 Commit:5 -> 4 receiving messages - 1->4 MsgAppResp Term:2 Log:0/5 Commit:5 - 2->4 MsgAppResp Term:2 Log:0/5 Commit:5 - 3->4 MsgAppResp Term:2 Log:0/5 Commit:5 - -# Leadership transfer succeeded. -raft-state ----- -1: StateFollower (Voter) Term:2 Lead:4 LeadEpoch:1 -2: StateFollower (Voter) Term:2 Lead:4 LeadEpoch:1 -3: StateFollower (Voter) Term:2 Lead:4 LeadEpoch:1 -4: StateLeader (Voter) Term:2 Lead:4 LeadEpoch:1 - # n4 will propose a transition out of the joint config. propose-conf-change 4 ---- ok # The group commits the command and everyone switches to the final config. +# n1 steps down as leader. stabilize ---- > 4 handling Ready - Ready MustSync=true: - Entries: - 2/6 EntryConfChangeV2 + Ready MustSync=false: Messages: - 4->1 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] - 4->2 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] - 4->3 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] + 4->1 MsgProp Term:0 Log:0/0 Entries:[0/0 EntryConfChangeV2] > 1 receiving messages - 4->1 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] -> 2 receiving messages - 4->2 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] -> 3 receiving messages - 4->3 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] + 4->1 MsgProp Term:0 Log:0/0 Entries:[0/0 EntryConfChangeV2] > 1 handling Ready Ready MustSync=true: Entries: - 2/6 EntryConfChangeV2 + 1/5 EntryConfChangeV2 Messages: - 1->4 MsgAppResp Term:2 Log:0/6 Commit:5 + 1->2 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] + 1->3 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] + 1->4 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] +> 4 receiving messages + 1->4 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] > 2 handling Ready Ready MustSync=true: Entries: - 2/6 EntryConfChangeV2 + 1/5 EntryConfChangeV2 Messages: - 2->4 MsgAppResp Term:2 Log:0/6 Commit:5 + 2->1 MsgAppResp Term:1 Log:0/5 Commit:4 > 3 handling Ready Ready MustSync=true: Entries: - 2/6 EntryConfChangeV2 + 1/5 EntryConfChangeV2 Messages: - 3->4 MsgAppResp Term:2 Log:0/6 Commit:5 -> 4 receiving messages - 1->4 MsgAppResp Term:2 Log:0/6 Commit:5 - 2->4 MsgAppResp Term:2 Log:0/6 Commit:5 - 3->4 MsgAppResp Term:2 Log:0/6 Commit:5 + 3->1 MsgAppResp Term:1 Log:0/5 Commit:4 > 4 handling Ready Ready MustSync=true: - HardState Term:2 Vote:4 Commit:6 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/6 EntryConfChangeV2 + Entries: + 1/5 EntryConfChangeV2 Messages: - 4->1 MsgApp Term:2 Log:2/6 Commit:6 - 4->2 MsgApp Term:2 Log:2/6 Commit:6 - 4->3 MsgApp Term:2 Log:2/6 Commit:6 - INFO 4 switched to configuration voters=(2 3 4) + 4->1 MsgAppResp Term:1 Log:0/5 Commit:4 > 1 receiving messages - 4->1 MsgApp Term:2 Log:2/6 Commit:6 -> 2 receiving messages - 4->2 MsgApp Term:2 Log:2/6 Commit:6 -> 3 receiving messages - 4->3 MsgApp Term:2 Log:2/6 Commit:6 + 2->1 MsgAppResp Term:1 Log:0/5 Commit:4 + 3->1 MsgAppResp Term:1 Log:0/5 Commit:4 + 4->1 MsgAppResp Term:1 Log:0/5 Commit:4 > 1 handling Ready Ready MustSync=true: - HardState Term:2 Vote:4 Commit:6 Lead:4 LeadEpoch:1 + HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 CommittedEntries: - 2/6 EntryConfChangeV2 + 1/5 EntryConfChangeV2 Messages: - 1->4 MsgAppResp Term:2 Log:0/6 Commit:6 + 1->2 MsgApp Term:1 Log:1/5 Commit:5 + 1->3 MsgApp Term:1 Log:1/5 Commit:5 + 1->4 MsgApp Term:1 Log:1/5 Commit:5 INFO 1 switched to configuration voters=(2 3 4) + INFO 1 became follower at term 1 +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/5 Commit:5 +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/5 Commit:5 +> 4 receiving messages + 1->4 MsgApp Term:1 Log:1/5 Commit:5 +> 1 handling Ready + Ready MustSync=false: + State:StateFollower > 2 handling Ready Ready MustSync=true: - HardState Term:2 Vote:4 Commit:6 Lead:4 LeadEpoch:1 + HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 CommittedEntries: - 2/6 EntryConfChangeV2 + 1/5 EntryConfChangeV2 Messages: - 2->4 MsgAppResp Term:2 Log:0/6 Commit:6 + 2->1 MsgAppResp Term:1 Log:0/5 Commit:5 INFO 2 switched to configuration voters=(2 3 4) > 3 handling Ready Ready MustSync=true: - HardState Term:2 Vote:4 Commit:6 Lead:4 LeadEpoch:1 + HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 CommittedEntries: - 2/6 EntryConfChangeV2 + 1/5 EntryConfChangeV2 Messages: - 3->4 MsgAppResp Term:2 Log:0/6 Commit:6 + 3->1 MsgAppResp Term:1 Log:0/5 Commit:5 INFO 3 switched to configuration voters=(2 3 4) -> 4 receiving messages - 1->4 MsgAppResp Term:2 Log:0/6 Commit:6 - raft: cannot step as peer not found - 2->4 MsgAppResp Term:2 Log:0/6 Commit:6 - 3->4 MsgAppResp Term:2 Log:0/6 Commit:6 +> 4 handling Ready + Ready MustSync=true: + HardState Term:1 Commit:5 Lead:1 LeadEpoch:0 + CommittedEntries: + 1/5 EntryConfChangeV2 + Messages: + 4->1 MsgAppResp Term:1 Log:0/5 Commit:5 + INFO 4 switched to configuration voters=(2 3 4) +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/5 Commit:5 + 3->1 MsgAppResp Term:1 Log:0/5 Commit:5 + 4->1 MsgAppResp Term:1 Log:0/5 Commit:5 # n1 is out of the configuration. raft-state ---- -1: StateFollower (Non-Voter) Term:2 Lead:4 LeadEpoch:1 -2: StateFollower (Voter) Term:2 Lead:4 LeadEpoch:1 -3: StateFollower (Voter) Term:2 Lead:4 LeadEpoch:1 -4: StateLeader (Voter) Term:2 Lead:4 LeadEpoch:1 +1: StateFollower (Non-Voter) Term:1 Lead:0 LeadEpoch:1 +2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 +3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 +4: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 # Make sure n1 cannot campaign to become leader. campaign 1 ---- WARN 1 is unpromotable and can not campaign + +# TODO(arul): this is a hack until +# https://github.com/cockroachdb/cockroach/issues/129098 is fixed. +bump-epoch 1 +---- + 1 2 3 4 +1 2 1 1 1 +2 2 1 1 1 +3 2 1 1 1 +4 2 1 1 1 + +# Campaign the dedicated voter n2 to become the new leader. +campaign 2 +---- +INFO 2 is starting a new election at term 1 +INFO 2 became candidate at term 2 +INFO 2 [logterm: 1, index: 5] sent MsgVote request to 3 at term 2 +INFO 2 [logterm: 1, index: 5] sent MsgVote request to 4 at term 2 + +stabilize log-level=none +---- +ok + +raft-state +---- +1: StateFollower (Non-Voter) Term:1 Lead:0 LeadEpoch:1 +2: StateLeader (Voter) Term:2 Lead:2 LeadEpoch:1 +3: StateFollower (Voter) Term:2 Lead:2 LeadEpoch:1 +4: StateFollower (Voter) Term:2 Lead:2 LeadEpoch:1 diff --git a/pkg/raft/testdata/confchange_v2_replace_leader_stepdown.txt b/pkg/raft/testdata/confchange_v2_replace_leader_stepdown.txt deleted file mode 100644 index ec7cbcb1b1a2..000000000000 --- a/pkg/raft/testdata/confchange_v2_replace_leader_stepdown.txt +++ /dev/null @@ -1,202 +0,0 @@ -# Run a V2 membership change that removes the leader and adds another voter as a -# single operation, using joint consensus and explicitly determining when to -# transition out of the joint config. Leadership is transferred by campaigning a -# designated voter in the new config once the old leader steps down. After the -# reconfiguration completes, we verify that the removed leader cannot campaign -# to become leader. - -# We'll turn this back on after the boilerplate. -log-level none ----- -ok - -# Bootstrap n1, n2, n3. -add-nodes 3 voters=(1,2,3) index=2 step-down-on-removal=true ----- -ok - -# n1 campaigns to become leader. -campaign 1 ----- -ok - -stabilize ----- -ok - -log-level info ----- -ok - -raft-state ----- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 - -# create n4 -add-nodes 1 ----- -INFO 4 switched to configuration voters=() -INFO 4 became follower at term 0 -INFO newRaft 4 [peers: [], term: 0, commit: 0, applied: 0, lastindex: 0, lastterm: 0] - -# Start reconfiguration to remove n1 and add n4. -propose-conf-change 1 v1=false transition=explicit -r1 v4 ----- -ok - -# Enter joint config. -stabilize log-level=none ----- -ok - -raft-state ----- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -4: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 - -# n4 will propose a transition out of the joint config. -propose-conf-change 4 ----- -ok - -# The group commits the command and everyone switches to the final config. -# n1 steps down as leader. -stabilize ----- -> 4 handling Ready - Ready MustSync=false: - Messages: - 4->1 MsgProp Term:0 Log:0/0 Entries:[0/0 EntryConfChangeV2] -> 1 receiving messages - 4->1 MsgProp Term:0 Log:0/0 Entries:[0/0 EntryConfChangeV2] -> 1 handling Ready - Ready MustSync=true: - Entries: - 1/5 EntryConfChangeV2 - Messages: - 1->2 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] - 1->3 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] - 1->4 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] -> 4 receiving messages - 1->4 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryConfChangeV2] -> 2 handling Ready - Ready MustSync=true: - Entries: - 1/5 EntryConfChangeV2 - Messages: - 2->1 MsgAppResp Term:1 Log:0/5 Commit:4 -> 3 handling Ready - Ready MustSync=true: - Entries: - 1/5 EntryConfChangeV2 - Messages: - 3->1 MsgAppResp Term:1 Log:0/5 Commit:4 -> 4 handling Ready - Ready MustSync=true: - Entries: - 1/5 EntryConfChangeV2 - Messages: - 4->1 MsgAppResp Term:1 Log:0/5 Commit:4 -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/5 Commit:4 - 3->1 MsgAppResp Term:1 Log:0/5 Commit:4 - 4->1 MsgAppResp Term:1 Log:0/5 Commit:4 -> 1 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/5 EntryConfChangeV2 - Messages: - 1->2 MsgApp Term:1 Log:1/5 Commit:5 - 1->3 MsgApp Term:1 Log:1/5 Commit:5 - 1->4 MsgApp Term:1 Log:1/5 Commit:5 - INFO 1 switched to configuration voters=(2 3 4) - INFO 1 became follower at term 1 -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/5 Commit:5 -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/5 Commit:5 -> 4 receiving messages - 1->4 MsgApp Term:1 Log:1/5 Commit:5 -> 1 handling Ready - Ready MustSync=false: - State:StateFollower -> 2 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/5 EntryConfChangeV2 - Messages: - 2->1 MsgAppResp Term:1 Log:0/5 Commit:5 - INFO 2 switched to configuration voters=(2 3 4) -> 3 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/5 EntryConfChangeV2 - Messages: - 3->1 MsgAppResp Term:1 Log:0/5 Commit:5 - INFO 3 switched to configuration voters=(2 3 4) -> 4 handling Ready - Ready MustSync=true: - HardState Term:1 Commit:5 Lead:1 LeadEpoch:0 - CommittedEntries: - 1/5 EntryConfChangeV2 - Messages: - 4->1 MsgAppResp Term:1 Log:0/5 Commit:5 - INFO 4 switched to configuration voters=(2 3 4) -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/5 Commit:5 - 3->1 MsgAppResp Term:1 Log:0/5 Commit:5 - 4->1 MsgAppResp Term:1 Log:0/5 Commit:5 - -# n1 is out of the configuration. -raft-state ----- -1: StateFollower (Non-Voter) Term:1 Lead:0 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -4: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 - -# Make sure n1 cannot campaign to become leader. -campaign 1 ----- -WARN 1 is unpromotable and can not campaign - -# TODO(arul): this is a hack until -# https://github.com/cockroachdb/cockroach/issues/129098 is fixed. -bump-epoch 1 ----- - 1 2 3 4 -1 2 1 1 1 -2 2 1 1 1 -3 2 1 1 1 -4 2 1 1 1 - -# Campaign the dedicated voter n2 to become the new leader. -campaign 2 ----- -INFO 2 is starting a new election at term 1 -INFO 2 became candidate at term 2 -INFO 2 [logterm: 1, index: 5] sent MsgVote request to 3 at term 2 -INFO 2 [logterm: 1, index: 5] sent MsgVote request to 4 at term 2 - -stabilize log-level=none ----- -ok - -raft-state ----- -1: StateFollower (Non-Voter) Term:1 Lead:0 LeadEpoch:1 -2: StateLeader (Voter) Term:2 Lead:2 LeadEpoch:1 -3: StateFollower (Voter) Term:2 Lead:2 LeadEpoch:1 -4: StateFollower (Voter) Term:2 Lead:2 LeadEpoch:1 diff --git a/pkg/storage/replicas_storage.go b/pkg/storage/replicas_storage.go index 7c98f7b50581..34d1c86bf2cc 100644 --- a/pkg/storage/replicas_storage.go +++ b/pkg/storage/replicas_storage.go @@ -194,8 +194,8 @@ import ( // of this range (for a range that has never been the LHS of a merge, this // is the initial snapshot when the range came into being, followed by all // subsequent raft log entries). -// - RaftAppliedIndex >= raftInitialLogIndex -// - RaftAppliedIndexTerm >= raftInitialLogTerm +// - RaftAppliedIndex >= RaftInitialLogIndex +// - RaftAppliedIndexTerm >= RaftInitialLogTerm // - Has at least 1 non-provisional RangeDescriptor. // - Regression of the HardState.Commit and RaftAppliedIndex is permitted due // to a crash except for the following: @@ -237,7 +237,7 @@ import ( // Raft invariant is upheld externally by a combination of mostly external // invariants: // A new Range is initialized with all Replicas at truncated index equal to -// raftInitialLogIndex (10) (so they are in InitializedStateMachine state), +// RaftInitialLogIndex (10) (so they are in InitializedStateMachine state), // and any future Replicas will be initialized via a snapshot reflecting a // nonzero applied index >= 10. In particular, prior to receiving the // snapshot, no log entries can be sent to the Replica. And etcd/raft only @@ -256,7 +256,7 @@ import ( // has been deleted and RangeTombstoneKey updated and before the raft state // has been deleted. This is distinguishable from UninitializedStateMachine // since RaftTruncatedState.{Index,Term} are guaranteed to exist and have -// values >= raftInitialLogIndex, raftInitialLogTerm. ReplicasStorage.Init +// values >= RaftInitialLogIndex, RaftInitialLogTerm. ReplicasStorage.Init // will transition out of this state into DeletedReplica state. // // DEFINITION (RecoveryInconsistentReplica): This is a Replica that mostly @@ -706,7 +706,7 @@ type RangeStorage interface { // split, merge, or remove this replica (due to rebalancing) -- see the // methods in ReplicasStorage that accomplish that. // REQUIRES: replica is in state InitializedStateMachine (this is because we - // create a new range with the first log entry at raftInitialLogIndex (10), + // create a new range with the first log entry at RaftInitialLogIndex (10), // so a range always requires an initial state "snapshot" before it can // apply raft entries). ApplyCommittedBatch(smBatch MutationBatch) error