From 6d09d13424e2c3868a383b794872655427e2ab52 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Tue, 10 Sep 2024 18:50:56 -0400 Subject: [PATCH 1/4] kv: export stateloader RaftInitialLog constants Refactor. No logic changes. Epic: None Release note: None --- pkg/kv/kvserver/stateloader/initial.go | 10 +++++----- pkg/kv/kvserver/store.go | 2 +- pkg/kv/kvserver/store_raft.go | 2 +- pkg/storage/replicas_storage.go | 10 +++++----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pkg/kv/kvserver/stateloader/initial.go b/pkg/kv/kvserver/stateloader/initial.go index fb885b8747b4..6a6e523ea17e 100644 --- a/pkg/kv/kvserver/stateloader/initial.go +++ b/pkg/kv/kvserver/stateloader/initial.go @@ -22,13 +22,13 @@ import ( "github.com/cockroachdb/errors" ) -// raftInitialLog{Index,Term} are the starting points for the raft log. We +// RaftInitialLog{Index,Term} are the starting points for the raft log. We // bootstrap the raft membership by synthesizing a snapshot as if there were // some discarded prefix to the log, so we must begin the log at an arbitrary // index greater than 1. const ( - raftInitialLogIndex = 10 - raftInitialLogTerm = 5 + RaftInitialLogIndex = 10 + RaftInitialLogTerm = 5 ) // WriteInitialReplicaState sets up a new Range, but without writing an @@ -51,8 +51,8 @@ func WriteInitialReplicaState( rsl := Make(desc.RangeID) var s kvserverpb.ReplicaState s.TruncatedState = &kvserverpb.RaftTruncatedState{ - Term: raftInitialLogTerm, - Index: raftInitialLogIndex, + Term: RaftInitialLogTerm, + Index: RaftInitialLogIndex, } s.RaftAppliedIndex = s.TruncatedState.Index s.RaftAppliedIndexTerm = s.TruncatedState.Term diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 9586a23cf410..1db2eeb22e13 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -707,7 +707,7 @@ A first phenomenon to understand is that of uninitialized Replicas, which is the State Machine at applied index zero, i.e. has an empty state. In CockroachDB, an uninitialized Replica can only advance to a nonzero log position ("become initialized") via a Raft snapshot (this is because we initialize all Ranges in -the system at log index raftInitialLogIndex which allows us to write arbitrary +the system at log index RaftInitialLogIndex which allows us to write arbitrary amounts of data into the initial state without having to worry about the size of individual log entries; see WriteInitialReplicaState). diff --git a/pkg/kv/kvserver/store_raft.go b/pkg/kv/kvserver/store_raft.go index 697c0cbdab55..27c49b91ed38 100644 --- a/pkg/kv/kvserver/store_raft.go +++ b/pkg/kv/kvserver/store_raft.go @@ -449,7 +449,7 @@ func (s *Store) processRaftSnapshotRequest( // the snapshot is targeting an uninitialized replica. The only known reason // for raft to ignore a snapshot is if it doesn't move the applied index // forward, but an uninitialized replica's applied index is zero (and a - // snapshot's is at least raftInitialLogIndex). + // snapshot's is at least RaftInitialLogIndex). if inSnap.placeholder != nil { if _, err := s.removePlaceholder(ctx, inSnap.placeholder, typ); err != nil { log.Fatalf(ctx, "unable to remove placeholder: %s", err) diff --git a/pkg/storage/replicas_storage.go b/pkg/storage/replicas_storage.go index 7c98f7b50581..34d1c86bf2cc 100644 --- a/pkg/storage/replicas_storage.go +++ b/pkg/storage/replicas_storage.go @@ -194,8 +194,8 @@ import ( // of this range (for a range that has never been the LHS of a merge, this // is the initial snapshot when the range came into being, followed by all // subsequent raft log entries). -// - RaftAppliedIndex >= raftInitialLogIndex -// - RaftAppliedIndexTerm >= raftInitialLogTerm +// - RaftAppliedIndex >= RaftInitialLogIndex +// - RaftAppliedIndexTerm >= RaftInitialLogTerm // - Has at least 1 non-provisional RangeDescriptor. // - Regression of the HardState.Commit and RaftAppliedIndex is permitted due // to a crash except for the following: @@ -237,7 +237,7 @@ import ( // Raft invariant is upheld externally by a combination of mostly external // invariants: // A new Range is initialized with all Replicas at truncated index equal to -// raftInitialLogIndex (10) (so they are in InitializedStateMachine state), +// RaftInitialLogIndex (10) (so they are in InitializedStateMachine state), // and any future Replicas will be initialized via a snapshot reflecting a // nonzero applied index >= 10. In particular, prior to receiving the // snapshot, no log entries can be sent to the Replica. And etcd/raft only @@ -256,7 +256,7 @@ import ( // has been deleted and RangeTombstoneKey updated and before the raft state // has been deleted. This is distinguishable from UninitializedStateMachine // since RaftTruncatedState.{Index,Term} are guaranteed to exist and have -// values >= raftInitialLogIndex, raftInitialLogTerm. ReplicasStorage.Init +// values >= RaftInitialLogIndex, RaftInitialLogTerm. ReplicasStorage.Init // will transition out of this state into DeletedReplica state. // // DEFINITION (RecoveryInconsistentReplica): This is a Replica that mostly @@ -706,7 +706,7 @@ type RangeStorage interface { // split, merge, or remove this replica (due to rebalancing) -- see the // methods in ReplicasStorage that accomplish that. // REQUIRES: replica is in state InitializedStateMachine (this is because we - // create a new range with the first log entry at raftInitialLogIndex (10), + // create a new range with the first log entry at RaftInitialLogIndex (10), // so a range always requires an initial state "snapshot" before it can // apply raft entries). ApplyCommittedBatch(smBatch MutationBatch) error From 9c942a8ed6080854ea9fb27c8882f4df6006528f Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Tue, 10 Sep 2024 19:00:02 -0400 Subject: [PATCH 2/4] kv: use expiration-based lease for right-hand side of range split Fixes #130112. This commit converts leader leases into expiration-based leases during the state transfer during a range split. A leader lease is tied to a specific raft leadership term within a specific raft group. During a range split, we initialize a new raft group on the right-hand side, so a leader lease term from the left-hand side is unusable. Once the right-hand side elects a leader and collocates the lease and leader, it can promote the expiration-based lease back to a leader lease. Release note: None --- pkg/kv/kvserver/batcheval/BUILD.bazel | 1 + .../kvserver/batcheval/cmd_end_transaction.go | 26 +++- .../batcheval/cmd_end_transaction_test.go | 129 ++++++++++++++++++ pkg/kv/kvserver/batcheval/eval_context.go | 51 ++++--- pkg/kv/kvserver/replica_eval_context_span.go | 6 + pkg/kv/kvserver/replica_range_lease.go | 5 + 6 files changed, 192 insertions(+), 26 deletions(-) diff --git a/pkg/kv/kvserver/batcheval/BUILD.bazel b/pkg/kv/kvserver/batcheval/BUILD.bazel index 086873d1e94e..8afb9c324520 100644 --- a/pkg/kv/kvserver/batcheval/BUILD.bazel +++ b/pkg/kv/kvserver/batcheval/BUILD.bazel @@ -152,6 +152,7 @@ go_test( "//pkg/kv/kvserver/readsummary", "//pkg/kv/kvserver/readsummary/rspb", "//pkg/kv/kvserver/spanset", + "//pkg/kv/kvserver/stateloader", "//pkg/roachpb", "//pkg/security/securityassets", "//pkg/security/securitytest", diff --git a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go index cd2e853fac93..e7fb5fa5a0bc 100644 --- a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go +++ b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go @@ -1280,15 +1280,33 @@ func splitTriggerHelper( log.Fatalf(ctx, "LHS of split has no lease") } - replica, found := split.RightDesc.GetReplicaDescriptor(leftLease.Replica.StoreID) - if !found { + // Copy the lease from the left-hand side of the split over to the + // right-hand side so that it can immediately start serving requests. + // When doing so, we need to make a few modifications. + rightLease := leftLease + // Rebind the lease to the existing leaseholder store's replica from the + // right-hand side's descriptor. + var ok bool + rightLease.Replica, ok = split.RightDesc.GetReplicaDescriptor(leftLease.Replica.StoreID) + if !ok { return enginepb.MVCCStats{}, result.Result{}, errors.Errorf( "pre-split lease holder %+v not found in post-split descriptor %+v", leftLease.Replica, split.RightDesc, ) } - rightLease := leftLease - rightLease.Replica = replica + // Convert leader leases into expiration-based leases. A leader lease is + // tied to a specific raft leadership term within a specific raft group. + // During a range split, we initialize a new raft group on the right-hand + // side, so a leader lease term from the left-hand side is unusable. Once + // the right-hand side elects a leader and collocates the lease and leader, + // it can promote the expiration-based lease back to a leader lease. + if rightLease.Type() == roachpb.LeaseLeader { + exp := rec.Clock().Now().Add(int64(rec.GetRangeLeaseDuration()), 0) + rightLease.Expiration = &exp + rightLease.Term = 0 + rightLease.MinExpiration = hlc.Timestamp{} + } + gcThreshold, err := sl.LoadGCThreshold(ctx, batch) if err != nil { return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to load GCThreshold") diff --git a/pkg/kv/kvserver/batcheval/cmd_end_transaction_test.go b/pkg/kv/kvserver/batcheval/cmd_end_transaction_test.go index bba1b2d9b881..dda9f8229708 100644 --- a/pkg/kv/kvserver/batcheval/cmd_end_transaction_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_end_transaction_test.go @@ -14,14 +14,19 @@ import ( "context" "fmt" "regexp" + "slices" "testing" + "time" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/isolation" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/testutils" @@ -29,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/stretchr/testify/require" ) @@ -1678,3 +1684,126 @@ func TestResolveLocalLocks(t *testing.T) { }) } } + +// TestSplitTriggerWritesInitialReplicaState tests that a split trigger sets up +// the split's right-hand side range by writing the initial replica state into +// the evaluation write batch. +func TestSplitTriggerWritesInitialReplicaState(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + st := cluster.MakeTestingClusterSettings() + version := st.Version.LatestVersion() + manual := timeutil.NewManualTime(timeutil.Unix(0, 10)) + clock := hlc.NewClockForTesting(manual) + + db := storage.NewDefaultInMemForTesting() + defer db.Close() + batch := db.NewBatch() + defer batch.Close() + + rangeLeaseDuration := 99 * time.Nanosecond + startKey := roachpb.Key("0000") + endKey := roachpb.Key("9999") + desc := roachpb.RangeDescriptor{ + RangeID: 99, + StartKey: roachpb.RKey(startKey), + EndKey: roachpb.RKey(endKey), + } + desc.AddReplica(1, 1, roachpb.VOTER_FULL) + lease := roachpb.Lease{ + Replica: desc.InternalReplicas[0], + // The range was using a leader lease. The split will need to swap this to + // an expiration-based lease. + Term: 10, + MinExpiration: hlc.Timestamp{WallTime: 100}, + } + gcThreshold := hlc.Timestamp{WallTime: 4} + lastGCTimestamp := hlc.Timestamp{WallTime: 5} + gcHint := roachpb.GCHint{GCTimestamp: gcThreshold} + abortSpanTxnID := uuid.MakeV4() + as := abortspan.New(desc.RangeID) + sl := stateloader.Make(desc.RangeID) + rec := (&MockEvalCtx{ + ClusterSettings: st, + Desc: &desc, + Clock: clock, + AbortSpan: as, + LastReplicaGCTimestamp: lastGCTimestamp, + RangeLeaseDuration: rangeLeaseDuration, + }).EvalContext() + + splitKey := roachpb.RKey("5555") + leftDesc, rightDesc := desc, desc + leftDesc.EndKey = splitKey + rightDesc.RangeID++ + rightDesc.StartKey = splitKey + rightDesc.InternalReplicas = slices.Clone(leftDesc.InternalReplicas) + rightDesc.InternalReplicas[0].ReplicaID++ + split := &roachpb.SplitTrigger{ + LeftDesc: leftDesc, + RightDesc: rightDesc, + } + + // Write the range state that will be consulted and copied during the split. + err := as.Put(ctx, batch, nil, abortSpanTxnID, &roachpb.AbortSpanEntry{}) + require.NoError(t, err) + err = sl.SetLease(ctx, batch, nil, lease) + require.NoError(t, err) + err = sl.SetGCThreshold(ctx, batch, nil, &gcThreshold) + require.NoError(t, err) + err = sl.SetGCHint(ctx, batch, nil, &gcHint) + require.NoError(t, err) + err = sl.SetVersion(ctx, batch, nil, &version) + require.NoError(t, err) + + // Run the split trigger, which is normally run as a subset of EndTxn request + // evaluation. + _, _, err = splitTrigger(ctx, rec, batch, enginepb.MVCCStats{}, split, hlc.Timestamp{}) + require.NoError(t, err) + + // Verify that range state was migrated to the right-hand side properly. + asRight := abortspan.New(rightDesc.RangeID) + slRight := stateloader.Make(rightDesc.RangeID) + // The abort span should have been transferred over. + ok, err := asRight.Get(ctx, batch, abortSpanTxnID, &roachpb.AbortSpanEntry{}) + require.NoError(t, err) + require.True(t, ok) + // The lease should be present, pointing at the replica in the right-hand side + // range, and switched to an expiration-based lease. + expLease := roachpb.Lease{ + Replica: rightDesc.InternalReplicas[0], + Expiration: &hlc.Timestamp{WallTime: manual.Now().Add(rangeLeaseDuration).UnixNano()}, + } + loadedLease, err := slRight.LoadLease(ctx, batch) + require.NoError(t, err) + require.Equal(t, expLease, loadedLease) + loadedGCThreshold, err := slRight.LoadGCThreshold(ctx, batch) + require.NoError(t, err) + require.NotNil(t, loadedGCThreshold) + require.Equal(t, gcThreshold, *loadedGCThreshold) + loadedGCHint, err := slRight.LoadGCHint(ctx, batch) + require.NoError(t, err) + require.NotNil(t, loadedGCHint) + require.Equal(t, gcHint, *loadedGCHint) + expTruncState := kvserverpb.RaftTruncatedState{ + Term: stateloader.RaftInitialLogTerm, + Index: stateloader.RaftInitialLogIndex, + } + loadedTruncState, err := slRight.LoadRaftTruncatedState(ctx, batch) + require.NoError(t, err) + require.Equal(t, expTruncState, loadedTruncState) + loadedVersion, err := slRight.LoadVersion(ctx, batch) + require.NoError(t, err) + require.Equal(t, version, loadedVersion) + expAppliedState := kvserverpb.RangeAppliedState{ + RaftAppliedIndexTerm: stateloader.RaftInitialLogTerm, + RaftAppliedIndex: stateloader.RaftInitialLogIndex, + } + loadedAppliedState, err := slRight.LoadRangeAppliedState(ctx, batch) + require.NoError(t, err) + require.NotNil(t, loadedAppliedState) + loadedAppliedState.RangeStats = kvserverpb.MVCCPersistentStats{} // ignore + require.Equal(t, &expAppliedState, loadedAppliedState) +} diff --git a/pkg/kv/kvserver/batcheval/eval_context.go b/pkg/kv/kvserver/batcheval/eval_context.go index b315c5b8cdd9..1607297e4db8 100644 --- a/pkg/kv/kvserver/batcheval/eval_context.go +++ b/pkg/kv/kvserver/batcheval/eval_context.go @@ -14,6 +14,7 @@ import ( "context" "fmt" "math" + "time" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan" @@ -106,6 +107,7 @@ type EvalContext interface { ExcludeDataFromBackup(context.Context, roachpb.Span) (bool, error) GetLastReplicaGCTimestamp(context.Context) (hlc.Timestamp, error) GetLease() (roachpb.Lease, roachpb.Lease) + GetRangeLeaseDuration() time.Duration GetRangeInfo(context.Context) roachpb.RangeInfo // GetCurrentReadSummary returns a new ReadSummary reflecting all reads @@ -165,27 +167,29 @@ type ImmutableEvalContext interface { // MockEvalCtx is a dummy implementation of EvalContext for testing purposes. // For technical reasons, the interface is implemented by a wrapper .EvalContext(). type MockEvalCtx struct { - ClusterSettings *cluster.Settings - Desc *roachpb.RangeDescriptor - StoreID roachpb.StoreID - NodeID roachpb.NodeID - Clock *hlc.Clock - Stats enginepb.MVCCStats - QPS float64 - CPU float64 - AbortSpan *abortspan.AbortSpan - GCThreshold hlc.Timestamp - Term kvpb.RaftTerm - FirstIndex kvpb.RaftIndex - CanCreateTxnRecordFn func() (bool, kvpb.TransactionAbortedReason) - MinTxnCommitTSFn func() hlc.Timestamp - Lease roachpb.Lease - CurrentReadSummary rspb.ReadSummary - ClosedTimestamp hlc.Timestamp - RevokedLeaseSeq roachpb.LeaseSequence - MaxBytes int64 - ApproxDiskBytes uint64 - EvalKnobs kvserverbase.BatchEvalTestingKnobs + ClusterSettings *cluster.Settings + Desc *roachpb.RangeDescriptor + StoreID roachpb.StoreID + NodeID roachpb.NodeID + Clock *hlc.Clock + Stats enginepb.MVCCStats + QPS float64 + CPU float64 + AbortSpan *abortspan.AbortSpan + GCThreshold hlc.Timestamp + Term kvpb.RaftTerm + FirstIndex kvpb.RaftIndex + CanCreateTxnRecordFn func() (bool, kvpb.TransactionAbortedReason) + MinTxnCommitTSFn func() hlc.Timestamp + LastReplicaGCTimestamp hlc.Timestamp + Lease roachpb.Lease + RangeLeaseDuration time.Duration + CurrentReadSummary rspb.ReadSummary + ClosedTimestamp hlc.Timestamp + RevokedLeaseSeq roachpb.LeaseSequence + MaxBytes int64 + ApproxDiskBytes uint64 + EvalKnobs kvserverbase.BatchEvalTestingKnobs } // EvalContext returns the MockEvalCtx as an EvalContext. It will reflect future @@ -280,11 +284,14 @@ func (m *mockEvalCtxImpl) ExcludeDataFromBackup(context.Context, roachpb.Span) ( return false, nil } func (m *mockEvalCtxImpl) GetLastReplicaGCTimestamp(context.Context) (hlc.Timestamp, error) { - panic("unimplemented") + return m.LastReplicaGCTimestamp, nil } func (m *mockEvalCtxImpl) GetLease() (roachpb.Lease, roachpb.Lease) { return m.Lease, roachpb.Lease{} } +func (m *mockEvalCtxImpl) GetRangeLeaseDuration() time.Duration { + return m.RangeLeaseDuration +} func (m *mockEvalCtxImpl) GetRangeInfo(ctx context.Context) roachpb.RangeInfo { return roachpb.RangeInfo{Desc: *m.Desc(), Lease: m.Lease} } diff --git a/pkg/kv/kvserver/replica_eval_context_span.go b/pkg/kv/kvserver/replica_eval_context_span.go index 8e3a02b39c3b..a2787586f786 100644 --- a/pkg/kv/kvserver/replica_eval_context_span.go +++ b/pkg/kv/kvserver/replica_eval_context_span.go @@ -12,6 +12,7 @@ package kvserver import ( "context" + "time" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" @@ -207,6 +208,11 @@ func (rec SpanSetReplicaEvalContext) GetLease() (roachpb.Lease, roachpb.Lease) { return rec.i.GetLease() } +// GetRangeLeaseDuration is part of the EvalContext interface. +func (rec SpanSetReplicaEvalContext) GetRangeLeaseDuration() time.Duration { + return rec.i.GetRangeLeaseDuration() +} + // GetRangeInfo is part of the EvalContext interface. func (rec SpanSetReplicaEvalContext) GetRangeInfo(ctx context.Context) roachpb.RangeInfo { // Do the latching checks and ignore the results. diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index 64731a2f4799..23541046d8b2 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -753,6 +753,11 @@ func (r *Replica) leaseSettings(ctx context.Context) leases.Settings { } } +// GetRangeLeaseDuration is part of the EvalContext interface. +func (r *Replica) GetRangeLeaseDuration() time.Duration { + return r.store.cfg.RangeLeaseDuration +} + // requiresExpirationLeaseRLocked returns whether this range unconditionally // uses an expiration-based lease. Ranges located before or including the node // liveness table must always use expiration leases to avoid circular From 986aa411864bf20cb63496d0ab2b3000bc07fc93 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Sun, 15 Sep 2024 15:33:35 -0700 Subject: [PATCH 3/4] raft: remove StepDownOnRemoval This was added in ee0fe9da and has defaulted to true in cockroachdb since 4dcbdcdf for clusters with v23.2 active. Now that compatibility with pre-v23.2 is no longer required, we can remove the know from the raft layer. Epic: None Release note: None --- pkg/kv/kvserver/store.go | 20 +- pkg/raft/raft.go | 20 +- .../interaction_env_handler_add_nodes.go | 2 - .../testdata/confchange_v1_remove_leader.txt | 250 ---------- .../confchange_v1_remove_leader_stepdown.txt | 5 +- .../testdata/confchange_v2_replace_leader.txt | 431 ------------------ .../confchange_v2_replace_leader_stepdown.txt | 2 +- 7 files changed, 11 insertions(+), 719 deletions(-) delete mode 100644 pkg/raft/testdata/confchange_v1_remove_leader.txt delete mode 100644 pkg/raft/testdata/confchange_v2_replace_leader.txt diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index a2ac927d52e6..ae76c9b3b090 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -87,7 +87,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/log/logcrash" "github.com/cockroachdb/cockroach/pkg/util/log/severity" - "github.com/cockroachdb/cockroach/pkg/util/metamorphic" "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/cockroachdb/cockroach/pkg/util/mon" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -334,12 +333,6 @@ var SnapshotSendLimit = settings.RegisterIntSetting( settings.NonNegativeInt, ) -// raftStepDownOnRemoval is a metamorphic test parameter that makes Raft leaders -// step down on demotion or removal. Following an upgrade, clusters may have -// replicas with mixed settings, because it's only changed when initializing -// replicas. Varying it makes sure we handle this state. -var raftStepDownOnRemoval = metamorphic.ConstantWithTestBool("raft-step-down-on-removal", true) - // TestStoreConfig has some fields initialized with values relevant in tests. func TestStoreConfig(clock *hlc.Clock) StoreConfig { return testStoreConfig(clock, clusterversion.Latest.Version()) @@ -409,16 +402,9 @@ func newRaftConfig( Storage: strg, Logger: logger, StoreLiveness: storeLiveness, - - // We only set this on replica initialization, so replicas without - // StepDownOnRemoval may remain on 23.2 nodes until they restart. That's - // totally fine, we just can't rely on this behavior until 24.1, but - // we currently don't either. - StepDownOnRemoval: raftStepDownOnRemoval, - - PreVote: true, - CheckQuorum: storeCfg.RaftEnableCheckQuorum, - CRDBVersion: storeCfg.Settings.Version, + PreVote: true, + CheckQuorum: storeCfg.RaftEnableCheckQuorum, + CRDBVersion: storeCfg.Settings.Version, } } diff --git a/pkg/raft/raft.go b/pkg/raft/raft.go index 6b2abbc6d1bd..7eda3f552b9b 100644 --- a/pkg/raft/raft.go +++ b/pkg/raft/raft.go @@ -260,13 +260,6 @@ type Config struct { // See: https://github.com/etcd-io/raft/issues/80 DisableConfChangeValidation bool - // StepDownOnRemoval makes the leader step down when it is removed from the - // group or demoted to a learner. - // - // This behavior will become unconditional in the future. See: - // https://github.com/etcd-io/raft/issues/83 - StepDownOnRemoval bool - // StoreLiveness is a reference to the store liveness fabric. StoreLiveness raftstoreliveness.StoreLiveness @@ -428,7 +421,6 @@ type raft struct { // when raft changes its state to follower or candidate. randomizedElectionTimeout int disableProposalForwarding bool - stepDownOnRemoval bool tick func() step stepFunc @@ -464,7 +456,6 @@ func newRaft(c *Config) *raft { preVote: c.PreVote, disableProposalForwarding: c.DisableProposalForwarding, disableConfChangeValidation: c.DisableConfChangeValidation, - stepDownOnRemoval: c.StepDownOnRemoval, storeLiveness: c.StoreLiveness, crdbVersion: c.CRDBVersion, } @@ -2276,7 +2267,7 @@ func (r *raft) switchToConfig(cfg quorum.Config, progressMap tracker.ProgressMap r.isLearner = pr != nil && pr.IsLearner if (pr == nil || r.isLearner) && r.state == StateLeader { - // This node is leader and was removed or demoted, step down if requested. + // This node is leader and was removed or demoted, step down. // // We prevent demotions at the time writing but hypothetically we handle // them the same way as removing the leader. @@ -2284,11 +2275,10 @@ func (r *raft) switchToConfig(cfg quorum.Config, progressMap tracker.ProgressMap // TODO(tbg): ask follower with largest Match to TimeoutNow (to avoid // interruption). This might still drop some proposals but it's better than // nothing. - if r.stepDownOnRemoval { - // NB: Similar to the CheckQuorum step down case, we must remember our - // prior stint as leader, lest we regress the QSE. - r.becomeFollower(r.Term, r.lead) - } + // + // NB: Similar to the CheckQuorum step down case, we must remember our + // prior stint as leader, lest we regress the QSE. + r.becomeFollower(r.Term, r.lead) return cs } diff --git a/pkg/raft/rafttest/interaction_env_handler_add_nodes.go b/pkg/raft/rafttest/interaction_env_handler_add_nodes.go index fd4f157d1be6..0d6607d4112e 100644 --- a/pkg/raft/rafttest/interaction_env_handler_add_nodes.go +++ b/pkg/raft/rafttest/interaction_env_handler_add_nodes.go @@ -65,8 +65,6 @@ func (env *InteractionEnv) handleAddNodes(t *testing.T, d datadriven.TestData) e arg.Scan(t, i, &cfg.MaxCommittedSizePerReady) case "disable-conf-change-validation": arg.Scan(t, i, &cfg.DisableConfChangeValidation) - case "step-down-on-removal": - arg.Scan(t, i, &cfg.StepDownOnRemoval) case "crdb-version": var key string arg.Scan(t, i, &key) diff --git a/pkg/raft/testdata/confchange_v1_remove_leader.txt b/pkg/raft/testdata/confchange_v1_remove_leader.txt deleted file mode 100644 index 2533cf5f6fd6..000000000000 --- a/pkg/raft/testdata/confchange_v1_remove_leader.txt +++ /dev/null @@ -1,250 +0,0 @@ -# We'll turn this back on after the boilerplate. -log-level none ----- -ok - -# Run a V1 membership change that removes the leader. -# Bootstrap n1, n2, n3. -add-nodes 3 voters=(1,2,3) index=2 ----- -ok - -campaign 1 ----- -ok - -stabilize ----- -ok - -log-level debug ----- -ok - -raft-state ----- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 - -# Start removing n1. -propose-conf-change 1 v1=true -r1 ----- -ok - -raft-state ----- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 - -# Propose an extra entry which will be sent out together with the conf change. -propose 1 foo ----- -ok - -# Send out the corresponding appends. -process-ready 1 ----- -Ready MustSync=true: -Entries: -1/4 EntryConfChange r1 -1/5 EntryNormal "foo" -Messages: -1->2 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] -1->3 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] -1->2 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] -1->3 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] - -# Send response from n2 (which is enough to commit the entries so far next time -# n1 runs). -stabilize 2 ----- -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] - 1->2 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] -> 2 handling Ready - Ready MustSync=true: - Entries: - 1/4 EntryConfChange r1 - 1/5 EntryNormal "foo" - Messages: - 2->1 MsgAppResp Term:1 Log:0/4 Commit:3 - 2->1 MsgAppResp Term:1 Log:0/5 Commit:3 - -# Put another entry in n1's log. -propose 1 bar ----- -ok - -# n1 applies the conf change, so it has now removed itself. But it still has -# an uncommitted entry in the log. If the leader unconditionally counted itself -# as part of the commit quorum, we'd be in trouble. In the block below, we see -# it send out appends to the other nodes for the 'bar' entry. -stabilize 1 ----- -> 1 handling Ready - Ready MustSync=true: - Entries: - 1/6 EntryNormal "bar" - Messages: - 1->2 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] - 1->3 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/4 Commit:3 - 2->1 MsgAppResp Term:1 Log:0/5 Commit:3 -> 1 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/4 EntryConfChange r1 - 1/5 EntryNormal "foo" - Messages: - 1->2 MsgApp Term:1 Log:1/6 Commit:4 - 1->3 MsgApp Term:1 Log:1/6 Commit:4 - 1->2 MsgApp Term:1 Log:1/6 Commit:5 - 1->3 MsgApp Term:1 Log:1/6 Commit:5 - INFO 1 switched to configuration voters=(2 3) - -raft-state ----- -1: StateLeader (Non-Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 - -# n2 responds, n3 doesn't yet. Quorum for 'bar' should not be reached... -stabilize 2 ----- -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] - 1->2 MsgApp Term:1 Log:1/6 Commit:4 - 1->2 MsgApp Term:1 Log:1/6 Commit:5 -> 2 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 - Entries: - 1/6 EntryNormal "bar" - CommittedEntries: - 1/4 EntryConfChange r1 - 1/5 EntryNormal "foo" - Messages: - 2->1 MsgAppResp Term:1 Log:0/6 Commit:3 - 2->1 MsgAppResp Term:1 Log:0/6 Commit:4 - 2->1 MsgAppResp Term:1 Log:0/6 Commit:5 - INFO 2 switched to configuration voters=(2 3) - -# ... which thankfully is what we see on the leader. -stabilize 1 ----- -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/6 Commit:3 - 2->1 MsgAppResp Term:1 Log:0/6 Commit:4 - 2->1 MsgAppResp Term:1 Log:0/6 Commit:5 - -# When n3 responds, quorum is reached and everything falls into place. -stabilize ----- -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChange r1] - 1->3 MsgApp Term:1 Log:1/4 Commit:3 Entries:[1/5 EntryNormal "foo"] - 1->3 MsgApp Term:1 Log:1/5 Commit:3 Entries:[1/6 EntryNormal "bar"] - 1->3 MsgApp Term:1 Log:1/6 Commit:4 - 1->3 MsgApp Term:1 Log:1/6 Commit:5 -> 3 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:5 Lead:1 LeadEpoch:1 - Entries: - 1/4 EntryConfChange r1 - 1/5 EntryNormal "foo" - 1/6 EntryNormal "bar" - CommittedEntries: - 1/4 EntryConfChange r1 - 1/5 EntryNormal "foo" - Messages: - 3->1 MsgAppResp Term:1 Log:0/4 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/5 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:4 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:5 - INFO 3 switched to configuration voters=(2 3) -> 1 receiving messages - 3->1 MsgAppResp Term:1 Log:0/4 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/5 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:4 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:5 -> 1 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:6 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/6 EntryNormal "bar" - Messages: - 1->2 MsgApp Term:1 Log:1/6 Commit:6 - 1->3 MsgApp Term:1 Log:1/6 Commit:6 -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/6 Commit:6 -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/6 Commit:6 -> 2 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:6 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/6 EntryNormal "bar" - Messages: - 2->1 MsgAppResp Term:1 Log:0/6 Commit:6 -> 3 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:6 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/6 EntryNormal "bar" - Messages: - 3->1 MsgAppResp Term:1 Log:0/6 Commit:6 -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/6 Commit:6 - 3->1 MsgAppResp Term:1 Log:0/6 Commit:6 - -# However not all is well. n1 is still leader but unconditionally drops all -# proposals on the floor, so we're effectively stuck if it still heartbeats -# its followers... -propose 1 baz ----- -raft proposal dropped - -tick-heartbeat 1 ----- -ok - -# ... which, uh oh, it does. -# TODO(tbg): change behavior so that a leader that is removed immediately steps -# down, and initiates an optimistic handover. -stabilize ----- -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->2 MsgHeartbeat Term:1 Log:0/0 - 1->3 MsgHeartbeat Term:1 Log:0/0 -> 2 receiving messages - 1->2 MsgHeartbeat Term:1 Log:0/0 -> 3 receiving messages - 1->3 MsgHeartbeat Term:1 Log:0/0 -> 2 handling Ready - Ready MustSync=false: - Messages: - 2->1 MsgHeartbeatResp Term:1 Log:0/0 -> 3 handling Ready - Ready MustSync=false: - Messages: - 3->1 MsgHeartbeatResp Term:1 Log:0/0 -> 1 receiving messages - 2->1 MsgHeartbeatResp Term:1 Log:0/0 - 3->1 MsgHeartbeatResp Term:1 Log:0/0 - -# Just confirming the issue above - leader does not automatically step down. -# Expected behavior: a new leader is elected after an election timeout. -raft-state ----- -1: StateLeader (Non-Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 diff --git a/pkg/raft/testdata/confchange_v1_remove_leader_stepdown.txt b/pkg/raft/testdata/confchange_v1_remove_leader_stepdown.txt index 54a582deadde..e6a46569dc7d 100644 --- a/pkg/raft/testdata/confchange_v1_remove_leader_stepdown.txt +++ b/pkg/raft/testdata/confchange_v1_remove_leader_stepdown.txt @@ -3,10 +3,9 @@ log-level none ---- ok -# Run a V1 membership change that removes the leader, asking it -# to step down on removal. +# Run a V1 membership change that removes the leader. # Bootstrap n1, n2, n3. -add-nodes 3 voters=(1,2,3) index=2 step-down-on-removal=true +add-nodes 3 voters=(1,2,3) index=2 ---- ok diff --git a/pkg/raft/testdata/confchange_v2_replace_leader.txt b/pkg/raft/testdata/confchange_v2_replace_leader.txt deleted file mode 100644 index 676b214e0fa1..000000000000 --- a/pkg/raft/testdata/confchange_v2_replace_leader.txt +++ /dev/null @@ -1,431 +0,0 @@ -# Run a V2 membership change that removes the leader and adds another voter as -# a single operation, using joint consensus and explicitly determining when to -# transition out of the joint config. Leadership is transferred to new joiner -# while in the joint config. After the reconfiguration completes, we verify -# that the removed leader cannot campaign to become leader. - -# We'll turn this back on after the boilerplate. -log-level none ----- -ok - -# Bootstrap n1, n2, n3. -add-nodes 3 voters=(1,2,3) index=2 ----- -ok - -# n1 campaigns to become leader. -campaign 1 ----- -ok - -stabilize ----- -ok - -log-level info ----- -ok - -raft-state ----- -1: StateLeader (Voter) Term:1 Lead:1 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 - -# create n4 -add-nodes 1 ----- -INFO 4 switched to configuration voters=() -INFO 4 became follower at term 0 -INFO newRaft 4 [peers: [], term: 0, commit: 0, applied: 0, lastindex: 0, lastterm: 0] - -# Start reconfiguration to remove n1 and add n4. -propose-conf-change 1 v1=false transition=explicit -r1 v4 ----- -ok - -# Enter joint config. -stabilize ----- -> 1 handling Ready - Ready MustSync=true: - Entries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 1->2 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChangeV2 r1 v4] - 1->3 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChangeV2 r1 v4] -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChangeV2 r1 v4] -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/3 Commit:3 Entries:[1/4 EntryConfChangeV2 r1 v4] -> 2 handling Ready - Ready MustSync=true: - Entries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 2->1 MsgAppResp Term:1 Log:0/4 Commit:3 -> 3 handling Ready - Ready MustSync=true: - Entries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 3->1 MsgAppResp Term:1 Log:0/4 Commit:3 -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/4 Commit:3 - 3->1 MsgAppResp Term:1 Log:0/4 Commit:3 -> 1 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:4 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 1->2 MsgApp Term:1 Log:1/4 Commit:4 - 1->3 MsgApp Term:1 Log:1/4 Commit:4 - INFO 1 switched to configuration voters=(2 3 4)&&(1 2 3) -> 2 receiving messages - 1->2 MsgApp Term:1 Log:1/4 Commit:4 -> 3 receiving messages - 1->3 MsgApp Term:1 Log:1/4 Commit:4 -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->4 MsgApp Term:1 Log:1/3 Commit:4 Entries:[1/4 EntryConfChangeV2 r1 v4] -> 2 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:4 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 2->1 MsgAppResp Term:1 Log:0/4 Commit:4 - INFO 2 switched to configuration voters=(2 3 4)&&(1 2 3) -> 3 handling Ready - Ready MustSync=true: - HardState Term:1 Vote:1 Commit:4 Lead:1 LeadEpoch:1 - CommittedEntries: - 1/4 EntryConfChangeV2 r1 v4 - Messages: - 3->1 MsgAppResp Term:1 Log:0/4 Commit:4 - INFO 3 switched to configuration voters=(2 3 4)&&(1 2 3) -> 1 receiving messages - 2->1 MsgAppResp Term:1 Log:0/4 Commit:4 - 3->1 MsgAppResp Term:1 Log:0/4 Commit:4 -> 4 receiving messages - 1->4 MsgApp Term:1 Log:1/3 Commit:4 Entries:[1/4 EntryConfChangeV2 r1 v4] - INFO 4 [term: 0] received a MsgApp message with higher term from 1 [term: 1] - INFO 4 became follower at term 1 -> 4 handling Ready - Ready MustSync=true: - HardState Term:1 Commit:0 Lead:1 LeadEpoch:0 - Messages: - 4->1 MsgAppResp Term:1 Log:0/3 Rejected (Hint: 0) -> 1 receiving messages - 4->1 MsgAppResp Term:1 Log:0/3 Rejected (Hint: 0) -> 1 handling Ready - Ready MustSync=false: - Messages: - 1->4 MsgSnap Term:1 Log:0/0 - Snapshot: Index:4 Term:1 ConfState:Voters:[2 3 4] VotersOutgoing:[1 2 3] Learners:[] LearnersNext:[] AutoLeave:false -> 4 receiving messages - 1->4 MsgSnap Term:1 Log:0/0 - Snapshot: Index:4 Term:1 ConfState:Voters:[2 3 4] VotersOutgoing:[1 2 3] Learners:[] LearnersNext:[] AutoLeave:false - INFO log [committed=0, applied=0, applying=0, unstable.offset=1, unstable.offsetInProgress=1, len(unstable.Entries)=0] starts to restore snapshot [index: 4, term: 1] - INFO 4 switched to configuration voters=(2 3 4)&&(1 2 3) - INFO 4 [commit: 4, lastindex: 4, lastterm: 1] restored snapshot [index: 4, term: 1] - INFO 4 [commit: 4] restored snapshot [index: 4, term: 1] -> 4 handling Ready - Ready MustSync=true: - HardState Term:1 Commit:4 Lead:1 LeadEpoch:0 - Snapshot Index:4 Term:1 ConfState:Voters:[2 3 4] VotersOutgoing:[1 2 3] Learners:[] LearnersNext:[] AutoLeave:false - Messages: - 4->1 MsgAppResp Term:1 Log:0/4 Commit:4 -> 1 receiving messages - 4->1 MsgAppResp Term:1 Log:0/4 Commit:4 - - -# Transfer leadership while in the joint config. -transfer-leadership from=1 to=4 ----- -INFO 1 [term 1] starts to transfer leadership to 4 -INFO 1 sends MsgTimeoutNow to 4 immediately as 4 already has up-to-date log -INFO 1 became follower at term 1 - -# Leadership transfer was initiated by the outgoing leader, but not yet -# processed by the transfer target. -raft-state ----- -1: StateFollower (Voter) Term:1 Lead:0 LeadEpoch:1 -2: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -3: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:1 -4: StateFollower (Voter) Term:1 Lead:1 LeadEpoch:0 - -# Leadership transfer is happening here. -stabilize ----- -> 1 handling Ready - Ready MustSync=false: - State:StateFollower - Messages: - 1->4 MsgTimeoutNow Term:1 Log:0/0 -> 4 receiving messages - 1->4 MsgTimeoutNow Term:1 Log:0/0 - INFO 4 [term 1] received MsgTimeoutNow from 1 and starts an election to get leadership - INFO 4 is starting a new election at term 1 - INFO 4 became candidate at term 2 - INFO 4 [logterm: 1, index: 4] sent MsgVote request to 1 at term 2 - INFO 4 [logterm: 1, index: 4] sent MsgVote request to 2 at term 2 - INFO 4 [logterm: 1, index: 4] sent MsgVote request to 3 at term 2 -> 4 handling Ready - Ready MustSync=true: - State:StateCandidate - HardState Term:2 Vote:4 Commit:4 Lead:0 LeadEpoch:0 - Messages: - 4->1 MsgVote Term:2 Log:1/4 - 4->2 MsgVote Term:2 Log:1/4 - 4->3 MsgVote Term:2 Log:1/4 - INFO 4 received MsgVoteResp from 4 at term 2 - INFO 4 has received 1 MsgVoteResp votes and 0 vote rejections -> 1 receiving messages - 4->1 MsgVote Term:2 Log:1/4 - INFO 1 [term: 1] received a MsgVote message with higher term from 4 [term: 2] - INFO 1 became follower at term 2 - INFO 1 [logterm: 1, index: 4, vote: 0] cast MsgVote for 4 [logterm: 1, index: 4] at term 2 -> 2 receiving messages - 4->2 MsgVote Term:2 Log:1/4 - INFO 2 [term: 1] received a MsgVote message with higher term from 4 [term: 2] - INFO 2 became follower at term 2 - INFO 2 [logterm: 1, index: 4, vote: 0] cast MsgVote for 4 [logterm: 1, index: 4] at term 2 -> 3 receiving messages - 4->3 MsgVote Term:2 Log:1/4 - INFO 3 [term: 1] received a MsgVote message with higher term from 4 [term: 2] - INFO 3 became follower at term 2 - INFO 3 [logterm: 1, index: 4, vote: 0] cast MsgVote for 4 [logterm: 1, index: 4] at term 2 -> 1 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:0 LeadEpoch:0 - Messages: - 1->4 MsgVoteResp Term:2 Log:0/0 -> 2 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:0 LeadEpoch:0 - Messages: - 2->4 MsgVoteResp Term:2 Log:0/0 -> 3 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:0 LeadEpoch:0 - Messages: - 3->4 MsgVoteResp Term:2 Log:0/0 -> 4 receiving messages - 1->4 MsgVoteResp Term:2 Log:0/0 - INFO 4 received MsgVoteResp from 1 at term 2 - INFO 4 has received 2 MsgVoteResp votes and 0 vote rejections - 2->4 MsgVoteResp Term:2 Log:0/0 - INFO 4 received MsgVoteResp from 2 at term 2 - INFO 4 has received 3 MsgVoteResp votes and 0 vote rejections - INFO 4 became leader at term 2 - 3->4 MsgVoteResp Term:2 Log:0/0 -> 4 handling Ready - Ready MustSync=true: - State:StateLeader - HardState Term:2 Vote:4 Commit:4 Lead:4 LeadEpoch:1 - Entries: - 2/5 EntryNormal "" - Messages: - 4->1 MsgFortifyLeader Term:2 Log:0/0 - 4->2 MsgFortifyLeader Term:2 Log:0/0 - 4->3 MsgFortifyLeader Term:2 Log:0/0 - 4->1 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] - 4->2 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] - 4->3 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] -> 1 receiving messages - 4->1 MsgFortifyLeader Term:2 Log:0/0 - 4->1 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] -> 2 receiving messages - 4->2 MsgFortifyLeader Term:2 Log:0/0 - 4->2 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] -> 3 receiving messages - 4->3 MsgFortifyLeader Term:2 Log:0/0 - 4->3 MsgApp Term:2 Log:1/4 Commit:4 Entries:[2/5 EntryNormal ""] -> 1 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:4 LeadEpoch:1 - Entries: - 2/5 EntryNormal "" - Messages: - 1->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 1->4 MsgAppResp Term:2 Log:0/5 Commit:4 -> 2 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:4 LeadEpoch:1 - Entries: - 2/5 EntryNormal "" - Messages: - 2->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 2->4 MsgAppResp Term:2 Log:0/5 Commit:4 -> 3 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:4 Lead:4 LeadEpoch:1 - Entries: - 2/5 EntryNormal "" - Messages: - 3->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 3->4 MsgAppResp Term:2 Log:0/5 Commit:4 -> 4 receiving messages - 1->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 1->4 MsgAppResp Term:2 Log:0/5 Commit:4 - 2->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 2->4 MsgAppResp Term:2 Log:0/5 Commit:4 - 3->4 MsgFortifyLeaderResp Term:2 Log:0/0 LeadEpoch:1 - 3->4 MsgAppResp Term:2 Log:0/5 Commit:4 -> 4 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:5 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/5 EntryNormal "" - Messages: - 4->1 MsgApp Term:2 Log:2/5 Commit:5 - 4->2 MsgApp Term:2 Log:2/5 Commit:5 - 4->3 MsgApp Term:2 Log:2/5 Commit:5 -> 1 receiving messages - 4->1 MsgApp Term:2 Log:2/5 Commit:5 -> 2 receiving messages - 4->2 MsgApp Term:2 Log:2/5 Commit:5 -> 3 receiving messages - 4->3 MsgApp Term:2 Log:2/5 Commit:5 -> 1 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:5 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/5 EntryNormal "" - Messages: - 1->4 MsgAppResp Term:2 Log:0/5 Commit:5 -> 2 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:5 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/5 EntryNormal "" - Messages: - 2->4 MsgAppResp Term:2 Log:0/5 Commit:5 -> 3 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:5 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/5 EntryNormal "" - Messages: - 3->4 MsgAppResp Term:2 Log:0/5 Commit:5 -> 4 receiving messages - 1->4 MsgAppResp Term:2 Log:0/5 Commit:5 - 2->4 MsgAppResp Term:2 Log:0/5 Commit:5 - 3->4 MsgAppResp Term:2 Log:0/5 Commit:5 - -# Leadership transfer succeeded. -raft-state ----- -1: StateFollower (Voter) Term:2 Lead:4 LeadEpoch:1 -2: StateFollower (Voter) Term:2 Lead:4 LeadEpoch:1 -3: StateFollower (Voter) Term:2 Lead:4 LeadEpoch:1 -4: StateLeader (Voter) Term:2 Lead:4 LeadEpoch:1 - -# n4 will propose a transition out of the joint config. -propose-conf-change 4 ----- -ok - -# The group commits the command and everyone switches to the final config. -stabilize ----- -> 4 handling Ready - Ready MustSync=true: - Entries: - 2/6 EntryConfChangeV2 - Messages: - 4->1 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] - 4->2 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] - 4->3 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] -> 1 receiving messages - 4->1 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] -> 2 receiving messages - 4->2 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] -> 3 receiving messages - 4->3 MsgApp Term:2 Log:2/5 Commit:5 Entries:[2/6 EntryConfChangeV2] -> 1 handling Ready - Ready MustSync=true: - Entries: - 2/6 EntryConfChangeV2 - Messages: - 1->4 MsgAppResp Term:2 Log:0/6 Commit:5 -> 2 handling Ready - Ready MustSync=true: - Entries: - 2/6 EntryConfChangeV2 - Messages: - 2->4 MsgAppResp Term:2 Log:0/6 Commit:5 -> 3 handling Ready - Ready MustSync=true: - Entries: - 2/6 EntryConfChangeV2 - Messages: - 3->4 MsgAppResp Term:2 Log:0/6 Commit:5 -> 4 receiving messages - 1->4 MsgAppResp Term:2 Log:0/6 Commit:5 - 2->4 MsgAppResp Term:2 Log:0/6 Commit:5 - 3->4 MsgAppResp Term:2 Log:0/6 Commit:5 -> 4 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:6 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/6 EntryConfChangeV2 - Messages: - 4->1 MsgApp Term:2 Log:2/6 Commit:6 - 4->2 MsgApp Term:2 Log:2/6 Commit:6 - 4->3 MsgApp Term:2 Log:2/6 Commit:6 - INFO 4 switched to configuration voters=(2 3 4) -> 1 receiving messages - 4->1 MsgApp Term:2 Log:2/6 Commit:6 -> 2 receiving messages - 4->2 MsgApp Term:2 Log:2/6 Commit:6 -> 3 receiving messages - 4->3 MsgApp Term:2 Log:2/6 Commit:6 -> 1 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:6 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/6 EntryConfChangeV2 - Messages: - 1->4 MsgAppResp Term:2 Log:0/6 Commit:6 - INFO 1 switched to configuration voters=(2 3 4) -> 2 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:6 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/6 EntryConfChangeV2 - Messages: - 2->4 MsgAppResp Term:2 Log:0/6 Commit:6 - INFO 2 switched to configuration voters=(2 3 4) -> 3 handling Ready - Ready MustSync=true: - HardState Term:2 Vote:4 Commit:6 Lead:4 LeadEpoch:1 - CommittedEntries: - 2/6 EntryConfChangeV2 - Messages: - 3->4 MsgAppResp Term:2 Log:0/6 Commit:6 - INFO 3 switched to configuration voters=(2 3 4) -> 4 receiving messages - 1->4 MsgAppResp Term:2 Log:0/6 Commit:6 - raft: cannot step as peer not found - 2->4 MsgAppResp Term:2 Log:0/6 Commit:6 - 3->4 MsgAppResp Term:2 Log:0/6 Commit:6 - -# n1 is out of the configuration. -raft-state ----- -1: StateFollower (Non-Voter) Term:2 Lead:4 LeadEpoch:1 -2: StateFollower (Voter) Term:2 Lead:4 LeadEpoch:1 -3: StateFollower (Voter) Term:2 Lead:4 LeadEpoch:1 -4: StateLeader (Voter) Term:2 Lead:4 LeadEpoch:1 - -# Make sure n1 cannot campaign to become leader. -campaign 1 ----- -WARN 1 is unpromotable and can not campaign diff --git a/pkg/raft/testdata/confchange_v2_replace_leader_stepdown.txt b/pkg/raft/testdata/confchange_v2_replace_leader_stepdown.txt index ec7cbcb1b1a2..036a329b75ed 100644 --- a/pkg/raft/testdata/confchange_v2_replace_leader_stepdown.txt +++ b/pkg/raft/testdata/confchange_v2_replace_leader_stepdown.txt @@ -11,7 +11,7 @@ log-level none ok # Bootstrap n1, n2, n3. -add-nodes 3 voters=(1,2,3) index=2 step-down-on-removal=true +add-nodes 3 voters=(1,2,3) index=2 ---- ok From 86eda7818dfcbfcb6ffdfdb4bc7980fccd19fca5 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Sun, 15 Sep 2024 15:34:48 -0700 Subject: [PATCH 4/4] raft: rename leader stepdown test files Now that the StepDownOnRemoval option is removed, the test files don't need the "_stepdown" suffix anymore. Epic: None Release note: None --- ...remove_leader_stepdown.txt => confchange_v1_remove_leader.txt} | 0 ...place_leader_stepdown.txt => confchange_v2_replace_leader.txt} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename pkg/raft/testdata/{confchange_v1_remove_leader_stepdown.txt => confchange_v1_remove_leader.txt} (100%) rename pkg/raft/testdata/{confchange_v2_replace_leader_stepdown.txt => confchange_v2_replace_leader.txt} (100%) diff --git a/pkg/raft/testdata/confchange_v1_remove_leader_stepdown.txt b/pkg/raft/testdata/confchange_v1_remove_leader.txt similarity index 100% rename from pkg/raft/testdata/confchange_v1_remove_leader_stepdown.txt rename to pkg/raft/testdata/confchange_v1_remove_leader.txt diff --git a/pkg/raft/testdata/confchange_v2_replace_leader_stepdown.txt b/pkg/raft/testdata/confchange_v2_replace_leader.txt similarity index 100% rename from pkg/raft/testdata/confchange_v2_replace_leader_stepdown.txt rename to pkg/raft/testdata/confchange_v2_replace_leader.txt