From 4bd1fbc290c8afe1768cb7ef2e23a285954f343b Mon Sep 17 00:00:00 2001 From: Ibrahim Kettaneh Date: Mon, 9 Dec 2024 14:36:01 -0500 Subject: [PATCH] loq: clear LeadEpoch when re-writing the RangeDescriptor LoQ tool is used when we have some ranges that lost quorum. It removes some replicas from the RangeDescriptor. If the fortified leader is removed from the RangeDescriptor, SupportFor() will return epoch=0. This will fire an assertion since supportFor epochs should never regress. This commit changes the LoQ behaviour where it resets the LeadEpoch when rewriting the ReplicaDescriptor. Fixes: #136908 Release note: None --- pkg/kv/kvserver/loqrecovery/apply.go | 13 +++++++++++++ .../kvserver/loqrecovery/server_integration_test.go | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pkg/kv/kvserver/loqrecovery/apply.go b/pkg/kv/kvserver/loqrecovery/apply.go index 28c3b25a74e9..af4221186058 100644 --- a/pkg/kv/kvserver/loqrecovery/apply.go +++ b/pkg/kv/kvserver/loqrecovery/apply.go @@ -308,6 +308,19 @@ func applyReplicaUpdate( return PrepareReplicaReport{}, errors.Wrap(err, "updating MVCCStats") } + // Update the HardState to clear the LeadEpoch, as otherwise we may risk + // seeing an epoch regression in raft. See #136908 for more details. + hs, err := sl.LoadHardState(ctx, readWriter) + if err != nil { + return PrepareReplicaReport{}, errors.Wrap(err, "loading HardState") + } + + hs.LeadEpoch = 0 + + if err := sl.SetHardState(ctx, readWriter, hs); err != nil { + return PrepareReplicaReport{}, errors.Wrap(err, "setting HardState") + } + return report, nil } diff --git a/pkg/kv/kvserver/loqrecovery/server_integration_test.go b/pkg/kv/kvserver/loqrecovery/server_integration_test.go index dd50a3f18636..f221f27d3310 100644 --- a/pkg/kv/kvserver/loqrecovery/server_integration_test.go +++ b/pkg/kv/kvserver/loqrecovery/server_integration_test.go @@ -587,7 +587,7 @@ func TestRetrieveApplyStatus(t *testing.T) { // We currently don't clear out the LeadEpoch field when recovering from a // loss of quorum, so we can't run with leader leases on in this test. - tc, _, _ := prepTestCluster(ctx, t, 5, true /* disableLeaderLease */) + tc, _, _ := prepTestCluster(ctx, t, 5, false /* disableLeaderLease */) defer tc.Stopper().Stop(ctx) // Use scratch range to ensure we have a range that loses quorum.