From b8cce55484c581f78ce162e57b63c235112a09da Mon Sep 17 00:00:00 2001 From: Austen McClernon Date: Wed, 30 Oct 2024 17:16:47 +0000 Subject: [PATCH 1/3] kvflowcontrol: re-enable kvadmission.flow_control.mode metamorphism The cluster setting `kvadmission.flow_control.mode` is used to control whether replication admission control applies to only elastic work, or applies to all work which sets an admission priority. The cluster setting was prior to #132125, metamorphic in tests, selecting either value with equal probability. #132125 disabled the metamorphism in order to prevent the newly introduced send queue / pull mode from being run unexpectedly, while still under testing. With testing now in place, re-enable the metamorphism, leaving the default value unchanged. Resolves: #132364 Release note: None --- pkg/kv/kvserver/kvflowcontrol/BUILD.bazel | 1 + pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/kv/kvserver/kvflowcontrol/BUILD.bazel b/pkg/kv/kvserver/kvflowcontrol/BUILD.bazel index 077a49e8dd87..2caac173dc03 100644 --- a/pkg/kv/kvserver/kvflowcontrol/BUILD.bazel +++ b/pkg/kv/kvserver/kvflowcontrol/BUILD.bazel @@ -19,6 +19,7 @@ go_library( "//pkg/util/admission/admissionpb", "//pkg/util/ctxutil", "//pkg/util/humanizeutil", + "//pkg/util/metamorphic", "@com_github_cockroachdb_redact//:redact", ], ) diff --git a/pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go b/pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go index 09bb9a44cf7e..d3f932fd5bf4 100644 --- a/pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go +++ b/pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb" "github.com/cockroachdb/cockroach/pkg/util/ctxutil" "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" + "github.com/cockroachdb/cockroach/pkg/util/metamorphic" "github.com/cockroachdb/redact" ) @@ -41,7 +42,11 @@ var Mode = settings.RegisterEnumSetting( settings.SystemOnly, "kvadmission.flow_control.mode", "determines the 'mode' of flow control we use for replication traffic in KV, if enabled", - modeDict[ApplyToElastic], /* default value */ + metamorphic.ConstantWithTestChoice( + "kvadmission.flow_control.mode", + modeDict[ApplyToElastic], /* default value */ + modeDict[ApplyToAll], /* other value */ + ), modeDict, ) From 69d5559ea911af2c9f398926fd95fa2caaae49aa Mon Sep 17 00:00:00 2001 From: Austen McClernon Date: Wed, 30 Oct 2024 17:20:49 +0000 Subject: [PATCH 2/3] kvflowcontrol: set kvadmission.flow_control.mode default to apply_to_all With testing now in place, this commit changes the default cluster setting value of `kvadmission.flow_control.mode` from `apply_to_elastic`, to `apply_to_all`. Now by default, regular work will be subject to replication admission control, whereby a quorum will be allowed to proceed, queuing entries to any non-quorum required replica, when send tokens are exhausted. For more details, see #123509. Resolves: #133838 Release note (performance improvement): Regular writes are now subject to admission control by default, meaning that non-quorum required replicas may not be told about new writes from the leader if they are unable to keep up. This brings a large performance improvement during instances where there is a large backlog of replication work towards a subset of node(s), such as node restarts. The setting can be reverted to the <=24.3 default by setting `kvadmission.flow_control.mode` to `apply_to_elastic`. --- docs/generated/metrics/metrics.html | 5 +++++ pkg/kv/kvserver/client_raft_log_queue_test.go | 4 +++- pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go | 4 ++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index c3bb5502776c..3ef9b32c5da8 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -19,6 +19,7 @@ STORAGEadmission.admitted.kv-stores.high-priNumber of requests admittedRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.admitted.kv-stores.locking-normal-priNumber of requests admittedRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.admitted.kv-stores.normal-priNumber of requests admittedRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +STORAGEadmission.admitted.kv-stores.user-high-priNumber of requests admittedRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.admitted.kv.high-priNumber of requests admittedRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.admitted.kv.locking-normal-priNumber of requests admittedRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.admitted.kv.normal-priNumber of requests admittedRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE @@ -54,6 +55,7 @@ STORAGEadmission.errored.kv-stores.high-priNumber of requests not admitted due to errorRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.errored.kv-stores.locking-normal-priNumber of requests not admitted due to errorRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.errored.kv-stores.normal-priNumber of requests not admitted due to errorRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +STORAGEadmission.errored.kv-stores.user-high-priNumber of requests not admitted due to errorRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.errored.kv.high-priNumber of requests not admitted due to errorRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.errored.kv.locking-normal-priNumber of requests not admitted due to errorRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.errored.kv.normal-priNumber of requests not admitted due to errorRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE @@ -101,6 +103,7 @@ STORAGEadmission.requested.kv-stores.high-priNumber of requestsRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.requested.kv-stores.locking-normal-priNumber of requestsRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.requested.kv-stores.normal-priNumber of requestsRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +STORAGEadmission.requested.kv-stores.user-high-priNumber of requestsRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.requested.kv.high-priNumber of requestsRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.requested.kv.locking-normal-priNumber of requestsRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEadmission.requested.kv.normal-priNumber of requestsRequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE @@ -128,6 +131,7 @@ STORAGEadmission.wait_durations.kv-stores.high-priWait time durations for requests that waitedWait time DurationHISTOGRAMNANOSECONDSAVGNONE STORAGEadmission.wait_durations.kv-stores.locking-normal-priWait time durations for requests that waitedWait time DurationHISTOGRAMNANOSECONDSAVGNONE STORAGEadmission.wait_durations.kv-stores.normal-priWait time durations for requests that waitedWait time DurationHISTOGRAMNANOSECONDSAVGNONE +STORAGEadmission.wait_durations.kv-stores.user-high-priWait time durations for requests that waitedWait time DurationHISTOGRAMNANOSECONDSAVGNONE STORAGEadmission.wait_durations.kv.high-priWait time durations for requests that waitedWait time DurationHISTOGRAMNANOSECONDSAVGNONE STORAGEadmission.wait_durations.kv.locking-normal-priWait time durations for requests that waitedWait time DurationHISTOGRAMNANOSECONDSAVGNONE STORAGEadmission.wait_durations.kv.normal-priWait time durations for requests that waitedWait time DurationHISTOGRAMNANOSECONDSAVGNONE @@ -155,6 +159,7 @@ STORAGEadmission.wait_queue_length.kv-stores.high-priLength of wait queueRequestsGAUGECOUNTAVGNONE STORAGEadmission.wait_queue_length.kv-stores.locking-normal-priLength of wait queueRequestsGAUGECOUNTAVGNONE STORAGEadmission.wait_queue_length.kv-stores.normal-priLength of wait queueRequestsGAUGECOUNTAVGNONE +STORAGEadmission.wait_queue_length.kv-stores.user-high-priLength of wait queueRequestsGAUGECOUNTAVGNONE STORAGEadmission.wait_queue_length.kv.high-priLength of wait queueRequestsGAUGECOUNTAVGNONE STORAGEadmission.wait_queue_length.kv.locking-normal-priLength of wait queueRequestsGAUGECOUNTAVGNONE STORAGEadmission.wait_queue_length.kv.normal-priLength of wait queueRequestsGAUGECOUNTAVGNONE diff --git a/pkg/kv/kvserver/client_raft_log_queue_test.go b/pkg/kv/kvserver/client_raft_log_queue_test.go index abaadb9d5bb0..10a70e2d2888 100644 --- a/pkg/kv/kvserver/client_raft_log_queue_test.go +++ b/pkg/kv/kvserver/client_raft_log_queue_test.go @@ -201,8 +201,10 @@ func TestRaftTracing(t *testing.T) { expectedMessages := []string{ `replica_proposal_buf.* flushing proposal to Raft`, `replica_proposal_buf.* registering local trace`, + // Note that we don't assert that the 1->3 MsgApp goes through, as + // the ordering may change between 1->2 and 1->3. It should be + // sufficient to just check one of them for tracing. `replica_raft.* 1->2 MsgApp`, - `replica_raft.* 1->3 MsgApp`, `replica_raft.* AppendThread->1 MsgStorageAppendResp`, `ack-ing replication success to the client`, } diff --git a/pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go b/pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go index d3f932fd5bf4..35365094b9c2 100644 --- a/pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go +++ b/pkg/kv/kvserver/kvflowcontrol/kvflowcontrol.go @@ -44,8 +44,8 @@ var Mode = settings.RegisterEnumSetting( "determines the 'mode' of flow control we use for replication traffic in KV, if enabled", metamorphic.ConstantWithTestChoice( "kvadmission.flow_control.mode", - modeDict[ApplyToElastic], /* default value */ - modeDict[ApplyToAll], /* other value */ + modeDict[ApplyToAll], /* default value */ + modeDict[ApplyToElastic], /* other value */ ), modeDict, ) From 71cdabc28ae1f81f6ad8cec41da647b541af48a4 Mon Sep 17 00:00:00 2001 From: Austen McClernon Date: Thu, 19 Dec 2024 12:12:19 -0500 Subject: [PATCH 3/3] testcluster: retry force repl scan on startup There have been recent flakes on tests which use a `TestCluster`. These failures have occurred at startup, before much test specific logic. This patch mitigates the symptom by retrying on an error, rather than erroring out. The error looks like: ``` testcluster.go:485: unable to retrieve conf reader: span configs not available ``` It is caused by span configs not being available, which is checked via grabbing the `ConfReader` before forcing the replica queues to process. The root cause of this behavior change should still be determined and this should be considered temporary, although the root cause may be benign. informs: #137712 informs: #137762 Release note: None --- pkg/testutils/testcluster/testcluster.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/testutils/testcluster/testcluster.go b/pkg/testutils/testcluster/testcluster.go index 93adda29e143..0c2f833dba45 100644 --- a/pkg/testutils/testcluster/testcluster.go +++ b/pkg/testutils/testcluster/testcluster.go @@ -1479,7 +1479,9 @@ func (tc *TestCluster) WaitForFullReplication() error { // Force upreplication. Otherwise, if we rely on the scanner to do it, // it'll take a while. if err := s.ForceReplicationScanAndProcess(); err != nil { - return err + log.Infof(context.TODO(), "%v", err) + notReplicated = true + return nil } if err := s.ComputeMetrics(context.TODO()); err != nil { // This can sometimes fail since ComputeMetrics calls