diff --git a/pkg/ccl/multiregionccl/BUILD.bazel b/pkg/ccl/multiregionccl/BUILD.bazel index d70c91932e7d..12586db184d5 100644 --- a/pkg/ccl/multiregionccl/BUILD.bazel +++ b/pkg/ccl/multiregionccl/BUILD.bazel @@ -86,7 +86,6 @@ go_test( "//pkg/sql/sem/tree", "//pkg/sql/sqlinstance/instancestorage", "//pkg/sql/sqlliveness", - "//pkg/sql/sqlliveness/slbase", "//pkg/sql/sqlliveness/slstorage", "//pkg/sql/sqltestutils", "//pkg/testutils", diff --git a/pkg/ccl/multiregionccl/regionliveness_test.go b/pkg/ccl/multiregionccl/regionliveness_test.go index f7593051a6c3..4c7dff1a522e 100644 --- a/pkg/ccl/multiregionccl/regionliveness_test.go +++ b/pkg/ccl/multiregionccl/regionliveness_test.go @@ -32,7 +32,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/regionliveness" "github.com/cockroachdb/cockroach/pkg/sql/regions" "github.com/cockroachdb/cockroach/pkg/sql/sqlinstance/instancestorage" - "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness/slbase" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/testutils/skip" @@ -44,7 +43,14 @@ import ( ) const ( + // Use a low probe timeout number, but intentionally only manipulate to + // this value when *failures* are expected. testingRegionLivenessProbeTimeout = time.Second * 2 + // Use a long probe timeout by default when running this test (since CI + // can easily hit query timeouts). + testingRegionLivenessProbeTimeoutLong = time.Minute + // Use a reduced liveness TTL for region liveness infrastructure. + testingRegionLivenessTTL = time.Second * 10 ) func TestRegionLivenessProber(t *testing.T) { @@ -53,7 +59,7 @@ func TestRegionLivenessProber(t *testing.T) { // This test forces the SQL liveness TTL be a small number, // which makes the heartbeats even more critical. Under stress and // race environments this test becomes even more sensitive, if - // we can't send heartbeats within 10 seconds. + // we can't send heartbeats within 20 seconds. skip.UnderStress(t) skip.UnderRace(t) skip.UnderDeadlock(t) @@ -65,10 +71,10 @@ func TestRegionLivenessProber(t *testing.T) { makeSettings := func() *cluster.Settings { cs := cluster.MakeTestingClusterSettings() instancestorage.ReclaimLoopInterval.Override(ctx, &cs.SV, 150*time.Millisecond) - slbase.DefaultTTL.Override(ctx, &cs.SV, 10*time.Second) regionliveness.RegionLivenessEnabled.Override(ctx, &cs.SV, true) return cs } + defer regionliveness.TestingSetUnavailableAtTTLOverride(testingRegionLivenessTTL)() expectedRegions := []string{ "us-east", @@ -88,11 +94,11 @@ func TestRegionLivenessProber(t *testing.T) { var tenants []serverutils.ApplicationLayerInterface var tenantSQL []*gosql.DB blockProbeQuery := atomic.Bool{} - defer regionliveness.TestingSetProbeLivenessTimeout(500*time.Millisecond, + defer regionliveness.TestingSetProbeLivenessTimeout( func() { // Timeout attempts to probe intentionally. if blockProbeQuery.Swap(false) { - time.Sleep(2 * time.Second) + time.Sleep(testingRegionLivenessProbeTimeout) } })() @@ -148,6 +154,11 @@ func TestRegionLivenessProber(t *testing.T) { liveRegions, err = regionProber.QueryLiveness(ctx, testTxn) require.NoError(t, err) checkExpectedRegions(expectedRegions, liveRegions) + // Override the table timeout probe for testing to ensure timeout failures + // happen now. + for _, ts := range tenants { + regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeout) + } // Probe the liveness of the region, but timeout the query // intentionally to make it seem dead. blockProbeQuery.Store(true) @@ -213,7 +224,7 @@ func TestRegionLivenessProberForLeases(t *testing.T) { // This test forces the SQL liveness TTL be a small number, // which makes the heartbeats even more critical. Under stress and // race environments this test becomes even more sensitive, if - // we can't send heartbeats within 10 seconds. + // we can't send heartbeats within 20 seconds. skip.UnderStress(t) skip.UnderRace(t) skip.UnderDeadlock(t) @@ -225,10 +236,10 @@ func TestRegionLivenessProberForLeases(t *testing.T) { makeSettings := func() *cluster.Settings { cs := cluster.MakeTestingClusterSettings() instancestorage.ReclaimLoopInterval.Override(ctx, &cs.SV, 150*time.Millisecond) - slbase.DefaultTTL.Override(ctx, &cs.SV, 10*time.Second) regionliveness.RegionLivenessEnabled.Override(ctx, &cs.SV, true) return cs } + defer regionliveness.TestingSetUnavailableAtTTLOverride(testingRegionLivenessTTL)() expectedRegions := []string{ "us-east", @@ -239,11 +250,11 @@ func TestRegionLivenessProberForLeases(t *testing.T) { targetCount := atomic.Int64{} var tenants []serverutils.ApplicationLayerInterface var tenantSQL []*gosql.DB - defer regionliveness.TestingSetProbeLivenessTimeout(1*time.Second, func() { + defer regionliveness.TestingSetProbeLivenessTimeout(func() { if !detectLeaseWait.Load() { return } - time.Sleep(time.Second * 2) + time.Sleep(testingRegionLivenessProbeTimeout) targetCount.Swap(0) detectLeaseWait.Swap(false) })() @@ -274,7 +285,7 @@ func TestRegionLivenessProberForLeases(t *testing.T) { if targetCount.Add(1) != 1 { return } - time.Sleep(time.Second * 2) + time.Sleep(testingRegionLivenessProbeTimeout) } }, }, @@ -298,10 +309,7 @@ func TestRegionLivenessProberForLeases(t *testing.T) { require.NoError(t, err) } } - // Override the table timeout probe for testing. - for _, ts := range tenants { - regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeout) - } + // Create a new table and have it used on all nodes. _, err = tenantSQL[0].Exec("CREATE TABLE t1(j int)") require.NoError(t, err) @@ -312,6 +320,11 @@ func TestRegionLivenessProberForLeases(t *testing.T) { row := tenantSQL[0].QueryRow("SELECT 't1'::REGCLASS::OID") var tableID int require.NoError(t, row.Scan(&tableID)) + // Override the table timeout probe for testing to ensure timeout failures + // happen now. + for _, ts := range tenants { + regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeout) + } // Issue a schema change which should mark this region as dead, and fail // out because our probe query will time out. detectLeaseWait.Swap(true) @@ -350,5 +363,9 @@ func TestRegionLivenessProberForLeases(t *testing.T) { }) return builder.BuildExistingMutableType() }) + // Restore the override to reduce the risk of failing on overloaded systems. + for _, ts := range tenants { + regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeoutLong) + } require.NoError(t, lm.WaitForNoVersion(ctx, descpb.ID(tableID), cachedDatabaseRegions, retry.Options{})) } diff --git a/pkg/sql/regionliveness/prober.go b/pkg/sql/regionliveness/prober.go index 5b519805445a..b1f556489b40 100644 --- a/pkg/sql/regionliveness/prober.go +++ b/pkg/sql/regionliveness/prober.go @@ -122,19 +122,24 @@ type livenessProber struct { settings *clustersettings.Settings } -var probeLivenessTimeout = 15 * time.Second var testingProbeQueryCallbackFunc func() +var testingUnavailableAtTTLOverride time.Duration -func TestingSetProbeLivenessTimeout(newTimeout time.Duration, probeCallbackFn func()) func() { - oldTimeout := probeLivenessTimeout - probeLivenessTimeout = newTimeout +func TestingSetProbeLivenessTimeout(probeCallbackFn func()) func() { testingProbeQueryCallbackFunc = probeCallbackFn return func() { - probeLivenessTimeout = oldTimeout probeCallbackFn = nil } } +func TestingSetUnavailableAtTTLOverride(duration time.Duration) func() { + oldValue := testingUnavailableAtTTLOverride + testingUnavailableAtTTLOverride = duration + return func() { + testingUnavailableAtTTLOverride = oldValue + } +} + // NewLivenessProber creates a new region liveness prober. func NewLivenessProber( db *kv.DB, @@ -211,6 +216,9 @@ func (l *livenessProber) ProbeLivenessWithPhysicalRegion( // Region has gone down, set the unavailable_at time on it return l.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { defaultTTL := slbase.DefaultTTL.Get(&l.settings.SV) + if testingUnavailableAtTTLOverride != 0 { + defaultTTL = testingUnavailableAtTTLOverride + } defaultHeartbeat := slbase.DefaultHeartBeat.Get(&l.settings.SV) // Get the read timestamp and pick a commit deadline. commitDeadline := txn.ReadTimestamp().AddDuration(defaultHeartbeat) @@ -316,7 +324,8 @@ func (l *livenessProber) GetProbeTimeout() (bool, time.Duration) { // when checking for region liveness. func IsQueryTimeoutErr(err error) bool { return pgerror.GetPGCode(err) == pgcode.QueryCanceled || - errors.HasType(err, (*timeutil.TimeoutError)(nil)) + errors.HasType(err, (*timeutil.TimeoutError)(nil)) || + errors.Is(err, context.DeadlineExceeded) } // IsMissingRegionEnumErr determines if a query hit an error because of a missing