Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
118503: regionliveness: fix flakes inside TestRegionLivenessProber r=fqazi a=fqazi

Previously, the cancellation generated by timeouts set by region liveness probing could potentially fail with "context deadline exceeded". This could happen in code paths where the cancellation checker isn't used for example within the region liveness code when pure KV calls are used. To address this, this patch allow context cancelled errors to be treated as timeouts. Additionally, the region liveness tests are cleaned up and have the sqlliveness TTL increased to reduce the risk of flakes. Also the reduced probing timeouts are only setup for short periods of time when failures are expected to reduce risk of intermittent failures.

Fixes: cockroachdb#118465
fixes cockroachdb#118464

Release note: None

Co-authored-by: Faizan Qazi <[email protected]>
  • Loading branch information
craig[bot] and fqazi committed Feb 1, 2024
2 parents cc4fdff + 6f0e68e commit 709d308
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 21 deletions.
1 change: 0 additions & 1 deletion pkg/ccl/multiregionccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ go_test(
"//pkg/sql/sem/tree",
"//pkg/sql/sqlinstance/instancestorage",
"//pkg/sql/sqlliveness",
"//pkg/sql/sqlliveness/slbase",
"//pkg/sql/sqlliveness/slstorage",
"//pkg/sql/sqltestutils",
"//pkg/testutils",
Expand Down
45 changes: 31 additions & 14 deletions pkg/ccl/multiregionccl/regionliveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/regionliveness"
"github.com/cockroachdb/cockroach/pkg/sql/regions"
"github.com/cockroachdb/cockroach/pkg/sql/sqlinstance/instancestorage"
"github.com/cockroachdb/cockroach/pkg/sql/sqlliveness/slbase"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
Expand All @@ -44,7 +43,14 @@ import (
)

const (
// Use a low probe timeout number, but intentionally only manipulate to
// this value when *failures* are expected.
testingRegionLivenessProbeTimeout = time.Second * 2
// Use a long probe timeout by default when running this test (since CI
// can easily hit query timeouts).
testingRegionLivenessProbeTimeoutLong = time.Minute
// Use a reduced liveness TTL for region liveness infrastructure.
testingRegionLivenessTTL = time.Second * 10
)

func TestRegionLivenessProber(t *testing.T) {
Expand All @@ -53,7 +59,7 @@ func TestRegionLivenessProber(t *testing.T) {
// This test forces the SQL liveness TTL be a small number,
// which makes the heartbeats even more critical. Under stress and
// race environments this test becomes even more sensitive, if
// we can't send heartbeats within 10 seconds.
// we can't send heartbeats within 20 seconds.
skip.UnderStress(t)
skip.UnderRace(t)
skip.UnderDeadlock(t)
Expand All @@ -65,10 +71,10 @@ func TestRegionLivenessProber(t *testing.T) {
makeSettings := func() *cluster.Settings {
cs := cluster.MakeTestingClusterSettings()
instancestorage.ReclaimLoopInterval.Override(ctx, &cs.SV, 150*time.Millisecond)
slbase.DefaultTTL.Override(ctx, &cs.SV, 10*time.Second)
regionliveness.RegionLivenessEnabled.Override(ctx, &cs.SV, true)
return cs
}
defer regionliveness.TestingSetUnavailableAtTTLOverride(testingRegionLivenessTTL)()

expectedRegions := []string{
"us-east",
Expand All @@ -88,11 +94,11 @@ func TestRegionLivenessProber(t *testing.T) {
var tenants []serverutils.ApplicationLayerInterface
var tenantSQL []*gosql.DB
blockProbeQuery := atomic.Bool{}
defer regionliveness.TestingSetProbeLivenessTimeout(500*time.Millisecond,
defer regionliveness.TestingSetProbeLivenessTimeout(
func() {
// Timeout attempts to probe intentionally.
if blockProbeQuery.Swap(false) {
time.Sleep(2 * time.Second)
time.Sleep(testingRegionLivenessProbeTimeout)
}
})()

Expand Down Expand Up @@ -148,6 +154,11 @@ func TestRegionLivenessProber(t *testing.T) {
liveRegions, err = regionProber.QueryLiveness(ctx, testTxn)
require.NoError(t, err)
checkExpectedRegions(expectedRegions, liveRegions)
// Override the table timeout probe for testing to ensure timeout failures
// happen now.
for _, ts := range tenants {
regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeout)
}
// Probe the liveness of the region, but timeout the query
// intentionally to make it seem dead.
blockProbeQuery.Store(true)
Expand Down Expand Up @@ -213,7 +224,7 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
// This test forces the SQL liveness TTL be a small number,
// which makes the heartbeats even more critical. Under stress and
// race environments this test becomes even more sensitive, if
// we can't send heartbeats within 10 seconds.
// we can't send heartbeats within 20 seconds.
skip.UnderStress(t)
skip.UnderRace(t)
skip.UnderDeadlock(t)
Expand All @@ -225,10 +236,10 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
makeSettings := func() *cluster.Settings {
cs := cluster.MakeTestingClusterSettings()
instancestorage.ReclaimLoopInterval.Override(ctx, &cs.SV, 150*time.Millisecond)
slbase.DefaultTTL.Override(ctx, &cs.SV, 10*time.Second)
regionliveness.RegionLivenessEnabled.Override(ctx, &cs.SV, true)
return cs
}
defer regionliveness.TestingSetUnavailableAtTTLOverride(testingRegionLivenessTTL)()

expectedRegions := []string{
"us-east",
Expand All @@ -239,11 +250,11 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
targetCount := atomic.Int64{}
var tenants []serverutils.ApplicationLayerInterface
var tenantSQL []*gosql.DB
defer regionliveness.TestingSetProbeLivenessTimeout(1*time.Second, func() {
defer regionliveness.TestingSetProbeLivenessTimeout(func() {
if !detectLeaseWait.Load() {
return
}
time.Sleep(time.Second * 2)
time.Sleep(testingRegionLivenessProbeTimeout)
targetCount.Swap(0)
detectLeaseWait.Swap(false)
})()
Expand Down Expand Up @@ -274,7 +285,7 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
if targetCount.Add(1) != 1 {
return
}
time.Sleep(time.Second * 2)
time.Sleep(testingRegionLivenessProbeTimeout)
}
},
},
Expand All @@ -298,10 +309,7 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
require.NoError(t, err)
}
}
// Override the table timeout probe for testing.
for _, ts := range tenants {
regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeout)
}

// Create a new table and have it used on all nodes.
_, err = tenantSQL[0].Exec("CREATE TABLE t1(j int)")
require.NoError(t, err)
Expand All @@ -312,6 +320,11 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
row := tenantSQL[0].QueryRow("SELECT 't1'::REGCLASS::OID")
var tableID int
require.NoError(t, row.Scan(&tableID))
// Override the table timeout probe for testing to ensure timeout failures
// happen now.
for _, ts := range tenants {
regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeout)
}
// Issue a schema change which should mark this region as dead, and fail
// out because our probe query will time out.
detectLeaseWait.Swap(true)
Expand Down Expand Up @@ -350,5 +363,9 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
})
return builder.BuildExistingMutableType()
})
// Restore the override to reduce the risk of failing on overloaded systems.
for _, ts := range tenants {
regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeoutLong)
}
require.NoError(t, lm.WaitForNoVersion(ctx, descpb.ID(tableID), cachedDatabaseRegions, retry.Options{}))
}
21 changes: 15 additions & 6 deletions pkg/sql/regionliveness/prober.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,19 +122,24 @@ type livenessProber struct {
settings *clustersettings.Settings
}

var probeLivenessTimeout = 15 * time.Second
var testingProbeQueryCallbackFunc func()
var testingUnavailableAtTTLOverride time.Duration

func TestingSetProbeLivenessTimeout(newTimeout time.Duration, probeCallbackFn func()) func() {
oldTimeout := probeLivenessTimeout
probeLivenessTimeout = newTimeout
func TestingSetProbeLivenessTimeout(probeCallbackFn func()) func() {
testingProbeQueryCallbackFunc = probeCallbackFn
return func() {
probeLivenessTimeout = oldTimeout
probeCallbackFn = nil
}
}

func TestingSetUnavailableAtTTLOverride(duration time.Duration) func() {
oldValue := testingUnavailableAtTTLOverride
testingUnavailableAtTTLOverride = duration
return func() {
testingUnavailableAtTTLOverride = oldValue
}
}

// NewLivenessProber creates a new region liveness prober.
func NewLivenessProber(
db *kv.DB,
Expand Down Expand Up @@ -211,6 +216,9 @@ func (l *livenessProber) ProbeLivenessWithPhysicalRegion(
// Region has gone down, set the unavailable_at time on it
return l.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
defaultTTL := slbase.DefaultTTL.Get(&l.settings.SV)
if testingUnavailableAtTTLOverride != 0 {
defaultTTL = testingUnavailableAtTTLOverride
}
defaultHeartbeat := slbase.DefaultHeartBeat.Get(&l.settings.SV)
// Get the read timestamp and pick a commit deadline.
commitDeadline := txn.ReadTimestamp().AddDuration(defaultHeartbeat)
Expand Down Expand Up @@ -316,7 +324,8 @@ func (l *livenessProber) GetProbeTimeout() (bool, time.Duration) {
// when checking for region liveness.
func IsQueryTimeoutErr(err error) bool {
return pgerror.GetPGCode(err) == pgcode.QueryCanceled ||
errors.HasType(err, (*timeutil.TimeoutError)(nil))
errors.HasType(err, (*timeutil.TimeoutError)(nil)) ||
errors.Is(err, context.DeadlineExceeded)
}

// IsMissingRegionEnumErr determines if a query hit an error because of a missing
Expand Down

0 comments on commit 709d308

Please sign in to comment.