diff --git a/pkg/cmd/roachtest/BUILD.bazel b/pkg/cmd/roachtest/BUILD.bazel index efae87705d1b..f532bbc04084 100644 --- a/pkg/cmd/roachtest/BUILD.bazel +++ b/pkg/cmd/roachtest/BUILD.bazel @@ -127,6 +127,7 @@ go_test( "//pkg/util/version", "@com_github_cockroachdb_datadriven//:datadriven", "@com_github_cockroachdb_errors//:errors", + "@com_github_cockroachdb_redact//:redact", "@com_github_data_dog_go_sqlmock//:go-sqlmock", "@com_github_kr_pretty//:pretty", "@com_github_prometheus_client_golang//prometheus", diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go index 0c71712f20ab..5691127c51a3 100644 --- a/pkg/cmd/roachtest/github.go +++ b/pkg/cmd/roachtest/github.go @@ -210,6 +210,9 @@ func (g *githubIssues) createPostRequest( // error, redirect that to Test Eng with the corresponding label as // title override. errWithOwner := failuresAsErrorWithOwnership(failures) + if errWithOwner == nil { + errWithOwner = transientErrorOwnershipFallback(failures) + } if errWithOwner != nil { handleErrorWithOwnership(*errWithOwner) } diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go index 7da19923c5bf..61c7466d0e66 100644 --- a/pkg/cmd/roachtest/github_test.go +++ b/pkg/cmd/roachtest/github_test.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/testutils/echotest" "github.com/cockroachdb/datadriven" "github.com/cockroachdb/errors" + "github.com/cockroachdb/redact" "github.com/stretchr/testify/require" ) @@ -214,6 +215,14 @@ func TestCreatePostRequest(t *testing.T) { refError = registry.ErrorWithOwner(registry.OwnerSQLFoundations, refError) case "error-with-owner-test-eng": refError = registry.ErrorWithOwner(registry.OwnerTestEng, refError) + case "require-no-error-failed": + // Attempts to mimic how the require package creates failures by losing + // the error object and prepending a message. Similar to above we don't use + // %+v to avoid stack traces. + refError = errors.Newf("Received unexpected error:\n%s", redact.SafeString(refError.Error())) + case "lose-error-object": + // Lose the error object which should make our flake detection fail. + refError = errors.Newf("%s", redact.SafeString(refError.Error())) } } } diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go index 05da8257f8d5..fa677cd1e15c 100644 --- a/pkg/cmd/roachtest/test_impl.go +++ b/pkg/cmd/roachtest/test_impl.go @@ -11,6 +11,7 @@ import ( "io" "math/rand" "os" + "regexp" "strings" "sync" "time" @@ -548,6 +549,70 @@ func failuresMatchingError(failures []failure, refError any) bool { return false } +var transientErrorRegex = regexp.MustCompile(`TRANSIENT_ERROR\((.+)\)`) + +// transientErrorOwnershipFallback string matches for `TRANSIENT_ERROR` in the provided +// failures and returns a new ErrorWithOwnership if it does. It iterates through each failure, +// checking both the squashedErr and list of errors for `TRANSIENT_ERROR`. +// +// This is needed as the `require` package does not preserve the error object needed +// for us to properly check if it is a transient error using `failuresMatchingError`. +// Note the match is additionally guarded by the unique substring which denotes that +// the error originated from the require package. If we see somewhere else that does not +// preserve the error object, we want to investigate whether it can be fixed before resorting +// to this fallback. See: #131094 +func transientErrorOwnershipFallback(failures []failure) *registry.ErrorWithOwnership { + const unexpectedErrPrefix = "Received unexpected error:" + var errWithOwner registry.ErrorWithOwnership + isTransient := func(err error) bool { + // Both squashedErr and errors should be non nil in actual roachtest failures, + // but for testing we sometimes only set one for simplicity. + if err == nil { + return false + } + // The require package prepends this message to `require.NoError` failures. + // We may see `TRANSIENT_ERROR` without this prefix, but don't mark it as + // a flake as we may be able to fix the code that doesn't preserve the error. + if !strings.Contains(err.Error(), unexpectedErrPrefix) { + return false + } + + if match := transientErrorRegex.FindString(err.Error()); match != "" { + problemCause := strings.TrimPrefix(match, "TRANSIENT_ERROR(") + problemCause = strings.TrimSuffix(problemCause, ")") + // The cause will be used to create the github issue creation, so we don't want + // it to be blank. Instead, return false and let us investigate what is creating + // a transient error with no cause. + if problemCause == "" { + return false + } + + errWithOwner = registry.ErrorWithOwner( + registry.OwnerTestEng, err, + registry.WithTitleOverride(problemCause), + registry.InfraFlake, + ) + return true + } + + return false + } + + for _, f := range failures { + for _, err := range f.errors { + if isTransient(err) { + return &errWithOwner + } + } + + if isTransient(f.squashedErr) { + return &errWithOwner + } + } + + return nil +} + func (t *testImpl) ArtifactsDir() string { return t.artifactsDir } diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index f084f83c013a..00d3f76b294f 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -1227,6 +1227,9 @@ func (r *testRunner) runTest( if s.Run != nil { if t.Failed() { errWithOwner := failuresAsErrorWithOwnership(t.failures()) + if errWithOwner == nil { + errWithOwner = transientErrorOwnershipFallback(t.failures()) + } if errWithOwner == nil || !errWithOwner.InfraFlake { r.status.fail[t] = struct{}{} } diff --git a/pkg/cmd/roachtest/test_test.go b/pkg/cmd/roachtest/test_test.go index 7d10161dcf24..c384550d08ff 100644 --- a/pkg/cmd/roachtest/test_test.go +++ b/pkg/cmd/roachtest/test_test.go @@ -25,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod" "github.com/cockroachdb/cockroach/pkg/roachprod/cloud" + rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce" @@ -527,3 +528,69 @@ func TestGCESameDefaultZone(t *testing.T) { }) } } + +func TestTransientErrorFallback(t *testing.T) { + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + cr := newClusterRegistry() + runner := newUnitTestRunner(cr, stopper) + + var buf syncedBuffer + lopt := loggingOpt{ + l: nilLogger(), + tee: logger.NoTee, + stdout: &buf, + stderr: &buf, + artifactsDir: "", + } + copt := clustersOpt{ + typ: roachprodCluster, + user: "test_user", + cpuQuota: 1000, + debugMode: NoDebug, + } + + // Test that if a test fails with a transient error handled by the `require` package, + // the test runner will correctly still identify it as a flake and the run will have + // no failed tests. + t.Run("Require API", func(t *testing.T) { + mockTest := registry.TestSpec{ + Name: `ssh flake`, + Owner: OwnerUnitTest, + Cluster: spec.MakeClusterSpec(0), + CompatibleClouds: registry.AllExceptAWS, + Suites: registry.Suites(registry.Nightly), + CockroachBinary: registry.StandardCockroach, + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + require.NoError(t, rperrors.NewSSHError(errors.New("oops"))) + }, + } + err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */ + defaultParallelism, copt, testOpts{}, lopt) + require.NoError(t, err) + }) + + // Now test that if the transient error is not handled by the `require` package, + // but similarly lost due to casting to a string, the test runner *won't* mark + // it as a flake and we will have a failed test. + t.Run("Require API Not Used", func(t *testing.T) { + mockTest := registry.TestSpec{ + Name: `ssh flake`, + Owner: OwnerUnitTest, + Cluster: spec.MakeClusterSpec(0), + CompatibleClouds: registry.AllExceptAWS, + Suites: registry.Suites(registry.Nightly), + CockroachBinary: registry.StandardCockroach, + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + err := errors.Newf("%s", rperrors.NewSSHError(errors.New("oops"))) + t.Fatal(err) + }, + } + err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */ + defaultParallelism, copt, testOpts{}, lopt) + if !testutils.IsError(err, "some tests failed") { + t.Fatalf("expected error \"some tests failed\", got: %v", err) + } + }) +} diff --git a/pkg/cmd/roachtest/testdata/github/lost_error_object_and_transient_error b/pkg/cmd/roachtest/testdata/github/lost_error_object_and_transient_error new file mode 100644 index 000000000000..a7fef522f801 --- /dev/null +++ b/pkg/cmd/roachtest/testdata/github/lost_error_object_and_transient_error @@ -0,0 +1,60 @@ +# When a transient error is lost as a result of an unknown case +# casting it to a string, check that our fallback transient error handling +# *doesn't* catch it. This should be investigated to determine if it should +# be fixed to preserve the error object or added to the list of exceptions + +add-failure name=(oops) type=(ssh-flake, lose-error-object) +---- +ok + +post +---- +---- +roachtest.github_test [failed]() on test_branch @ [test_SHA](). A Side-Eye cluster snapshot was captured on timeout: [https://app.side-eye.io/snapshots/1](https://app.side-eye.io/snapshots/1). + + +``` +TRANSIENT_ERROR(ssh_problem): oops +``` + +Parameters: + - ROACHTEST_arch=amd64 + - ROACHTEST_cloud=gce + - ROACHTEST_coverageBuild=false + - ROACHTEST_cpu=4 + - ROACHTEST_encrypted=false + - ROACHTEST_fs=ext4 + - ROACHTEST_localSSD=true + - ROACHTEST_runtimeAssertionsBuild=false + - ROACHTEST_ssd=0 +
Help +

+ + +See: [roachtest README](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/README.md) + + + +See: [How To Investigate \(internal\)](https://cockroachlabs.atlassian.net/l/c/SSSBr8c7) + + + +See: [Grafana](https://go.crdb.dev/roachtest-grafana//github-test/1689957243000/1689957853000) + +

+
+/cc @cockroachdb/unowned + + +[This test on roachdash](https://roachdash.crdb.dev/?filter=status:open%20t:.*github_test.*&sort=title+created&display=lastcommented+project) | [Improve this report!](https://github.com/cockroachdb/cockroach/tree/master/pkg/cmd/bazci/githubpost/issues) + + + +------ +Labels: +- O-roachtest +- C-test-failure +- release-blocker +Rendered:https://github.com/cockroachdb/cockroach/issues/new?body=roachtest.github_test+%5Bfailed%5D%28%29+on+test_branch+%40+%5Btest_SHA%5D%28%29.+A+Side-Eye+cluster+snapshot+was+captured+on+timeout%3A+%5Bhttps%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%5D%28https%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%29.%0A%0A%0A%60%60%60%0ATRANSIENT_ERROR%28ssh_problem%29%3A+oops%0A%60%60%60%0A%0AParameters%3A%0A+-+%3Ccode%3EROACHTEST_arch%3Damd64%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cloud%3Dgce%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_coverageBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cpu%3D4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_encrypted%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_fs%3Dext4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_localSSD%3Dtrue%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_runtimeAssertionsBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_ssd%3D0%3C%2Fcode%3E%0A%3Cdetails%3E%3Csummary%3EHelp%3C%2Fsummary%3E%0A%3Cp%3E%0A%0A%0ASee%3A+%5Broachtest+README%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Fblob%2Fmaster%2Fpkg%2Fcmd%2Froachtest%2FREADME.md%29%0A%0A%0A%0ASee%3A+%5BHow+To+Investigate+%5C%28internal%5C%29%5D%28https%3A%2F%2Fcockroachlabs.atlassian.net%2Fl%2Fc%2FSSSBr8c7%29%0A%0A%0A%0ASee%3A+%5BGrafana%5D%28https%3A%2F%2Fgo.crdb.dev%2Froachtest-grafana%2F%2Fgithub-test%2F1689957243000%2F1689957853000%29%0A%0A%3C%2Fp%3E%0A%3C%2Fdetails%3E%0A%2Fcc+%40cockroachdb%2Funowned%0A%3Csub%3E%0A%0A%5BThis+test+on+roachdash%5D%28https%3A%2F%2Froachdash.crdb.dev%2F%3Ffilter%3Dstatus%3Aopen%2520t%3A.%2Agithub_test.%2A%26sort%3Dtitle%2Bcreated%26display%3Dlastcommented%2Bproject%29+%7C+%5BImprove+this+report%21%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Ftree%2Fmaster%2Fpkg%2Fcmd%2Fbazci%2Fgithubpost%2Fissues%29%0A%0A%3C%2Fsub%3E%0A%0A------%0ALabels%3A%0A-+%3Ccode%3EO-roachtest%3C%2Fcode%3E%0A-+%3Ccode%3EC-test-failure%3C%2Fcode%3E%0A-+%3Ccode%3Erelease-blocker%3C%2Fcode%3E%0A&title=roachtest%3A+github_test+failed +---- +---- diff --git a/pkg/cmd/roachtest/testdata/github/require_no_error_transient_error b/pkg/cmd/roachtest/testdata/github/require_no_error_transient_error new file mode 100644 index 000000000000..a1ccb0b3d423 --- /dev/null +++ b/pkg/cmd/roachtest/testdata/github/require_no_error_transient_error @@ -0,0 +1,60 @@ +# When a transient error is lost as a result of the require package +# casting it to a string, check that our fallback transient error handling +# still catches it. + +add-failure name=(oops) type=(ssh-flake, require-no-error-failed) +---- +ok + +post +---- +---- +roachtest.ssh_problem [failed]() on test_branch @ [test_SHA](). A Side-Eye cluster snapshot was captured on timeout: [https://app.side-eye.io/snapshots/1](https://app.side-eye.io/snapshots/1). + + +``` +test github_test failed: Received unexpected error: +TRANSIENT_ERROR(ssh_problem): oops +``` + +Parameters: + - ROACHTEST_arch=amd64 + - ROACHTEST_cloud=gce + - ROACHTEST_coverageBuild=false + - ROACHTEST_cpu=4 + - ROACHTEST_encrypted=false + - ROACHTEST_fs=ext4 + - ROACHTEST_localSSD=true + - ROACHTEST_runtimeAssertionsBuild=false + - ROACHTEST_ssd=0 +
Help +

+ + +See: [roachtest README](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/README.md) + + + +See: [How To Investigate \(internal\)](https://cockroachlabs.atlassian.net/l/c/SSSBr8c7) + + + +See: [Grafana](https://go.crdb.dev/roachtest-grafana//github-test/1689957243000/1689957853000) + +

+
+/cc @cockroachdb/test-eng + + +[This test on roachdash](https://roachdash.crdb.dev/?filter=status:open%20t:.*ssh_problem.*&sort=title+created&display=lastcommented+project) | [Improve this report!](https://github.com/cockroachdb/cockroach/tree/master/pkg/cmd/bazci/githubpost/issues) + + + +------ +Labels: +- O-roachtest +- X-infra-flake +- T-testeng +Rendered:https://github.com/cockroachdb/cockroach/issues/new?body=roachtest.ssh_problem+%5Bfailed%5D%28%29+on+test_branch+%40+%5Btest_SHA%5D%28%29.+A+Side-Eye+cluster+snapshot+was+captured+on+timeout%3A+%5Bhttps%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%5D%28https%3A%2F%2Fapp.side-eye.io%2Fsnapshots%2F1%29.%0A%0A%0A%60%60%60%0Atest+github_test+failed%3A+Received+unexpected+error%3A%0ATRANSIENT_ERROR%28ssh_problem%29%3A+oops%0A%60%60%60%0A%0AParameters%3A%0A+-+%3Ccode%3EROACHTEST_arch%3Damd64%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cloud%3Dgce%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_coverageBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_cpu%3D4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_encrypted%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_fs%3Dext4%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_localSSD%3Dtrue%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_runtimeAssertionsBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3EROACHTEST_ssd%3D0%3C%2Fcode%3E%0A%3Cdetails%3E%3Csummary%3EHelp%3C%2Fsummary%3E%0A%3Cp%3E%0A%0A%0ASee%3A+%5Broachtest+README%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Fblob%2Fmaster%2Fpkg%2Fcmd%2Froachtest%2FREADME.md%29%0A%0A%0A%0ASee%3A+%5BHow+To+Investigate+%5C%28internal%5C%29%5D%28https%3A%2F%2Fcockroachlabs.atlassian.net%2Fl%2Fc%2FSSSBr8c7%29%0A%0A%0A%0ASee%3A+%5BGrafana%5D%28https%3A%2F%2Fgo.crdb.dev%2Froachtest-grafana%2F%2Fgithub-test%2F1689957243000%2F1689957853000%29%0A%0A%3C%2Fp%3E%0A%3C%2Fdetails%3E%0A%2Fcc+%40cockroachdb%2Ftest-eng%0A%3Csub%3E%0A%0A%5BThis+test+on+roachdash%5D%28https%3A%2F%2Froachdash.crdb.dev%2F%3Ffilter%3Dstatus%3Aopen%2520t%3A.%2Assh_problem.%2A%26sort%3Dtitle%2Bcreated%26display%3Dlastcommented%2Bproject%29+%7C+%5BImprove+this+report%21%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Ftree%2Fmaster%2Fpkg%2Fcmd%2Fbazci%2Fgithubpost%2Fissues%29%0A%0A%3C%2Fsub%3E%0A%0A------%0ALabels%3A%0A-+%3Ccode%3EO-roachtest%3C%2Fcode%3E%0A-+%3Ccode%3EX-infra-flake%3C%2Fcode%3E%0A-+%3Ccode%3ET-testeng%3C%2Fcode%3E%0A&title=roachtest%3A+ssh_problem+failed +---- +----