From 27fa0e5edf4935bc7f624bb3e84a9bc228162304 Mon Sep 17 00:00:00 2001 From: Mason Malone <651224+MasonM@users.noreply.github.com> Date: Mon, 18 Nov 2024 21:48:00 -0800 Subject: [PATCH 1/2] fix: don't log non-errors as "Non-transient error:". Fixes #13881 There's many places that can call `IsTransientErr()` with nil, e.g. https://github.com/argoproj/argo-workflows/blob/2fd54884844bb76d760466027afa023c5bfd6b64/util/util.go#L39 This caused `IsTransientErr()` to log `Non-transient error: `. We shouldn't be generating logs in this case. This can be reproduced locally by running `make start PROFILE=mysql`: `` $ head -n9 logs/controller.log time="2024-11-19T05:55:16Z" level=info msg="index config" indexWorkflowSemaphoreKeys=true time="2024-11-19T05:55:16Z" level=info msg="cron config" cronSyncPeriod=10s time="2024-11-19T05:55:16Z" level=info msg="Memoization caches will be garbage-collected if they have not been hit after" gcAfterNotHitDuration=30s time="2024-11-19T05:55:16.010Z" level=info msg="not enabling pprof debug endpoints" time="2024-11-19T05:55:16.014Z" level=debug msg="Get configmaps 200" time="2024-11-19T05:55:16.017Z" level=info msg="Configuration:\nartifactRepository:\n s3:\n accessKeySecret:\n key: accesskey\n name: my-minio-cred\n bucket: my-bucket\n endpoint: minio:9000\n insecure: true\n secretKeySecret:\n key: secretkey\n name: my-minio-cred\ncolumns:\n- key: workflows.argoproj.io/completed\n name: Workflow Completed\n type: label\nexecutor:\n imagePullPolicy: IfNotPresent\n name: \"\"\n resources:\n limits:\n cpu: 500m\n memory: 256Mi\n requests:\n cpu: 100m\n memory: 64Mi\nimages:\n docker/whalesay:latest:\n cmd:\n - cowsay\ninitialDelay: 0s\nlinks:\n- name: Workflow Link\n scope: workflow\n url: http://logging-facility?namespace=${metadata.namespace}&workflowName=${metadata.name}&startedAt=${status.startedAt}&finishedAt=${status.finishedAt}\n- name: Pod Link\n scope: pod\n url: http://logging-facility?namespace=${metadata.namespace}&podName=${metadata.name}&startedAt=${status.startedAt}&finishedAt=${status.finishedAt}\n- name: Pod Logs Link\n scope: pod-logs\n url: http://logging-facility?namespace=${metadata.namespace}&podName=${metadata.name}&startedAt=${status.startedAt}&finishedAt=${status.finishedAt}\n- name: Event Source Logs Link\n scope: event-source-logs\n url: http://logging-facility?namespace=${metadata.namespace}&podName=${metadata.name}&startedAt=${status.startedAt}&finishedAt=${status.finishedAt}\n- name: Sensor Logs Link\n scope: sensor-logs\n url: http://logging-facility?namespace=${metadata.namespace}&podName=${metadata.name}&startedAt=${status.startedAt}&finishedAt=${status.finishedAt}\n- name: Completed Workflows\n scope: workflow-list\n url: http://workflows?label=workflows.argoproj.io/completed=true\nmetricsConfig:\n enabled: true\n path: /metrics\n port: 9090\nnamespaceParallelism: 10\nnodeEvents: {}\npersistence:\n archive: true\n archiveTTL: 168h0m0s\n connectionPool:\n maxIdleConns: 100\n mysql:\n database: argo\n host: mysql\n passwordSecret:\n key: password\n name: argo-mysql-config\n port: 3306\n tableName: argo_workflows\n userNameSecret:\n key: username\n name: argo-mysql-config\n nodeStatusOffLoad: true\npodSpecLogStrategy: {}\nretentionPolicy:\n completed: 10\n errored: 2\n failed: 2\nsso:\n clientId:\n key: \"\"\n clientSecret:\n key: \"\"\n issuer: \"\"\n redirectUrl: \"\"\n sessionExpiry: 0s\ntelemetryConfig: {}\nworkflowDefaults:\n metadata:\n creationTimestamp: null\n spec:\n activeDeadlineSeconds: 300\n arguments: {}\n podSpecPatch: |\n terminationGracePeriodSeconds: 3\n status:\n finishedAt: null\n startedAt: null\nworkflowEvents: {}\n" time="2024-11-19T05:55:16.017Z" level=info msg="Persistence configuration enabled" time="2024-11-19T05:55:16.019Z" level=debug msg="Get secrets 200" time="2024-11-19T05:55:16.019Z" level=warning msg="Non-transient error: " ``` Signed-off-by: Mason Malone <651224+MasonM@users.noreply.github.com> --- util/errors/errors.go | 2 +- util/errors/errors_test.go | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/util/errors/errors.go b/util/errors/errors.go index 19addc48312c..6d41bf612213 100644 --- a/util/errors/errors.go +++ b/util/errors/errors.go @@ -26,7 +26,7 @@ func IgnoreContainerNotFoundErr(err error) error { // IsTransientErr reports whether the error is transient and logs it. func IsTransientErr(err error) bool { isTransient := IsTransientErrQuiet(err) - if !isTransient { + if err != nil && !isTransient { log.Warnf("Non-transient error: %v", err) } return isTransient diff --git a/util/errors/errors_test.go b/util/errors/errors_test.go index 8090072c1ae1..702c3d594251 100644 --- a/util/errors/errors_test.go +++ b/util/errors/errors_test.go @@ -8,6 +8,8 @@ import ( "os/exec" "testing" + log "github.com/sirupsen/logrus" + logtest "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/assert" apierr "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime/schema" @@ -50,12 +52,19 @@ var ( const transientEnvVarKey = "TRANSIENT_ERROR_PATTERN" func TestIsTransientErr(t *testing.T) { + hook := &logtest.Hook{} + log.AddHook(hook) + defer log.StandardLogger().ReplaceHooks(nil) + t.Run("Nil", func(t *testing.T) { assert.False(t, IsTransientErr(nil)) + assert.Nil(t, hook.LastEntry()) }) t.Run("ResourceQuotaConflictErr", func(t *testing.T) { assert.False(t, IsTransientErr(apierr.NewConflict(schema.GroupResource{}, "", nil))) + assert.Contains(t, hook.LastEntry().Message, "Non-transient error:") assert.True(t, IsTransientErr(apierr.NewConflict(schema.GroupResource{Group: "v1", Resource: "resourcequotas"}, "", nil))) + assert.Contains(t, hook.LastEntry().Message, "Transient error:") }) t.Run("ResourceQuotaTimeoutErr", func(t *testing.T) { assert.False(t, IsTransientErr(apierr.NewInternalError(errors.New("")))) From a0b5f341816ddcc9884c2f1c3e6ac2697a54af92 Mon Sep 17 00:00:00 2001 From: Mason Malone <651224+MasonM@users.noreply.github.com> Date: Mon, 18 Nov 2024 22:40:19 -0800 Subject: [PATCH 2/2] test: increase timeout Signed-off-by: Mason Malone <651224+MasonM@users.noreply.github.com> --- test/e2e/signals_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/signals_test.go b/test/e2e/signals_test.go index 132e99a60c49..13a10020b91c 100644 --- a/test/e2e/signals_test.go +++ b/test/e2e/signals_test.go @@ -150,7 +150,7 @@ func (s *SignalsSuite) TestSignaledContainerSet() { Workflow("@testdata/signaled-container-set-workflow.yaml"). When(). SubmitWorkflow(). - WaitForWorkflow(). + WaitForWorkflow(killDuration). Then(). ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *wfv1.WorkflowStatus) { assert.Equal(t, wfv1.WorkflowFailed, status.Phase)