From b1b6d7dc46fbe7bfc8d4f618335616c3eb245a33 Mon Sep 17 00:00:00 2001 From: isubasinghe Date: Fri, 25 Oct 2024 12:23:28 +1100 Subject: [PATCH 1/5] fix: check if pod exists instead of error message Signed-off-by: isubasinghe --- workflow/controller/taskresult.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/controller/taskresult.go b/workflow/controller/taskresult.go index 8118084dd360..146ef710e2d0 100644 --- a/workflow/controller/taskresult.go +++ b/workflow/controller/taskresult.go @@ -84,8 +84,9 @@ func (woc *wfOperationCtx) taskResultReconciliation() { if err != nil { continue } + // Mark task result as completed if it has no chance to be completed. - if label == "false" && old.IsPodDeleted() { + if label == "false" && !woc.nodePodExist(*old) { if recentlyDeleted(old) { woc.log.WithField("nodeID", nodeID).Debug("Wait for marking task result as completed because pod is recently deleted.") // If the pod was deleted, then it is possible that the controller never get another informer message about it. From 29c9ff6e6275c8cbb7fa72c4e9f59b5032ec8ac7 Mon Sep 17 00:00:00 2001 From: isubasinghe Date: Fri, 25 Oct 2024 13:46:48 +1100 Subject: [PATCH 2/5] fix: remove IsPodDeleted Signed-off-by: isubasinghe --- pkg/apis/workflow/v1alpha1/workflow_types.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pkg/apis/workflow/v1alpha1/workflow_types.go b/pkg/apis/workflow/v1alpha1/workflow_types.go index e9b824139430..c90ebd76dd38 100644 --- a/pkg/apis/workflow/v1alpha1/workflow_types.go +++ b/pkg/apis/workflow/v1alpha1/workflow_types.go @@ -2409,11 +2409,6 @@ func (n NodeStatus) IsExitNode() bool { return strings.HasSuffix(n.DisplayName, ".onExit") } -// IsPodDeleted returns whether node is error with pod deleted. -func (n NodeStatus) IsPodDeleted() bool { - return n.Phase == NodeError && n.Message == "pod deleted" -} - func (n NodeStatus) Succeeded() bool { return n.Phase == NodeSucceeded } From a8dc360206fd64b1983c798b4f3e09f74d2204b9 Mon Sep 17 00:00:00 2001 From: isubasinghe Date: Mon, 28 Oct 2024 22:32:33 +1100 Subject: [PATCH 3/5] fix: increase timeout to 2 minutes, use started time Signed-off-by: isubasinghe --- docs/environment-variables.md | 2 +- workflow/controller/taskresult.go | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 0a5e1533bc4f..bcb2d409e604 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -45,7 +45,7 @@ This document outlines environment variables that can be used to customize behav | `OPERATION_DURATION_METRIC_BUCKET_COUNT` | `int` | `6` | The number of buckets to collect the metric for the operation duration. | | `POD_NAMES` | `string` | `v2` | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server. | | `RECENTLY_STARTED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently started. | -| `RECENTLY_DELETED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently deleted. | +| `RECENTLY_DELETED_POD_DURATION` | `time.Duration` | `2m` | The duration of a pod before the pod is considered to be recently deleted. | | `RETRY_BACKOFF_DURATION` | `time.Duration` | `10ms` | The retry back-off duration when retrying API calls. | | `RETRY_BACKOFF_FACTOR` | `float` | `2.0` | The retry back-off factor when retrying API calls. | | `RETRY_BACKOFF_STEPS` | `int` | `5` | The retry back-off steps when retrying API calls. | diff --git a/workflow/controller/taskresult.go b/workflow/controller/taskresult.go index 146ef710e2d0..1a5f8fa0906c 100644 --- a/workflow/controller/taskresult.go +++ b/workflow/controller/taskresult.go @@ -55,7 +55,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer() cache.SharedIndex } func recentlyDeleted(node *wfv1.NodeStatus) bool { - return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 10*time.Second) + return time.Since(node.StartedAt.Time) >= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute) } func (woc *wfOperationCtx) taskResultReconciliation() { @@ -94,10 +94,9 @@ func (woc *wfOperationCtx) taskResultReconciliation() { // workflow will not update for 20m. Requeuing here prevents that happening. woc.requeue() continue - } else { - woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.") - woc.wf.Status.MarkTaskResultComplete(nodeID) } + woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.") + woc.wf.Status.MarkTaskResultComplete(nodeID) } newNode := old.DeepCopy() if result.Outputs.HasOutputs() { From 96282c7d59be296868ae8620e8b2d6eb1fc1c77d Mon Sep 17 00:00:00 2001 From: isubasinghe Date: Mon, 28 Oct 2024 22:40:32 +1100 Subject: [PATCH 4/5] fix: inverted logic Signed-off-by: isubasinghe --- workflow/controller/taskresult.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/controller/taskresult.go b/workflow/controller/taskresult.go index 1a5f8fa0906c..d806272dae77 100644 --- a/workflow/controller/taskresult.go +++ b/workflow/controller/taskresult.go @@ -55,7 +55,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer() cache.SharedIndex } func recentlyDeleted(node *wfv1.NodeStatus) bool { - return time.Since(node.StartedAt.Time) >= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute) + return time.Since(node.StartedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute) } func (woc *wfOperationCtx) taskResultReconciliation() { From 47e9ee1356228c249b449704de3bac0af5783928 Mon Sep 17 00:00:00 2001 From: isubasinghe Date: Mon, 28 Oct 2024 23:11:30 +1100 Subject: [PATCH 5/5] fix: use FinishedAt Signed-off-by: isubasinghe --- workflow/controller/taskresult.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/controller/taskresult.go b/workflow/controller/taskresult.go index d806272dae77..d6c003d0b51a 100644 --- a/workflow/controller/taskresult.go +++ b/workflow/controller/taskresult.go @@ -55,7 +55,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer() cache.SharedIndex } func recentlyDeleted(node *wfv1.NodeStatus) bool { - return time.Since(node.StartedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute) + return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute) } func (woc *wfOperationCtx) taskResultReconciliation() { @@ -86,7 +86,7 @@ func (woc *wfOperationCtx) taskResultReconciliation() { } // Mark task result as completed if it has no chance to be completed. - if label == "false" && !woc.nodePodExist(*old) { + if label == "false" && old.Completed() && !woc.nodePodExist(*old) { if recentlyDeleted(old) { woc.log.WithField("nodeID", nodeID).Debug("Wait for marking task result as completed because pod is recently deleted.") // If the pod was deleted, then it is possible that the controller never get another informer message about it.