From b1b6d7dc46fbe7bfc8d4f618335616c3eb245a33 Mon Sep 17 00:00:00 2001
From: isubasinghe <isitha@pipekit.io>
Date: Fri, 25 Oct 2024 12:23:28 +1100
Subject: [PATCH 1/5] fix: check if pod exists instead of error message

Signed-off-by: isubasinghe <isitha@pipekit.io>
---
 workflow/controller/taskresult.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/workflow/controller/taskresult.go b/workflow/controller/taskresult.go
index 8118084dd360..146ef710e2d0 100644
--- a/workflow/controller/taskresult.go
+++ b/workflow/controller/taskresult.go
@@ -84,8 +84,9 @@ func (woc *wfOperationCtx) taskResultReconciliation() {
 		if err != nil {
 			continue
 		}
+
 		// Mark task result as completed if it has no chance to be completed.
-		if label == "false" && old.IsPodDeleted() {
+		if label == "false" && !woc.nodePodExist(*old) {
 			if recentlyDeleted(old) {
 				woc.log.WithField("nodeID", nodeID).Debug("Wait for marking task result as completed because pod is recently deleted.")
 				// If the pod was deleted, then it is possible that the controller never get another informer message about it.

From 29c9ff6e6275c8cbb7fa72c4e9f59b5032ec8ac7 Mon Sep 17 00:00:00 2001
From: isubasinghe <isitha@pipekit.io>
Date: Fri, 25 Oct 2024 13:46:48 +1100
Subject: [PATCH 2/5] fix: remove IsPodDeleted

Signed-off-by: isubasinghe <isitha@pipekit.io>
---
 pkg/apis/workflow/v1alpha1/workflow_types.go | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pkg/apis/workflow/v1alpha1/workflow_types.go b/pkg/apis/workflow/v1alpha1/workflow_types.go
index e9b824139430..c90ebd76dd38 100644
--- a/pkg/apis/workflow/v1alpha1/workflow_types.go
+++ b/pkg/apis/workflow/v1alpha1/workflow_types.go
@@ -2409,11 +2409,6 @@ func (n NodeStatus) IsExitNode() bool {
 	return strings.HasSuffix(n.DisplayName, ".onExit")
 }
 
-// IsPodDeleted returns whether node is error with pod deleted.
-func (n NodeStatus) IsPodDeleted() bool {
-	return n.Phase == NodeError && n.Message == "pod deleted"
-}
-
 func (n NodeStatus) Succeeded() bool {
 	return n.Phase == NodeSucceeded
 }

From a8dc360206fd64b1983c798b4f3e09f74d2204b9 Mon Sep 17 00:00:00 2001
From: isubasinghe <isitha@pipekit.io>
Date: Mon, 28 Oct 2024 22:32:33 +1100
Subject: [PATCH 3/5] fix: increase timeout to 2 minutes, use started time

Signed-off-by: isubasinghe <isitha@pipekit.io>
---
 docs/environment-variables.md     | 2 +-
 workflow/controller/taskresult.go | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/environment-variables.md b/docs/environment-variables.md
index 0a5e1533bc4f..bcb2d409e604 100644
--- a/docs/environment-variables.md
+++ b/docs/environment-variables.md
@@ -45,7 +45,7 @@ This document outlines environment variables that can be used to customize behav
 | `OPERATION_DURATION_METRIC_BUCKET_COUNT` | `int`               | `6`                                                                                         | The number of buckets to collect the metric for the operation duration.                                                                                                                                                                                                  |
 | `POD_NAMES`                              | `string`            | `v2`                                                                                        | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server.                                                                                                                                                |
 | `RECENTLY_STARTED_POD_DURATION`          | `time.Duration`     | `10s`                                                                                       | The duration of a pod before the pod is considered to be recently started.                                                                                                                                                                                               |
-| `RECENTLY_DELETED_POD_DURATION`          | `time.Duration`     | `10s`                                                                                       | The duration of a pod before the pod is considered to be recently deleted.                                                                                                                                                                                               |
+| `RECENTLY_DELETED_POD_DURATION`          | `time.Duration`     | `2m`                                                                                       | The duration of a pod before the pod is considered to be recently deleted.                                                                                                                                                                                               |
 | `RETRY_BACKOFF_DURATION`                 | `time.Duration`     | `10ms`                                                                                      | The retry back-off duration when retrying API calls.                                                                                                                                                                                                                     |
 | `RETRY_BACKOFF_FACTOR`                   | `float`             | `2.0`                                                                                       | The retry back-off factor when retrying API calls.                                                                                                                                                                                                                       |
 | `RETRY_BACKOFF_STEPS`                    | `int`               | `5`                                                                                         | The retry back-off steps when retrying API calls.                                                                                                                                                                                                                        |
diff --git a/workflow/controller/taskresult.go b/workflow/controller/taskresult.go
index 146ef710e2d0..1a5f8fa0906c 100644
--- a/workflow/controller/taskresult.go
+++ b/workflow/controller/taskresult.go
@@ -55,7 +55,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer() cache.SharedIndex
 }
 
 func recentlyDeleted(node *wfv1.NodeStatus) bool {
-	return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 10*time.Second)
+	return time.Since(node.StartedAt.Time) >= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute)
 }
 
 func (woc *wfOperationCtx) taskResultReconciliation() {
@@ -94,10 +94,9 @@ func (woc *wfOperationCtx) taskResultReconciliation() {
 				// workflow will not update for 20m. Requeuing here prevents that happening.
 				woc.requeue()
 				continue
-			} else {
-				woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.")
-				woc.wf.Status.MarkTaskResultComplete(nodeID)
 			}
+			woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.")
+			woc.wf.Status.MarkTaskResultComplete(nodeID)
 		}
 		newNode := old.DeepCopy()
 		if result.Outputs.HasOutputs() {

From 96282c7d59be296868ae8620e8b2d6eb1fc1c77d Mon Sep 17 00:00:00 2001
From: isubasinghe <isitha@pipekit.io>
Date: Mon, 28 Oct 2024 22:40:32 +1100
Subject: [PATCH 4/5] fix: inverted logic

Signed-off-by: isubasinghe <isitha@pipekit.io>
---
 workflow/controller/taskresult.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/controller/taskresult.go b/workflow/controller/taskresult.go
index 1a5f8fa0906c..d806272dae77 100644
--- a/workflow/controller/taskresult.go
+++ b/workflow/controller/taskresult.go
@@ -55,7 +55,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer() cache.SharedIndex
 }
 
 func recentlyDeleted(node *wfv1.NodeStatus) bool {
-	return time.Since(node.StartedAt.Time) >= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute)
+	return time.Since(node.StartedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute)
 }
 
 func (woc *wfOperationCtx) taskResultReconciliation() {

From 47e9ee1356228c249b449704de3bac0af5783928 Mon Sep 17 00:00:00 2001
From: isubasinghe <isitha@pipekit.io>
Date: Mon, 28 Oct 2024 23:11:30 +1100
Subject: [PATCH 5/5] fix: use FinishedAt

Signed-off-by: isubasinghe <isitha@pipekit.io>
---
 workflow/controller/taskresult.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/controller/taskresult.go b/workflow/controller/taskresult.go
index d806272dae77..d6c003d0b51a 100644
--- a/workflow/controller/taskresult.go
+++ b/workflow/controller/taskresult.go
@@ -55,7 +55,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer() cache.SharedIndex
 }
 
 func recentlyDeleted(node *wfv1.NodeStatus) bool {
-	return time.Since(node.StartedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute)
+	return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute)
 }
 
 func (woc *wfOperationCtx) taskResultReconciliation() {
@@ -86,7 +86,7 @@ func (woc *wfOperationCtx) taskResultReconciliation() {
 		}
 
 		// Mark task result as completed if it has no chance to be completed.
-		if label == "false" && !woc.nodePodExist(*old) {
+		if label == "false" && old.Completed() && !woc.nodePodExist(*old) {
 			if recentlyDeleted(old) {
 				woc.log.WithField("nodeID", nodeID).Debug("Wait for marking task result as completed because pod is recently deleted.")
 				// If the pod was deleted, then it is possible that the controller never get another informer message about it.