Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: mark taskresult complete when failed or error. Fixes #12993, Fixes #13533 #13798

Merged
merged 5 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/environment-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ This document outlines environment variables that can be used to customize behav
| `OPERATION_DURATION_METRIC_BUCKET_COUNT` | `int` | `6` | The number of buckets to collect the metric for the operation duration. |
| `POD_NAMES` | `string` | `v2` | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server. |
| `RECENTLY_STARTED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently started. |
| `RECENTLY_DELETED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently deleted. |
| `RECENTLY_DELETED_POD_DURATION` | `time.Duration` | `2m` | The duration of a pod before the pod is considered to be recently deleted. |
| `RETRY_BACKOFF_DURATION` | `time.Duration` | `10ms` | The retry back-off duration when retrying API calls. |
| `RETRY_BACKOFF_FACTOR` | `float` | `2.0` | The retry back-off factor when retrying API calls. |
| `RETRY_BACKOFF_STEPS` | `int` | `5` | The retry back-off steps when retrying API calls. |
Expand Down
5 changes: 0 additions & 5 deletions pkg/apis/workflow/v1alpha1/workflow_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -2409,11 +2409,6 @@ func (n NodeStatus) IsExitNode() bool {
return strings.HasSuffix(n.DisplayName, ".onExit")
}

// IsPodDeleted returns whether node is error with pod deleted.
func (n NodeStatus) IsPodDeleted() bool {
return n.Phase == NodeError && n.Message == "pod deleted"
}

func (n NodeStatus) Succeeded() bool {
return n.Phase == NodeSucceeded
}
Expand Down
10 changes: 5 additions & 5 deletions workflow/controller/taskresult.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer() cache.SharedIndex
}

func recentlyDeleted(node *wfv1.NodeStatus) bool {
return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 10*time.Second)
return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute)
}

func (woc *wfOperationCtx) taskResultReconciliation() {
Expand Down Expand Up @@ -84,19 +84,19 @@ func (woc *wfOperationCtx) taskResultReconciliation() {
if err != nil {
continue
}

// Mark task result as completed if it has no chance to be completed.
if label == "false" && old.IsPodDeleted() {
if label == "false" && old.Completed() && !woc.nodePodExist(*old) {
if recentlyDeleted(old) {
woc.log.WithField("nodeID", nodeID).Debug("Wait for marking task result as completed because pod is recently deleted.")
// If the pod was deleted, then it is possible that the controller never get another informer message about it.
// In this case, the workflow will only be requeued after the resync period (20m). This means
// workflow will not update for 20m. Requeuing here prevents that happening.
woc.requeue()
continue
} else {
woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.")
woc.wf.Status.MarkTaskResultComplete(nodeID)
}
woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.")
woc.wf.Status.MarkTaskResultComplete(nodeID)
}
newNode := old.DeepCopy()
if result.Outputs.HasOutputs() {
Expand Down
Loading