From b19870d737a14b21d86f6267642a63dd14e5acd5 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Sun, 20 Nov 2022 18:47:19 -0500 Subject: [PATCH] fix(operator): Workflow stuck at running when init container failed. Fixes #10045 (#10047) Signed-off-by: Yuan Tang Signed-off-by: Saravanan Balasubramanian --- workflow/controller/operator.go | 17 +++++++++++++++++ workflow/controller/operator_test.go | 26 ++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index d50b9baf19ba..1c52ec84ba8e 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1277,6 +1277,23 @@ func (woc *wfOperationCtx) assessNodeStatus(pod *apiv1.Pod, old *wfv1.NodeStatus new.Outputs.ExitCode = pointer.StringPtr(fmt.Sprint(*exitCode)) } + // We cannot fail the node until the wait container is finished because it may be busy saving outputs, and these + // would not get captured successfully. + for _, c := range pod.Status.ContainerStatuses { + if c.Name == common.WaitContainerName && c.State.Terminated == nil && new.Phase.Completed() { + woc.log.WithField("new.phase", new.Phase).Info("leaving phase un-changed: wait container is not yet terminated ") + new.Phase = old.Phase + } + } + // If the init container failed, we should mark the node as failed. + for _, c := range pod.Status.InitContainerStatuses { + if c.State.Terminated != nil && int(c.State.Terminated.ExitCode) != 0 { + new.Phase = wfv1.NodeFailed + woc.log.WithField("new.phase", new.Phase).Info("marking node as failed since init container has non-zero exit code") + break + } + } + // if we are transitioning from Pending to a different state, clear out unchanged message if old.Phase == wfv1.NodePending && new.Phase != wfv1.NodePending && old.Message == new.Message { new.Message = "" diff --git a/workflow/controller/operator_test.go b/workflow/controller/operator_test.go index f9710528ba09..5297d001b6e2 100644 --- a/workflow/controller/operator_test.go +++ b/workflow/controller/operator_test.go @@ -1332,6 +1332,32 @@ func TestAssessNodeStatus(t *testing.T) { }, node: &wfv1.NodeStatus{TemplateName: templateName}, want: wfv1.NodeFailed, + }, { + name: "pod failed - init container failed", + pod: &apiv1.Pod{ + Status: apiv1.PodStatus{ + InitContainerStatuses: []apiv1.ContainerStatus{ + { + Name: common.InitContainerName, + State: apiv1.ContainerState{Terminated: &apiv1.ContainerStateTerminated{ExitCode: 1}}, + }, + }, + ContainerStatuses: []apiv1.ContainerStatus{ + { + Name: common.WaitContainerName, + State: apiv1.ContainerState{Terminated: nil}, + }, + { + Name: common.MainContainerName, + State: apiv1.ContainerState{Terminated: &apiv1.ContainerStateTerminated{ExitCode: 0}}, + }, + }, + Message: "failed since init container failed", + Phase: apiv1.PodFailed, + }, + }, + node: &wfv1.NodeStatus{TemplateName: templateName}, + want: wfv1.NodeFailed, }, { name: "pod running", pod: &apiv1.Pod{