From 8de0afaebde97dff0dd0be8c1cc269cbd134afb4 Mon Sep 17 00:00:00 2001 From: henrywangx Date: Mon, 20 Dec 2021 21:04:14 +0800 Subject: [PATCH] fix: add retry for kill pod Signed-off-by: henrywangx --- cmd/argoexec/commands/wait.go | 20 +++++++++++++++++++- workflow/executor/k8sapi/client.go | 17 ++++++++++++----- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/cmd/argoexec/commands/wait.go b/cmd/argoexec/commands/wait.go index 045580e060d4..e84424a9ef90 100644 --- a/cmd/argoexec/commands/wait.go +++ b/cmd/argoexec/commands/wait.go @@ -7,6 +7,10 @@ import ( "github.com/argoproj/pkg/stats" log "github.com/sirupsen/logrus" "github.com/spf13/cobra" + "k8s.io/apimachinery/pkg/util/wait" + + errorsutil "github.com/argoproj/argo-workflows/v3/util/errors" + "github.com/argoproj/argo-workflows/v3/workflow/executor" ) func NewWaitCommand() *cobra.Command { @@ -31,7 +35,21 @@ func waitContainer(ctx context.Context) error { stats.StartStatsTicker(5 * time.Minute) defer func() { - if err := wfExecutor.KillSidecars(ctx); err != nil { + // Killing sidecar containers + retryCnt := 0 + err := wait.ExponentialBackoff(executor.ExecutorRetry, func() (bool, error) { + err := wfExecutor.KillSidecars(ctx) + if err == nil { + return true, nil + } + if errorsutil.IsTransientErr(err) { + log.WithError(err).WithField("retryCnt", retryCnt).Warn("fail to kill sidecar") + retryCnt++ + return false, nil + } + return false, err + }) + if err != nil { wfExecutor.AddError(err) } }() diff --git a/workflow/executor/k8sapi/client.go b/workflow/executor/k8sapi/client.go index d7190bfb03ad..4efa0f56bdf7 100644 --- a/workflow/executor/k8sapi/client.go +++ b/workflow/executor/k8sapi/client.go @@ -20,6 +20,7 @@ import ( errorsutil "github.com/argoproj/argo-workflows/v3/util/errors" waitutil "github.com/argoproj/argo-workflows/v3/util/wait" "github.com/argoproj/argo-workflows/v3/workflow/common" + "github.com/argoproj/argo-workflows/v3/workflow/executor" execcommon "github.com/argoproj/argo-workflows/v3/workflow/executor/common" ) @@ -110,11 +111,17 @@ func (c *k8sAPIClient) GetContainerStatuses(ctx context.Context) (*corev1.Pod, [ func (c *k8sAPIClient) KillContainer(pod *corev1.Pod, container *corev1.ContainerStatus, sig syscall.Signal) error { command := []string{"/bin/sh", "-c", fmt.Sprintf("kill -%d 1", sig)} - exec, err := common.ExecPodContainer(c.config, c.namespace, c.podName, container.Name, true, true, command...) - if err != nil { - return err - } - _, _, err = common.GetExecutorOutput(exec) + err := wait.ExponentialBackoff(executor.ExecutorRetry, func() (bool, error) { + exec, err := common.ExecPodContainer(c.config, c.namespace, c.podName, container.Name, true, true, command...) + if err != nil { + return false, nil + } + _, _, err = common.GetExecutorOutput(exec) + if err != nil { + return false, err + } + return true, nil + }) return err }