diff --git a/api/jsonschema/schema.json b/api/jsonschema/schema.json index d96eb25b7bc3..f6cd69c3cbbe 100644 --- a/api/jsonschema/schema.json +++ b/api/jsonschema/schema.json @@ -4770,6 +4770,7 @@ "type": "object" }, "io.argoproj.workflow.v1alpha1.ContainerSetRetryStrategy": { + "description": "ContainerSetRetryStrategy provides controls on how to retry a container set", "properties": { "duration": { "description": "Duration is the time between each retry, examples values are \"300ms\", \"1s\" or \"5m\". Valid time units are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\".", @@ -4777,7 +4778,7 @@ }, "retries": { "$ref": "#/definitions/io.k8s.apimachinery.pkg.util.intstr.IntOrString", - "description": "Nbr of retries" + "description": "Retries is the maximum number of retry attempts for each container. It does not include the first, original attempt; the maximum number of total attempts will be `retries + 1`." } }, "required": [ @@ -4795,7 +4796,7 @@ }, "retryStrategy": { "$ref": "#/definitions/io.argoproj.workflow.v1alpha1.ContainerSetRetryStrategy", - "description": "RetryStrategy describes how to retry a container nodes in the container set if it fails. Nbr of retries(default 0) and sleep duration between retries(default 0s, instant retry) can be set." + "description": "RetryStrategy describes how to retry container nodes if the container set fails. Note that this works differently from the template-level `retryStrategy` as it is a process-level retry that does not create new Pods or containers." }, "volumeMounts": { "items": { diff --git a/api/openapi-spec/swagger.json b/api/openapi-spec/swagger.json index f2dcef958b36..483615edffd0 100644 --- a/api/openapi-spec/swagger.json +++ b/api/openapi-spec/swagger.json @@ -8719,6 +8719,7 @@ } }, "io.argoproj.workflow.v1alpha1.ContainerSetRetryStrategy": { + "description": "ContainerSetRetryStrategy provides controls on how to retry a container set", "type": "object", "required": [ "retries" @@ -8729,7 +8730,7 @@ "type": "string" }, "retries": { - "description": "Nbr of retries", + "description": "Retries is the maximum number of retry attempts for each container. It does not include the first, original attempt; the maximum number of total attempts will be `retries + 1`.", "$ref": "#/definitions/io.k8s.apimachinery.pkg.util.intstr.IntOrString" } } @@ -8747,7 +8748,7 @@ } }, "retryStrategy": { - "description": "RetryStrategy describes how to retry a container nodes in the container set if it fails. Nbr of retries(default 0) and sleep duration between retries(default 0s, instant retry) can be set.", + "description": "RetryStrategy describes how to retry container nodes if the container set fails. Note that this works differently from the template-level `retryStrategy` as it is a process-level retry that does not create new Pods or containers.", "$ref": "#/definitions/io.argoproj.workflow.v1alpha1.ContainerSetRetryStrategy" }, "volumeMounts": { diff --git a/docs/container-set-template.md b/docs/container-set-template.md index d27cb27df95b..2108fa3d08b9 100644 --- a/docs/container-set-template.md +++ b/docs/container-set-template.md @@ -116,3 +116,73 @@ Example B: Lopsided requests, e.g. `a -> b` where `a` is cheap and `b` is expens Can you see the problem here? `a` only has small requests, but the container set will use the total of all requests. So it's as if you're using all that GPU for 10h. This will be expensive. Solution: do not use container set when you have lopsided requests. + +## Inner `retryStrategy` usage + +> v3.3 and after + +You can set an inner `retryStrategy` to apply to all containers of a container set, including the `duration` between each retry and the total number of `retries`. + +See an example below: + +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + name: containerset-with-retrystrategy + annotations: + workflows.argoproj.io/description: | + This workflow creates a container set with a retryStrategy. +spec: + entrypoint: containerset-retrystrategy-example + templates: + - name: containerset-retrystrategy-example + containerSet: + retryStrategy: + retries: "10" # if fails, retry at most ten times + duration: 30s # retry for at most 30s + containers: + # this container completes successfully, so it won't be retried. + - name: success + image: python:alpine3.6 + command: + - python + - -c + args: + - | + print("hi") + # if fails, it will retry at most ten times. + - name: fail-retry + image: python:alpine3.6 + command: ["python", -c] + # fail with a 66% probability + args: ["import random; import sys; exit_code = random.choice([0, 1, 1]); sys.exit(exit_code)"] +``` + + + +!!! Note "Template-level `retryStrategy` vs Container Set `retryStrategy`" + `containerSet.retryStrategy` works differently from [template-level retries](retries.md): + + 1. Your `command` will be re-ran by the Executor inside the same container if it fails. + + - As no new containers are created, the nodes in the UI remain the same, and the retried logs are appended to original container's logs. For example, your container logs may look like: + ```text + time="2024-03-29T06:40:25 UTC" level=info msg="capturing logs" argo=true + intentional failure + time="2024-03-29T06:40:25 UTC" level=debug msg="ignore signal child exited" argo=true + time="2024-03-29T06:40:26 UTC" level=info msg="capturing logs" argo=true + time="2024-03-29T06:40:26 UTC" level=debug msg="ignore signal urgent I/O condition" argo=true + intentional failure + time="2024-03-29T06:40:26 UTC" level=debug msg="ignore signal child exited" argo=true + time="2024-03-29T06:40:26 UTC" level=debug msg="forwarding signal terminated" argo=true + time="2024-03-29T06:40:27 UTC" level=info msg="sub-process exited" argo=true error="" + time="2024-03-29T06:40:27 UTC" level=info msg="not saving outputs - not main container" argo=true + Error: exit status 1 + ``` + + 1. If a container's `command` cannot be located, it will not be retried. + + - As it will fail each time, the retry logic is short-circuited. + + diff --git a/docs/executor_swagger.md b/docs/executor_swagger.md index e159c9f26106..e78cbb50df6b 100644 --- a/docs/executor_swagger.md +++ b/docs/executor_swagger.md @@ -1026,10 +1026,13 @@ referred to by services. ### ContainerSetRetryStrategy +> ContainerSetRetryStrategy provides controls on how to retry a container set + + **Properties** | Name | Type | Go type | Required | Default | Description | Example | diff --git a/docs/fields.md b/docs/fields.md index 2eda10c6e4e8..880604d9c1e4 100644 --- a/docs/fields.md +++ b/docs/fields.md @@ -2411,7 +2411,7 @@ _No description available_ | Field Name | Field Type | Description | |:----------:|:----------:|---------------| |`containers`|`Array<`[`ContainerNode`](#containernode)`>`|_No description available_| -|`retryStrategy`|[`ContainerSetRetryStrategy`](#containersetretrystrategy)|RetryStrategy describes how to retry a container nodes in the container set if it fails. Nbr of retries(default 0) and sleep duration between retries(default 0s, instant retry) can be set.| +|`retryStrategy`|[`ContainerSetRetryStrategy`](#containersetretrystrategy)|RetryStrategy describes how to retry container nodes if the container set fails. Note that this works differently from the template-level `retryStrategy` as it is a process-level retry that does not create new Pods or containers.| |`volumeMounts`|`Array<`[`VolumeMount`](#volumemount)`>`|_No description available_| ## DAGTemplate @@ -3748,7 +3748,7 @@ _No description available_ ## ContainerSetRetryStrategy -_No description available_ +ContainerSetRetryStrategy provides controls on how to retry a container set
Examples with this field (click to open) @@ -3780,7 +3780,7 @@ _No description available_ | Field Name | Field Type | Description | |:----------:|:----------:|---------------| |`duration`|`string`|Duration is the time between each retry, examples values are "300ms", "1s" or "5m". Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h".| -|`retries`|[`IntOrString`](#intorstring)|Nbr of retries| +|`retries`|[`IntOrString`](#intorstring)|Retries is the maximum number of retry attempts for each container. It does not include the first, original attempt; the maximum number of total attempts will be `retries + 1`.| ## DAGTask diff --git a/pkg/apis/workflow/v1alpha1/container_set_template_types.go b/pkg/apis/workflow/v1alpha1/container_set_template_types.go index fb685a3e81f5..ac1a4f44205f 100644 --- a/pkg/apis/workflow/v1alpha1/container_set_template_types.go +++ b/pkg/apis/workflow/v1alpha1/container_set_template_types.go @@ -12,16 +12,18 @@ import ( type ContainerSetTemplate struct { Containers []ContainerNode `json:"containers" protobuf:"bytes,4,rep,name=containers"` VolumeMounts []corev1.VolumeMount `json:"volumeMounts,omitempty" protobuf:"bytes,3,rep,name=volumeMounts"` - // RetryStrategy describes how to retry a container nodes in the container set if it fails. - // Nbr of retries(default 0) and sleep duration between retries(default 0s, instant retry) can be set. + // RetryStrategy describes how to retry container nodes if the container set fails. + // Note that this works differently from the template-level `retryStrategy` as it is a process-level retry that does not create new Pods or containers. RetryStrategy *ContainerSetRetryStrategy `json:"retryStrategy,omitempty" protobuf:"bytes,5,opt,name=retryStrategy"` } +// ContainerSetRetryStrategy provides controls on how to retry a container set type ContainerSetRetryStrategy struct { // Duration is the time between each retry, examples values are "300ms", "1s" or "5m". // Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". Duration string `json:"duration,omitempty" protobuf:"bytes,1,opt,name=duration"` - // Nbr of retries + // Retries is the maximum number of retry attempts for each container. It does not include the + // first, original attempt; the maximum number of total attempts will be `retries + 1`. Retries *intstr.IntOrString `json:"retries" protobuf:"bytes,2,rep,name=retries"` } diff --git a/pkg/apis/workflow/v1alpha1/generated.proto b/pkg/apis/workflow/v1alpha1/generated.proto index 221f9f64d9c9..f3910066d35b 100644 --- a/pkg/apis/workflow/v1alpha1/generated.proto +++ b/pkg/apis/workflow/v1alpha1/generated.proto @@ -405,12 +405,14 @@ message ContainerNode { repeated string dependencies = 2; } +// ContainerSetRetryStrategy provides controls on how to retry a container set message ContainerSetRetryStrategy { // Duration is the time between each retry, examples values are "300ms", "1s" or "5m". // Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". optional string duration = 1; - // Nbr of retries + // Retries is the maximum number of retry attempts for each container. It does not include the + // first, original attempt; the maximum number of total attempts will be `retries + 1`. optional k8s.io.apimachinery.pkg.util.intstr.IntOrString retries = 2; } @@ -419,8 +421,8 @@ message ContainerSetTemplate { repeated k8s.io.api.core.v1.VolumeMount volumeMounts = 3; - // RetryStrategy describes how to retry a container nodes in the container set if it fails. - // Nbr of retries(default 0) and sleep duration between retries(default 0s, instant retry) can be set. + // RetryStrategy describes how to retry container nodes if the container set fails. + // Note that this works differently from the template-level `retryStrategy` as it is a process-level retry that does not create new Pods or containers. optional ContainerSetRetryStrategy retryStrategy = 5; } diff --git a/pkg/apis/workflow/v1alpha1/openapi_generated.go b/pkg/apis/workflow/v1alpha1/openapi_generated.go index 8f025f9528fc..07904b8206fd 100644 --- a/pkg/apis/workflow/v1alpha1/openapi_generated.go +++ b/pkg/apis/workflow/v1alpha1/openapi_generated.go @@ -1930,7 +1930,8 @@ func schema_pkg_apis_workflow_v1alpha1_ContainerSetRetryStrategy(ref common.Refe return common.OpenAPIDefinition{ Schema: spec.Schema{ SchemaProps: spec.SchemaProps{ - Type: []string{"object"}, + Description: "ContainerSetRetryStrategy provides controls on how to retry a container set", + Type: []string{"object"}, Properties: map[string]spec.Schema{ "duration": { SchemaProps: spec.SchemaProps{ @@ -1941,7 +1942,7 @@ func schema_pkg_apis_workflow_v1alpha1_ContainerSetRetryStrategy(ref common.Refe }, "retries": { SchemaProps: spec.SchemaProps{ - Description: "Nbr of retries", + Description: "Retries is the maximum number of retry attempts for each container. It does not include the first, original attempt; the maximum number of total attempts will be `retries + 1`.", Ref: ref("k8s.io/apimachinery/pkg/util/intstr.IntOrString"), }, }, @@ -1988,7 +1989,7 @@ func schema_pkg_apis_workflow_v1alpha1_ContainerSetTemplate(ref common.Reference }, "retryStrategy": { SchemaProps: spec.SchemaProps{ - Description: "RetryStrategy describes how to retry a container nodes in the container set if it fails. Nbr of retries(default 0) and sleep duration between retries(default 0s, instant retry) can be set.", + Description: "RetryStrategy describes how to retry container nodes if the container set fails. Note that this works differently from the template-level `retryStrategy` as it is a process-level retry that does not create new Pods or containers.", Ref: ref("github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1.ContainerSetRetryStrategy"), }, }, diff --git a/pkg/plugins/executor/swagger.yml b/pkg/plugins/executor/swagger.yml index ef4de7072265..5e44fc6844fe 100644 --- a/pkg/plugins/executor/swagger.yml +++ b/pkg/plugins/executor/swagger.yml @@ -1002,6 +1002,8 @@ definitions: title: ContainerPort represents a network port in a single container. type: object ContainerSetRetryStrategy: + description: ContainerSetRetryStrategy provides controls on how to retry a container + set properties: duration: description: |- diff --git a/sdks/java/client/docs/IoArgoprojWorkflowV1alpha1ContainerSetRetryStrategy.md b/sdks/java/client/docs/IoArgoprojWorkflowV1alpha1ContainerSetRetryStrategy.md index 19c11c6bb471..6a059f3b0315 100644 --- a/sdks/java/client/docs/IoArgoprojWorkflowV1alpha1ContainerSetRetryStrategy.md +++ b/sdks/java/client/docs/IoArgoprojWorkflowV1alpha1ContainerSetRetryStrategy.md @@ -2,6 +2,7 @@ # IoArgoprojWorkflowV1alpha1ContainerSetRetryStrategy +ContainerSetRetryStrategy provides controls on how to retry a container set ## Properties diff --git a/sdks/python/client/docs/IoArgoprojWorkflowV1alpha1ContainerSetRetryStrategy.md b/sdks/python/client/docs/IoArgoprojWorkflowV1alpha1ContainerSetRetryStrategy.md index 50e09bc46bd8..b7bfad4cd414 100644 --- a/sdks/python/client/docs/IoArgoprojWorkflowV1alpha1ContainerSetRetryStrategy.md +++ b/sdks/python/client/docs/IoArgoprojWorkflowV1alpha1ContainerSetRetryStrategy.md @@ -1,5 +1,6 @@ # IoArgoprojWorkflowV1alpha1ContainerSetRetryStrategy +ContainerSetRetryStrategy provides controls on how to retry a container set ## Properties Name | Type | Description | Notes diff --git a/test/e2e/retry_test.go b/test/e2e/retry_test.go index 740ef42d1967..bc1ad53e928e 100644 --- a/test/e2e/retry_test.go +++ b/test/e2e/retry_test.go @@ -4,6 +4,9 @@ package e2e import ( + "context" + "io" + "strings" "testing" "time" @@ -120,6 +123,77 @@ spec: }) } +func (s *RetryTestSuite) TestWorkflowTemplateWithRetryStrategyInContainerSet() { + var name string + var ns string + s.Given(). + WorkflowTemplate("@testdata/workflow-template-with-containerset.yaml"). + Workflow(` +metadata: + name: workflow-template-containerset +spec: + workflowTemplateRef: + name: containerset-with-retrystrategy +`). + When(). + CreateWorkflowTemplates(). + SubmitWorkflow(). + WaitForWorkflow(fixtures.ToBeFailed). + Then(). + ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *wfv1.WorkflowStatus) { + assert.Equal(t, status.Phase, wfv1.WorkflowFailed) + }). + ExpectWorkflowNode(func(status v1alpha1.NodeStatus) bool { + return status.Name == "workflow-template-containerset" + }, func(t *testing.T, status *v1alpha1.NodeStatus, pod *apiv1.Pod) { + name = pod.GetName() + ns = pod.GetNamespace() + }) + // Success, no need retry + s.Run("ContainerLogs", func() { + ctx := context.Background() + podLogOptions := &apiv1.PodLogOptions{Container: "c1"} + stream, err := s.KubeClient.CoreV1().Pods(ns).GetLogs(name, podLogOptions).Stream(ctx) + assert.Nil(s.T(), err) + defer stream.Close() + logBytes, err := io.ReadAll(stream) + assert.Nil(s.T(), err) + output := string(logBytes) + count := strings.Count(output, "capturing logs") + assert.Equal(s.T(), 1, count) + assert.Contains(s.T(), output, "hi") + }) + // Command err. No retry logic is entered. + s.Run("ContainerLogs", func() { + ctx := context.Background() + podLogOptions := &apiv1.PodLogOptions{Container: "c2"} + stream, err := s.KubeClient.CoreV1().Pods(ns).GetLogs(name, podLogOptions).Stream(ctx) + assert.Nil(s.T(), err) + defer stream.Close() + logBytes, err := io.ReadAll(stream) + assert.Nil(s.T(), err) + output := string(logBytes) + count := strings.Count(output, "capturing logs") + assert.Equal(s.T(), 0, count) + assert.Contains(s.T(), output, "executable file not found in $PATH") + }) + // Retry when err. + s.Run("ContainerLogs", func() { + ctx := context.Background() + podLogOptions := &apiv1.PodLogOptions{Container: "c3"} + stream, err := s.KubeClient.CoreV1().Pods(ns).GetLogs(name, podLogOptions).Stream(ctx) + assert.Nil(s.T(), err) + defer stream.Close() + logBytes, err := io.ReadAll(stream) + assert.Nil(s.T(), err) + output := string(logBytes) + count := strings.Count(output, "capturing logs") + assert.Equal(s.T(), 2, count) + countFailureInfo := strings.Count(output, "intentional failure") + assert.Equal(s.T(), 2, countFailureInfo) + }) +} + func TestRetrySuite(t *testing.T) { suite.Run(t, new(RetryTestSuite)) } diff --git a/test/e2e/testdata/workflow-template-with-containerset.yaml b/test/e2e/testdata/workflow-template-with-containerset.yaml new file mode 100644 index 000000000000..b2f4c32a880a --- /dev/null +++ b/test/e2e/testdata/workflow-template-with-containerset.yaml @@ -0,0 +1,32 @@ +apiVersion: argoproj.io/v1alpha1 +kind: WorkflowTemplate +metadata: + name: containerset-with-retrystrategy + annotations: + workflows.argoproj.io/description: | + This workflow creates a container set with a retryStrategy. +spec: + entrypoint: test + templates: + - name: test + containerSet: + retryStrategy: + retries: "2" + containers: + - name: c1 + image: python:alpine3.6 + command: + - python + - -c + args: + - | + print("hi") + - name: c2 + image: python:alpine3.6 + command: + - invalid + - command + - name: c3 + image: alpine:latest + command: [ sh, -c ] + args: [ "echo intentional failure; exit 1" ] \ No newline at end of file