Skip to content

Commit

Permalink
refactor step execution to distinguish between terminal and retryable…
Browse files Browse the repository at this point in the history
… errors/failures

Signed-off-by: Kent Rancourt <[email protected]>
  • Loading branch information
krancour committed Dec 11, 2024
1 parent 6c5a5ca commit 0fac579
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 25 deletions.
79 changes: 60 additions & 19 deletions internal/directives/simple_engine_promote.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,29 +112,70 @@ func (e *SimpleEngine) executeSteps(

// Execute the step
result, err := e.executeStep(ctx, promoCtx, step, reg, workDir, state)
stepExecMeta.Status = result.Status
stepExecMeta.Message = result.Message
state[step.Alias] = result.Output

switch result.Status {
case kargoapi.PromotionPhaseErrored, kargoapi.PromotionPhaseFailed,
kargoapi.PromotionPhaseRunning, kargoapi.PromotionPhaseSucceeded:
default:
// Deal with statuses that no step should have returned.
stepExecMeta.FinishedAt = ptr.To(metav1.Now())
return PromotionResult{
Status: kargoapi.PromotionPhaseErrored,
CurrentStep: i,
StepExecutionMetadata: stepExecMetas,
State: state,
HealthCheckSteps: healthChecks,
}, fmt.Errorf("step %d returned an invalid status", i)

Check warning on line 131 in internal/directives/simple_engine_promote.go

View check run for this annotation

Codecov / codecov/patch

internal/directives/simple_engine_promote.go#L122-L131

Added lines #L122 - L131 were not covered by tests
}

// Reconcile status and err...
if err != nil {
// Let a hard error take precedence over the result status and message.
stepExecMeta.Status = kargoapi.PromotionPhaseErrored
if stepExecMeta.Status != kargoapi.PromotionPhaseFailed {
// All states other than Errored and Failed should be mutually exclusive
// with a hard error. If we got to here, a step has violated this
// assumption. We will prioritize the error over the status and change
// the status to Errored.
stepExecMeta.Status = kargoapi.PromotionPhaseErrored
}
// Let the hard error take precedence over the message.
stepExecMeta.Message = err.Error()
} else {
stepExecMeta.Status = result.Status
stepExecMeta.Message = result.Message
} else if result.Status == kargoapi.PromotionPhaseErrored {
// A nil err should be mutually exclusive with an Errored status. If we
// got to here, a step has violated this assumption. We will prioritize
// the Errored status over the nil error and create an error.
message := stepExecMeta.Message
if message == "" {
message = "no details provided"
}
err = fmt.Errorf("step %d errored: %s", i, message)

Check warning on line 153 in internal/directives/simple_engine_promote.go

View check run for this annotation

Codecov / codecov/patch

internal/directives/simple_engine_promote.go#L146-L153

Added lines #L146 - L153 were not covered by tests
}
state[step.Alias] = result.Output

if stepExecMeta.Status == kargoapi.PromotionPhaseSucceeded {
// At this point, we've sorted out any discrepancies between the status and
// err.

switch {
case stepExecMeta.Status == kargoapi.PromotionPhaseSucceeded:
// Best case scenario: The step succeeded.
stepExecMeta.FinishedAt = ptr.To(metav1.Now())
if healthCheck := result.HealthCheckStep; healthCheck != nil {
healthChecks = append(healthChecks, *healthCheck)
}
continue // Move on to the next step
}

// Treat errors and logical failures the same for now.
// TODO(krancour): In the future, we should fail without retry for logical
// failures and unrecoverable errors and retry only those errors with a
// chance of recovery.
if stepExecMeta.Status != kargoapi.PromotionPhaseRunning {
case isTerminal(err):
// This is an unrecoverable error.
stepExecMeta.FinishedAt = ptr.To(metav1.Now())
return PromotionResult{
Status: stepExecMeta.Status,
CurrentStep: i,
StepExecutionMetadata: stepExecMetas,
State: state,
HealthCheckSteps: healthChecks,
}, fmt.Errorf("an unrecoverable error occurred: %w", err)
case err != nil:
// If we get to here, the error is POTENTIALLY recoverable.
stepExecMeta.ErrorCount++
// Check if the error threshold has been met.
errorThreshold := step.GetErrorThreshold(reg.Runner)
Expand All @@ -154,8 +195,8 @@ func (e *SimpleEngine) executeSteps(
}
}

// If we get to here, the step is either running (waiting for some external
// condition to be met) or it errored/failed but did not meet the error
// If we get to here, the step is either Running (waiting for some external
// condition to be met) or it Errored/Failed but did not meet the error
// threshold. Now we need to check if the timeout has elapsed. A nil timeout
// or any non-positive timeout interval are treated as NO timeout, although
// a nil timeout really shouldn't happen.
Expand All @@ -172,8 +213,8 @@ func (e *SimpleEngine) executeSteps(
}, fmt.Errorf("step %d timeout of %s has elapsed", i, timeout.String())
}

if stepExecMeta.Status != kargoapi.PromotionPhaseRunning {
// Treat the error/failure as if the step is still running so that the
if err != nil {
// Treat Errored/Failed as if the step is still running so that the
// Promotion will be requeued. The step will be retried on the next
// reconciliation.
stepExecMeta.Message += "; step will be retried"
Expand All @@ -186,7 +227,7 @@ func (e *SimpleEngine) executeSteps(
}, nil
}

// If we get to here, the step is still running (waiting for some external
// If we get to here, the step is still Running (waiting for some external
// condition to be met).
stepExecMeta.ErrorCount = 0 // Reset the error count
return PromotionResult{
Expand Down
82 changes: 76 additions & 6 deletions internal/directives/simple_engine_promote_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,35 @@ func TestSimpleEngine_executeSteps(t *testing.T) {
},
},
{
name: "error on step execution; error threshold met",
name: "terminal error on step execution",
steps: []PromotionStep{
{Kind: "success-step", Alias: "step1"},
{Kind: "terminal-error-step", Alias: "step2"},
},
assertions: func(t *testing.T, result PromotionResult, err error) {
assert.ErrorContains(t, err, "an unrecoverable error occurred")
assert.Equal(t, kargoapi.PromotionPhaseErrored, result.Status)
assert.Equal(t, int64(1), result.CurrentStep)
assert.Len(t, result.StepExecutionMetadata, 2)
assert.Equal(t, kargoapi.PromotionPhaseSucceeded, result.StepExecutionMetadata[0].Status)
assert.NotNil(t, result.StepExecutionMetadata[0].StartedAt)
assert.NotNil(t, result.StepExecutionMetadata[0].FinishedAt)
assert.Equal(t, kargoapi.PromotionPhaseErrored, result.StepExecutionMetadata[1].Status)
assert.NotNil(t, result.StepExecutionMetadata[1].StartedAt)
assert.NotNil(t, result.StepExecutionMetadata[1].FinishedAt)
assert.Contains(t, result.StepExecutionMetadata[1].Message, "something went wrong")

// Verify first step output is preserved in state
assert.Equal(t, State{
"step1": map[string]any{
"key": "value",
},
"step2": map[string]any(nil),
}, result.State)
},
},
{
name: "non-terminal error on step execution; error threshold met",
steps: []PromotionStep{
{Kind: "success-step", Alias: "step1"},
{Kind: "error-step", Alias: "step2"},
Expand Down Expand Up @@ -263,7 +291,7 @@ func TestSimpleEngine_executeSteps(t *testing.T) {
},
},
{
name: "error on step execution; error threshold not met",
name: "non-terminal error on step execution; error threshold not met",
steps: []PromotionStep{
{
Kind: "error-step",
Expand All @@ -283,6 +311,37 @@ func TestSimpleEngine_executeSteps(t *testing.T) {
assert.Contains(t, result.StepExecutionMetadata[0].Message, "will be retried")
},
},
{
name: "non-terminal error on step execution; timeout elapsed",
promoCtx: PromotionContext{
StepExecutionMetadata: kargoapi.StepExecutionMetadataList{{
// Start time is set to an hour ago
StartedAt: ptr.To(metav1.NewTime(time.Now().Add(-time.Hour))),
}},
},
steps: []PromotionStep{
{
Kind: "error-step",
Retry: &kargoapi.PromotionStepRetry{
ErrorThreshold: 3,
Timeout: &metav1.Duration{
Duration: time.Hour,
},
},
},
},
assertions: func(t *testing.T, result PromotionResult, err error) {
assert.ErrorContains(t, err, "timeout")
assert.ErrorContains(t, err, "has elapsed")
assert.Equal(t, kargoapi.PromotionPhaseErrored, result.Status)
assert.Equal(t, int64(0), result.CurrentStep)
assert.Len(t, result.StepExecutionMetadata, 1)
assert.Equal(t, kargoapi.PromotionPhaseErrored, result.StepExecutionMetadata[0].Status)
assert.NotNil(t, result.StepExecutionMetadata[0].StartedAt)
assert.NotNil(t, result.StepExecutionMetadata[0].FinishedAt)
assert.Equal(t, uint32(1), result.StepExecutionMetadata[0].ErrorCount)
},
},
{
name: "step is still running; timeout elapsed",
promoCtx: PromotionContext{
Expand All @@ -306,6 +365,10 @@ func TestSimpleEngine_executeSteps(t *testing.T) {
assert.ErrorContains(t, err, "has elapsed")
assert.Equal(t, kargoapi.PromotionPhaseErrored, result.Status)
assert.Equal(t, int64(0), result.CurrentStep)
assert.Len(t, result.StepExecutionMetadata, 1)
assert.Equal(t, kargoapi.PromotionPhaseRunning, result.StepExecutionMetadata[0].Status)
assert.NotNil(t, result.StepExecutionMetadata[0].StartedAt)
assert.NotNil(t, result.StepExecutionMetadata[0].FinishedAt)
},
},
{
Expand All @@ -318,13 +381,12 @@ func TestSimpleEngine_executeSteps(t *testing.T) {
assert.Len(t, result.StepExecutionMetadata, 1)
assert.Equal(t, kargoapi.PromotionPhaseRunning, result.StepExecutionMetadata[0].Status)
assert.NotNil(t, result.StepExecutionMetadata[0].StartedAt)
assert.Nil(t, result.StepExecutionMetadata[0].FinishedAt)
},
},
{
name: "context cancellation",
steps: []PromotionStep{
{Kind: "context-waiter", Alias: "step1"},
},
name: "context cancellation",
steps: []PromotionStep{{Kind: "context-waiter"}},
assertions: func(t *testing.T, result PromotionResult, err error) {
assert.ErrorContains(t, err, "met error threshold")
assert.Equal(t, kargoapi.PromotionPhaseErrored, result.Status)
Expand Down Expand Up @@ -369,6 +431,14 @@ func TestSimpleEngine_executeSteps(t *testing.T) {
},
&StepRunnerPermissions{},
)
testRegistry.RegisterPromotionStepRunner(
&mockPromotionStepRunner{
name: "terminal-error-step",
runResult: PromotionStepResult{Status: kargoapi.PromotionPhaseErrored},
runErr: &terminalError{err: errors.New("something went wrong")},
},
&StepRunnerPermissions{},
)
testRegistry.RegisterPromotionStepRunner(
&mockPromotionStepRunner{
name: "context-waiter",
Expand Down

0 comments on commit 0fac579

Please sign in to comment.