Skip to content

Commit

Permalink
deploy: hint that machines may time out due to region issue (#3074)
Browse files Browse the repository at this point in the history
* `machine`: standardize on `WaitTimeoutErr`

* `deploy`: suggest deploying to different region on timeout
  • Loading branch information
alichay authored Nov 30, 2023
1 parent 9441656 commit b2f4319
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 13 deletions.
41 changes: 36 additions & 5 deletions internal/command/deploy/machines_deploymachinesapp.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,12 +286,19 @@ func (m machineUpdateEntries) machines() []machine.LeasableMachine {
}

func errorIsTimeout(err error) bool {
// Trying to match the errors in a typed way is incredibly difficult and makes this function massive.

// Match an error against various known timeout conditions.
// This is probably a sign that we need to standardize this better, but it works for now.

var timeoutErr machine.WaitTimeoutErr
if errors.As(err, &timeoutErr) {
return true
}

if strings.Contains(err.Error(), "net/http: request canceled") {
return true
}

// Look for an underlying context.DeadlineExceeded error
if errors.Is(err, context.DeadlineExceeded) {
return true
}
Expand All @@ -305,12 +312,36 @@ func errorIsTimeout(err error) bool {

// suggestChangeWaitTimeout appends a suggestion to change the specified flag name
// if and only if the error is caused by a timeout.
// If the err is not a timeout, it's returned unchanged.
// If the error is not a timeout, it's returned unchanged.
func suggestChangeWaitTimeout(err error, flagName string) error {
if errorIsTimeout(err) {

suggestIncreaseTimeout := fmt.Sprintf("increasing the timeout with the --%s flag", flagName)

descript := ""
suggest := ""

// Both of these branches give the suggestion to change the timeout,
// but we only suggest changing region on machine start.

var timeoutErr machine.WaitTimeoutErr
if errors.As(err, &timeoutErr) && timeoutErr.DesiredState() == api.MachineStateStarted {
// If we timed out waiting for a machine to start, we want to suggest that there could be a region issue preventing
// the machine from finishing its state transition. (e.g. slow image pulls, volume trouble, etc.)
descript = "Your machine was created, but never started. This could mean that your app is taking a long time to start,\nbut it could be indicative of a region issue."
suggest = fmt.Sprintf("You can try deploying to a different region,\nor you can try %s", suggestIncreaseTimeout)
} else {
// If we timed out waiting for a different state, we want to suggest that the timeout could be too short.
// You can't really suggest changing regions in cases where you're not starting machines, so this is the
// best advice we can give.
descript = "Your machine never reached the state \"%s\"."
suggest = fmt.Sprintf("You can try %s", suggestIncreaseTimeout)
}

err = flyerr.GenericErr{
Err: err.Error(),
Suggest: fmt.Sprintf("You can increase the timeout with the --%s flag", flagName),
Err: err.Error(),
Descript: descript,
Suggest: suggest,
}
}
return err
Expand Down
6 changes: 5 additions & 1 deletion internal/machine/leasable_machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,11 @@ func (lm *leasableMachine) WaitForState(ctx context.Context, desiredState string
case errors.Is(waitCtx.Err(), context.Canceled):
return err
case errors.Is(waitCtx.Err(), context.DeadlineExceeded):
return fmt.Errorf("timed out waiting for machine to reach %s state: %w", desiredState, err)
return WaitTimeoutErr{
machineID: lm.machine.ID,
timeout: timeout,
desiredState: desiredState,
}
case notFoundResponse && desiredState != api.MachineStateDestroyed:
return err
case !notFoundResponse && err != nil:
Expand Down
18 changes: 11 additions & 7 deletions internal/machine/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ func WaitForStartOrStop(ctx context.Context, machine *api.Machine, action string
return err
case errors.Is(waitCtx.Err(), context.DeadlineExceeded):
return WaitTimeoutErr{
machineID: machine.ID,
timeout: timeout,
action: action,
machineID: machine.ID,
timeout: timeout,
desiredState: waitOnAction,
}
default:
var flapsErr *flaps.FlapsError
Expand All @@ -65,17 +65,21 @@ func WaitForStartOrStop(ctx context.Context, machine *api.Machine, action string
}

type WaitTimeoutErr struct {
machineID string
timeout time.Duration
action string
machineID string
timeout time.Duration
desiredState string
}

func (e WaitTimeoutErr) Error() string {
return "timeout reached waiting for machine's state to change"
}

func (e WaitTimeoutErr) Description() string {
return fmt.Sprintf("The machine %s took more than %s to %s", e.machineID, e.timeout, e.action)
return fmt.Sprintf("The machine %s took more than %s to reach \"%s\"", e.machineID, e.timeout, e.desiredState)
}

func (e WaitTimeoutErr) DesiredState() string {
return e.desiredState
}

var invalidAction flyerr.GenericErr = flyerr.GenericErr{
Expand Down

0 comments on commit b2f4319

Please sign in to comment.