Skip to content

Commit

Permalink
Add retryable errors & Refactor error-retrying mechanism
Browse files Browse the repository at this point in the history
  • Loading branch information
HamadaSalhab committed Oct 28, 2024
1 parent bcec6d2 commit d096955
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 22 deletions.
61 changes: 42 additions & 19 deletions agents-api/agents_api/common/exceptions/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import requests
import temporalio.exceptions

### FIXME: This should be the opposite. We should retry on only known errors

# List of error types that should not be retried
NON_RETRYABLE_ERROR_TYPES = (
Expand Down Expand Up @@ -56,7 +55,6 @@
UnicodeTranslateError,
#
# HTTP and API-related errors
fastapi.exceptions.HTTPException,
fastapi.exceptions.RequestValidationError,
#
# Asynchronous programming errors
Expand Down Expand Up @@ -98,36 +96,61 @@
litellm.exceptions.AuthenticationError,
litellm.exceptions.ServiceUnavailableError,
litellm.exceptions.OpenAIError,
litellm.exceptions.APIError,
)

RETRYABLE_ERROR_TYPES = (
# LiteLLM exceptions
litellm.exceptions.RateLimitError,
litellm.exceptions.APIError, # Added to retry on "APIError: OpenAIException - Connection error"
#
# HTTP/Network related errors
requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
requests.exceptions.ConnectTimeout,
requests.exceptions.ReadTimeout,
httpx.ConnectError,
httpx.ConnectTimeout,
httpx.ReadTimeout,
httpx.WriteTimeout,
httpx.PoolTimeout,
#
# Standard library errors that are typically transient
ConnectionError,
TimeoutError,
OSError, # Covers many IO-related errors that may be transient
IOError,
#
# Database/storage related
asyncio.TimeoutError,
)

### FIXME: This should be the opposite. So `is_retryable_error` instead of `is_non_retryable_error`
def is_non_retryable_error(error: BaseException) -> bool:
"""
Determines if the given error is non-retryable.
RETRYABLE_HTTP_STATUS_CODES = (408, 429, 503, 504)

This function checks if the error is an instance of any of the error types
defined in NON_RETRYABLE_ERROR_TYPES.
def is_retryable_error(error: BaseException) -> bool:
"""
Determines if the given error should be retried or not.
Args:
error (Exception): The error to check.
Returns:
bool: True if the error is non-retryable, False otherwise.
bool: True if the error is retryable, False otherwise.
"""

if isinstance(error, NON_RETRYABLE_ERROR_TYPES):
return False

if isinstance(error, RETRYABLE_ERROR_TYPES):
return True

# Check for specific HTTP errors (status code == 429)
# Check for specific HTTP errors that should be retried
if isinstance(error, fastapi.exceptions.HTTPException):
if error.status_code in RETRYABLE_HTTP_STATUS_CODES:
return True

if isinstance(error, httpx.HTTPStatusError):
if error.response.status_code in (
408,
429,
503,
504,
): # pytype: disable=attribute-error
return False
if error.response.status_code in RETRYABLE_HTTP_STATUS_CODES:
return True

# If we don't know about the error, we should not retry
return True
return False
6 changes: 3 additions & 3 deletions agents-api/agents_api/common/interceptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
ReadOnlyContextError,
)

from .exceptions.tasks import is_non_retryable_error
from .exceptions.tasks import is_retryable_error


class CustomActivityInterceptor(ActivityInboundInterceptor):
Expand All @@ -50,7 +50,7 @@ async def execute_activity(self, input: ExecuteActivityInput):
):
raise
except BaseException as e:
if is_non_retryable_error(e):
if not is_retryable_error(e):
raise ApplicationError(
str(e),
type=type(e).__name__,
Expand Down Expand Up @@ -83,7 +83,7 @@ async def execute_workflow(self, input: ExecuteWorkflowInput):
):
raise
except BaseException as e:
if is_non_retryable_error(e):
if not is_retryable_error(e):
raise ApplicationError(
str(e),
type=type(e).__name__,
Expand Down
9 changes: 9 additions & 0 deletions agents-api/agents_api/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,16 @@ def cozo_query_dec(func: Callable[P, tuple[str | list[Any], dict]]):
"""

from pprint import pprint
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception

def is_resource_busy(e: Exception) -> bool:
return isinstance(e, HTTPException) and e.status_code == 429

@retry(
stop=stop_after_attempt(2),
wait=wait_exponential(multiplier=1, min=4, max=10),
retry=retry_if_exception(is_resource_busy)
)
@wraps(func)
def wrapper(*args: P.args, client=None, **kwargs: P.kwargs) -> pd.DataFrame:
queries, variables = func(*args, **kwargs)
Expand Down

0 comments on commit d096955

Please sign in to comment.