Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(agents-api): Fix Error-Retrying Mechanism #760

Merged
merged 6 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 43 additions & 20 deletions agents-api/agents_api/common/exceptions/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
import requests
import temporalio.exceptions

### FIXME: This should be the opposite. We should retry on only known errors

# List of error types that should not be retried
NON_RETRYABLE_ERROR_TYPES = (
# Temporal-specific errors
Expand Down Expand Up @@ -56,7 +54,6 @@
UnicodeTranslateError,
#
# HTTP and API-related errors
fastapi.exceptions.HTTPException,
fastapi.exceptions.RequestValidationError,
#
# Asynchronous programming errors
Expand Down Expand Up @@ -98,36 +95,62 @@
litellm.exceptions.AuthenticationError,
litellm.exceptions.ServiceUnavailableError,
litellm.exceptions.OpenAIError,
litellm.exceptions.APIError,
)

RETRYABLE_ERROR_TYPES = (
# LiteLLM exceptions
litellm.exceptions.RateLimitError,
litellm.exceptions.APIError, # Added to retry on "APIError: OpenAIException - Connection error"
#
# HTTP/Network related errors
requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
requests.exceptions.ConnectTimeout,
requests.exceptions.ReadTimeout,
httpx.ConnectError,
httpx.ConnectTimeout,
httpx.ReadTimeout,
httpx.WriteTimeout,
httpx.PoolTimeout,
#
# Standard library errors that are typically transient
ConnectionError,
TimeoutError,
OSError, # Covers many IO-related errors that may be transient
IOError,
#
# Database/storage related
asyncio.TimeoutError,
)

RETRYABLE_HTTP_STATUS_CODES = (408, 429, 503, 504)

### FIXME: This should be the opposite. So `is_retryable_error` instead of `is_non_retryable_error`
def is_non_retryable_error(error: BaseException) -> bool:
"""
Determines if the given error is non-retryable.

This function checks if the error is an instance of any of the error types
defined in NON_RETRYABLE_ERROR_TYPES.
def is_retryable_error(error: BaseException) -> bool:
"""
Determines if the given error should be retried or not.

Args:
error (Exception): The error to check.

Returns:
bool: True if the error is non-retryable, False otherwise.
bool: True if the error is retryable, False otherwise.
"""

if isinstance(error, NON_RETRYABLE_ERROR_TYPES):
return False

if isinstance(error, RETRYABLE_ERROR_TYPES):
return True

# Check for specific HTTP errors (status code == 429)
# Check for specific HTTP errors that should be retried
if isinstance(error, fastapi.exceptions.HTTPException):
if error.status_code in RETRYABLE_HTTP_STATUS_CODES:
return True

if isinstance(error, httpx.HTTPStatusError):
if error.response.status_code in (
408,
429,
503,
504,
): # pytype: disable=attribute-error
return False
if error.response.status_code in RETRYABLE_HTTP_STATUS_CODES:
return True

# If we don't know about the error, we should not retry
return True
return False
6 changes: 3 additions & 3 deletions agents-api/agents_api/common/interceptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
ReadOnlyContextError,
)

from .exceptions.tasks import is_non_retryable_error
from .exceptions.tasks import is_retryable_error


class CustomActivityInterceptor(ActivityInboundInterceptor):
Expand All @@ -50,7 +50,7 @@ async def execute_activity(self, input: ExecuteActivityInput):
):
creatorrr marked this conversation as resolved.
Show resolved Hide resolved
raise
except BaseException as e:
if is_non_retryable_error(e):
if not is_retryable_error(e):
raise ApplicationError(
str(e),
type=type(e).__name__,
Expand Down Expand Up @@ -83,7 +83,7 @@ async def execute_workflow(self, input: ExecuteWorkflowInput):
):
raise
except BaseException as e:
if is_non_retryable_error(e):
if not is_retryable_error(e):
raise ApplicationError(
str(e),
type=type(e).__name__,
Expand Down
15 changes: 15 additions & 0 deletions agents-api/agents_api/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,21 @@ def cozo_query_dec(func: Callable[P, tuple[str | list[Any], dict]]):

from pprint import pprint

from tenacity import (
retry,
retry_if_exception,
stop_after_attempt,
wait_exponential,
)

def is_resource_busy(e: Exception) -> bool:
return isinstance(e, HTTPException) and e.status_code == 429

@retry(
stop=stop_after_attempt(2),
wait=wait_exponential(multiplier=1, min=4, max=10),
retry=retry_if_exception(is_resource_busy),
)
@wraps(func)
def wrapper(*args: P.args, client=None, **kwargs: P.kwargs) -> pd.DataFrame:
queries, variables = func(*args, **kwargs)
Expand Down
Loading