From d096955cf14854e64af79ab77855a8b0ee7c32ad Mon Sep 17 00:00:00 2001 From: HamadaSalhab Date: Mon, 28 Oct 2024 22:00:10 +0300 Subject: [PATCH] Add retryable errors & Refactor error-retrying mechanism --- .../agents_api/common/exceptions/tasks.py | 61 +++++++++++++------ agents-api/agents_api/common/interceptors.py | 6 +- agents-api/agents_api/models/utils.py | 9 +++ 3 files changed, 54 insertions(+), 22 deletions(-) diff --git a/agents-api/agents_api/common/exceptions/tasks.py b/agents-api/agents_api/common/exceptions/tasks.py index 86fbaa65d..d51696485 100644 --- a/agents-api/agents_api/common/exceptions/tasks.py +++ b/agents-api/agents_api/common/exceptions/tasks.py @@ -19,7 +19,6 @@ import requests import temporalio.exceptions -### FIXME: This should be the opposite. We should retry on only known errors # List of error types that should not be retried NON_RETRYABLE_ERROR_TYPES = ( @@ -56,7 +55,6 @@ UnicodeTranslateError, # # HTTP and API-related errors - fastapi.exceptions.HTTPException, fastapi.exceptions.RequestValidationError, # # Asynchronous programming errors @@ -98,36 +96,61 @@ litellm.exceptions.AuthenticationError, litellm.exceptions.ServiceUnavailableError, litellm.exceptions.OpenAIError, - litellm.exceptions.APIError, ) +RETRYABLE_ERROR_TYPES = ( + # LiteLLM exceptions + litellm.exceptions.RateLimitError, + litellm.exceptions.APIError, # Added to retry on "APIError: OpenAIException - Connection error" + # + # HTTP/Network related errors + requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + requests.exceptions.ConnectTimeout, + requests.exceptions.ReadTimeout, + httpx.ConnectError, + httpx.ConnectTimeout, + httpx.ReadTimeout, + httpx.WriteTimeout, + httpx.PoolTimeout, + # + # Standard library errors that are typically transient + ConnectionError, + TimeoutError, + OSError, # Covers many IO-related errors that may be transient + IOError, + # + # Database/storage related + asyncio.TimeoutError, +) -### FIXME: This should be the opposite. So `is_retryable_error` instead of `is_non_retryable_error` -def is_non_retryable_error(error: BaseException) -> bool: - """ - Determines if the given error is non-retryable. +RETRYABLE_HTTP_STATUS_CODES = (408, 429, 503, 504) - This function checks if the error is an instance of any of the error types - defined in NON_RETRYABLE_ERROR_TYPES. +def is_retryable_error(error: BaseException) -> bool: + """ + Determines if the given error should be retried or not. Args: error (Exception): The error to check. Returns: - bool: True if the error is non-retryable, False otherwise. + bool: True if the error is retryable, False otherwise. """ + if isinstance(error, NON_RETRYABLE_ERROR_TYPES): + return False + + if isinstance(error, RETRYABLE_ERROR_TYPES): return True - # Check for specific HTTP errors (status code == 429) + # Check for specific HTTP errors that should be retried + if isinstance(error, fastapi.exceptions.HTTPException): + if error.status_code in RETRYABLE_HTTP_STATUS_CODES: + return True + if isinstance(error, httpx.HTTPStatusError): - if error.response.status_code in ( - 408, - 429, - 503, - 504, - ): # pytype: disable=attribute-error - return False + if error.response.status_code in RETRYABLE_HTTP_STATUS_CODES: + return True # If we don't know about the error, we should not retry - return True + return False diff --git a/agents-api/agents_api/common/interceptors.py b/agents-api/agents_api/common/interceptors.py index 408005da5..796e14966 100644 --- a/agents-api/agents_api/common/interceptors.py +++ b/agents-api/agents_api/common/interceptors.py @@ -23,7 +23,7 @@ ReadOnlyContextError, ) -from .exceptions.tasks import is_non_retryable_error +from .exceptions.tasks import is_retryable_error class CustomActivityInterceptor(ActivityInboundInterceptor): @@ -50,7 +50,7 @@ async def execute_activity(self, input: ExecuteActivityInput): ): raise except BaseException as e: - if is_non_retryable_error(e): + if not is_retryable_error(e): raise ApplicationError( str(e), type=type(e).__name__, @@ -83,7 +83,7 @@ async def execute_workflow(self, input: ExecuteWorkflowInput): ): raise except BaseException as e: - if is_non_retryable_error(e): + if not is_retryable_error(e): raise ApplicationError( str(e), type=type(e).__name__, diff --git a/agents-api/agents_api/models/utils.py b/agents-api/agents_api/models/utils.py index d618f97b6..f08c54c6d 100644 --- a/agents-api/agents_api/models/utils.py +++ b/agents-api/agents_api/models/utils.py @@ -201,7 +201,16 @@ def cozo_query_dec(func: Callable[P, tuple[str | list[Any], dict]]): """ from pprint import pprint + from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception + def is_resource_busy(e: Exception) -> bool: + return isinstance(e, HTTPException) and e.status_code == 429 + + @retry( + stop=stop_after_attempt(2), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception(is_resource_busy) + ) @wraps(func) def wrapper(*args: P.args, client=None, **kwargs: P.kwargs) -> pd.DataFrame: queries, variables = func(*args, **kwargs)