From 2354b818b6faa06ac65e1eb5bfad7e2278ca43f8 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 19 Jan 2025 00:27:16 -0800 Subject: [PATCH] [GCP] Add retry for `set_labels` when creating instances (#4593) * Add retry for set_labels on GCP * Use exception retry function * revert provisioner error --- sky/provision/gcp/instance_utils.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/sky/provision/gcp/instance_utils.py b/sky/provision/gcp/instance_utils.py index 933df5e08a1..44fe150845e 100644 --- a/sky/provision/gcp/instance_utils.py +++ b/sky/provision/gcp/instance_utils.py @@ -38,7 +38,7 @@ r'The resource \'projects/.*/global/firewalls/.*\' was not found') -def _retry_on_http_exception( +def _retry_on_gcp_http_exception( regex: Optional[str] = None, max_retries: int = GCP_MAX_RETRIES, retry_interval_s: int = GCP_RETRY_INTERVAL_SECONDS, @@ -49,17 +49,18 @@ def dec(func): @functools.wraps(func) def wrapper(*args, **kwargs): - exception_type = gcp.http_error_exception() def try_catch_exc(): try: value = func(*args, **kwargs) return value except Exception as e: # pylint: disable=broad-except - if not isinstance(e, exception_type) or ( - regex and not re.search(regex, str(e))): - raise - return e + if (isinstance(e, gcp.http_error_exception()) and + (regex is None or re.search(regex, str(e)))): + logger.error( + f'Retrying for gcp.http_error_exception: {e}') + return e + raise for _ in range(max_retries): ret = try_catch_exc() @@ -431,7 +432,7 @@ def wait_for_operation(cls, logger.debug( f'Waiting GCP operation {operation["name"]} to be ready ...') - @_retry_on_http_exception( + @_retry_on_gcp_http_exception( f'Failed to wait for operation {operation["name"]}') def call_operation(fn, timeout: int): request = fn( @@ -613,6 +614,11 @@ def create_or_update_firewall_rule( return operation @classmethod + # When there is a cloud function running in parallel to set labels for + # newly created instances, it may fail with the following error: + # "Labels fingerprint either invalid or resource labels have changed" + # We should retry until the labels are set successfully. + @_retry_on_gcp_http_exception('Labels fingerprint either invalid') def set_labels(cls, project_id: str, availability_zone: str, node_id: str, labels: dict) -> None: node = cls.load_resource().instances().get( @@ -1211,7 +1217,7 @@ def wait_for_operation(cls, """Poll for TPU operation until finished.""" del project_id, region, zone # unused - @_retry_on_http_exception( + @_retry_on_gcp_http_exception( f'Failed to wait for operation {operation["name"]}') def call_operation(fn, timeout: int): request = fn(name=operation['name']) @@ -1379,7 +1385,7 @@ def get_vpc_name( f'Failed to get VPC name for instance {instance}') from e @classmethod - @_retry_on_http_exception('unable to queue the operation') + @_retry_on_gcp_http_exception('unable to queue the operation') def set_labels(cls, project_id: str, availability_zone: str, node_id: str, labels: dict) -> None: while True: