Skip to content

Commit

Permalink
[GCP] Add retry for set_labels when creating instances (#4593)
Browse files Browse the repository at this point in the history
* Add retry for set_labels on GCP

* Use exception retry function

* revert provisioner error
  • Loading branch information
Michaelvll authored Jan 19, 2025
1 parent 11861fd commit 2354b81
Showing 1 changed file with 15 additions and 9 deletions.
24 changes: 15 additions & 9 deletions sky/provision/gcp/instance_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
r'The resource \'projects/.*/global/firewalls/.*\' was not found')


def _retry_on_http_exception(
def _retry_on_gcp_http_exception(
regex: Optional[str] = None,
max_retries: int = GCP_MAX_RETRIES,
retry_interval_s: int = GCP_RETRY_INTERVAL_SECONDS,
Expand All @@ -49,17 +49,18 @@ def dec(func):

@functools.wraps(func)
def wrapper(*args, **kwargs):
exception_type = gcp.http_error_exception()

def try_catch_exc():
try:
value = func(*args, **kwargs)
return value
except Exception as e: # pylint: disable=broad-except
if not isinstance(e, exception_type) or (
regex and not re.search(regex, str(e))):
raise
return e
if (isinstance(e, gcp.http_error_exception()) and
(regex is None or re.search(regex, str(e)))):
logger.error(
f'Retrying for gcp.http_error_exception: {e}')
return e
raise

for _ in range(max_retries):
ret = try_catch_exc()
Expand Down Expand Up @@ -431,7 +432,7 @@ def wait_for_operation(cls,
logger.debug(
f'Waiting GCP operation {operation["name"]} to be ready ...')

@_retry_on_http_exception(
@_retry_on_gcp_http_exception(
f'Failed to wait for operation {operation["name"]}')
def call_operation(fn, timeout: int):
request = fn(
Expand Down Expand Up @@ -613,6 +614,11 @@ def create_or_update_firewall_rule(
return operation

@classmethod
# When there is a cloud function running in parallel to set labels for
# newly created instances, it may fail with the following error:
# "Labels fingerprint either invalid or resource labels have changed"
# We should retry until the labels are set successfully.
@_retry_on_gcp_http_exception('Labels fingerprint either invalid')
def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
labels: dict) -> None:
node = cls.load_resource().instances().get(
Expand Down Expand Up @@ -1211,7 +1217,7 @@ def wait_for_operation(cls,
"""Poll for TPU operation until finished."""
del project_id, region, zone # unused

@_retry_on_http_exception(
@_retry_on_gcp_http_exception(
f'Failed to wait for operation {operation["name"]}')
def call_operation(fn, timeout: int):
request = fn(name=operation['name'])
Expand Down Expand Up @@ -1379,7 +1385,7 @@ def get_vpc_name(
f'Failed to get VPC name for instance {instance}') from e

@classmethod
@_retry_on_http_exception('unable to queue the operation')
@_retry_on_gcp_http_exception('unable to queue the operation')
def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
labels: dict) -> None:
while True:
Expand Down

0 comments on commit 2354b81

Please sign in to comment.