Skip to content

Commit

Permalink
[k8s] Fix --purge not cleaning up cluster in stale k8s context (#4514)
Browse files Browse the repository at this point in the history
* Fix purge not cleaning up stale k8s context cluster

* update comment

* Apply purge after printing warnings.

* lint

* Fix comments

* clean up condition
  • Loading branch information
romilbhardwaj authored Jan 6, 2025
1 parent 0e14982 commit 59cb4e9
Showing 1 changed file with 21 additions and 7 deletions.
28 changes: 21 additions & 7 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -4216,11 +4216,20 @@ def post_teardown_cleanup(self,
attempts = 0
while True:
logger.debug(f'instance statuses attempt {attempts + 1}')
node_status_dict = provision_lib.query_instances(
repr(cloud),
cluster_name_on_cloud,
config['provider'],
non_terminated_only=False)
try:
node_status_dict = provision_lib.query_instances(
repr(cloud),
cluster_name_on_cloud,
config['provider'],
non_terminated_only=False)
except Exception as e: # pylint: disable=broad-except
if purge:
logger.warning(
f'Failed to query instances. Skipping since purge is '
f'set. Details: '
f'{common_utils.format_exception(e, use_bracket=True)}')
break
raise

unexpected_node_state: Optional[Tuple[str, str]] = None
for node_id, node_status in node_status_dict.items():
Expand All @@ -4239,8 +4248,13 @@ def post_teardown_cleanup(self,
time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
else:
(node_id, node_status) = unexpected_node_state
raise RuntimeError(f'Instance {node_id} in unexpected state '
f'{node_status}.')
if purge:
logger.warning(f'Instance {node_id} in unexpected '
f'state {node_status}. Skipping since purge '
'is set.')
break
raise RuntimeError(f'Instance {node_id} in unexpected '
f'state {node_status}.')

global_user_state.remove_cluster(handle.cluster_name,
terminate=terminate)
Expand Down

0 comments on commit 59cb4e9

Please sign in to comment.