Skip to content

Commit

Permalink
[Core] Refresh for spot controller in INIT status (#3302)
Browse files Browse the repository at this point in the history
* Do not reset local autostop for spot controller

* use verb

* Add comments

* fix comment
  • Loading branch information
Michaelvll authored Mar 13, 2024
1 parent 96febed commit 606cede
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 16 deletions.
6 changes: 3 additions & 3 deletions docs/source/getting-started/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Install SkyPilot using pip:
.. code-block:: shell
# Recommended: use a new conda env to avoid package conflicts.
# SkyPilot requires 3.7 <= python <= 3.10.
# SkyPilot requires 3.7 <= python <= 3.11.
conda create -y -n sky python=3.10
conda activate sky
Expand All @@ -43,7 +43,7 @@ Install SkyPilot using pip:
.. code-block:: shell
# Recommended: use a new conda env to avoid package conflicts.
# SkyPilot requires 3.7 <= python <= 3.10.
# SkyPilot requires 3.7 <= python <= 3.11.
conda create -y -n sky python=3.10
conda activate sky
Expand All @@ -69,7 +69,7 @@ Install SkyPilot using pip:
.. code-block:: shell
# Recommended: use a new conda env to avoid package conflicts.
# SkyPilot requires 3.7 <= python <= 3.10.
# SkyPilot requires 3.7 <= python <= 3.11.
conda create -y -n sky python=3.10
conda activate sky
Expand Down
42 changes: 30 additions & 12 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1888,24 +1888,32 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
backends.CloudVmRayBackend) and record['autostop'] >= 0:
if not backend.is_definitely_autostopping(handle,
stream_logs=False):
# Friendly hint.
autostop = record['autostop']
maybe_down_str = ' --down' if record['to_down'] else ''
noun = 'autodown' if record['to_down'] else 'autostop'

# Reset the autostopping as the cluster is abnormal, and may
# not correctly autostop. Resetting the autostop will let
# the user know that the autostop may not happen to avoid
# leakages from the assumption that the cluster will autostop.
success = True
reset_local_autostop = True
try:
backend.set_autostop(handle, -1, stream_logs=False)
except exceptions.CommandError as e:
success = False
if e.returncode == 255:
logger.debug(f'The cluster is likely {noun}ed.')
reset_local_autostop = False
except (Exception, SystemExit) as e: # pylint: disable=broad-except
success = False
logger.debug(f'Failed to reset autostop. Due to '
f'{common_utils.format_exception(e)}')
global_user_state.set_cluster_autostop_value(
handle.cluster_name, -1, to_down=False)
if reset_local_autostop:
global_user_state.set_cluster_autostop_value(
handle.cluster_name, -1, to_down=False)

# Friendly hint.
autostop = record['autostop']
maybe_down_str = ' --down' if record['to_down'] else ''
noun = 'autodown' if record['to_down'] else 'autostop'
if success:
operation_str = (f'Canceled {noun} on the cluster '
f'{cluster_name!r}')
Expand Down Expand Up @@ -2264,12 +2272,22 @@ def is_controller_accessible(
need_connection_check = False
controller_status, handle = None, None
try:
# Set force_refresh_statuses=None to make sure the refresh only happens
# Set force_refresh_statuses=[INIT] to make sure the refresh happens
# when the controller is INIT/UP (triggered in these statuses as the
# autostop is always set for the controller). This optimization avoids
# unnecessary costly refresh when the controller is already stopped.
# This optimization is based on the assumption that the user will not
# start the controller manually from the cloud console.
# autostop is always set for the controller). The controller can be in
# following cases:
# * (UP, autostop set): it will be refreshed without force_refresh set.
# * (UP, no autostop): very rare (a user ctrl-c when the controller is
# launching), does not matter if refresh or not, since no autostop. We
# don't include UP in force_refresh_statuses to avoid overheads.
# * (INIT, autostop set)
# * (INIT, no autostop): very rare (_update_cluster_status_no_lock may
# reset local autostop config), but force_refresh will make sure
# status is refreshed.
#
# We avoids unnecessary costly refresh when the controller is already
# STOPPED. This optimization is based on the assumption that the user
# will not start the controller manually from the cloud console.
#
# The acquire_lock_timeout is set to 0 to avoid hanging the command when
# multiple spot_launch commands are running at the same time. Our later
Expand All @@ -2278,7 +2296,7 @@ def is_controller_accessible(
# status of the controller.
controller_status, handle = refresh_cluster_status_handle(
cluster_name,
force_refresh_statuses=None,
force_refresh_statuses=[status_lib.ClusterStatus.INIT],
cluster_status_lock_timeout=0)
except exceptions.ClusterStatusFetchingError as e:
# We do not catch the exceptions related to the cluster owner identity
Expand Down
4 changes: 3 additions & 1 deletion sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -4042,7 +4042,9 @@ def is_definitely_autostopping(self,

if returncode == 0:
return common_utils.decode_payload(stdout)
logger.debug(f'Failed to check if cluster is autostopping: {stderr}')
logger.debug('Failed to check if cluster is autostopping with '
f'{returncode}: {stdout+stderr}\n'
f'Command: {code}')
return False

# TODO(zhwu): Refactor this to a CommandRunner class, so different backends
Expand Down
1 change: 1 addition & 0 deletions sky/setup_files/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ def parse_readme(readme: str) -> str:
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Topic :: Software Development :: Libraries :: Python Modules',
Expand Down

0 comments on commit 606cede

Please sign in to comment.