From d860b172c813a328af8e5044d3982a199e8b1a6b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 16 Jul 2023 14:43:19 -0700 Subject: [PATCH 1/4] fix --- sky/backends/cloud_vm_ray_backend.py | 45 ++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 3f776ec4dfe..e906a9020a2 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -764,6 +764,11 @@ def _update_blocklist_on_gcp_error( logger.warning(f'Got \'resource not found\' in {zone.name}.') self._blocked_resources.add( launchable_resources.copy(zone=zone.name)) + elif 'rsync: command not found' in stderr: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'rsync is not installed on the specific image. ' + 'Please install rsync and try again.') else: logger.info('====== stdout ======') for s in stdout.split('\n'): @@ -806,6 +811,11 @@ def _update_blocklist_on_aws_error( line.startswith('<1/1> Setting up head node') for line in stdout_splits + stderr_splits) if not errors or head_node_up: + if 'rsync: command not found' in stderr: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'rsync is not installed on the specific image. ' + 'Please install rsync and try again.') # TODO: Got transient 'Failed to create security group' that goes # away after a few minutes. Should we auto retry other regions, or # let the user retry. @@ -858,6 +868,11 @@ def _update_blocklist_on_azure_error( in s.strip() or '(ReadOnlyDisabledSubscription)' in s.strip()) ] if not errors: + if 'rsync: command not found' in stderr: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'rsync is not installed on the specific image. ' + 'Please install rsync and try again.') logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -891,6 +906,11 @@ def _update_blocklist_on_lambda_error( if 'LambdaCloudError:' in s.strip() ] if not errors: + if 'rsync: command not found' in stderr: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'rsync is not installed on the specific image. ' + 'Please install rsync and try again.') logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -927,6 +947,11 @@ def _update_blocklist_on_scp_error( if 'SCPError:' in s.strip() ] if not errors: + if 'rsync: command not found' in stderr: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'rsync is not installed on the specific image. ' + 'Please install rsync and try again.') logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -964,6 +989,11 @@ def _update_blocklist_on_ibm_error( if 'ERR' in s.strip() or 'PANIC' in s.strip() ] if not errors: + if 'rsync: command not found' in stderr: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'rsync is not installed on the specific image. ' + 'Please install rsync and try again.') logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -995,6 +1025,11 @@ def _update_blocklist_on_local_error( if 'ERR' in s.strip() or 'PANIC' in s.strip() ] if not errors: + if 'rsync: command not found' in stderr: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'rsync is not installed on the specific image. ' + 'Please install rsync and try again.') logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -1030,6 +1065,11 @@ def _update_blocklist_on_oci_error( 'LimitExceeded' in s.strip() or 'NotAuthenticated' in s.strip())) ] if not errors: + if 'rsync: command not found' in stderr: + with ux_utils.print_exception_no_traceback(): + raise RuntimeError( + 'rsync is not installed on the specific image. ' + 'Please install rsync and try again.') logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -1758,6 +1798,11 @@ def need_ray_up( 'error.') return True + if 'rsync: command not found' in stderr: + logger.info('Skipping retry due to `rsync` not found in ' + 'the specified image.') + return False + if ('Processing file mounts' in stdout and 'Running setup commands' not in stdout and 'Failed to setup head node.' in stderr): From 91872cc3b652c86a4d75ebaf31a382d49ede4997 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Sun, 16 Jul 2023 16:55:07 -0700 Subject: [PATCH 2/4] Update sky/backends/cloud_vm_ray_backend.py Co-authored-by: Zhanghao Wu --- sky/backends/cloud_vm_ray_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index e906a9020a2..7e82556f3e0 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -767,8 +767,8 @@ def _update_blocklist_on_gcp_error( elif 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - 'rsync is not installed on the specific image. ' - 'Please install rsync and try again.') + '`rsync` command is not found in the specified image. ' + 'Please use an image with rsync installed.') else: logger.info('====== stdout ======') for s in stdout.split('\n'): From 949cf757060c3aa933c589f30aaf42d2c692c090 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Sun, 16 Jul 2023 17:01:41 -0700 Subject: [PATCH 3/4] style --- sky/backends/cloud_vm_ray_backend.py | 36 ++++++++++------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 7e82556f3e0..38443186fa9 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -103,6 +103,10 @@ 'Details: {details}' f'{colorama.Style.RESET_ALL}') +_RSYNC_NOT_FOUND_WARNING = ( + '`rsync` command is not found in the specified image. ' + 'Please use an image with rsync installed.') + _TPU_NOT_FOUND_ERROR = 'ERROR: (gcloud.compute.tpus.delete) NOT_FOUND' _CTRL_C_TIP_MESSAGE = ('INFO: Tip: use Ctrl-C to exit log streaming ' @@ -766,9 +770,7 @@ def _update_blocklist_on_gcp_error( launchable_resources.copy(zone=zone.name)) elif 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - '`rsync` command is not found in the specified image. ' - 'Please use an image with rsync installed.') + raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) else: logger.info('====== stdout ======') for s in stdout.split('\n'): @@ -813,9 +815,7 @@ def _update_blocklist_on_aws_error( if not errors or head_node_up: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'rsync is not installed on the specific image. ' - 'Please install rsync and try again.') + raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) # TODO: Got transient 'Failed to create security group' that goes # away after a few minutes. Should we auto retry other regions, or # let the user retry. @@ -870,9 +870,7 @@ def _update_blocklist_on_azure_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'rsync is not installed on the specific image. ' - 'Please install rsync and try again.') + raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -908,9 +906,7 @@ def _update_blocklist_on_lambda_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'rsync is not installed on the specific image. ' - 'Please install rsync and try again.') + raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -949,9 +945,7 @@ def _update_blocklist_on_scp_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'rsync is not installed on the specific image. ' - 'Please install rsync and try again.') + raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -991,9 +985,7 @@ def _update_blocklist_on_ibm_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'rsync is not installed on the specific image. ' - 'Please install rsync and try again.') + raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -1027,9 +1019,7 @@ def _update_blocklist_on_local_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'rsync is not installed on the specific image. ' - 'Please install rsync and try again.') + raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -1067,9 +1057,7 @@ def _update_blocklist_on_oci_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - 'rsync is not installed on the specific image. ' - 'Please install rsync and try again.') + raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) logger.info('====== stdout ======') for s in stdout_splits: print(s) From ea86163b681c6fe361c936e8b2ad61bd77616bcf Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 17 Jul 2023 14:08:40 -0700 Subject: [PATCH 4/4] rerun the test --- sky/backends/cloud_vm_ray_backend.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 38443186fa9..5eff0762e88 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -103,7 +103,7 @@ 'Details: {details}' f'{colorama.Style.RESET_ALL}') -_RSYNC_NOT_FOUND_WARNING = ( +_RSYNC_NOT_FOUND_MESSAGE = ( '`rsync` command is not found in the specified image. ' 'Please use an image with rsync installed.') @@ -770,7 +770,7 @@ def _update_blocklist_on_gcp_error( launchable_resources.copy(zone=zone.name)) elif 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) + raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE) else: logger.info('====== stdout ======') for s in stdout.split('\n'): @@ -815,7 +815,7 @@ def _update_blocklist_on_aws_error( if not errors or head_node_up: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) + raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE) # TODO: Got transient 'Failed to create security group' that goes # away after a few minutes. Should we auto retry other regions, or # let the user retry. @@ -870,7 +870,7 @@ def _update_blocklist_on_azure_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) + raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE) logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -906,7 +906,7 @@ def _update_blocklist_on_lambda_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) + raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE) logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -945,7 +945,7 @@ def _update_blocklist_on_scp_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) + raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE) logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -985,7 +985,7 @@ def _update_blocklist_on_ibm_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) + raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE) logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -1019,7 +1019,7 @@ def _update_blocklist_on_local_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) + raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE) logger.info('====== stdout ======') for s in stdout_splits: print(s) @@ -1057,7 +1057,7 @@ def _update_blocklist_on_oci_error( if not errors: if 'rsync: command not found' in stderr: with ux_utils.print_exception_no_traceback(): - raise RuntimeError(_RSYNC_NOT_FOUND_WARNING) + raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE) logger.info('====== stdout ======') for s in stdout_splits: print(s)