diff --git a/sky/authentication.py b/sky/authentication.py index f6181620c3d..d77d4a098f4 100644 --- a/sky/authentication.py +++ b/sky/authentication.py @@ -1,6 +1,7 @@ """Module to enable a single SkyPilot key for all VMs in each cloud.""" import copy import functools +import json import os import re import socket @@ -102,7 +103,7 @@ def _replace_cloud_init_ssh_info_in_config(config: Dict[str, Any], def setup_aws_authentication(config: Dict[str, Any]) -> Dict[str, Any]: _, public_key_path = get_or_generate_keys() with open(public_key_path, 'r') as f: - public_key = f.read() + public_key = f.read().strip() config = _replace_cloud_init_ssh_info_in_config(config, public_key) return config @@ -132,6 +133,105 @@ def _wait_for_compute_global_operation(project_name: str, operation_name: str, return result +def _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project, + config: Dict[str, Any], + os_login_enabled: bool): + """Add ssh key to GCP account if using Debian image without cloud-init. + + This function is for backward compatibility. It is only used when the user + is using the old Debian image without cloud-init. In this case, we need to + add the ssh key to the GCP account so that we can ssh into the instance. + """ + private_key_path, public_key_path = get_or_generate_keys() + user = config['auth']['ssh_user'] + + node_config = config.get('available_node_types', + {}).get('ray_head_default', + {}).get('node_config', {}) + image_id = node_config.get('disks', [{}])[0].get('initializeParams', + {}).get('sourceImage') + # image_id is None when TPU VM is used, as TPU VM does not use image. + if image_id is not None and 'debian' not in image_id.lower(): + image_info = clouds.GCP.get_image_info(image_id) + if 'debian' not in json.dumps(image_info).lower(): + # The non-Debian images have the ssh key setup by cloud-init. + return + logger.info('Adding ssh key to GCP account.') + if os_login_enabled: + # Add ssh key to GCP with oslogin + subprocess.run( + 'gcloud compute os-login ssh-keys add ' + f'--key-file={public_key_path}', + check=True, + shell=True, + stdout=subprocess.DEVNULL) + # Enable ssh port for all the instances + enable_ssh_cmd = ('gcloud compute firewall-rules create ' + 'allow-ssh-ingress-from-iap ' + '--direction=INGRESS ' + '--action=allow ' + '--rules=tcp:22 ' + '--source-ranges=0.0.0.0/0') + proc = subprocess.run(enable_ssh_cmd, + check=False, + shell=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE) + if proc.returncode != 0 and 'already exists' not in proc.stderr.decode( + 'utf-8'): + subprocess_utils.handle_returncode(proc.returncode, enable_ssh_cmd, + 'Failed to enable ssh port.', + proc.stderr.decode('utf-8')) + return config + + # OS Login is not enabled for the project. Add the ssh key directly to the + # metadata. + project_keys: str = next( # type: ignore + (item for item in project['commonInstanceMetadata'].get('items', []) + if item['key'] == 'ssh-keys'), {}).get('value', '') + ssh_keys = project_keys.split('\n') if project_keys else [] + + # Get public key from file. + with open(public_key_path, 'r') as f: + public_key = f.read() + + # Check if ssh key in Google Project's metadata + public_key_token = public_key.split(' ')[1] + + key_found = False + for key in ssh_keys: + key_list = key.split(' ') + if len(key_list) != 3: + continue + if user == key_list[-1] and os.path.exists( + private_key_path) and key_list[1] == public_key.split(' ')[1]: + key_found = True + + if not key_found: + new_ssh_key = '{user}:ssh-rsa {public_key_token} {user}'.format( + user=user, public_key_token=public_key_token) + metadata = project['commonInstanceMetadata'].get('items', []) + + ssh_key_index = [ + k for k, v in enumerate(metadata) if v['key'] == 'ssh-keys' + ] + assert len(ssh_key_index) <= 1 + + if len(ssh_key_index) == 0: + metadata.append({'key': 'ssh-keys', 'value': new_ssh_key}) + else: + first_ssh_key_index = ssh_key_index[0] + metadata[first_ssh_key_index]['value'] += '\n' + new_ssh_key + + project['commonInstanceMetadata']['items'] = metadata + + operation = compute.projects().setCommonInstanceMetadata( + project=project['name'], + body=project['commonInstanceMetadata']).execute() + _wait_for_compute_global_operation(project['name'], operation['name'], + compute) + + # Snippets of code inspired from # https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/gcp/config.py # Takes in config, a yaml dict and outputs a postprocessed dict @@ -140,7 +240,9 @@ def _wait_for_compute_global_operation(project_name: str, operation_name: str, # Retry for the GCP as sometimes there will be connection reset by peer error. @common_utils.retry def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]: - private_key_path, public_key_path = get_or_generate_keys() + _, public_key_path = get_or_generate_keys() + with open(public_key_path, 'r') as f: + public_key = f.read() config = copy.deepcopy(config) project_id = config['provider']['project_id'] @@ -148,7 +250,6 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]: 'v1', credentials=None, cache_discovery=False) - user = config['auth']['ssh_user'] try: project = compute.projects().get(project=project_id).execute() @@ -191,7 +292,8 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]: (item for item in project['commonInstanceMetadata'].get('items', []) if item['key'] == 'enable-oslogin'), {}).get('value', 'False') - if project_oslogin.lower() == 'true': + oslogin_enabled = project_oslogin.lower() == 'true' + if oslogin_enabled: logger.info( f'OS Login is enabled for GCP project {project_id}. Running ' 'additional authentication steps.') @@ -243,81 +345,12 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]: os_login_username = account.replace('@', '_').replace('.', '_') config['auth']['ssh_user'] = os_login_username - # Add ssh key to GCP with oslogin - subprocess.run( - 'gcloud compute os-login ssh-keys add ' - f'--key-file={public_key_path}', - check=True, - shell=True, - stdout=subprocess.DEVNULL) - # Enable ssh port for all the instances - enable_ssh_cmd = ('gcloud compute firewall-rules create ' - 'allow-ssh-ingress-from-iap ' - '--direction=INGRESS ' - '--action=allow ' - '--rules=tcp:22 ' - '--source-ranges=0.0.0.0/0') - proc = subprocess.run(enable_ssh_cmd, - check=False, - shell=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE) - if proc.returncode != 0 and 'already exists' not in proc.stderr.decode( - 'utf-8'): - subprocess_utils.handle_returncode(proc.returncode, enable_ssh_cmd, - 'Failed to enable ssh port.', - proc.stderr.decode('utf-8')) - return config - - # OS Login is not enabled for the project. Add the ssh key directly to the - # metadata. - # TODO(zhwu): Use cloud init to add ssh public key, to avoid the permission - # issue. A blocker is that the cloud init is not installed in the debian - # image by default. - project_keys: str = next( # type: ignore - (item for item in project['commonInstanceMetadata'].get('items', []) - if item['key'] == 'ssh-keys'), {}).get('value', '') - ssh_keys = project_keys.split('\n') if project_keys else [] - - # Get public key from file. - with open(public_key_path, 'r') as f: - public_key = f.read() - - # Check if ssh key in Google Project's metadata - public_key_token = public_key.split(' ')[1] - - key_found = False - for key in ssh_keys: - key_list = key.split(' ') - if len(key_list) != 3: - continue - if user == key_list[-1] and os.path.exists( - private_key_path) and key_list[1] == public_key.split(' ')[1]: - key_found = True - - if not key_found: - new_ssh_key = '{user}:ssh-rsa {public_key_token} {user}'.format( - user=user, public_key_token=public_key_token) - metadata = project['commonInstanceMetadata'].get('items', []) - - ssh_key_index = [ - k for k, v in enumerate(metadata) if v['key'] == 'ssh-keys' - ] - assert len(ssh_key_index) <= 1 - - if len(ssh_key_index) == 0: - metadata.append({'key': 'ssh-keys', 'value': new_ssh_key}) - else: - first_ssh_key_index = ssh_key_index[0] - metadata[first_ssh_key_index]['value'] += '\n' + new_ssh_key - - project['commonInstanceMetadata']['items'] = metadata - - operation = compute.projects().setCommonInstanceMetadata( - project=project['name'], - body=project['commonInstanceMetadata']).execute() - _wait_for_compute_global_operation(project['name'], operation['name'], - compute) + config = _replace_cloud_init_ssh_info_in_config(config, public_key) + # This function is for backward compatibility, as the user using the old + # Debian-based image may not have the cloud-init enabled, and we need to + # add the ssh key to the account. + _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project, config, + oslogin_enabled) return config diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index c9d44f54cf8..deb98c2016a 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -5,7 +5,7 @@ import subprocess import time import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple from sky import clouds from sky import exceptions @@ -77,6 +77,11 @@ # TODO(zhwu): Move the default AMI size to the catalog instead. DEFAULT_GCP_IMAGE_GB = 50 +_DEFAULT_CPU_IMAGE = 'skypilot:cpu-ubuntu-2004' +# Other GPUs: CUDA driver version 510.47.03, CUDA Library 11.6. +# K80: CUDA driver version 470.103.01, CUDA Library 11.4 (we manually install +# the older CUDA driver in the gcp-ray.yaml to support K80). +_DEFAULT_GPU_IMAGE = 'skypilot:gpu-ubuntu-2004' def _run_output(cmd): @@ -243,26 +248,24 @@ def get_egress_cost(self, num_gigabytes): def is_same_cloud(self, other): return isinstance(other, GCP) - def get_image_size(self, image_id: str, region: Optional[str]) -> float: - del region # unused - if image_id.startswith('skypilot:'): - return DEFAULT_GCP_IMAGE_GB + @classmethod + def get_image_info(cls, image_id) -> Dict[str, Any]: try: compute = gcp.build('compute', 'v1', credentials=None, cache_discovery=False) except gcp.credential_error_exception() as e: - return DEFAULT_GCP_IMAGE_GB + return {} try: image_attrs = image_id.split('/') if len(image_attrs) == 1: raise ValueError(f'Image {image_id!r} not found in GCP.') project = image_attrs[1] image_name = image_attrs[-1] - image_infos = compute.images().get(project=project, - image=image_name).execute() - return float(image_infos['diskSizeGb']) + image_info = compute.images().get(project=project, + image=image_name).execute() + return image_info except gcp.http_error_exception() as e: if e.resp.status == 403: with ux_utils.print_exception_no_traceback(): @@ -274,6 +277,21 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float: 'GCP.') from None raise + def get_image_size(self, image_id: str, region: Optional[str]) -> float: + del region # unused + if image_id.startswith('skypilot:'): + # Hack: this utilizes the knowledge that both the selected debian + # and ubuntu images on GCP have the same size of 50GB, to reduce + # the overhead for querying the image size. + return DEFAULT_GCP_IMAGE_GB + image_info = self.get_image_info(image_id) + if 'diskSizeGb' not in image_info: + # All the images in GCP should have the diskSizeGb field, but + # just in case, we do not want to crash the program, as the image + # size check is not critical. + return DEFAULT_GCP_IMAGE_GB + return float(image_info['diskSizeGb']) + @classmethod def get_default_instance_type( cls, @@ -295,10 +313,8 @@ def make_deploy_resources_variables( # gcloud compute images list \ # --project deeplearning-platform-release \ - # --no-standard-images - # We use the debian image, as the ubuntu image has some connectivity - # issue when first booted. - image_id = 'skypilot:cpu-debian-10' + # --no-standard-images | grep ubuntu-2004 + image_id = _DEFAULT_CPU_IMAGE r = resources # Find GPU spec, if any. @@ -338,17 +354,8 @@ def make_deploy_resources_variables( resources_vars['gpu'] = 'nvidia-tesla-{}'.format( acc.lower()) resources_vars['gpu_count'] = acc_count - if acc == 'K80': - # Though the image is called cu113, it actually has later - # versions of CUDA as noted below. - # CUDA driver version 470.57.02, CUDA Library 11.4 - image_id = 'skypilot:k80-debian-10' - else: - # Though the image is called cu113, it actually has later - # versions of CUDA as noted below. - # CUDA driver version 510.47.03, CUDA Library 11.6 - # Does not support torch==1.13.0 with cu117 - image_id = 'skypilot:gpu-debian-10' + + image_id = _DEFAULT_GPU_IMAGE if resources.image_id is not None: if None in resources.image_id: diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index e55f5b33506..aa323456a90 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -67,7 +67,7 @@ def read_catalog(filename: str, """ assert filename.endswith('.csv'), 'The catalog file must be a CSV file.' assert (pull_frequency_hours is None or - pull_frequency_hours > 0), pull_frequency_hours + pull_frequency_hours >= 0), pull_frequency_hours catalog_path = get_catalog_path(filename) cloud = cloud_lib.CLOUD_REGISTRY.from_str(os.path.dirname(filename)) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 3558e8b7f0a..ccbfeadc1d4 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -26,6 +26,10 @@ pull_frequency_hours=_PULL_FREQUENCY_HOURS) _image_df = common.read_catalog('gcp/images.csv', pull_frequency_hours=_PULL_FREQUENCY_HOURS) +if _image_df[_image_df['Tag'] == 'skypilot:cpu-ubuntu-2004'].empty: + # Update the image catalog if it does not include the updated images + # https://github.com/skypilot-org/skypilot-catalog/pull/25. + _image_df = common.read_catalog('gcp/images.csv', pull_frequency_hours=0) _TPU_REGIONS = [ 'us-central1', diff --git a/sky/skylet/providers/gcp/node_provider.py b/sky/skylet/providers/gcp/node_provider.py index 1a3443f52e6..14d75dd5d6f 100644 --- a/sky/skylet/providers/gcp/node_provider.py +++ b/sky/skylet/providers/gcp/node_provider.py @@ -362,3 +362,65 @@ def _get_cached_node(self, node_id: str) -> GCPNode: @staticmethod def bootstrap_config(cluster_config): return bootstrap_gcp(cluster_config) + + def get_command_runner( + self, + log_prefix, + node_id, + auth_config, + cluster_name, + process_runner, + use_internal_ip, + docker_config, + ): + from ray.autoscaler._private.command_runner import ( + DockerCommandRunner, + SSHCommandRunner, + ) + + class SSHCommandRunnerWithRetry(SSHCommandRunner): + def _run_helper( + self, final_cmd, with_output=False, exit_on_fail=False, silent=False + ): + """Wrapper around _run_helper to retry on failure. + + Fix the ssh connection issue caused by control master for GCP with ubuntu + image. Before the fix, the ssh connection will be disconnected when ray + trying to setup the runtime dependencies, which is probably because the + ssh connection is unstable when the cluster is just provisioned + https://github.com/ray-project/ray/issues/16539#issuecomment-1073138982. + The root cause can be that the GCP's async nvidia-driver installation will + reboot the machine when finished. + + We added retry for the ssh commands executed by ray up, which is ok since + our setup commands are idempotent. + """ + retry_cnt = 0 + import click + + while True: + try: + return super()._run_helper( + final_cmd, with_output, exit_on_fail, silent + ) + except click.ClickException as e: + retry_cnt += 1 + if retry_cnt > 3: + raise e + logger.info(f"Retrying SSH command in 5 seconds: {e}") + time.sleep(5) + + # Adopted from super().get_command_runner() + common_args = { + "log_prefix": log_prefix, + "node_id": node_id, + "provider": self, + "auth_config": auth_config, + "cluster_name": cluster_name, + "process_runner": process_runner, + "use_internal_ip": use_internal_ip, + } + if docker_config and docker_config["container_name"] != "": + return DockerCommandRunner(docker_config, **common_args) + else: + return SSHCommandRunnerWithRetry(**common_args) diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 11115723a1c..b87223bede8 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -172,7 +172,6 @@ initialization_commands: [] # Increment the following for catching performance bugs easier: # current num items (num SSH connections): 1 setup_commands: - # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.) # Create ~/.ssh/config file in case the file does not exist in the custom image. # Make sure python3 & pip3 are available on this image. # We set auto_activate_base to be false for pre-installed conda. @@ -187,7 +186,7 @@ setup_commands: (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; (which conda > /dev/null 2>&1 && conda init > /dev/null) || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true); source ~/.bashrc; - (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; + (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray ray-cpp && pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[aws]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index a489bf6d1ec..b3755278af6 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -126,7 +126,7 @@ setup_commands: (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; which conda > /dev/null 2>&1 || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(/home/azureuser/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true); source ~/.bashrc; - (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; + (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray ray-cpp && pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[azure]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 3221ab2c21f..80283170b86 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -55,14 +55,36 @@ available_node_types: # See https://cloud.google.com/deep-learning-vm/docs/images sourceImage: {{image_id}} diskType: zones/{{zones}}/diskTypes/{{disk_tier}} + metadata: + items: + - key: ssh-keys + value: | + skypilot:ssh_user:skypilot:ssh_public_key_content + - key: user-data + value: | + #cloud-config + write_files: + - path: /etc/apt/apt.conf.d/20auto-upgrades + content: | + APT::Periodic::Update-Package-Lists "0"; + APT::Periodic::Download-Upgradeable-Packages "0"; + APT::Periodic::AutocleanInterval "0"; + APT::Periodic::Unattended-Upgrade "0"; + - path: /etc/apt/apt.conf.d/10cloudinit-disable + content: | + APT::Periodic::Enable "0"; {%- if gpu is not none %} + {%- if 'tesla-k80' in gpu %} + runcmd: + - curl https://raw.githubusercontent.com/GoogleCloudPlatform/compute-gpu-installation/main/linux/install_gpu_driver.py --output install_gpu_driver.py + - python3 install_gpu_driver.py + {%- else %} + - key: install-nvidia-driver + value: true + {%- endif %} guestAccelerators: - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}} acceleratorCount: {{gpu_count}} - metadata: - items: - - key: install-nvidia-driver - value: "True" {%- endif %} scheduling: {%- if use_spot %} @@ -98,14 +120,36 @@ available_node_types: # See https://cloud.google.com/deep-learning-vm/docs/images sourceImage: {{image_id}} diskType: zones/{{zones}}/diskTypes/{{disk_tier}} + metadata: + items: + - key: ssh-keys + value: | + skypilot:ssh_user:skypilot:ssh_public_key_content + - key: user-data + value: | + #cloud-config + write_files: + - path: /etc/apt/apt.conf.d/20auto-upgrades + content: | + APT::Periodic::Update-Package-Lists "0"; + APT::Periodic::Download-Upgradeable-Packages "0"; + APT::Periodic::AutocleanInterval "0"; + APT::Periodic::Unattended-Upgrade "0"; + - path: /etc/apt/apt.conf.d/10cloudinit-disable + content: | + APT::Periodic::Enable "0"; {%- if gpu is not none %} + {%- if 'tesla-k80' in gpu %} + runcmd: + - curl https://raw.githubusercontent.com/GoogleCloudPlatform/compute-gpu-installation/main/linux/install_gpu_driver.py --output install_gpu_driver.py + - python3 install_gpu_driver.py + {%- else %} + - key: install-nvidia-driver + value: true + {%- endif %} guestAccelerators: - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}} acceleratorCount: {{gpu_count}} - metadata: - items: - - key: install-nvidia-driver - value: "True" {%- endif %} scheduling: {%- if use_spot %} @@ -140,27 +184,21 @@ initialization_commands: [] # Increment the following for catching performance bugs easier: # current num items (num SSH connections): 1 (+1 if tpu_vm) setup_commands: - # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.) # Line 'mkdir -p ..': Create ~/.ssh/config file in case the file does not exist in the custom image. # Line 'pip3 --v ..': Make sure python3 & pip3 are available on this image. # Line 'which conda ..': some images (TPU VM) do not install conda by # default. 'source ~/.bashrc' is needed so conda takes effect for the next # commands. + # Line 'pip3 list | ..': Ensure only one Ray version (which is our ray_version) is installed, + # regardless of if the image comes pre-installed with another Ray version. The + # 'python3 -c "import ray"' is to check the integrity of the installed ray package. The integrity + # check is needed because the reboot of the machine during the ray installation may cause the + # corruption. # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; }; - sudo systemctl stop unattended-upgrades || true; - sudo systemctl disable unattended-upgrades || true; - sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; - p=$(mylsof "/var/lib/dpkg/lock-frontend"); echo "$p"; - sudo kill -9 `echo "$p" | tail -n 1` || true; - sudo rm /var/lib/dpkg/lock-frontend; - sudo pkill -9 dpkg; - sudo pkill -9 apt-get; - sudo dpkg --configure --force-overwrite -a; - mkdir -p ~/.ssh; touch ~/.ssh/config; + - mkdir -p ~/.ssh; touch ~/.ssh/config; pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc); (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; @@ -170,8 +208,8 @@ setup_commands: test -f /home/gcpuser/miniconda3/etc/profile.d/conda.sh && source /home/gcpuser/miniconda3/etc/profile.d/conda.sh && conda activate base || true; pip3 install --upgrade google-api-python-client; {%- endif %} - (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; - (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[gcp]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null && python3 -c "import ray" || pip3 uninstall -y ray ray-cpp && pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; + (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) && python3 -c "import sky" || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[gcp]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; @@ -184,12 +222,26 @@ setup_commands: # items! The same comment applies for worker_start_ray_commands. # # Increment the following for catching performance bugs easier: -# current num items (num SSH connections): 1 +# current num items (num SSH connections): 2 head_start_ray_commands: # Start skylet daemon. (Should not place it in the head_setup_commands, otherwise it will run before sky is installed.) # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. +{%- if gpu is not none %} + - | + echo "Installing NVIDIA GPU driver." >> ~/.sky/nvlog + while ! nvidia-smi &> /dev/null + do + echo "Waiting for NVIDIA drivers to be installed..." >> ~/.sky/nvlog + sleep 5 + done + # Magic number for waiting for the nvidia driver to be ready and the instance + # to be rebooted. The number is determined by experiments, and it is the + # minimum number that works. + sleep 18 + echo "NVIDIA GPU is ready." >> ~/.sky/nvlog +{%- endif %} - ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); export SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l); ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1; @@ -199,6 +251,20 @@ head_start_ray_commands: # Worker commands are needed for TPU VM Pods {%- if num_nodes > 1 or tpu_vm %} worker_start_ray_commands: +{%- if gpu is not none %} + - | + echo "Installing NVIDIA GPU driver." >> ~/.sky/nvlog + while ! nvidia-smi &> /dev/null + do + echo "Waiting for NVIDIA drivers to be installed..." >> ~/.sky/nvlog + sleep 5 + done + # Magic number for waiting for the nvidia driver to be ready and the instance + # to be rebooted. The number is determined by experiments, and it is the + # minimum number that works. + sleep 18 + echo "NVIDIA GPU is ready." >> ~/.sky/nvlog +{%- endif %} - SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l); ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; diff --git a/tests/test_smoke.py b/tests/test_smoke.py index dcc159a9888..a9d05388d4c 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -916,6 +916,7 @@ def test_job_queue(generic_cloud: str): f'sky logs {name} 5 --status', ], f'sky down -y {name}', + timeout=20 * 60, ) run_one_test(test) @@ -1149,7 +1150,7 @@ def test_multi_echo(generic_cloud: str): # unfulfilled' error. If process not found, grep->ssh returns 1. [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''], f'sky down -y {name}', - timeout=20 * 60, + timeout=25 * 60, ) run_one_test(test) @@ -1325,6 +1326,7 @@ def test_gcp_start_stop(): f'sky status -r {name} | grep "INIT\|STOPPED"', ], f'sky down -y {name}', + timeout=20 * 60, ) run_one_test(test) @@ -2229,7 +2231,7 @@ def _get_aws_query_command(region, instance_id, field, expected): specs['disk_throughput']))), ], f'sky down -y {name}', - timeout=10 * 60, # 10 mins (it takes around ~6 mins) + timeout=12 * 60, ) run_one_test(test)