skypilot-org · Michaelvll · May 12, 2023 · May 12, 2023 · May 12, 2023 · May 12, 2023
diff --git a/sky/authentication.py b/sky/authentication.py
@@ -1,6 +1,7 @@
 """Module to enable a single SkyPilot key for all VMs in each cloud."""
 import copy
 import functools
+import json
 import os
 import re
 import socket
@@ -132,6 +133,105 @@ def _wait_for_compute_global_operation(project_name: str, operation_name: str,
     return result
 
 
+def _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project,
+                                                config: Dict[str, Any],
+                                                os_login_enabled: bool):
+    """Add ssh key to GCP account if using Debian image without cloud-init.
+
+    This function is for backward compatibility. It is only used when the user
+    is using the old Debian image without cloud-init. In this case, we need to
+    add the ssh key to the GCP account so that we can ssh into the instance.
+    """
+    private_key_path, public_key_path = get_or_generate_keys()
+    user = config['auth']['ssh_user']
+
+    node_config = config.get('available_node_types',
+                             {}).get('ray_head_default',
+                                     {}).get('node_config', {})
+    image_id = node_config.get('disks', [{}])[0].get('initializeParams',
+                                                     {}).get('sourceImage')
+    # image_id is None when TPU VM is used, as TPU VM does not use image.
+    if image_id is not None and 'debian' not in image_id.lower():
+        image_info = clouds.GCP.get_image_info(image_id)
+        if 'debian' not in json.dumps(image_info).lower():
+            # The non-Debian images have the ssh key setup by cloud-init.
+            return
+    logger.info('Adding ssh key to GCP account.')
+    if os_login_enabled:
+        # Add ssh key to GCP with oslogin
+        subprocess.run(
+            'gcloud compute os-login ssh-keys add '
+            f'--key-file={public_key_path}',
+            check=True,
+            shell=True,
+            stdout=subprocess.DEVNULL)
+        # Enable ssh port for all the instances
+        enable_ssh_cmd = ('gcloud compute firewall-rules create '
+                          'allow-ssh-ingress-from-iap '
+                          '--direction=INGRESS '
+                          '--action=allow '
+                          '--rules=tcp:22 '
+                          '--source-ranges=0.0.0.0/0')
+        proc = subprocess.run(enable_ssh_cmd,
+                              check=False,
+                              shell=True,
+                              stdout=subprocess.DEVNULL,
+                              stderr=subprocess.PIPE)
+        if proc.returncode != 0 and 'already exists' not in proc.stderr.decode(
+                'utf-8'):
+            subprocess_utils.handle_returncode(proc.returncode, enable_ssh_cmd,
+                                               'Failed to enable ssh port.',
+                                               proc.stderr.decode('utf-8'))
+        return config
+
+    # OS Login is not enabled for the project. Add the ssh key directly to the
+    # metadata.
+    project_keys: str = next(  # type: ignore
+        (item for item in project['commonInstanceMetadata'].get('items', [])
+         if item['key'] == 'ssh-keys'), {}).get('value', '')
+    ssh_keys = project_keys.split('\n') if project_keys else []
+
+    # Get public key from file.
+    with open(public_key_path, 'r') as f:
+        public_key = f.read()
+
+    # Check if ssh key in Google Project's metadata
+    public_key_token = public_key.split(' ')[1]
+
+    key_found = False
+    for key in ssh_keys:
+        key_list = key.split(' ')
+        if len(key_list) != 3:
+            continue
+        if user == key_list[-1] and os.path.exists(
+                private_key_path) and key_list[1] == public_key.split(' ')[1]:
+            key_found = True
+
+    if not key_found:
+        new_ssh_key = '{user}:ssh-rsa {public_key_token} {user}'.format(
+            user=user, public_key_token=public_key_token)
+        metadata = project['commonInstanceMetadata'].get('items', [])
+
+        ssh_key_index = [
+            k for k, v in enumerate(metadata) if v['key'] == 'ssh-keys'
+        ]
+        assert len(ssh_key_index) <= 1
+
+        if len(ssh_key_index) == 0:
+            metadata.append({'key': 'ssh-keys', 'value': new_ssh_key})
+        else:
+            first_ssh_key_index = ssh_key_index[0]
+            metadata[first_ssh_key_index]['value'] += '\n' + new_ssh_key
+
+        project['commonInstanceMetadata']['items'] = metadata
+
+        operation = compute.projects().setCommonInstanceMetadata(
+            project=project['name'],
+            body=project['commonInstanceMetadata']).execute()
+        _wait_for_compute_global_operation(project['name'], operation['name'],
+                                           compute)
+
+
 # Snippets of code inspired from
 # https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/gcp/config.py
 # Takes in config, a yaml dict and outputs a postprocessed dict
@@ -140,15 +240,16 @@ def _wait_for_compute_global_operation(project_name: str, operation_name: str,
 # Retry for the GCP as sometimes there will be connection reset by peer error.
 @common_utils.retry
 def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
-    private_key_path, public_key_path = get_or_generate_keys()
+    _, public_key_path = get_or_generate_keys()
+    with open(public_key_path, 'r') as f:
+        public_key = f.read()
     config = copy.deepcopy(config)
 
     project_id = config['provider']['project_id']
     compute = gcp.build('compute',
                         'v1',
                         credentials=None,
                         cache_discovery=False)
-    user = config['auth']['ssh_user']
 
     try:
         project = compute.projects().get(project=project_id).execute()
@@ -191,7 +292,8 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
         (item for item in project['commonInstanceMetadata'].get('items', [])
          if item['key'] == 'enable-oslogin'), {}).get('value', 'False')
 
-    if project_oslogin.lower() == 'true':
+    oslogin_enabled = project_oslogin.lower() == 'true'
+    if oslogin_enabled:
         # project.
         logger.info(
             f'OS Login is enabled for GCP project {project_id}. Running '
@@ -218,81 +320,12 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
                         'account information.')
         config['auth']['ssh_user'] = account.replace('@', '_').replace('.', '_')
 
-        # Add ssh key to GCP with oslogin
-        subprocess.run(
-            'gcloud compute os-login ssh-keys add '
-            f'--key-file={public_key_path}',
-            check=True,
-            shell=True,
-            stdout=subprocess.DEVNULL)
-        # Enable ssh port for all the instances
-        enable_ssh_cmd = ('gcloud compute firewall-rules create '
-                          'allow-ssh-ingress-from-iap '
-                          '--direction=INGRESS '
-                          '--action=allow '
-                          '--rules=tcp:22 '
-                          '--source-ranges=0.0.0.0/0')
-        proc = subprocess.run(enable_ssh_cmd,
-                              check=False,
-                              shell=True,
-                              stdout=subprocess.DEVNULL,
-                              stderr=subprocess.PIPE)
-        if proc.returncode != 0 and 'already exists' not in proc.stderr.decode(
-                'utf-8'):
-            subprocess_utils.handle_returncode(proc.returncode, enable_ssh_cmd,
-                                               'Failed to enable ssh port.',
-                                               proc.stderr.decode('utf-8'))
-        return config
-
-    # OS Login is not enabled for the project. Add the ssh key directly to the
-    # metadata.
-    # TODO(zhwu): Use cloud init to add ssh public key, to avoid the permission
-    # issue. A blocker is that the cloud init is not installed in the debian
-    # image by default.
-    project_keys: str = next(  # type: ignore
-        (item for item in project['commonInstanceMetadata'].get('items', [])
-         if item['key'] == 'ssh-keys'), {}).get('value', '')
-    ssh_keys = project_keys.split('\n') if project_keys else []
-
-    # Get public key from file.
-    with open(public_key_path, 'r') as f:
-        public_key = f.read()
-
-    # Check if ssh key in Google Project's metadata
-    public_key_token = public_key.split(' ')[1]
-
-    key_found = False
-    for key in ssh_keys:
-        key_list = key.split(' ')
-        if len(key_list) != 3:
-            continue
-        if user == key_list[-1] and os.path.exists(
-                private_key_path) and key_list[1] == public_key.split(' ')[1]:
-            key_found = True
-
-    if not key_found:
-        new_ssh_key = '{user}:ssh-rsa {public_key_token} {user}'.format(
-            user=user, public_key_token=public_key_token)
-        metadata = project['commonInstanceMetadata'].get('items', [])
-
-        ssh_key_index = [
-            k for k, v in enumerate(metadata) if v['key'] == 'ssh-keys'
-        ]
-        assert len(ssh_key_index) <= 1
-
-        if len(ssh_key_index) == 0:
-            metadata.append({'key': 'ssh-keys', 'value': new_ssh_key})
-        else:
-            first_ssh_key_index = ssh_key_index[0]
-            metadata[first_ssh_key_index]['value'] += '\n' + new_ssh_key
-
-        project['commonInstanceMetadata']['items'] = metadata
-
-        operation = compute.projects().setCommonInstanceMetadata(
-            project=project['name'],
-            body=project['commonInstanceMetadata']).execute()
-        _wait_for_compute_global_operation(project['name'], operation['name'],
-                                           compute)
+    config = _replace_cloud_init_ssh_info_in_config(config, public_key)
+    # This function is for backward compatibility, as the user using the old
+    # Debian-based image may not have the cloud-init enabled, and we need to
+    # add the ssh key to the account.
+    _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project, config,
+                                                oslogin_enabled)
     return config
 
 

diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py
@@ -5,7 +5,7 @@
 import subprocess
 import time
 import typing
-from typing import Dict, Iterator, List, Optional, Tuple
+from typing import Any, Dict, Iterator, List, Optional, Tuple
 
 from sky import clouds
 from sky import exceptions
@@ -69,6 +69,11 @@
 
 # TODO(zhwu): Move the default AMI size to the catalog instead.
 DEFAULT_GCP_IMAGE_GB = 50
+_DEFAULT_CPU_IMAGE = 'skypilot:cpu-ubuntu-2004'
+# Other GPUs: CUDA driver version 510.47.03, CUDA Library 11.6.
+# K80: CUDA driver version 470.103.01, CUDA Library 11.4 (we manually install
+# the older CUDA driver in the gcp-ray.yaml to support K80).
+_DEFAULT_GPU_IMAGE = 'skypilot:gpu-ubuntu-2004'
 
 
 def _run_output(cmd):
@@ -235,26 +240,24 @@ def get_egress_cost(self, num_gigabytes):
     def is_same_cloud(self, other):
         return isinstance(other, GCP)
 
-    def get_image_size(self, image_id: str, region: Optional[str]) -> float:
-        del region  # unused
-        if image_id.startswith('skypilot:'):
-            return DEFAULT_GCP_IMAGE_GB
+    @classmethod
+    def get_image_info(cls, image_id) -> Dict[str, Any]:
         try:
             compute = gcp.build('compute',
                                 'v1',
                                 credentials=None,
                                 cache_discovery=False)
         except gcp.credential_error_exception() as e:
-            return DEFAULT_GCP_IMAGE_GB
+            return {}
         try:
             image_attrs = image_id.split('/')
             if len(image_attrs) == 1:
                 raise ValueError(f'Image {image_id!r} not found in GCP.')
             project = image_attrs[1]
             image_name = image_attrs[-1]
-            image_infos = compute.images().get(project=project,
-                                               image=image_name).execute()
-            return float(image_infos['diskSizeGb'])
+            image_info = compute.images().get(project=project,
+                                              image=image_name).execute()
+            return image_info
         except gcp.http_error_exception() as e:
             if e.resp.status == 403:
                 with ux_utils.print_exception_no_traceback():
@@ -266,6 +269,21 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float:
                                      'GCP.') from None
             raise
 
+    def get_image_size(self, image_id: str, region: Optional[str]) -> float:
+        del region  # unused
+        if image_id.startswith('skypilot:'):
+            # Hack: this utilizes the knowledge that both the selected debian
+            # and ubuntu images on GCP have the same size of 50GB, to reduce
+            # the overhead for querying the image size.
+            return DEFAULT_GCP_IMAGE_GB
+        image_info = self.get_image_info(image_id)
+        if 'diskSizeGb' not in image_info:
+            # All the images in GCP should have the diskSizeGb field, but
+            # just in case, we do not want to crash the program, as the image
+            # size check is not critical.
+            return DEFAULT_GCP_IMAGE_GB
+        return float(image_info['diskSizeGb'])
+
     @classmethod
     def get_default_instance_type(
             cls,
@@ -287,10 +305,8 @@ def make_deploy_resources_variables(
 
         # gcloud compute images list \
         # --project deeplearning-platform-release \
-        # --no-standard-images
-        # We use the debian image, as the ubuntu image has some connectivity
-        # issue when first booted.
-        image_id = 'skypilot:cpu-debian-10'
+        # --no-standard-images | grep ubuntu-2004
+        image_id = _DEFAULT_CPU_IMAGE
 
         r = resources
         # Find GPU spec, if any.
@@ -330,17 +346,8 @@ def make_deploy_resources_variables(
                     resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
                         acc.lower())
                 resources_vars['gpu_count'] = acc_count
-                if acc == 'K80':
-                    # Though the image is called cu113, it actually has later
-                    # versions of CUDA as noted below.
-                    # CUDA driver version 470.57.02, CUDA Library 11.4
-                    image_id = 'skypilot:k80-debian-10'
-                else:
-                    # Though the image is called cu113, it actually has later
-                    # versions of CUDA as noted below.
-                    # CUDA driver version 510.47.03, CUDA Library 11.6
-                    # Does not support torch==1.13.0 with cu117
-                    image_id = 'skypilot:gpu-debian-10'
+
+                image_id = _DEFAULT_GPU_IMAGE
 
         if resources.image_id is not None:
             if None in resources.image_id:

diff --git a/sky/clouds/service_catalog/constants.py b/sky/clouds/service_catalog/constants.py
@@ -2,5 +2,5 @@
 import os
 
 HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs'  # pylint: disable=line-too-long
-CATALOG_SCHEMA_VERSION = 'v5'
+CATALOG_SCHEMA_VERSION = 'v6'
 LOCAL_CATALOG_DIR = os.path.expanduser('~/.sky/catalogs/')