diff --git a/sky/authentication.py b/sky/authentication.py
index f6181620c3d..d77d4a098f4 100644
--- a/sky/authentication.py
+++ b/sky/authentication.py
@@ -1,6 +1,7 @@
 """Module to enable a single SkyPilot key for all VMs in each cloud."""
 import copy
 import functools
+import json
 import os
 import re
 import socket
@@ -102,7 +103,7 @@ def _replace_cloud_init_ssh_info_in_config(config: Dict[str, Any],
 def setup_aws_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
     _, public_key_path = get_or_generate_keys()
     with open(public_key_path, 'r') as f:
-        public_key = f.read()
+        public_key = f.read().strip()
     config = _replace_cloud_init_ssh_info_in_config(config, public_key)
     return config
 
@@ -132,6 +133,105 @@ def _wait_for_compute_global_operation(project_name: str, operation_name: str,
     return result
 
 
+def _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project,
+                                                config: Dict[str, Any],
+                                                os_login_enabled: bool):
+    """Add ssh key to GCP account if using Debian image without cloud-init.
+
+    This function is for backward compatibility. It is only used when the user
+    is using the old Debian image without cloud-init. In this case, we need to
+    add the ssh key to the GCP account so that we can ssh into the instance.
+    """
+    private_key_path, public_key_path = get_or_generate_keys()
+    user = config['auth']['ssh_user']
+
+    node_config = config.get('available_node_types',
+                             {}).get('ray_head_default',
+                                     {}).get('node_config', {})
+    image_id = node_config.get('disks', [{}])[0].get('initializeParams',
+                                                     {}).get('sourceImage')
+    # image_id is None when TPU VM is used, as TPU VM does not use image.
+    if image_id is not None and 'debian' not in image_id.lower():
+        image_info = clouds.GCP.get_image_info(image_id)
+        if 'debian' not in json.dumps(image_info).lower():
+            # The non-Debian images have the ssh key setup by cloud-init.
+            return
+    logger.info('Adding ssh key to GCP account.')
+    if os_login_enabled:
+        # Add ssh key to GCP with oslogin
+        subprocess.run(
+            'gcloud compute os-login ssh-keys add '
+            f'--key-file={public_key_path}',
+            check=True,
+            shell=True,
+            stdout=subprocess.DEVNULL)
+        # Enable ssh port for all the instances
+        enable_ssh_cmd = ('gcloud compute firewall-rules create '
+                          'allow-ssh-ingress-from-iap '
+                          '--direction=INGRESS '
+                          '--action=allow '
+                          '--rules=tcp:22 '
+                          '--source-ranges=0.0.0.0/0')
+        proc = subprocess.run(enable_ssh_cmd,
+                              check=False,
+                              shell=True,
+                              stdout=subprocess.DEVNULL,
+                              stderr=subprocess.PIPE)
+        if proc.returncode != 0 and 'already exists' not in proc.stderr.decode(
+                'utf-8'):
+            subprocess_utils.handle_returncode(proc.returncode, enable_ssh_cmd,
+                                               'Failed to enable ssh port.',
+                                               proc.stderr.decode('utf-8'))
+        return config
+
+    # OS Login is not enabled for the project. Add the ssh key directly to the
+    # metadata.
+    project_keys: str = next(  # type: ignore
+        (item for item in project['commonInstanceMetadata'].get('items', [])
+         if item['key'] == 'ssh-keys'), {}).get('value', '')
+    ssh_keys = project_keys.split('\n') if project_keys else []
+
+    # Get public key from file.
+    with open(public_key_path, 'r') as f:
+        public_key = f.read()
+
+    # Check if ssh key in Google Project's metadata
+    public_key_token = public_key.split(' ')[1]
+
+    key_found = False
+    for key in ssh_keys:
+        key_list = key.split(' ')
+        if len(key_list) != 3:
+            continue
+        if user == key_list[-1] and os.path.exists(
+                private_key_path) and key_list[1] == public_key.split(' ')[1]:
+            key_found = True
+
+    if not key_found:
+        new_ssh_key = '{user}:ssh-rsa {public_key_token} {user}'.format(
+            user=user, public_key_token=public_key_token)
+        metadata = project['commonInstanceMetadata'].get('items', [])
+
+        ssh_key_index = [
+            k for k, v in enumerate(metadata) if v['key'] == 'ssh-keys'
+        ]
+        assert len(ssh_key_index) <= 1
+
+        if len(ssh_key_index) == 0:
+            metadata.append({'key': 'ssh-keys', 'value': new_ssh_key})
+        else:
+            first_ssh_key_index = ssh_key_index[0]
+            metadata[first_ssh_key_index]['value'] += '\n' + new_ssh_key
+
+        project['commonInstanceMetadata']['items'] = metadata
+
+        operation = compute.projects().setCommonInstanceMetadata(
+            project=project['name'],
+            body=project['commonInstanceMetadata']).execute()
+        _wait_for_compute_global_operation(project['name'], operation['name'],
+                                           compute)
+
+
 # Snippets of code inspired from
 # https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/gcp/config.py
 # Takes in config, a yaml dict and outputs a postprocessed dict
@@ -140,7 +240,9 @@ def _wait_for_compute_global_operation(project_name: str, operation_name: str,
 # Retry for the GCP as sometimes there will be connection reset by peer error.
 @common_utils.retry
 def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
-    private_key_path, public_key_path = get_or_generate_keys()
+    _, public_key_path = get_or_generate_keys()
+    with open(public_key_path, 'r') as f:
+        public_key = f.read()
     config = copy.deepcopy(config)
 
     project_id = config['provider']['project_id']
@@ -148,7 +250,6 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
                         'v1',
                         credentials=None,
                         cache_discovery=False)
-    user = config['auth']['ssh_user']
 
     try:
         project = compute.projects().get(project=project_id).execute()
@@ -191,7 +292,8 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
         (item for item in project['commonInstanceMetadata'].get('items', [])
          if item['key'] == 'enable-oslogin'), {}).get('value', 'False')
 
-    if project_oslogin.lower() == 'true':
+    oslogin_enabled = project_oslogin.lower() == 'true'
+    if oslogin_enabled:
         logger.info(
             f'OS Login is enabled for GCP project {project_id}. Running '
             'additional authentication steps.')
@@ -243,81 +345,12 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
             os_login_username = account.replace('@', '_').replace('.', '_')
         config['auth']['ssh_user'] = os_login_username
 
-        # Add ssh key to GCP with oslogin
-        subprocess.run(
-            'gcloud compute os-login ssh-keys add '
-            f'--key-file={public_key_path}',
-            check=True,
-            shell=True,
-            stdout=subprocess.DEVNULL)
-        # Enable ssh port for all the instances
-        enable_ssh_cmd = ('gcloud compute firewall-rules create '
-                          'allow-ssh-ingress-from-iap '
-                          '--direction=INGRESS '
-                          '--action=allow '
-                          '--rules=tcp:22 '
-                          '--source-ranges=0.0.0.0/0')
-        proc = subprocess.run(enable_ssh_cmd,
-                              check=False,
-                              shell=True,
-                              stdout=subprocess.DEVNULL,
-                              stderr=subprocess.PIPE)
-        if proc.returncode != 0 and 'already exists' not in proc.stderr.decode(
-                'utf-8'):
-            subprocess_utils.handle_returncode(proc.returncode, enable_ssh_cmd,
-                                               'Failed to enable ssh port.',
-                                               proc.stderr.decode('utf-8'))
-        return config
-
-    # OS Login is not enabled for the project. Add the ssh key directly to the
-    # metadata.
-    # TODO(zhwu): Use cloud init to add ssh public key, to avoid the permission
-    # issue. A blocker is that the cloud init is not installed in the debian
-    # image by default.
-    project_keys: str = next(  # type: ignore
-        (item for item in project['commonInstanceMetadata'].get('items', [])
-         if item['key'] == 'ssh-keys'), {}).get('value', '')
-    ssh_keys = project_keys.split('\n') if project_keys else []
-
-    # Get public key from file.
-    with open(public_key_path, 'r') as f:
-        public_key = f.read()
-
-    # Check if ssh key in Google Project's metadata
-    public_key_token = public_key.split(' ')[1]
-
-    key_found = False
-    for key in ssh_keys:
-        key_list = key.split(' ')
-        if len(key_list) != 3:
-            continue
-        if user == key_list[-1] and os.path.exists(
-                private_key_path) and key_list[1] == public_key.split(' ')[1]:
-            key_found = True
-
-    if not key_found:
-        new_ssh_key = '{user}:ssh-rsa {public_key_token} {user}'.format(
-            user=user, public_key_token=public_key_token)
-        metadata = project['commonInstanceMetadata'].get('items', [])
-
-        ssh_key_index = [
-            k for k, v in enumerate(metadata) if v['key'] == 'ssh-keys'
-        ]
-        assert len(ssh_key_index) <= 1
-
-        if len(ssh_key_index) == 0:
-            metadata.append({'key': 'ssh-keys', 'value': new_ssh_key})
-        else:
-            first_ssh_key_index = ssh_key_index[0]
-            metadata[first_ssh_key_index]['value'] += '\n' + new_ssh_key
-
-        project['commonInstanceMetadata']['items'] = metadata
-
-        operation = compute.projects().setCommonInstanceMetadata(
-            project=project['name'],
-            body=project['commonInstanceMetadata']).execute()
-        _wait_for_compute_global_operation(project['name'], operation['name'],
-                                           compute)
+    config = _replace_cloud_init_ssh_info_in_config(config, public_key)
+    # This function is for backward compatibility, as the user using the old
+    # Debian-based image may not have the cloud-init enabled, and we need to
+    # add the ssh key to the account.
+    _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project, config,
+                                                oslogin_enabled)
     return config
 
 
diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py
index c9d44f54cf8..deb98c2016a 100644
--- a/sky/clouds/gcp.py
+++ b/sky/clouds/gcp.py
@@ -5,7 +5,7 @@
 import subprocess
 import time
 import typing
-from typing import Dict, Iterator, List, Optional, Tuple
+from typing import Any, Dict, Iterator, List, Optional, Tuple
 
 from sky import clouds
 from sky import exceptions
@@ -77,6 +77,11 @@
 
 # TODO(zhwu): Move the default AMI size to the catalog instead.
 DEFAULT_GCP_IMAGE_GB = 50
+_DEFAULT_CPU_IMAGE = 'skypilot:cpu-ubuntu-2004'
+# Other GPUs: CUDA driver version 510.47.03, CUDA Library 11.6.
+# K80: CUDA driver version 470.103.01, CUDA Library 11.4 (we manually install
+# the older CUDA driver in the gcp-ray.yaml to support K80).
+_DEFAULT_GPU_IMAGE = 'skypilot:gpu-ubuntu-2004'
 
 
 def _run_output(cmd):
@@ -243,26 +248,24 @@ def get_egress_cost(self, num_gigabytes):
     def is_same_cloud(self, other):
         return isinstance(other, GCP)
 
-    def get_image_size(self, image_id: str, region: Optional[str]) -> float:
-        del region  # unused
-        if image_id.startswith('skypilot:'):
-            return DEFAULT_GCP_IMAGE_GB
+    @classmethod
+    def get_image_info(cls, image_id) -> Dict[str, Any]:
         try:
             compute = gcp.build('compute',
                                 'v1',
                                 credentials=None,
                                 cache_discovery=False)
         except gcp.credential_error_exception() as e:
-            return DEFAULT_GCP_IMAGE_GB
+            return {}
         try:
             image_attrs = image_id.split('/')
             if len(image_attrs) == 1:
                 raise ValueError(f'Image {image_id!r} not found in GCP.')
             project = image_attrs[1]
             image_name = image_attrs[-1]
-            image_infos = compute.images().get(project=project,
-                                               image=image_name).execute()
-            return float(image_infos['diskSizeGb'])
+            image_info = compute.images().get(project=project,
+                                              image=image_name).execute()
+            return image_info
         except gcp.http_error_exception() as e:
             if e.resp.status == 403:
                 with ux_utils.print_exception_no_traceback():
@@ -274,6 +277,21 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float:
                                      'GCP.') from None
             raise
 
+    def get_image_size(self, image_id: str, region: Optional[str]) -> float:
+        del region  # unused
+        if image_id.startswith('skypilot:'):
+            # Hack: this utilizes the knowledge that both the selected debian
+            # and ubuntu images on GCP have the same size of 50GB, to reduce
+            # the overhead for querying the image size.
+            return DEFAULT_GCP_IMAGE_GB
+        image_info = self.get_image_info(image_id)
+        if 'diskSizeGb' not in image_info:
+            # All the images in GCP should have the diskSizeGb field, but
+            # just in case, we do not want to crash the program, as the image
+            # size check is not critical.
+            return DEFAULT_GCP_IMAGE_GB
+        return float(image_info['diskSizeGb'])
+
     @classmethod
     def get_default_instance_type(
             cls,
@@ -295,10 +313,8 @@ def make_deploy_resources_variables(
 
         # gcloud compute images list \
         # --project deeplearning-platform-release \
-        # --no-standard-images
-        # We use the debian image, as the ubuntu image has some connectivity
-        # issue when first booted.
-        image_id = 'skypilot:cpu-debian-10'
+        # --no-standard-images | grep ubuntu-2004
+        image_id = _DEFAULT_CPU_IMAGE
 
         r = resources
         # Find GPU spec, if any.
@@ -338,17 +354,8 @@ def make_deploy_resources_variables(
                     resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
                         acc.lower())
                 resources_vars['gpu_count'] = acc_count
-                if acc == 'K80':
-                    # Though the image is called cu113, it actually has later
-                    # versions of CUDA as noted below.
-                    # CUDA driver version 470.57.02, CUDA Library 11.4
-                    image_id = 'skypilot:k80-debian-10'
-                else:
-                    # Though the image is called cu113, it actually has later
-                    # versions of CUDA as noted below.
-                    # CUDA driver version 510.47.03, CUDA Library 11.6
-                    # Does not support torch==1.13.0 with cu117
-                    image_id = 'skypilot:gpu-debian-10'
+
+                image_id = _DEFAULT_GPU_IMAGE
 
         if resources.image_id is not None:
             if None in resources.image_id:
diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py
index e55f5b33506..aa323456a90 100644
--- a/sky/clouds/service_catalog/common.py
+++ b/sky/clouds/service_catalog/common.py
@@ -67,7 +67,7 @@ def read_catalog(filename: str,
     """
     assert filename.endswith('.csv'), 'The catalog file must be a CSV file.'
     assert (pull_frequency_hours is None or
-            pull_frequency_hours > 0), pull_frequency_hours
+            pull_frequency_hours >= 0), pull_frequency_hours
     catalog_path = get_catalog_path(filename)
     cloud = cloud_lib.CLOUD_REGISTRY.from_str(os.path.dirname(filename))
 
diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py
index 3558e8b7f0a..ccbfeadc1d4 100644
--- a/sky/clouds/service_catalog/gcp_catalog.py
+++ b/sky/clouds/service_catalog/gcp_catalog.py
@@ -26,6 +26,10 @@
                           pull_frequency_hours=_PULL_FREQUENCY_HOURS)
 _image_df = common.read_catalog('gcp/images.csv',
                                 pull_frequency_hours=_PULL_FREQUENCY_HOURS)
+if _image_df[_image_df['Tag'] == 'skypilot:cpu-ubuntu-2004'].empty:
+    # Update the image catalog if it does not include the updated images
+    # https://github.com/skypilot-org/skypilot-catalog/pull/25.
+    _image_df = common.read_catalog('gcp/images.csv', pull_frequency_hours=0)
 
 _TPU_REGIONS = [
     'us-central1',
diff --git a/sky/skylet/providers/gcp/node_provider.py b/sky/skylet/providers/gcp/node_provider.py
index 1a3443f52e6..14d75dd5d6f 100644
--- a/sky/skylet/providers/gcp/node_provider.py
+++ b/sky/skylet/providers/gcp/node_provider.py
@@ -362,3 +362,65 @@ def _get_cached_node(self, node_id: str) -> GCPNode:
     @staticmethod
     def bootstrap_config(cluster_config):
         return bootstrap_gcp(cluster_config)
+
+    def get_command_runner(
+        self,
+        log_prefix,
+        node_id,
+        auth_config,
+        cluster_name,
+        process_runner,
+        use_internal_ip,
+        docker_config,
+    ):
+        from ray.autoscaler._private.command_runner import (
+            DockerCommandRunner,
+            SSHCommandRunner,
+        )
+
+        class SSHCommandRunnerWithRetry(SSHCommandRunner):
+            def _run_helper(
+                self, final_cmd, with_output=False, exit_on_fail=False, silent=False
+            ):
+                """Wrapper around _run_helper to retry on failure.
+
+                Fix the ssh connection issue caused by control master for GCP with ubuntu
+                image. Before the fix, the ssh connection will be disconnected when ray
+                trying to setup the runtime dependencies, which is probably because the
+                ssh connection is unstable when the cluster is just provisioned
+                https://github.com/ray-project/ray/issues/16539#issuecomment-1073138982.
+                The root cause can be that the GCP's async nvidia-driver installation will
+                reboot the machine when finished.
+
+                We added retry for the ssh commands executed by ray up, which is ok since
+                our setup commands are idempotent.
+                """
+                retry_cnt = 0
+                import click
+
+                while True:
+                    try:
+                        return super()._run_helper(
+                            final_cmd, with_output, exit_on_fail, silent
+                        )
+                    except click.ClickException as e:
+                        retry_cnt += 1
+                        if retry_cnt > 3:
+                            raise e
+                        logger.info(f"Retrying SSH command in 5 seconds: {e}")
+                        time.sleep(5)
+
+        # Adopted from super().get_command_runner()
+        common_args = {
+            "log_prefix": log_prefix,
+            "node_id": node_id,
+            "provider": self,
+            "auth_config": auth_config,
+            "cluster_name": cluster_name,
+            "process_runner": process_runner,
+            "use_internal_ip": use_internal_ip,
+        }
+        if docker_config and docker_config["container_name"] != "":
+            return DockerCommandRunner(docker_config, **common_args)
+        else:
+            return SSHCommandRunnerWithRetry(**common_args)
diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2
index 11115723a1c..b87223bede8 100644
--- a/sky/templates/aws-ray.yml.j2
+++ b/sky/templates/aws-ray.yml.j2
@@ -172,7 +172,6 @@ initialization_commands: []
 # Increment the following for catching performance bugs easier:
 #   current num items (num SSH connections): 1
 setup_commands:
-  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
   # Create ~/.ssh/config file in case the file does not exist in the custom image.
   # Make sure python3 & pip3 are available on this image.
   # We set auto_activate_base to be false for pre-installed conda.
@@ -187,7 +186,7 @@ setup_commands:
     (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
     (which conda > /dev/null 2>&1 && conda init > /dev/null) || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true);
     source ~/.bashrc;
-    (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
+    (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray ray-cpp && pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
     (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[aws]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
     sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2
index a489bf6d1ec..b3755278af6 100644
--- a/sky/templates/azure-ray.yml.j2
+++ b/sky/templates/azure-ray.yml.j2
@@ -126,7 +126,7 @@ setup_commands:
     (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
     which conda > /dev/null 2>&1 || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(/home/azureuser/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true);
     source ~/.bashrc;
-    (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful;
+    (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray ray-cpp && pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful;
     (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[azure]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
     sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2
index 3221ab2c21f..80283170b86 100644
--- a/sky/templates/gcp-ray.yml.j2
+++ b/sky/templates/gcp-ray.yml.j2
@@ -55,14 +55,36 @@ available_node_types:
             # See https://cloud.google.com/deep-learning-vm/docs/images
             sourceImage: {{image_id}}
             diskType: zones/{{zones}}/diskTypes/{{disk_tier}}
+      metadata:
+        items:
+          - key: ssh-keys
+            value: |
+              skypilot:ssh_user:skypilot:ssh_public_key_content
+          - key: user-data
+            value: |
+              #cloud-config
+              write_files:
+                - path: /etc/apt/apt.conf.d/20auto-upgrades
+                  content: |
+                    APT::Periodic::Update-Package-Lists "0";
+                    APT::Periodic::Download-Upgradeable-Packages "0";
+                    APT::Periodic::AutocleanInterval "0";
+                    APT::Periodic::Unattended-Upgrade "0";
+                - path: /etc/apt/apt.conf.d/10cloudinit-disable
+                  content: |
+                    APT::Periodic::Enable "0";
   {%- if gpu is not none %}
+    {%- if 'tesla-k80' in gpu %}
+              runcmd:
+                - curl https://raw.githubusercontent.com/GoogleCloudPlatform/compute-gpu-installation/main/linux/install_gpu_driver.py --output install_gpu_driver.py
+                - python3 install_gpu_driver.py
+    {%- else %}
+          - key: install-nvidia-driver
+            value: true
+    {%- endif %}
       guestAccelerators:
         - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}}
           acceleratorCount: {{gpu_count}}
-      metadata:
-        items:
-          - key: install-nvidia-driver
-            value: "True"
   {%- endif %}
       scheduling:
   {%- if use_spot %}
@@ -98,14 +120,36 @@ available_node_types:
             # See https://cloud.google.com/deep-learning-vm/docs/images
             sourceImage: {{image_id}}
             diskType: zones/{{zones}}/diskTypes/{{disk_tier}}
+      metadata:
+        items:
+          - key: ssh-keys
+            value: |
+              skypilot:ssh_user:skypilot:ssh_public_key_content
+          - key: user-data
+            value: |
+              #cloud-config
+              write_files:
+                - path: /etc/apt/apt.conf.d/20auto-upgrades
+                  content: |
+                    APT::Periodic::Update-Package-Lists "0";
+                    APT::Periodic::Download-Upgradeable-Packages "0";
+                    APT::Periodic::AutocleanInterval "0";
+                    APT::Periodic::Unattended-Upgrade "0";
+                - path: /etc/apt/apt.conf.d/10cloudinit-disable
+                  content: |
+                    APT::Periodic::Enable "0";
     {%- if gpu is not none %}
+      {%- if 'tesla-k80' in gpu %}
+              runcmd:
+                - curl https://raw.githubusercontent.com/GoogleCloudPlatform/compute-gpu-installation/main/linux/install_gpu_driver.py --output install_gpu_driver.py
+                - python3 install_gpu_driver.py
+      {%- else %}
+          - key: install-nvidia-driver
+            value: true
+      {%- endif %}
       guestAccelerators:
         - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}}
           acceleratorCount: {{gpu_count}}
-      metadata:
-        items:
-          - key: install-nvidia-driver
-            value: "True"
     {%- endif %}
       scheduling:
     {%- if use_spot %}
@@ -140,27 +184,21 @@ initialization_commands: []
 # Increment the following for catching performance bugs easier:
 #   current num items (num SSH connections): 1  (+1 if tpu_vm)
 setup_commands:
-  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
   # Line 'mkdir -p ..': Create ~/.ssh/config file in case the file does not exist in the custom image.
   # Line 'pip3 --v ..': Make sure python3 & pip3 are available on this image.
   # Line 'which conda ..': some images (TPU VM) do not install conda by
   # default. 'source ~/.bashrc' is needed so conda takes effect for the next
   # commands.
+  # Line 'pip3 list | ..': Ensure only one Ray version (which is our ray_version) is installed,
+  # regardless of if the image comes pre-installed with another Ray version. The
+  # 'python3 -c "import ray"' is to check the integrity of the installed ray package. The integrity
+  # check is needed because the reboot of the machine during the ray installation may cause the
+  # corruption.
   # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
   # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
   # Line 'mkdir -p ..': disable host key check
   # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
-  - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
-    sudo systemctl stop unattended-upgrades || true;
-    sudo systemctl disable unattended-upgrades || true;
-    sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
-    p=$(mylsof "/var/lib/dpkg/lock-frontend"); echo "$p";
-    sudo kill -9 `echo "$p" | tail -n 1` || true;
-    sudo rm /var/lib/dpkg/lock-frontend;
-    sudo pkill -9 dpkg;
-    sudo pkill -9 apt-get;
-    sudo dpkg --configure --force-overwrite -a;
-    mkdir -p ~/.ssh; touch ~/.ssh/config;
+  - mkdir -p ~/.ssh; touch ~/.ssh/config;
     pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
     (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc;
     (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
@@ -170,8 +208,8 @@ setup_commands:
     test -f /home/gcpuser/miniconda3/etc/profile.d/conda.sh && source /home/gcpuser/miniconda3/etc/profile.d/conda.sh && conda activate base || true;
     pip3 install --upgrade google-api-python-client;
   {%- endif %}
-    (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
-    (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[gcp]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
+    (pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null && python3 -c "import ray" || pip3 uninstall -y ray ray-cpp && pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
+    (pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) && python3 -c "import sky" || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[gcp]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
     sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
     mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n" >> ~/.ssh/config;
@@ -184,12 +222,26 @@ setup_commands:
 # items! The same comment applies for worker_start_ray_commands.
 #
 # Increment the following for catching performance bugs easier:
-#   current num items (num SSH connections): 1
+#   current num items (num SSH connections): 2
 head_start_ray_commands:
   # Start skylet daemon. (Should not place it in the head_setup_commands, otherwise it will run before sky is installed.)
   # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
   # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
   # all the sessions to be reloaded. This is a workaround.
+{%- if gpu is not none %}
+  - |
+    echo "Installing NVIDIA GPU driver." >> ~/.sky/nvlog
+    while ! nvidia-smi &> /dev/null
+    do
+      echo "Waiting for NVIDIA drivers to be installed..." >> ~/.sky/nvlog
+      sleep 5
+    done
+    # Magic number for waiting for the nvidia driver to be ready and the instance
+    # to be rebooted. The number is determined by experiments, and it is the
+    # minimum number that works.
+    sleep 18
+    echo "NVIDIA GPU is ready." >> ~/.sky/nvlog
+{%- endif %}
   - ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &);
     export SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l);
     ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1;
@@ -199,6 +251,20 @@ head_start_ray_commands:
 # Worker commands are needed for TPU VM Pods
 {%- if num_nodes > 1 or tpu_vm %}
 worker_start_ray_commands:
+{%- if gpu is not none %}
+  - |
+    echo "Installing NVIDIA GPU driver." >> ~/.sky/nvlog
+    while ! nvidia-smi &> /dev/null
+    do
+      echo "Waiting for NVIDIA drivers to be installed..." >> ~/.sky/nvlog
+      sleep 5
+    done
+    # Magic number for waiting for the nvidia driver to be ready and the instance
+    # to be rebooted. The number is determined by experiments, and it is the
+    # minimum number that works.
+    sleep 18
+    echo "NVIDIA GPU is ready." >> ~/.sky/nvlog
+{%- endif %}
   - SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l);
     ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1;
     which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index dcc159a9888..a9d05388d4c 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -916,6 +916,7 @@ def test_job_queue(generic_cloud: str):
             f'sky logs {name} 5 --status',
         ],
         f'sky down -y {name}',
+        timeout=20 * 60,
     )
     run_one_test(test)
 
@@ -1149,7 +1150,7 @@ def test_multi_echo(generic_cloud: str):
         # unfulfilled' error.  If process not found, grep->ssh returns 1.
         [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''],
         f'sky down -y {name}',
-        timeout=20 * 60,
+        timeout=25 * 60,
     )
     run_one_test(test)
 
@@ -1325,6 +1326,7 @@ def test_gcp_start_stop():
             f'sky status -r {name} | grep "INIT\|STOPPED"',
         ],
         f'sky down -y {name}',
+        timeout=20 * 60,
     )
     run_one_test(test)
 
@@ -2229,7 +2231,7 @@ def _get_aws_query_command(region, instance_id, field, expected):
                                          specs['disk_throughput']))),
             ],
             f'sky down -y {name}',
-            timeout=10 * 60,  # 10 mins  (it takes around ~6 mins)
+            timeout=12 * 60,
         )
         run_one_test(test)