From 8d762ddf0d52410f42338a1b77868f39b1e4baca Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 2 Jan 2025 08:08:33 -0800 Subject: [PATCH 01/16] [Docs] Refactor pod_config docs (#4427) * refactor pod_config docs * Update docs/source/reference/kubernetes/kubernetes-getting-started.rst Co-authored-by: Zongheng Yang * Update docs/source/reference/kubernetes/kubernetes-getting-started.rst Co-authored-by: Zongheng Yang --------- Co-authored-by: Zongheng Yang --- .../kubernetes/kubernetes-getting-started.rst | 93 ++++++++++++------- 1 file changed, 61 insertions(+), 32 deletions(-) diff --git a/docs/source/reference/kubernetes/kubernetes-getting-started.rst b/docs/source/reference/kubernetes/kubernetes-getting-started.rst index e4bbb2c8915..3323559bb36 100644 --- a/docs/source/reference/kubernetes/kubernetes-getting-started.rst +++ b/docs/source/reference/kubernetes/kubernetes-getting-started.rst @@ -258,6 +258,67 @@ After launching the cluster with :code:`sky launch -c myclus task.yaml`, you can To learn more about opening ports in SkyPilot tasks, see :ref:`Opening Ports `. +Customizing SkyPilot pods +------------------------- + +You can override the pod configuration used by SkyPilot by setting the :code:`pod_config` key in :code:`~/.sky/config.yaml`. +The value of :code:`pod_config` should be a dictionary that follows the `Kubernetes Pod API `_. This will apply to all pods created by SkyPilot. + +For example, to set custom environment variables and use GPUDirect RDMA, you can add the following to your :code:`~/.sky/config.yaml` file: + +.. code-block:: yaml + + # ~/.sky/config.yaml + kubernetes: + pod_config: + spec: + containers: + - env: # Custom environment variables to set in pod + - name: MY_ENV_VAR + value: MY_ENV_VALUE + resources: # Custom resources for GPUDirect RDMA + requests: + rdma/rdma_shared_device_a: 1 + limits: + rdma/rdma_shared_device_a: 1 + + +Similarly, you can attach `Kubernetes volumes `_ (e.g., an `NFS volume `_) directly to your SkyPilot pods: + +.. code-block:: yaml + + # ~/.sky/config.yaml + kubernetes: + pod_config: + spec: + containers: + - volumeMounts: # Custom volume mounts for the pod + - mountPath: /data + name: nfs-volume + volumes: + - name: nfs-volume + nfs: # Alternatively, use hostPath if your NFS is directly attached to the nodes + server: nfs.example.com + path: /nfs + + +.. tip:: + + As an alternative to setting ``pod_config`` globally, you can also set it on a per-task basis directly in your task YAML with the ``config_overrides`` :ref:`field `. + + .. code-block:: yaml + + # task.yaml + run: | + python myscript.py + + # Set pod_config for this task + experimental: + config_overrides: + pod_config: + ... + + FAQs ---- @@ -293,38 +354,6 @@ FAQs You can use your existing observability tools to filter resources with the label :code:`parent=skypilot` (:code:`kubectl get pods -l 'parent=skypilot'`). As an example, follow the instructions :ref:`here ` to deploy the Kubernetes Dashboard on your cluster. -* **How can I specify custom configuration for the pods created by SkyPilot?** - - You can override the pod configuration used by SkyPilot by setting the :code:`pod_config` key in :code:`~/.sky/config.yaml`. - The value of :code:`pod_config` should be a dictionary that follows the `Kubernetes Pod API `_. - - For example, to set custom environment variables and attach a volume on your pods, you can add the following to your :code:`~/.sky/config.yaml` file: - - .. code-block:: yaml - - kubernetes: - pod_config: - spec: - containers: - - env: - - name: MY_ENV_VAR - value: MY_ENV_VALUE - volumeMounts: # Custom volume mounts for the pod - - mountPath: /foo - name: example-volume - resources: # Custom resource requests and limits - requests: - rdma/rdma_shared_device_a: 1 - limits: - rdma/rdma_shared_device_a: 1 - volumes: - - name: example-volume - hostPath: - path: /tmp - type: Directory - - For more details refer to :ref:`config-yaml`. - * **I am using a custom image. How can I speed up the pod startup time?** You can pre-install SkyPilot dependencies in your custom image to speed up the pod startup time. Simply add these lines at the end of your Dockerfile: From 6a6d6671958c9fce56ffa314144daa1156a43342 Mon Sep 17 00:00:00 2001 From: Hysun He Date: Fri, 3 Jan 2025 15:45:50 +0800 Subject: [PATCH 02/16] [OCI] Set default image to ubuntu LTS 22.04 (#4517) * set default gpu image to skypilot:gpu-ubuntu-2204 * add example * remove comment line * set cpu default image to 2204 * update change history --- examples/oci/gpu-oraclelinux9.yaml | 33 ++++++++++++++++++++++++++++++ examples/oci/gpu-ubuntu-2204.yaml | 33 ++++++++++++++++++++++++++++++ sky/clouds/utils/oci_utils.py | 8 ++++++-- 3 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 examples/oci/gpu-oraclelinux9.yaml create mode 100644 examples/oci/gpu-ubuntu-2204.yaml diff --git a/examples/oci/gpu-oraclelinux9.yaml b/examples/oci/gpu-oraclelinux9.yaml new file mode 100644 index 00000000000..cc7b05ea0fc --- /dev/null +++ b/examples/oci/gpu-oraclelinux9.yaml @@ -0,0 +1,33 @@ +name: gpu-task + +resources: + # Optional; if left out, automatically pick the cheapest cloud. + cloud: oci + + accelerators: A10:1 + + disk_size: 1024 + + disk_tier: high + + image_id: skypilot:gpu-oraclelinux9 + + +# Working directory (optional) containing the project codebase. +# Its contents are synced to ~/sky_workdir/ on the cluster. +workdir: . + +num_nodes: 1 + +# Typical use: pip install -r requirements.txt +# Invoked under the workdir (i.e., can use its files). +setup: | + echo "*** Running setup. ***" + +# Typical use: make use of resources, such as running training. +# Invoked under the workdir (i.e., can use its files). +run: | + echo "*** Running the task on OCI ***" + echo "hello, world" + nvidia-smi + echo "The task is completed." diff --git a/examples/oci/gpu-ubuntu-2204.yaml b/examples/oci/gpu-ubuntu-2204.yaml new file mode 100644 index 00000000000..e0012a31a1a --- /dev/null +++ b/examples/oci/gpu-ubuntu-2204.yaml @@ -0,0 +1,33 @@ +name: gpu-task + +resources: + # Optional; if left out, automatically pick the cheapest cloud. + cloud: oci + + accelerators: A10:1 + + disk_size: 1024 + + disk_tier: high + + image_id: skypilot:gpu-ubuntu-2204 + + +# Working directory (optional) containing the project codebase. +# Its contents are synced to ~/sky_workdir/ on the cluster. +workdir: . + +num_nodes: 1 + +# Typical use: pip install -r requirements.txt +# Invoked under the workdir (i.e., can use its files). +setup: | + echo "*** Running setup. ***" + +# Typical use: make use of resources, such as running training. +# Invoked under the workdir (i.e., can use its files). +run: | + echo "*** Running the task on OCI ***" + echo "hello, world" + nvidia-smi + echo "The task is completed." diff --git a/sky/clouds/utils/oci_utils.py b/sky/clouds/utils/oci_utils.py index 0cd4f33e647..581d4d72d3c 100644 --- a/sky/clouds/utils/oci_utils.py +++ b/sky/clouds/utils/oci_utils.py @@ -6,6 +6,10 @@ configuration. - Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add the constant SERVICE_PORT_RULE_TAG + - Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Set the default image + from ubuntu 20.04 to ubuntu 22.04, including: + - GPU: skypilot:gpu-ubuntu-2004 -> skypilot:gpu-ubuntu-2204 + - CPU: skypilot:cpu-ubuntu-2004 -> skypilot:cpu-ubuntu-2204 """ import os @@ -117,7 +121,7 @@ def get_default_gpu_image_tag(cls) -> str: # the sky's user-config file (if not specified, use the hardcode one at # last) return skypilot_config.get_nested(('oci', 'default', 'image_tag_gpu'), - 'skypilot:gpu-ubuntu-2004') + 'skypilot:gpu-ubuntu-2204') @classmethod def get_default_image_tag(cls) -> str: @@ -125,7 +129,7 @@ def get_default_image_tag(cls) -> str: # set the default image tag in the sky's user-config file. (if not # specified, use the hardcode one at last) return skypilot_config.get_nested( - ('oci', 'default', 'image_tag_general'), 'skypilot:cpu-ubuntu-2004') + ('oci', 'default', 'image_tag_general'), 'skypilot:cpu-ubuntu-2204') @classmethod def get_sky_user_config_file(cls) -> str: From 459c5ae9d56239d90658bf782c56cc6ad36f5dfe Mon Sep 17 00:00:00 2001 From: Hysun He Date: Fri, 3 Jan 2025 21:54:04 +0800 Subject: [PATCH 03/16] [OCI] 1. Support specify OS with custom image id. 2. Corner case fix (#4524) * Support specify os type with custom image id. * trim space * nit * comment --- sky/clouds/oci.py | 29 ++++++++++++++++++++--------- sky/provision/oci/query_utils.py | 7 +++++-- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index d4ae6f298d2..b0234e2802c 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -232,6 +232,14 @@ def make_deploy_resources_variables( listing_id = None res_ver = None + os_type = None + if ':' in image_id: + # OS type provided in the --image-id. This is the case where + # custom image's ocid provided in the --image-id parameter. + # - ocid1.image...aaa:oraclelinux (os type is oraclelinux) + # - ocid1.image...aaa (OS not provided) + image_id, os_type = image_id.replace(' ', '').split(':') + cpus = resources.cpus instance_type_arr = resources.instance_type.split( oci_utils.oci_config.INSTANCE_TYPE_RES_SPERATOR) @@ -297,15 +305,18 @@ def make_deploy_resources_variables( cpus=None if cpus is None else float(cpus), disk_tier=resources.disk_tier) - image_str = self._get_image_str(image_id=resources.image_id, - instance_type=resources.instance_type, - region=region.name) - - # pylint: disable=import-outside-toplevel - from sky.clouds.service_catalog import oci_catalog - os_type = oci_catalog.get_image_os_from_tag(tag=image_str, - region=region.name) - logger.debug(f'OS type for the image {image_str} is {os_type}') + if os_type is None: + # OS type is not determined yet. So try to get it from vms.csv + image_str = self._get_image_str( + image_id=resources.image_id, + instance_type=resources.instance_type, + region=region.name) + + # pylint: disable=import-outside-toplevel + from sky.clouds.service_catalog import oci_catalog + os_type = oci_catalog.get_image_os_from_tag(tag=image_str, + region=region.name) + logger.debug(f'OS type for the image {image_id} is {os_type}') return { 'instance_type': instance_type, diff --git a/sky/provision/oci/query_utils.py b/sky/provision/oci/query_utils.py index 8cca0629305..3037fcc2703 100644 --- a/sky/provision/oci/query_utils.py +++ b/sky/provision/oci/query_utils.py @@ -506,8 +506,11 @@ def find_nsg(cls, region: str, nsg_name: str, raise exceptions.ResourcesUnavailableError( 'The VCN is not available') - # Get the primary vnic. - assert len(list_vcns_resp.data) > 0 + # Get the primary vnic. The vnic might be an empty list for the + # corner case when the cluster was exited during provision. + if not list_vcns_resp.data: + return None + vcn = list_vcns_resp.data[0] list_nsg_resp = net_client.list_network_security_groups( From 0db9846889dbea6e60440b090c6590eb7016d713 Mon Sep 17 00:00:00 2001 From: zpoint Date: Fri, 3 Jan 2025 23:38:48 +0800 Subject: [PATCH 04/16] Update intermediate bucket related doc (#4521) * doc * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * add tip * minor changes --------- Co-authored-by: Romil Bhardwaj --- docs/source/examples/managed-jobs.rst | 42 ++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/docs/source/examples/managed-jobs.rst b/docs/source/examples/managed-jobs.rst index 99fa461249d..2cd99b6c24b 100644 --- a/docs/source/examples/managed-jobs.rst +++ b/docs/source/examples/managed-jobs.rst @@ -152,6 +152,7 @@ The :code:`MOUNT` mode in :ref:`SkyPilot bucket mounting ` ensures Note that the application code should save program checkpoints periodically and reload those states when the job is restarted. This is typically achieved by reloading the latest checkpoint at the beginning of your program. + .. _spot-jobs-end-to-end: An End-to-End Example @@ -455,6 +456,46 @@ especially useful when there are many in-progress jobs to monitor, which the terminal-based CLI may need more than one page to display. +.. _intermediate-bucket: + +Intermediate storage for files +------------------------------ + +For managed jobs, SkyPilot requires an intermediate bucket to store files used in the task, such as local file mounts, temporary files, and the workdir. +If you do not configure a bucket, SkyPilot will automatically create a temporary bucket named :code:`skypilot-filemounts-{username}-{run_id}` for each job launch. SkyPilot automatically deletes the bucket after the job completes. + +Alternatively, you can pre-provision a bucket and use it as an intermediate for storing file by setting :code:`jobs.bucket` in :code:`~/.sky/config.yaml`: + +.. code-block:: yaml + + # ~/.sky/config.yaml + jobs: + bucket: s3://my-bucket # Supports s3://, gs://, https://.blob.core.windows.net/, r2://, cos:/// + + +If you choose to specify a bucket, ensure that the bucket already exists and that you have the necessary permissions. + +When using a pre-provisioned intermediate bucket with :code:`jobs.bucket`, SkyPilot creates job-specific directories under the bucket root to store files. They are organized in the following structure: + +.. code-block:: text + + # cloud bucket, s3://my-bucket/ for example + my-bucket/ + ├── job-15891b25/ # Job-specific directory + │ ├── local-file-mounts/ # Files from local file mounts + │ ├── tmp-files/ # Temporary files + │ └── workdir/ # Files from workdir + └── job-cae228be/ # Another job's directory + ├── local-file-mounts/ + ├── tmp-files/ + └── workdir/ + +When using a custom bucket (:code:`jobs.bucket`), the job-specific directories (e.g., :code:`job-15891b25/`) created by SkyPilot are removed when the job completes. + +.. tip:: + Multiple users can share the same intermediate bucket. Each user's jobs will have their own unique job-specific directories, ensuring that files are kept separate and organized. + + Concept: Jobs Controller ------------------------ @@ -505,4 +546,3 @@ The :code:`resources` field has the same spec as a normal SkyPilot job; see `her These settings will not take effect if you have an existing controller (either stopped or live). For them to take effect, tear down the existing controller first, which requires all in-progress jobs to finish or be canceled. - From 2ccbbffb99cda5894f294d6db98b453eb430da5b Mon Sep 17 00:00:00 2001 From: Aylei Date: Sat, 4 Jan 2025 02:50:29 +0800 Subject: [PATCH 05/16] [aws] cache user identity by 'aws configure list' (#4507) * [aws] cache user identity by 'aws configure list' Signed-off-by: Aylei * refine get_user_identities docstring Signed-off-by: Aylei * address review comments Signed-off-by: Aylei --------- Signed-off-by: Aylei --- sky/clouds/aws.py | 126 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 39 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index cafc789c5be..c665263e22e 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -2,6 +2,8 @@ import enum import fnmatch import functools +import hashlib +import json import os import re import subprocess @@ -16,6 +18,7 @@ from sky import skypilot_config from sky.adaptors import aws from sky.clouds import service_catalog +from sky.clouds.service_catalog import common as catalog_common from sky.clouds.utils import aws_utils from sky.skylet import constants from sky.utils import common_utils @@ -624,14 +627,10 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: @classmethod def _current_identity_type(cls) -> Optional[AWSIdentityType]: - proc = subprocess.run('aws configure list', - shell=True, - check=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - if proc.returncode != 0: + stdout = cls._aws_configure_list() + if stdout is None: return None - stdout = proc.stdout.decode() + output = stdout.decode() # We determine the identity type by looking at the output of # `aws configure list`. The output looks like: @@ -646,10 +645,10 @@ def _current_identity_type(cls) -> Optional[AWSIdentityType]: def _is_access_key_of_type(type_str: str) -> bool: # The dot (.) does not match line separators. - results = re.findall(fr'access_key.*{type_str}', stdout) + results = re.findall(fr'access_key.*{type_str}', output) if len(results) > 1: raise RuntimeError( - f'Unexpected `aws configure list` output:\n{stdout}') + f'Unexpected `aws configure list` output:\n{output}') return len(results) == 1 if _is_access_key_of_type(AWSIdentityType.SSO.value): @@ -664,37 +663,20 @@ def _is_access_key_of_type(type_str: str) -> bool: return AWSIdentityType.SHARED_CREDENTIALS_FILE @classmethod - @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. - def get_user_identities(cls) -> Optional[List[List[str]]]: - """Returns a [UserId, Account] list that uniquely identifies the user. - - These fields come from `aws sts get-caller-identity`. We permit the same - actual user to: - - - switch between different root accounts (after which both elements - of the list will be different) and have their clusters owned by - each account be protected; or - - - within the same root account, switch between different IAM - users, and treat [user_id=1234, account=A] and - [user_id=4567, account=A] to be the *same*. Namely, switching - between these IAM roles within the same root account will cause - the first element of the returned list to differ, and will allow - the same actual user to continue to interact with their clusters. - Note: this is not 100% safe, since the IAM users can have very - specific permissions, that disallow them to access the clusters - but it is a reasonable compromise as that could be rare. - - Returns: - A list of strings that uniquely identifies the user on this cloud. - For identity check, we will fallback through the list of strings - until we find a match, and print a warning if we fail for the - first string. + @functools.lru_cache(maxsize=1) + def _aws_configure_list(cls) -> Optional[bytes]: + proc = subprocess.run('aws configure list', + shell=True, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if proc.returncode != 0: + return None + return proc.stdout - Raises: - exceptions.CloudUserIdentityError: if the user identity cannot be - retrieved. - """ + @classmethod + @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. + def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]: try: sts = aws.client('sts') # The caller identity contains 3 fields: UserId, Account, Arn. @@ -773,6 +755,72 @@ def get_user_identities(cls) -> Optional[List[List[str]]]: # automatic switching for AWS. Currently we only support one identity. return [user_ids] + @classmethod + @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. + def get_user_identities(cls) -> Optional[List[List[str]]]: + """Returns a [UserId, Account] list that uniquely identifies the user. + + These fields come from `aws sts get-caller-identity` and are cached + locally by `aws configure list` output. The identities are assumed to + be stable for the duration of the `sky` process. Modifying the + credentials while the `sky` process is running will not affect the + identity returned by this function. + + We permit the same actual user to: + + - switch between different root accounts (after which both elements + of the list will be different) and have their clusters owned by + each account be protected; or + + - within the same root account, switch between different IAM + users, and treat [user_id=1234, account=A] and + [user_id=4567, account=A] to be the *same*. Namely, switching + between these IAM roles within the same root account will cause + the first element of the returned list to differ, and will allow + the same actual user to continue to interact with their clusters. + Note: this is not 100% safe, since the IAM users can have very + specific permissions, that disallow them to access the clusters + but it is a reasonable compromise as that could be rare. + + Returns: + A list of strings that uniquely identifies the user on this cloud. + For identity check, we will fallback through the list of strings + until we find a match, and print a warning if we fail for the + first string. + + Raises: + exceptions.CloudUserIdentityError: if the user identity cannot be + retrieved. + """ + stdout = cls._aws_configure_list() + if stdout is None: + # `aws configure list` is not available, possible reasons: + # - awscli is not installed but credentials are valid, e.g. run from + # an EC2 instance with IAM role + # - aws credentials are not set, proceed anyway to get unified error + # message for users + return cls._sts_get_caller_identity() + config_hash = hashlib.md5(stdout).hexdigest()[:8] + # Getting aws identity cost ~1s, so we cache the result with the output of + # `aws configure list` as cache key. Different `aws configure list` output + # can have same aws identity, our assumption is the output would be stable + # in real world, so the number of cache files would be limited. + # TODO(aylei): consider using a more stable cache key and evalute eviction. + cache_path = catalog_common.get_catalog_path( + f'aws/.cache/user-identity-{config_hash}.txt') + if os.path.exists(cache_path): + try: + with open(cache_path, 'r', encoding='utf-8') as f: + return json.loads(f.read()) + except json.JSONDecodeError: + # cache is invalid, ignore it and fetch identity again + pass + + result = cls._sts_get_caller_identity() + with open(cache_path, 'w', encoding='utf-8') as f: + f.write(json.dumps(result)) + return result + @classmethod def get_active_user_identity_str(cls) -> Optional[str]: user_identity = cls.get_active_user_identity() From 061d4bd998739e16fa704a855233e62290f6cbdb Mon Sep 17 00:00:00 2001 From: Chester Li Date: Sat, 4 Jan 2025 03:29:33 +0800 Subject: [PATCH 06/16] [k8s] Add validation for pod_config #4206 (#4466) * [k8s] Add validation for pod_config #4206 Check pod_config when run 'sky check k8s' by using k8s api * update: check pod_config when launch check merged pod_config during launch using k8s api * fix test * ignore check failed when test with dryrun if there is no kube config in env, ignore ValueError when launch with dryrun. For now, we don't support check schema offline. * use deserialize api to check pod_config schema * test * create another api_client with no kubeconfig * test * update error message * update test * test * test * Update sky/backends/backend_utils.py --------- Co-authored-by: Romil Bhardwaj --- sky/backends/backend_utils.py | 7 +++++ sky/provision/kubernetes/utils.py | 46 +++++++++++++++++++++++++++++++ tests/test_config.py | 46 +++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 1de799e7cf8..6e79469a819 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -926,6 +926,13 @@ def write_cluster_config( tmp_yaml_path, cluster_config_overrides=to_provision.cluster_config_overrides) kubernetes_utils.combine_metadata_fields(tmp_yaml_path) + yaml_obj = common_utils.read_yaml(tmp_yaml_path) + pod_config = yaml_obj['available_node_types']['ray_head_default'][ + 'node_config'] + valid, message = kubernetes_utils.check_pod_config(pod_config) + if not valid: + raise exceptions.InvalidCloudConfigs( + f'Invalid pod_config. Details: {message}') if dryrun: # If dryrun, return the unfinished tmp yaml path. diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 487868d1d9e..14b6b42aa58 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -893,6 +893,52 @@ def check_credentials(context: Optional[str], return True, None +def check_pod_config(pod_config: dict) \ + -> Tuple[bool, Optional[str]]: + """Check if the pod_config is a valid pod config + + Using deserialize api to check the pod_config is valid or not. + + Returns: + bool: True if pod_config is valid. + str: Error message about why the pod_config is invalid, None otherwise. + """ + errors = [] + # This api_client won't be used to send any requests, so there is no need to + # load kubeconfig + api_client = kubernetes.kubernetes.client.ApiClient() + + # Used for kubernetes api_client deserialize function, the function will use + # data attr, the detail ref: + # https://github.com/kubernetes-client/python/blob/master/kubernetes/client/api_client.py#L244 + class InnerResponse(): + + def __init__(self, data: dict): + self.data = json.dumps(data) + + try: + # Validate metadata if present + if 'metadata' in pod_config: + try: + value = InnerResponse(pod_config['metadata']) + api_client.deserialize( + value, kubernetes.kubernetes.client.V1ObjectMeta) + except ValueError as e: + errors.append(f'Invalid metadata: {str(e)}') + # Validate spec if present + if 'spec' in pod_config: + try: + value = InnerResponse(pod_config['spec']) + api_client.deserialize(value, + kubernetes.kubernetes.client.V1PodSpec) + except ValueError as e: + errors.append(f'Invalid spec: {str(e)}') + return len(errors) == 0, '.'.join(errors) + except Exception as e: # pylint: disable=broad-except + errors.append(f'Validation error: {str(e)}') + return False, '.'.join(errors) + + def is_kubeconfig_exec_auth( context: Optional[str] = None) -> Tuple[bool, Optional[str]]: """Checks if the kubeconfig file uses exec-based authentication diff --git a/tests/test_config.py b/tests/test_config.py index 5789214dc61..d3eaeb261bc 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -7,6 +7,7 @@ import sky from sky import skypilot_config +import sky.exceptions from sky.skylet import constants from sky.utils import common_utils from sky.utils import kubernetes_enums @@ -99,6 +100,29 @@ def _create_task_yaml_file(task_file_path: pathlib.Path) -> None: """)) +def _create_invalid_config_yaml_file(task_file_path: pathlib.Path) -> None: + task_file_path.write_text( + textwrap.dedent("""\ + experimental: + config_overrides: + kubernetes: + pod_config: + metadata: + labels: + test-key: test-value + annotations: + abc: def + spec: + containers: + - name: + imagePullSecrets: + - name: my-secret-2 + + setup: echo 'Setting up...' + run: echo 'Running...' + """)) + + def test_nested_config(monkeypatch) -> None: """Test that the nested config works.""" config = skypilot_config.Config() @@ -335,6 +359,28 @@ def test_k8s_config_with_override(monkeypatch, tmp_path, assert cluster_pod_config['spec']['runtimeClassName'] == 'nvidia' +def test_k8s_config_with_invalid_config(monkeypatch, tmp_path, + enable_all_clouds) -> None: + config_path = tmp_path / 'config.yaml' + _create_config_file(config_path) + monkeypatch.setattr(skypilot_config, 'CONFIG_PATH', config_path) + + _reload_config() + task_path = tmp_path / 'task.yaml' + _create_invalid_config_yaml_file(task_path) + task = sky.Task.from_yaml(task_path) + + # Test Kubernetes pod_config invalid + cluster_name = 'test_k8s_config_with_invalid_config' + task.set_resources_override({'cloud': sky.Kubernetes()}) + exception_occurred = False + try: + sky.launch(task, cluster_name=cluster_name, dryrun=True) + except sky.exceptions.ResourcesUnavailableError: + exception_occurred = True + assert exception_occurred + + def test_gcp_config_with_override(monkeypatch, tmp_path, enable_all_clouds) -> None: config_path = tmp_path / 'config.yaml' From 4ab8e1668053fef8ae87ba9c832073c444078e49 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Fri, 3 Jan 2025 21:03:39 -0800 Subject: [PATCH 07/16] [core] fix wheel timestamp check (#4488) Previously, we were only taking the max timestamp of all the subdirectories of the given directory. So the timestamp could be incorrect if only a file changed, and no directory changed. This fixes the issue by looking at all directories and files given by os.walk(). --- sky/backends/wheel_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sky/backends/wheel_utils.py b/sky/backends/wheel_utils.py index ed580569e0b..805117ee2a3 100644 --- a/sky/backends/wheel_utils.py +++ b/sky/backends/wheel_utils.py @@ -153,7 +153,10 @@ def _get_latest_modification_time(path: pathlib.Path) -> float: if not path.exists(): return -1. try: - return max(os.path.getmtime(root) for root, _, _ in os.walk(path)) + return max( + os.path.getmtime(os.path.join(root, f)) + for root, dirs, files in os.walk(path) + for f in (*dirs, *files)) except ValueError: return -1. From e4939f9fafde4985836689211d9e5f67731e792a Mon Sep 17 00:00:00 2001 From: Hysun He Date: Mon, 6 Jan 2025 09:20:33 +0800 Subject: [PATCH 08/16] [docs] Add image_id doc in task YAML for OCI (#4526) * Add image_id doc for OCI * nit * Update docs/source/reference/yaml-spec.rst Co-authored-by: Tian Xia --------- Co-authored-by: Tian Xia --- docs/source/reference/yaml-spec.rst | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst index 8a490b7e817..d2f0506993a 100644 --- a/docs/source/reference/yaml-spec.rst +++ b/docs/source/reference/yaml-spec.rst @@ -176,9 +176,9 @@ Available fields: # tpu_vm: True # True to use TPU VM (the default); False to use TPU node. # Custom image id (optional, advanced). The image id used to boot the - # instances. Only supported for AWS and GCP (for non-docker image). If not - # specified, SkyPilot will use the default debian-based image suitable for - # machine learning tasks. + # instances. Only supported for AWS, GCP, OCI and IBM (for non-docker image). + # If not specified, SkyPilot will use the default debian-based image + # suitable for machine learning tasks. # # Docker support # You can specify docker image to use by setting the image_id to @@ -204,7 +204,7 @@ Available fields: # image_id: # us-east-1: ami-0729d913a335efca7 # us-west-2: ami-050814f384259894c - image_id: ami-0868a20f5a3bf9702 + # # GCP # To find GCP images: https://cloud.google.com/compute/docs/images # image_id: projects/deeplearning-platform-release/global/images/common-cpu-v20230615-debian-11-py310 @@ -215,6 +215,24 @@ Available fields: # To find Azure images: https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage # image_id: microsoft-dsvm:ubuntu-2004:2004:21.11.04 # + # OCI + # To find OCI images: https://docs.oracle.com/en-us/iaas/images + # You can choose the image with OS version from the following image tags + # provided by SkyPilot: + # image_id: skypilot:gpu-ubuntu-2204 + # image_id: skypilot:gpu-ubuntu-2004 + # image_id: skypilot:gpu-oraclelinux9 + # image_id: skypilot:gpu-oraclelinux8 + # image_id: skypilot:cpu-ubuntu-2204 + # image_id: skypilot:cpu-ubuntu-2004 + # image_id: skypilot:cpu-oraclelinux9 + # image_id: skypilot:cpu-oraclelinux8 + # + # It is also possible to specify your custom image's OCID with OS type, + # for example: + # image_id: ocid1.image.oc1.us-sanjose-1.aaaaaaaaywwfvy67wwe7f24juvjwhyjn3u7g7s3wzkhduxcbewzaeki2nt5q:oraclelinux + # image_id: ocid1.image.oc1.us-sanjose-1.aaaaaaaa5tnuiqevhoyfnaa5pqeiwjv6w5vf6w4q2hpj3atyvu3yd6rhlhyq:ubuntu + # # IBM # Create a private VPC image and paste its ID in the following format: # image_id: @@ -224,6 +242,7 @@ Available fields: # https://www.ibm.com/cloud/blog/use-ibm-packer-plugin-to-create-custom-images-on-ibm-cloud-vpc-infrastructure # To use a more limited but easier to manage tool: # https://github.com/IBM/vpc-img-inst + image_id: ami-0868a20f5a3bf9702 # Labels to apply to the instances (optional). # From 9828f6b9b3ea50a35352c2b530c6717c6eef82b4 Mon Sep 17 00:00:00 2001 From: Hong Date: Mon, 6 Jan 2025 10:26:59 +0800 Subject: [PATCH 09/16] [UX] warning before launching jobs/serve when using a reauth required credentials (#4479) * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * Update sky/backends/cloud_vm_ray_backend.py Minor fix * Update sky/clouds/aws.py Co-authored-by: Romil Bhardwaj * wip * minor changes * wip --------- Co-authored-by: hong Co-authored-by: Romil Bhardwaj --- sky/backends/backend_utils.py | 36 ++++++++++++++++++++++++++++ sky/backends/cloud_vm_ray_backend.py | 17 +++++++++++++ sky/clouds/aws.py | 24 +++++++++++++++++++ sky/clouds/cloud.py | 4 ++++ sky/clouds/gcp.py | 9 +++++++ 5 files changed, 90 insertions(+) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 6e79469a819..bf92f442d2f 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -650,6 +650,42 @@ def _restore_block(new_block: Dict[str, Any], old_block: Dict[str, Any]): return common_utils.dump_yaml_str(new_config) +def get_expirable_clouds( + enabled_clouds: Sequence[clouds.Cloud]) -> List[clouds.Cloud]: + """Returns a list of clouds that use local credentials and whose credentials can expire. + + This function checks each cloud in the provided sequence to determine if it uses local credentials + and if its credentials can expire. If both conditions are met, the cloud is added to the list of + expirable clouds. + + Args: + enabled_clouds (Sequence[clouds.Cloud]): A sequence of cloud objects to check. + + Returns: + list[clouds.Cloud]: A list of cloud objects that use local credentials and whose credentials can expire. + """ + expirable_clouds = [] + local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value + for cloud in enabled_clouds: + remote_identities = skypilot_config.get_nested( + (str(cloud).lower(), 'remote_identity'), None) + if remote_identities is None: + remote_identities = schemas.get_default_remote_identity( + str(cloud).lower()) + + local_credential_expiring = cloud.can_credential_expire() + if isinstance(remote_identities, str): + if remote_identities == local_credentials_value and local_credential_expiring: + expirable_clouds.append(cloud) + elif isinstance(remote_identities, list): + for profile in remote_identities: + if list(profile.values( + ))[0] == local_credentials_value and local_credential_expiring: + expirable_clouds.append(cloud) + break + return expirable_clouds + + # TODO: too many things happening here - leaky abstraction. Refactor. @timeline.event def write_cluster_config( diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 156f43181b2..c972928cd7d 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -26,6 +26,7 @@ import sky from sky import backends +from sky import check as sky_check from sky import cloud_stores from sky import clouds from sky import exceptions @@ -1996,6 +1997,22 @@ def provision_with_retries( skip_unnecessary_provisioning else None) failover_history: List[Exception] = list() + # If the user is using local credentials which may expire, the + # controller may leak resources if the credentials expire while a job + # is running. Here we check the enabled clouds and expiring credentials + # and raise a warning to the user. + if task.is_controller_task(): + enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh() + expirable_clouds = backend_utils.get_expirable_clouds( + enabled_clouds) + + if len(expirable_clouds) > 0: + warnings = (f'\033[93mWarning: Credentials used for ' + f'{expirable_clouds} may expire. Clusters may be ' + f'leaked if the credentials expire while jobs ' + f'are running. It is recommended to use credentials' + f' that never expire or a service account.\033[0m') + logger.warning(warnings) # Retrying launchable resources. while True: diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index c665263e22e..a86a87f4feb 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -103,6 +103,24 @@ class AWSIdentityType(enum.Enum): # region us-east-1 config-file ~/.aws/config SHARED_CREDENTIALS_FILE = 'shared-credentials-file' + def can_credential_expire(self) -> bool: + """Check if the AWS identity type can expire. + + SSO,IAM_ROLE and CONTAINER_ROLE are temporary credentials and refreshed + automatically. ENV and SHARED_CREDENTIALS_FILE are short-lived + credentials without refresh. + IAM ROLE: + https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html + SSO/Container-role refresh token: + https://docs.aws.amazon.com/solutions/latest/dea-api/auth-refreshtoken.html + """ + # TODO(hong): Add a CLI based check for the expiration of the temporary + # credentials + expirable_types = { + AWSIdentityType.ENV, AWSIdentityType.SHARED_CREDENTIALS_FILE + } + return self in expirable_types + @clouds.CLOUD_REGISTRY.register class AWS(clouds.Cloud): @@ -860,6 +878,12 @@ def get_credential_file_mounts(self) -> Dict[str, str]: if os.path.exists(os.path.expanduser(f'~/.aws/{filename}')) } + @functools.lru_cache(maxsize=1) + def can_credential_expire(self) -> bool: + identity_type = self._current_identity_type() + return identity_type is not None and identity_type.can_credential_expire( + ) + def instance_type_exists(self, instance_type): return service_catalog.instance_type_exists(instance_type, clouds='aws') diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 455baeaf5d9..2cb45ca14fc 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -536,6 +536,10 @@ def get_credential_file_mounts(self) -> Dict[str, str]: """ raise NotImplementedError + def can_credential_expire(self) -> bool: + """Returns whether the cloud credential can expire.""" + return False + @classmethod def get_image_size(cls, image_id: str, region: Optional[str]) -> float: """Check the image size from the cloud. diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index ff200f84147..3502fee8e1c 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -132,6 +132,9 @@ class GCPIdentityType(enum.Enum): SHARED_CREDENTIALS_FILE = '' + def can_credential_expire(self) -> bool: + return self == GCPIdentityType.SHARED_CREDENTIALS_FILE + @clouds.CLOUD_REGISTRY.register class GCP(clouds.Cloud): @@ -863,6 +866,12 @@ def get_credential_file_mounts(self) -> Dict[str, str]: pass return credentials + @functools.lru_cache(maxsize=1) + def can_credential_expire(self) -> bool: + identity_type = self._get_identity_type() + return identity_type is not None and identity_type.can_credential_expire( + ) + @classmethod def _get_identity_type(cls) -> Optional[GCPIdentityType]: try: From 38a822ac6b553df0e784e559715ee4269c21f780 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 5 Jan 2025 22:51:04 -0800 Subject: [PATCH 10/16] [GCP] Activate service account for storage and controller (#4529) * Activate service account for storage * disable logging if not using service account * Activate for controller as well. * revert controller activate * Add comments * format * fix smoke --- sky/cloud_stores.py | 12 ++++++++++-- sky/data/data_utils.py | 12 ++++++++---- tests/smoke_tests/test_managed_job.py | 2 +- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/sky/cloud_stores.py b/sky/cloud_stores.py index 108f33f2c1f..e24c4f3ad03 100644 --- a/sky/cloud_stores.py +++ b/sky/cloud_stores.py @@ -113,8 +113,16 @@ class GcsCloudStorage(CloudStorage): @property def _gsutil_command(self): gsutil_alias, alias_gen = data_utils.get_gsutil_command() - return (f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS=' - f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH} {gsutil_alias}') + return ( + f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS=' + f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH}; ' + # Explicitly activate service account. Unlike the gcp packages + # and other GCP commands, gsutil does not automatically pick up + # the default credential keys when it is a service account. + 'gcloud auth activate-service-account ' + '--key-file=$GOOGLE_APPLICATION_CREDENTIALS ' + '2> /dev/null || true; ' + f'{gsutil_alias}') def is_directory(self, url: str) -> bool: """Returns whether 'url' is a directory. diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index 05c2b42c844..e8dcaa83017 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -523,10 +523,14 @@ def get_gsutil_command() -> Tuple[str, str]: def run_upload_cli(command: str, access_denied_message: str, bucket_name: str, log_path: str): - returncode, stdout, stderr = log_lib.run_with_log(command, - log_path, - shell=True, - require_outputs=True) + returncode, stdout, stderr = log_lib.run_with_log( + command, + log_path, + shell=True, + require_outputs=True, + # We need to use bash as some of the cloud commands uses bash syntax, + # such as [[ ... ]] + executable='/bin/bash') if access_denied_message in stderr: with ux_utils.print_exception_no_traceback(): raise PermissionError('Failed to upload files to ' diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py index 22381fc45e3..5c930724523 100644 --- a/tests/smoke_tests/test_managed_job.py +++ b/tests/smoke_tests/test_managed_job.py @@ -365,7 +365,7 @@ def test_managed_jobs_pipeline_recovery_gcp(): # separated by `-`. (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | ' f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), - smoke_tests_utils.zJOB_WAIT_NOT_RUNNING.format(job_name=name), + smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( From 8952fecd776cab6326e099a2255c15dd8d385890 Mon Sep 17 00:00:00 2001 From: Hysun He Date: Mon, 6 Jan 2025 20:42:21 +0800 Subject: [PATCH 11/16] [OCI] Support reuse existing VCN for SkyServe (#4530) * Support reuse existing VCN for SkyServe * fix * remove unused import * format --- sky/clouds/utils/oci_utils.py | 9 +++++++++ sky/provision/oci/query_utils.py | 34 ++++++++++++++++---------------- sky/utils/schemas.py | 3 +++ 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/sky/clouds/utils/oci_utils.py b/sky/clouds/utils/oci_utils.py index 581d4d72d3c..46d4454d866 100644 --- a/sky/clouds/utils/oci_utils.py +++ b/sky/clouds/utils/oci_utils.py @@ -10,6 +10,8 @@ from ubuntu 20.04 to ubuntu 22.04, including: - GPU: skypilot:gpu-ubuntu-2004 -> skypilot:gpu-ubuntu-2204 - CPU: skypilot:cpu-ubuntu-2004 -> skypilot:cpu-ubuntu-2204 + - Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Support reuse existing + VCN for SkyServe. """ import os @@ -109,8 +111,15 @@ def get_compartment(cls, region): ('oci', region, 'compartment_ocid'), default_compartment_ocid) return compartment + @classmethod + def get_vcn_ocid(cls, region): + # Will reuse the regional VCN if specified. + vcn = skypilot_config.get_nested(('oci', region, 'vcn_ocid'), None) + return vcn + @classmethod def get_vcn_subnet(cls, region): + # Will reuse the subnet if specified. vcn = skypilot_config.get_nested(('oci', region, 'vcn_subnet'), None) return vcn diff --git a/sky/provision/oci/query_utils.py b/sky/provision/oci/query_utils.py index 3037fcc2703..3f545aca4ba 100644 --- a/sky/provision/oci/query_utils.py +++ b/sky/provision/oci/query_utils.py @@ -7,6 +7,8 @@ find_compartment: allow search subtree when find a compartment. - Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add methods to Add/remove security rules: create_nsg_rules & remove_nsg + - Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Support reuse existing + VCN for SkyServe. """ from datetime import datetime import functools @@ -17,7 +19,6 @@ import typing from typing import List, Optional, Tuple -from sky import exceptions from sky import sky_logging from sky.adaptors import common as adaptors_common from sky.adaptors import oci as oci_adaptor @@ -496,26 +497,25 @@ def find_nsg(cls, region: str, nsg_name: str, compartment = cls.find_compartment(region) - list_vcns_resp = net_client.list_vcns( - compartment_id=compartment, - display_name=oci_utils.oci_config.VCN_NAME, - lifecycle_state='AVAILABLE', - ) - - if not list_vcns_resp: - raise exceptions.ResourcesUnavailableError( - 'The VCN is not available') + vcn_id = oci_utils.oci_config.get_vcn_ocid(region) + if vcn_id is None: + list_vcns_resp = net_client.list_vcns( + compartment_id=compartment, + display_name=oci_utils.oci_config.VCN_NAME, + lifecycle_state='AVAILABLE', + ) - # Get the primary vnic. The vnic might be an empty list for the - # corner case when the cluster was exited during provision. - if not list_vcns_resp.data: - return None + # Get the primary vnic. The vnic might be an empty list for the + # corner case when the cluster was exited during provision. + if not list_vcns_resp.data: + return None - vcn = list_vcns_resp.data[0] + vcn = list_vcns_resp.data[0] + vcn_id = vcn.id list_nsg_resp = net_client.list_network_security_groups( compartment_id=compartment, - vcn_id=vcn.id, + vcn_id=vcn_id, limit=1, display_name=nsg_name, ) @@ -532,7 +532,7 @@ def find_nsg(cls, region: str, nsg_name: str, create_network_security_group_details=oci_adaptor.oci.core.models. CreateNetworkSecurityGroupDetails( compartment_id=compartment, - vcn_id=vcn.id, + vcn_id=vcn_id, display_name=nsg_name, )) get_nsg_resp = net_client.get_network_security_group( diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index a424ae074b9..3194dc79da5 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -886,6 +886,9 @@ def get_config_schema(): 'image_tag_gpu': { 'type': 'string', }, + 'vcn_ocid': { + 'type': 'string', + }, 'vcn_subnet': { 'type': 'string', }, From 0e149822cc91ec57202ab071ab33d0f5d8c0a3ba Mon Sep 17 00:00:00 2001 From: Hysun He Date: Mon, 6 Jan 2025 20:43:22 +0800 Subject: [PATCH 12/16] [docs] OCI: advanced configuration & add vcn_ocid (#4531) * Add vcn_ocid configuration * Update config.rst --- docs/source/reference/config.rst | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 99bd347942a..a76dc473206 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -628,20 +628,30 @@ Available fields and semantics: # Advanced OCI configurations (optional). oci: # A dict mapping region names to region-specific configurations, or - # `default` for the default configuration. + # `default` for the default/global configuration. default: - # The OCID of the profile to use for launching instances (optional). - oci_config_profile: DEFAULT - # The OCID of the compartment to use for launching instances (optional). + # The profile name in ~/.oci/config to use for launching instances. If not + # set, the one named DEFAULT will be used (optional). + oci_config_profile: SKY_PROVISION_PROFILE + # The OCID of the compartment to use for launching instances. If not set, + # the root compartment will be used (optional). compartment_ocid: ocid1.compartment.oc1..aaaaaaaahr7aicqtodxmcfor6pbqn3hvsngpftozyxzqw36gj4kh3w3kkj4q - # The image tag to use for launching general instances (optional). - image_tag_general: skypilot:cpu-ubuntu-2004 - # The image tag to use for launching GPU instances (optional). - image_tag_gpu: skypilot:gpu-ubuntu-2004 - + # The default image tag to use for launching general instances (CPU) if the + # image_id parameter is not specified. If not set, the default is + # skypilot:cpu-ubuntu-2204 (optional). + image_tag_general: skypilot:cpu-oraclelinux8 + # The default image tag to use for launching GPU instances if the image_id + # parameter is not specified. If not set, the default is + # skypilot:gpu-ubuntu-2204 (optional). + image_tag_gpu: skypilot:gpu-oraclelinux8 + + # Region-specific configurations ap-seoul-1: + # The OCID of the VCN to use for instances (optional). + vcn_ocid: ocid1.vcn.oc1.ap-seoul-1.amaaaaaaak7gbriarkfs2ssus5mh347ktmi3xa72tadajep6asio3ubqgarq # The OCID of the subnet to use for instances (optional). vcn_subnet: ocid1.subnet.oc1.ap-seoul-1.aaaaaaaa5c6wndifsij6yfyfehmi3tazn6mvhhiewqmajzcrlryurnl7nuja us-ashburn-1: + vcn_ocid: ocid1.vcn.oc1.ap-seoul-1.amaaaaaaak7gbriarkfs2ssus5mh347ktmi3xa72tadajep6asio3ubqgarq vcn_subnet: ocid1.subnet.oc1.iad.aaaaaaaafbj7i3aqc4ofjaapa5edakde6g4ea2yaslcsay32cthp7qo55pxa From 59cb4e9625e98b06fd293d0dd5cea5deb89ea358 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 6 Jan 2025 14:10:55 -0800 Subject: [PATCH 13/16] [k8s] Fix `--purge` not cleaning up cluster in stale k8s context (#4514) * Fix purge not cleaning up stale k8s context cluster * update comment * Apply purge after printing warnings. * lint * Fix comments * clean up condition --- sky/backends/cloud_vm_ray_backend.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index c972928cd7d..2316888b44c 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4216,11 +4216,20 @@ def post_teardown_cleanup(self, attempts = 0 while True: logger.debug(f'instance statuses attempt {attempts + 1}') - node_status_dict = provision_lib.query_instances( - repr(cloud), - cluster_name_on_cloud, - config['provider'], - non_terminated_only=False) + try: + node_status_dict = provision_lib.query_instances( + repr(cloud), + cluster_name_on_cloud, + config['provider'], + non_terminated_only=False) + except Exception as e: # pylint: disable=broad-except + if purge: + logger.warning( + f'Failed to query instances. Skipping since purge is ' + f'set. Details: ' + f'{common_utils.format_exception(e, use_bracket=True)}') + break + raise unexpected_node_state: Optional[Tuple[str, str]] = None for node_id, node_status in node_status_dict.items(): @@ -4239,8 +4248,13 @@ def post_teardown_cleanup(self, time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS) else: (node_id, node_status) = unexpected_node_state - raise RuntimeError(f'Instance {node_id} in unexpected state ' - f'{node_status}.') + if purge: + logger.warning(f'Instance {node_id} in unexpected ' + f'state {node_status}. Skipping since purge ' + 'is set.') + break + raise RuntimeError(f'Instance {node_id} in unexpected ' + f'state {node_status}.') global_user_state.remove_cluster(handle.cluster_name, terminate=terminate) From 6cf98a3cfbed39984b106f95ec7209058c73570d Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Tue, 7 Jan 2025 15:11:27 +0800 Subject: [PATCH 14/16] [Catalog] TPU V6e pricing fetcher (#4540) * [Catalog] TPU V6e pricing fetcher * Update sky/clouds/service_catalog/data_fetchers/fetch_gcp.py Co-authored-by: Zhanghao Wu * comment --------- Co-authored-by: Zhanghao Wu --- .../data_fetchers/fetch_gcp.py | 61 +++---------------- 1 file changed, 10 insertions(+), 51 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py index 570bc773d2e..b3a71e9514a 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py @@ -47,10 +47,6 @@ TPU_V4_ZONES = ['us-central2-b'] # TPU v3 pods are available in us-east1-d, but hidden in the skus. # We assume the TPU prices are the same as us-central1. -# TPU v6e's pricing info is not available on the SKUs. However, in -# https://cloud.google.com/tpu/pricing, it listed the price for 4 regions: -# us-east1, us-east5, europe-west4, and asia-northeast1. We hardcode them here -# and filtered out the other regions (us-central{1,2}, us-south1). HIDDEN_TPU_DF = pd.read_csv( io.StringIO( textwrap.dedent("""\ @@ -62,49 +58,10 @@ ,tpu-v3-512,1,,,tpu-v3-512,512.0,153.6,us-east1,us-east1-d ,tpu-v3-1024,1,,,tpu-v3-1024,1024.0,307.2,us-east1,us-east1-d ,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d - ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-b - ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-c - ,tpu-v6e-1,1,,,tpu-v6e-1,2.97,,europe-west4,europe-west4-a - ,tpu-v6e-1,1,,,tpu-v6e-1,3.24,,asia-northeast1,asia-northeast1-b - ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east1,us-east1-d - ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-b - ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-c - ,tpu-v6e-4,1,,,tpu-v6e-4,11.88,,europe-west4,europe-west4-a - ,tpu-v6e-4,1,,,tpu-v6e-4,12.96,,asia-northeast1,asia-northeast1-b - ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east1,us-east1-d - ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-b - ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-c - ,tpu-v6e-8,1,,,tpu-v6e-8,23.76,,europe-west4,europe-west4-a - ,tpu-v6e-8,1,,,tpu-v6e-8,25.92,,asia-northeast1,asia-northeast1-b - ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east1,us-east1-d - ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-b - ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-c - ,tpu-v6e-16,1,,,tpu-v6e-16,47.52,,europe-west4,europe-west4-a - ,tpu-v6e-16,1,,,tpu-v6e-16,51.84,,asia-northeast1,asia-northeast1-b - ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east1,us-east1-d - ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-b - ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-c - ,tpu-v6e-32,1,,,tpu-v6e-32,95.04,,europe-west4,europe-west4-a - ,tpu-v6e-32,1,,,tpu-v6e-32,103.68,,asia-northeast1,asia-northeast1-b - ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east1,us-east1-d - ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-b - ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-c - ,tpu-v6e-64,1,,,tpu-v6e-64,190.08,,europe-west4,europe-west4-a - ,tpu-v6e-64,1,,,tpu-v6e-64,207.36,,asia-northeast1,asia-northeast1-b - ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east1,us-east1-d - ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-b - ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-c - ,tpu-v6e-128,1,,,tpu-v6e-128,380.16,,europe-west4,europe-west4-a - ,tpu-v6e-128,1,,,tpu-v6e-128,414.72,,asia-northeast1,asia-northeast1-b - ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east1,us-east1-d - ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-b - ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-c - ,tpu-v6e-256,1,,,tpu-v6e-256,760.32,,europe-west4,europe-west4-a - ,tpu-v6e-256,1,,,tpu-v6e-256,829.44,,asia-northeast1,asia-northeast1-b - ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east1,us-east1-d """))) -TPU_V6E_MISSING_REGIONS = ['us-central1', 'us-central2', 'us-south1'] +# TPU V6e price for us-central2 is missing in the SKUs. +TPU_V6E_MISSING_REGIONS = ['us-central2'] # TPU V5 is not visible in specific zones. We hardcode the missing zones here. # NOTE(dev): Keep the zones and the df in sync. @@ -670,6 +627,8 @@ def _get_tpu_description_str(tpu_version: str) -> str: return 'TpuV5p' assert tpu_version == 'v5litepod', tpu_version return 'TpuV5e' + if tpu_version.startswith('v6e'): + return 'TpuV6e' return f'Tpu-{tpu_version}' def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]: @@ -684,10 +643,10 @@ def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]: # whether the TPU is a single device or a pod. # For TPU-v4, the pricing is uniform, and thus the pricing API # only provides the price of TPU-v4 pods. - # The price shown for v5 TPU is per chip hour, so there is no 'Pod' - # keyword in the description. + # The price shown for v5 & v6e TPU is per chip hour, so there is + # no 'Pod' keyword in the description. is_pod = ((num_cores > 8 or tpu_version == 'v4') and - not tpu_version.startswith('v5')) + not tpu_version.startswith('v5') and tpu_version != 'v6e') for sku in gce_skus + tpu_skus: if tpu_region not in sku['serviceRegions']: @@ -718,7 +677,9 @@ def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]: # for v5e. Reference here: # https://cloud.google.com/tpu/docs/v5p#using-accelerator-type # https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config - core_per_sku = (1 if tpu_version == 'v5litepod' else + # v6e is also per chip price. Reference here: + # https://cloud.google.com/tpu/docs/v6e#configurations + core_per_sku = (1 if tpu_version in ['v5litepod', 'v6e'] else 2 if tpu_version == 'v5p' else 8) tpu_core_price = tpu_device_price / core_per_sku tpu_price = num_cores * tpu_core_price @@ -738,8 +699,6 @@ def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]: spot_str = 'spot ' if spot else '' print(f'The {spot_str}price of {tpu_name} in {tpu_region} is ' 'not found in SKUs or hidden TPU price DF.') - # TODO(tian): Hack. Should investigate how to retrieve the price - # for TPU-v6e. if (tpu_name.startswith('tpu-v6e') and tpu_region in TPU_V6E_MISSING_REGIONS): if not spot: From bee46471a0fe2f6df141fb30f32d167f5b7ec53b Mon Sep 17 00:00:00 2001 From: zpoint Date: Wed, 8 Jan 2025 10:13:04 +0800 Subject: [PATCH 15/16] Update peoetry-build.yml to ensure github CI pass (#4541) * change the dependencies * remove --no-update * remove poetry build --- .github/workflows/test-poetry-build.yml | 63 ------------------------- 1 file changed, 63 deletions(-) delete mode 100644 .github/workflows/test-poetry-build.yml diff --git a/.github/workflows/test-poetry-build.yml b/.github/workflows/test-poetry-build.yml deleted file mode 100644 index 4cce22809ef..00000000000 --- a/.github/workflows/test-poetry-build.yml +++ /dev/null @@ -1,63 +0,0 @@ -name: Poetry Test -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - master - - 'releases/**' - pull_request: - branches: - - master - - 'releases/**' - merge_group: - -jobs: - poetry-build-test: - runs-on: ubuntu-latest - steps: - - name: Set up Python 3.10 - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - name: Install Poetry - run: | - curl -sSL https://install.python-poetry.org | python - - echo "$HOME/.poetry/bin" >> $GITHUB_PATH - - name: Create foo package - run: | - mkdir foo - MASTER_REPO_URL=${{ github.server_url }}/${{ github.repository }} - REPO_URL=${{ github.event.pull_request.head.repo.html_url }} - if [ -z "$REPO_URL" ]; then - # This is a push, not a PR, so use the repo URL - REPO_URL=$MASTER_REPO_URL - fi - echo Master repo URL: $MASTER_REPO_URL - echo Using repo URL: $REPO_URL - cat < foo/pyproject.toml - [tool.poetry] - name = "foo" - version = "1.0.0" - authors = ["skypilot-bot"] - description = "" - - [tool.poetry.dependencies] - python = "3.10.x" - - [tool.poetry.group.dev.dependencies] - skypilot = {git = "${REPO_URL}.git", branch = "${{ github.head_ref }}"} - - [build-system] - requires = ["poetry-core"] - build-backend = "poetry.core.masonry.api" - - EOF - - - name: Check poetry lock time - run: | - cd foo - poetry lock --no-update - timeout-minutes: 2 - - From 2fa37ec2a68bdd1e69f3726ca46aaeebcaf07a2d Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Wed, 8 Jan 2025 11:19:40 +0800 Subject: [PATCH 16/16] [Core][Docker] Support docker login on RunPod. (#4287) * [Core][Docker] Support docker login on RunPod. * nit * works * remove unnecessary * delete template and registry after termination * move to graphql api & remove ephemeral resourcdes * nits --- docs/source/getting-started/installation.rst | 2 +- sky/provision/docker_utils.py | 11 +- sky/provision/runpod/instance.py | 11 +- sky/provision/runpod/utils.py | 183 +++++++++++++++++-- sky/setup_files/dependencies.py | 4 +- sky/skylet/providers/command_runner.py | 12 +- sky/templates/runpod-ray.yml.j2 | 13 ++ 7 files changed, 210 insertions(+), 26 deletions(-) diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst index 1d36b5ef6b8..93c730ef651 100644 --- a/docs/source/getting-started/installation.rst +++ b/docs/source/getting-started/installation.rst @@ -304,7 +304,7 @@ RunPod .. code-block:: shell - pip install "runpod>=1.5.1" + pip install "runpod>=1.6.1" runpod config diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index 848c7a06983..0aadcc55335 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -38,6 +38,13 @@ class DockerLoginConfig: password: str server: str + def format_image(self, image: str) -> str: + """Format the image name with the server prefix.""" + server_prefix = f'{self.server}/' + if not image.startswith(server_prefix): + return f'{server_prefix}{image}' + return image + @classmethod def from_env_vars(cls, d: Dict[str, str]) -> 'DockerLoginConfig': return cls( @@ -220,9 +227,7 @@ def initialize(self) -> str: wait_for_docker_daemon=True) # We automatically add the server prefix to the image name if # the user did not add it. - server_prefix = f'{docker_login_config.server}/' - if not specific_image.startswith(server_prefix): - specific_image = f'{server_prefix}{specific_image}' + specific_image = docker_login_config.format_image(specific_image) if self.docker_config.get('pull_before_run', True): assert specific_image, ('Image must be included in config if ' + diff --git a/sky/provision/runpod/instance.py b/sky/provision/runpod/instance.py index 8f992f569d9..9e57887c3f1 100644 --- a/sky/provision/runpod/instance.py +++ b/sky/provision/runpod/instance.py @@ -83,7 +83,8 @@ def run_instances(region: str, cluster_name_on_cloud: str, node_type = 'head' if head_instance_id is None else 'worker' try: instance_id = utils.launch( - name=f'{cluster_name_on_cloud}-{node_type}', + cluster_name=cluster_name_on_cloud, + node_type=node_type, instance_type=config.node_config['InstanceType'], region=region, disk_size=config.node_config['DiskSize'], @@ -92,6 +93,8 @@ def run_instances(region: str, cluster_name_on_cloud: str, public_key=config.node_config['PublicKey'], preemptible=config.node_config['Preemptible'], bid_per_gpu=config.node_config['BidPerGPU'], + docker_login_config=config.provider_config.get( + 'docker_login_config'), ) except Exception as e: # pylint: disable=broad-except logger.warning(f'run_instances error: {e}') @@ -145,6 +148,8 @@ def terminate_instances( """See sky/provision/__init__.py""" del provider_config # unused instances = _filter_instances(cluster_name_on_cloud, None) + template_name, registry_auth_id = utils.get_registry_auth_resources( + cluster_name_on_cloud) for inst_id, inst in instances.items(): logger.debug(f'Terminating instance {inst_id}: {inst}') if worker_only and inst['name'].endswith('-head'): @@ -157,6 +162,10 @@ def terminate_instances( f'Failed to terminate instance {inst_id}: ' f'{common_utils.format_exception(e, use_bracket=False)}' ) from e + if template_name is not None: + utils.delete_pod_template(template_name) + if registry_auth_id is not None: + utils.delete_register_auth(registry_auth_id) def get_cluster_info( diff --git a/sky/provision/runpod/utils.py b/sky/provision/runpod/utils.py index d0a06b026b3..6600cfd6198 100644 --- a/sky/provision/runpod/utils.py +++ b/sky/provision/runpod/utils.py @@ -2,10 +2,11 @@ import base64 import time -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple from sky import sky_logging from sky.adaptors import runpod +from sky.provision import docker_utils import sky.provision.runpod.api.commands as runpod_commands from sky.skylet import constants from sky.utils import common_utils @@ -47,6 +48,11 @@ } +def _construct_docker_login_template_name(cluster_name: str) -> str: + """Constructs the registry auth template name.""" + return f'{cluster_name}-docker-login-template' + + def retry(func): """Decorator to retry a function.""" @@ -66,9 +72,83 @@ def wrapper(*args, **kwargs): return wrapper +# Adapted from runpod.api.queries.pods.py::QUERY_POD. +# Adding containerRegistryAuthId to the query. +_QUERY_POD = """ +query myPods { + myself { + pods { + id + containerDiskInGb + containerRegistryAuthId + costPerHr + desiredStatus + dockerArgs + dockerId + env + gpuCount + imageName + lastStatusChange + machineId + memoryInGb + name + podType + port + ports + uptimeSeconds + vcpuCount + volumeInGb + volumeMountPath + runtime { + ports{ + ip + isIpPublic + privatePort + publicPort + type + } + } + machine { + gpuDisplayName + } + } + } +} +""" + + +def _sky_get_pods() -> dict: + """List all pods with extra registry auth information. + + Adapted from runpod.get_pods() to include containerRegistryAuthId. + """ + raw_return = runpod.runpod.api.graphql.run_graphql_query(_QUERY_POD) + cleaned_return = raw_return['data']['myself']['pods'] + return cleaned_return + + +_QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH = """ +query myself { + myself { + podTemplates { + name + containerRegistryAuthId + } + } +} +""" + + +def _list_pod_templates_with_container_registry() -> dict: + """List all pod templates.""" + raw_return = runpod.runpod.api.graphql.run_graphql_query( + _QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH) + return raw_return['data']['myself']['podTemplates'] + + def list_instances() -> Dict[str, Dict[str, Any]]: """Lists instances associated with API key.""" - instances = runpod.runpod.get_pods() + instances = _sky_get_pods() instance_dict: Dict[str, Dict[str, Any]] = {} for instance in instances: @@ -100,14 +180,75 @@ def list_instances() -> Dict[str, Dict[str, Any]]: return instance_dict -def launch(name: str, instance_type: str, region: str, disk_size: int, - image_name: str, ports: Optional[List[int]], public_key: str, - preemptible: Optional[bool], bid_per_gpu: float) -> str: +def delete_pod_template(template_name: str) -> None: + """Deletes a pod template.""" + try: + runpod.runpod.api.graphql.run_graphql_query( + f'mutation {{deleteTemplate(templateName: "{template_name}")}}') + except runpod.runpod.error.QueryError as e: + logger.warning(f'Failed to delete template {template_name}: {e}' + 'Please delete it manually.') + + +def delete_register_auth(registry_auth_id: str) -> None: + """Deletes a registry auth.""" + try: + runpod.runpod.delete_container_registry_auth(registry_auth_id) + except runpod.runpod.error.QueryError as e: + logger.warning(f'Failed to delete registry auth {registry_auth_id}: {e}' + 'Please delete it manually.') + + +def _create_template_for_docker_login( + cluster_name: str, + image_name: str, + docker_login_config: Optional[Dict[str, str]], +) -> Tuple[str, Optional[str]]: + """Creates a template for the given image with the docker login config. + + Returns: + formatted_image_name: The formatted image name. + template_id: The template ID. None for no docker login config. + """ + if docker_login_config is None: + return image_name, None + login_config = docker_utils.DockerLoginConfig(**docker_login_config) + container_registry_auth_name = f'{cluster_name}-registry-auth' + container_template_name = _construct_docker_login_template_name( + cluster_name) + # The `name` argument is only for display purpose and the registry server + # will be splitted from the docker image name (Tested with AWS ECR). + # Here we only need the username and password to create the registry auth. + # TODO(tian): Now we create a template and a registry auth for each cluster. + # Consider create one for each server and reuse them. Challenges including + # calculate the reference count and delete them when no longer needed. + create_auth_resp = runpod.runpod.create_container_registry_auth( + name=container_registry_auth_name, + username=login_config.username, + password=login_config.password, + ) + registry_auth_id = create_auth_resp['id'] + create_template_resp = runpod.runpod.create_template( + name=container_template_name, + image_name=None, + registry_auth_id=registry_auth_id, + ) + return login_config.format_image(image_name), create_template_resp['id'] + + +def launch(cluster_name: str, node_type: str, instance_type: str, region: str, + disk_size: int, image_name: str, ports: Optional[List[int]], + public_key: str, preemptible: Optional[bool], bid_per_gpu: float, + docker_login_config: Optional[Dict[str, str]]) -> str: """Launches an instance with the given parameters. Converts the instance_type to the RunPod GPU name, finds the specs for the GPU, and launches the instance. + + Returns: + instance_id: The instance ID. """ + name = f'{cluster_name}-{node_type}' gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]] gpu_quantity = int(instance_type.split('_')[0].replace('x', '')) cloud_type = instance_type.split('_')[2] @@ -139,21 +280,24 @@ def launch(name: str, instance_type: str, region: str, disk_size: int, # Use base64 to deal with the tricky quoting issues caused by runpod API. encoded = base64.b64encode(setup_cmd.encode('utf-8')).decode('utf-8') + docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; ' + f'bash init.sh\'') + # Port 8081 is occupied for nginx in the base image. custom_ports_str = '' if ports is not None: custom_ports_str = ''.join([f'{p}/tcp,' for p in ports]) + ports_str = (f'22/tcp,' + f'{custom_ports_str}' + f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,' + f'{constants.SKY_REMOTE_RAY_PORT}/http') - docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; ' - f'bash init.sh\'') - ports = (f'22/tcp,' - f'{custom_ports_str}' - f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,' - f'{constants.SKY_REMOTE_RAY_PORT}/http') + image_name_formatted, template_id = _create_template_for_docker_login( + cluster_name, image_name, docker_login_config) params = { 'name': name, - 'image_name': image_name, + 'image_name': image_name_formatted, 'gpu_type_id': gpu_type, 'cloud_type': cloud_type, 'container_disk_in_gb': disk_size, @@ -161,9 +305,10 @@ def launch(name: str, instance_type: str, region: str, disk_size: int, 'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity, 'gpu_count': gpu_quantity, 'country_code': region, - 'ports': ports, + 'ports': ports_str, 'support_public_ip': True, 'docker_args': docker_args, + 'template_id': template_id, } if preemptible is None or not preemptible: @@ -177,6 +322,18 @@ def launch(name: str, instance_type: str, region: str, disk_size: int, return new_instance['id'] +def get_registry_auth_resources( + cluster_name: str) -> Tuple[Optional[str], Optional[str]]: + """Gets the registry auth resources.""" + container_registry_auth_name = _construct_docker_login_template_name( + cluster_name) + for template in _list_pod_templates_with_container_registry(): + if template['name'] == container_registry_auth_name: + return container_registry_auth_name, template[ + 'containerRegistryAuthId'] + return None, None + + def remove(instance_id: str) -> None: """Terminates the given instance.""" runpod.runpod.terminate_pod(instance_id) diff --git a/sky/setup_files/dependencies.py b/sky/setup_files/dependencies.py index 16590a9fd0d..13b99770e5b 100644 --- a/sky/setup_files/dependencies.py +++ b/sky/setup_files/dependencies.py @@ -123,7 +123,9 @@ 'oci': ['oci'] + local_ray, 'kubernetes': ['kubernetes>=20.0.0'], 'remote': remote, - 'runpod': ['runpod>=1.5.1'], + # For the container registry auth api. Reference: + # https://github.com/runpod/runpod-python/releases/tag/1.6.1 + 'runpod': ['runpod>=1.6.1'], 'fluidstack': [], # No dependencies needed for fluidstack 'cudo': ['cudo-compute>=0.1.10'], 'paperspace': [], # No dependencies needed for paperspace diff --git a/sky/skylet/providers/command_runner.py b/sky/skylet/providers/command_runner.py index 4f66ef54383..16dbc4d2668 100644 --- a/sky/skylet/providers/command_runner.py +++ b/sky/skylet/providers/command_runner.py @@ -25,7 +25,7 @@ def docker_start_cmds( docker_cmd, ): """Generating docker start command without --rm. - + The code is borrowed from `ray.autoscaler._private.docker`. Changes we made: @@ -159,19 +159,17 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str], return True # SkyPilot: Docker login if user specified a private docker registry. - if "docker_login_config" in self.docker_config: + if 'docker_login_config' in self.docker_config: # TODO(tian): Maybe support a command to get the login password? - docker_login_config: docker_utils.DockerLoginConfig = self.docker_config[ - "docker_login_config"] + docker_login_config: docker_utils.DockerLoginConfig = ( + self.docker_config['docker_login_config']) self._run_with_retry( f'{self.docker_cmd} login --username ' f'{docker_login_config.username} --password ' f'{docker_login_config.password} {docker_login_config.server}') # We automatically add the server prefix to the image name if # the user did not add it. - server_prefix = f'{docker_login_config.server}/' - if not specific_image.startswith(server_prefix): - specific_image = f'{server_prefix}{specific_image}' + specific_image = docker_login_config.format_image(specific_image) if self.docker_config.get('pull_before_run', True): assert specific_image, ('Image must be included in config if ' diff --git a/sky/templates/runpod-ray.yml.j2 b/sky/templates/runpod-ray.yml.j2 index 853b9142037..ea57c9ac808 100644 --- a/sky/templates/runpod-ray.yml.j2 +++ b/sky/templates/runpod-ray.yml.j2 @@ -10,6 +10,19 @@ provider: module: sky.provision.runpod region: "{{region}}" disable_launch_config_check: true + # For RunPod, we directly set the image id for the docker as runtime environment + # support, thus we need to avoid the DockerInitializer detects the docker field + # and performs the initialization. Therefore we put the docker login config in + # the provider config here. + {%- if docker_login_config is not none %} + docker_login_config: + username: |- + {{docker_login_config.username}} + password: |- + {{docker_login_config.password}} + server: |- + {{docker_login_config.server}} + {%- endif %} auth: ssh_user: root