Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[k8s] Add validation for pod_config #4206 #4466

Merged
merged 13 commits into from
Jan 3, 2025
7 changes: 7 additions & 0 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,13 @@ def write_cluster_config(
tmp_yaml_path,
cluster_config_overrides=to_provision.cluster_config_overrides)
kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
yaml_obj = common_utils.read_yaml(tmp_yaml_path)
pod_config = yaml_obj['available_node_types']['ray_head_default'][
'node_config']
valid, message = kubernetes_utils.check_pod_config(pod_config)
if not valid:
raise exceptions.InvalidCloudConfigs(
f'Invalid pod_config. Deatils: {message}')
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved

if dryrun:
# If dryrun, return the unfinished tmp yaml path.
Expand Down
46 changes: 46 additions & 0 deletions sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,52 @@ def check_credentials(context: Optional[str],
return True, None


def check_pod_config(pod_config: dict) \
-> Tuple[bool, Optional[str]]:
"""Check if the pod_config is a valid pod config

Using deserialize api to check the pod_config is valid or not.

Returns:
bool: True if pod_config is valid.
str: Error message about why the pod_config is invalid, None otherwise.
"""
errors = []
# This api_client won't be used to send any requests, so there is no need to
# load kubeconfig
api_client = kubernetes.kubernetes.client.ApiClient()

# Used for kubernetes api_client deserialize function, the function will use
# data attr, the detail ref:
# https://github.com/kubernetes-client/python/blob/master/kubernetes/client/api_client.py#L244
class InnerResponse():

def __init__(self, data: dict):
self.data = json.dumps(data)

try:
# Validate metadata if present
if 'metadata' in pod_config:
try:
value = InnerResponse(pod_config['metadata'])
api_client.deserialize(
value, kubernetes.kubernetes.client.V1ObjectMeta)
except ValueError as e:
errors.append(f'Invalid metadata: {str(e)}')
# Validate spec if present
if 'spec' in pod_config:
try:
value = InnerResponse(pod_config['spec'])
api_client.deserialize(value,
kubernetes.kubernetes.client.V1PodSpec)
except ValueError as e:
errors.append(f'Invalid spec: {str(e)}')
return len(errors) == 0, '.'.join(errors)
except Exception as e: # pylint: disable=broad-except
errors.append(f'Validation error: {str(e)}')
return False, '.'.join(errors)


def is_kubeconfig_exec_auth(
context: Optional[str] = None) -> Tuple[bool, Optional[str]]:
"""Checks if the kubeconfig file uses exec-based authentication
Expand Down
46 changes: 46 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import sky
from sky import skypilot_config
import sky.exceptions
from sky.skylet import constants
from sky.utils import common_utils
from sky.utils import kubernetes_enums
Expand Down Expand Up @@ -99,6 +100,29 @@ def _create_task_yaml_file(task_file_path: pathlib.Path) -> None:
"""))


def _create_invalid_config_yaml_file(task_file_path: pathlib.Path) -> None:
task_file_path.write_text(
textwrap.dedent("""\
experimental:
config_overrides:
kubernetes:
pod_config:
metadata:
labels:
test-key: test-value
annotations:
abc: def
spec:
containers:
- name:
imagePullSecrets:
- name: my-secret-2
Comment on lines +108 to +119
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we retain just the relevant kubernetes field and remove the docker, gcp, nvidia_gpus and other fields?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also can we put a quick comment on what's the invalid field here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the deserialize api will ignore other invalid field here, as we can see the implement in the https://github.com/kubernetes-client/python/blob/e10470291526c82f12a0a3405910ccc3f3cdeb26/kubernetes/client/api_client.py#L620

for attr, attr_type in six.iteritems(klass.openapi_types):
  if klass.attribute_map[attr] in data:
      value = data[klass.attribute_map[attr]]
      kwargs[attr] = self.__deserialize(value, attr_type)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I meant for readability in tests, let's have only this:

        experimental:
            config_overrides:
                kubernetes:
                    pod_config:
                        metadata:
                            labels:
                                test-key: test-value
                            annotations:
                                abc: def
                        spec:
                            containers:
                                - name:
                            imagePullSecrets:
                                - name: my-secret-2


setup: echo 'Setting up...'
run: echo 'Running...'
"""))


def test_nested_config(monkeypatch) -> None:
"""Test that the nested config works."""
config = skypilot_config.Config()
Expand Down Expand Up @@ -335,6 +359,28 @@ def test_k8s_config_with_override(monkeypatch, tmp_path,
assert cluster_pod_config['spec']['runtimeClassName'] == 'nvidia'


def test_k8s_config_with_invalid_config(monkeypatch, tmp_path,
enable_all_clouds) -> None:
config_path = tmp_path / 'config.yaml'
_create_config_file(config_path)
monkeypatch.setattr(skypilot_config, 'CONFIG_PATH', config_path)

_reload_config()
task_path = tmp_path / 'task.yaml'
_create_invalid_config_yaml_file(task_path)
task = sky.Task.from_yaml(task_path)

# Test Kubernetes pod_config invalid
cluster_name = 'test_k8s_config_with_invalid_config'
task.set_resources_override({'cloud': sky.Kubernetes()})
exception_occurred = False
try:
sky.launch(task, cluster_name=cluster_name, dryrun=True)
except sky.exceptions.ResourcesUnavailableError:
exception_occurred = True
Comment on lines +379 to +380
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also verify the error message?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to validate this error message, but it's meaningless for this test because the error message returned by _provision in the end is not directly related to the actual error.

assert exception_occurred


def test_gcp_config_with_override(monkeypatch, tmp_path,
enable_all_clouds) -> None:
config_path = tmp_path / 'config.yaml'
Expand Down
Loading