diff --git a/requirements.txt b/requirements.txt index d988e02..6c38399 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,4 @@ requests_mock dataclasses-json python-dotenv pytest -git+https://github.com/ucsd-ets/awsed_python_client.git@2024.1.2-RC1 +git+https://github.com/ucsd-ets/awsed_python_client.git@2024.2.1-RC1 diff --git a/src/dsmlp/app/gpu_validator.py b/src/dsmlp/app/gpu_validator.py index 78f277f..898ab42 100644 --- a/src/dsmlp/app/gpu_validator.py +++ b/src/dsmlp/app/gpu_validator.py @@ -21,7 +21,7 @@ def __init__(self, awsed: AwsedClient, kube: KubeClient, logger: Logger) -> None self.kube = kube self.logger = logger - def get_ultilized_gpu(self, request: Request): + def calculate_utilized_gpu(self, request: Request): # Calculate the number of GPUs requested for kube client utilized_gpus = 0 for container in request.object.spec.containers: @@ -37,16 +37,14 @@ def get_ultilized_gpu(self, request: Request): utilized_gpus += max(requested, limit) - # Short circuit if no GPUs requested (permits overcap) or return - if utilized_gpus == 0: - raise ValueError("Error: No GPUs requested.") return utilized_gpus - def get_gpu_quota(self, awsed_quota, kube_client_quota): + def determine_gpu_quota(self, awsed_quota, kube_client_quota): """ - Use AWSED GPU quota if it is not None and greater than 0 - else use namespace GPU quota if it is not None and greater than 0 - else use 1 as default + Determine the GPU quota to be used based on AWSED GPU quota and namespace GPU quota. + If AWSED GPU quota is not None and greater than 0, use it. + If AWSED GPU quota is None or 0, check if namespace GPU quota is not None and greater than 0, use it. + If both AWSED GPU quota and namespace GPU quota are None or 0, use 1 as the default GPU quota. """ default_gpu_quota = 1 @@ -71,12 +69,15 @@ def validate_pod(self, request: Request): curr_gpus = self.kube.get_gpus_in_namespace(request.namespace) awsed_gpu_quota = self.awsed.get_user_gpu_quota(request.namespace) - # check ultilized gpu - utilized_gpus = self.get_ultilized_gpu(request=request) + # check utilized gpu + utilized_gpus = self.calculate_utilized_gpu(request=request) # request gpu_quota from method - gpu_quota = self.get_gpu_quota(awsed_gpu_quota, namespace.gpu_quota) + gpu_quota = self.determine_gpu_quota(awsed_gpu_quota, namespace.gpu_quota) + # Short circuit if no GPUs requested (permits overcap) or return + if utilized_gpus == 0: + return # Check if the total number of utilized GPUs exceeds the GPU quota if utilized_gpus + curr_gpus > gpu_quota: raise ValidationFailure( diff --git a/src/dsmlp/ext/awsed.py b/src/dsmlp/ext/awsed.py index 53ff38a..2d85fea 100644 --- a/src/dsmlp/ext/awsed.py +++ b/src/dsmlp/ext/awsed.py @@ -3,7 +3,7 @@ import requests from dacite import from_dict -from dsmlp.plugin.awsed import AwsedClient, ListTeamsResponse, TeamJson, UnsuccessfulRequest, UserResponse, UserGpuQuotaResponse +from dsmlp.plugin.awsed import AwsedClient, ListTeamsResponse, TeamJson, UnsuccessfulRequest, UserResponse, UserQuotaResponse, Quota import awsed.client import awsed.types @@ -28,12 +28,14 @@ def list_user_teams(self, username: str) -> ListTeamsResponse: return ListTeamsResponse(teams=teams) - # Fetch user's GPU quota with AWSED Api and assign to UserGpuQuotaResponse object - def get_user_gpu_quota(self, username: str) -> UserGpuQuotaResponse: + # Fetch user's GPU quota with AWSED Api and assign to UserQuotaResponse object + def get_user_gpu_quota(self, username: str) -> UserQuotaResponse: try: - usrGpuQuota = self.client.fetch_user_gpu_quota(username) + usrGpuQuota = self.client.get_user_quota(username) if not usrGpuQuota: return None - return UserGpuQuotaResponse(gpu_quota=usrGpuQuota.gpuQuota) + gpu_quota = usrGpuQuota.get("gpu", 0) + quota = Quota(user=username, resources=gpu_quota) + return UserQuotaResponse(quota=quota) except: return None diff --git a/src/dsmlp/plugin/awsed.py b/src/dsmlp/plugin/awsed.py index ea960b6..537c6e4 100644 --- a/src/dsmlp/plugin/awsed.py +++ b/src/dsmlp/plugin/awsed.py @@ -1,6 +1,6 @@ from abc import ABCMeta, abstractmethod from dataclasses import dataclass -from typing import List +from typing import List, Dict, Any @dataclass @@ -19,8 +19,13 @@ class UserResponse: enrollments: List[str] @dataclass -class UserGpuQuotaResponse: - gpu_quota: int +class Quota: + user: str + resources: Dict[str, Any] + +@dataclass +class UserQuotaResponse: + quota: Quota class AwsedClient(metaclass=ABCMeta): @abstractmethod @@ -33,8 +38,8 @@ def describe_user(self, username: str) -> UserResponse: pass @abstractmethod - def get_user_gpu_quota(self, username: str) -> UserGpuQuotaResponse: - # Return the gpu quota of a user + def get_user_gpu_quota(self, username: str) -> UserQuotaResponse: + # Return the quota (DICT) of a user pass class UnsuccessfulRequest(Exception): diff --git a/tests/app/test_gpu_validator.py b/tests/app/test_gpu_validator.py index 827236f..df09044 100644 --- a/tests/app/test_gpu_validator.py +++ b/tests/app/test_gpu_validator.py @@ -1,8 +1,10 @@ import inspect +import unittest +from unittest.mock import MagicMock from operator import contains from dsmlp.app.types import ValidationFailure from dsmlp.app.validator import Validator -from dsmlp.plugin.awsed import ListTeamsResponse, TeamJson, UserResponse +from dsmlp.plugin.awsed import ListTeamsResponse, TeamJson, UserResponse, Quota, UserQuotaResponse from dsmlp.plugin.kube import Namespace from hamcrest import assert_that, contains_inanyorder, equal_to, has_item from tests.fakes import FakeAwsedClient, FakeLogger, FakeKubeClient @@ -11,11 +13,12 @@ from tests.app.utils import gen_request, try_val_with_component -class TestGPUValidator: - def setup_method(self) -> None: +class TestGPUValidator(unittest.TestCase): + def setup_method(self, method) -> None: self.logger = FakeLogger() self.awsed_client = FakeAwsedClient() self.kube_client = FakeKubeClient() + self.gpu_validator = GPUValidator(self.awsed_client, self.kube_client, self.logger) # added for unit testing self.awsed_client.add_user( 'user10', UserResponse(uid=10, enrollments=[])) @@ -28,7 +31,7 @@ def setup_method(self) -> None: self.kube_client.set_existing_gpus('user10', 0) # Set gpu quota for user 10 with AWSED client - self.awsed_client.add_user_gpu_quota('user10', 10) + self.awsed_client.assign_user_gpu_quota('user10',{"gpu": 10}) # Set up user11 without any quota & namespace. self.awsed_client.add_user( @@ -39,7 +42,7 @@ def setup_method(self) -> None: def test_no_gpus_requested(self): self.try_validate( - gen_request(), expected=True, message="Allowed" + gen_request(), expected=True ) def test_quota_not_reached(self): @@ -81,7 +84,7 @@ def test_low_priority_overcap(self): self.kube_client.set_existing_gpus('user10', 11) self.try_validate( - gen_request(), expected=True) + gen_request(), expected=True, message="Allowed") def try_validate(self, json, expected: bool, message: str = None): try_val_with_component(GPUValidator(self.awsed_client, @@ -89,7 +92,7 @@ def try_validate(self, json, expected: bool, message: str = None): # Test correct response for get_user_gpu_quota method def test_awsed_gpu_quota_correct_response(self): - self.awsed_client.add_user_gpu_quota('user11', 5) + self.awsed_client.assign_user_gpu_quota('user11', {"gpu": 5}) user_gpu_quota = self.awsed_client.get_user_gpu_quota('user11') assert_that(user_gpu_quota, equal_to(5)) @@ -119,23 +122,95 @@ def test_gpu_quota_client_priority(self): name='user11', labels={'k8s-sync': 'true'}, gpu_quota=8)) self.kube_client.set_existing_gpus('user11', 3) - self.awsed_client.add_user_gpu_quota('user11', 6) + # add awsed quota + self.awsed_client.assign_user_gpu_quota('user11', {"gpu": 6}) self.try_validate( gen_request(gpu_req=6, username='user11'), expected=False, message="GPU quota exceeded. Wanted 6 but with 3 already in use, the quota of 6 would be exceeded." ) # Quota both set for user 11 from kube and awsed, should prioritize AWSED quota def test_gpu_quota_client_priority2(self): - self.awsed_client.add_user_gpu_quota('user11', 18) self.kube_client.add_namespace('user11', Namespace( name='user11', labels={'k8s-sync': 'true'}, gpu_quota=12)) + # add awsed quota + self.awsed_client.assign_user_gpu_quota('user11', {"gpu": 18}) # set existing gpu = kube client quota self.kube_client.set_existing_gpus('user11', 12) self.try_validate( - gen_request(gpu_req=6, username='user11'), expected=True, message="GPU quota exceeded. Wanted 6 but with 5 already in use, the quota of 18 would be exceeded." + gen_request(gpu_req=6, username='user11'), expected=True, message="GPU quota exceeded. Wanted 6 but with 12 already in use, the quota of 18 would be exceeded." ) - # --- Modular / Unit Testing for Validate Pod --- - def test \ No newline at end of file + ''' --- Modular / Unit Testing for Validate Pod --- ''' + # Test determine_gpu_quota (helper method for validate_pod) + def test_determine_gpu_quota_awsed_quota(self): + # Test when awsed_quota is greater than 0 (should prioritize it) + awsed_quota = 10 + kube_client_quota = 5 + result = self.gpu_validator.determine_gpu_quota(awsed_quota, kube_client_quota) + self.assertEqual(result, 10) + + def test_determine_gpu_quota_kube_client_quota(self): + # Test when awsed_quota is 0 or None, but kube_client_quota is greater than 0 + awsed_quota = 0 + kube_client_quota = 5 + result = self.gpu_validator.determine_gpu_quota(awsed_quota, kube_client_quota) + self.assertEqual(result, 5) + + awsed_quota = None + kube_client_quota = 5 + result = self.gpu_validator.determine_gpu_quota(awsed_quota, kube_client_quota) + self.assertEqual(result, 5) + + def test_determine_gpu_quota_default(self): + # Test when both awsed_quota and kube_client_quota are 0 or None + awsed_quota = 0 + kube_client_quota = 0 + result = self.gpu_validator.determine_gpu_quota(awsed_quota, kube_client_quota) + self.assertEqual(result, 1) + + awsed_quota = None + kube_client_quota = None + result = self.gpu_validator.determine_gpu_quota(awsed_quota, kube_client_quota) + self.assertEqual(result, 1) + + # Test caculate_utilized_gpu (helper method for validate_pod) + def test_calculate_utilized_gpu_with_requested_greater_than_limit(self): + request = self._create_request([{'requests': {'nvidia.com/gpu': '5'}, 'limits': {'nvidia.com/gpu': '3'}}]) + result = self.gpu_validator.calculate_utilized_gpu(request) + self.assertEqual(result, 5) + + def test_calculate_utilized_gpu_with_limit_greater_than_requested(self): + request = self._create_request([{'requests': {'nvidia.com/gpu': '2'}, 'limits': {'nvidia.com/gpu': '4'}}]) + result = self.gpu_validator.calculate_utilized_gpu(request) + self.assertEqual(result, 4) + + # Test low priority GPUs, no more requested -> no error should be raised + def test_calculate_utilized_gpu_with_no_gpus_requested(self): + request = self._create_request([{'requests': {}, 'limits': {}}]) + result = self.gpu_validator.calculate_utilized_gpu(request) + self.assertEqual(result, 0) + + def test_user_with_more_gpus_than_quota_not_requesting_more(self): + self.kube_client.add_namespace('user11', Namespace( + name='user11', labels={'k8s-sync': 'true'}, gpu_quota=5)) + self.kube_client.set_existing_gpus('user11', 8) + + # Create a request where no additional GPUs are requested + request = self._create_request([{'requests': {}, 'limits': {}}]) + request.namespace = 'user11' + + # Ensure the ValidationFailure exception is not raised + try: + self.gpu_validator.validate_pod(request) + except ValidationFailure: + self.fail("ValidationFailure was raised unexpectedly!") + + def _create_request(self, containers): + request = MagicMock() + request.object.spec.containers = [ + MagicMock(resources=MagicMock(requests=container['requests'], limits=container['limits'])) + for container in containers + ] + return request \ No newline at end of file diff --git a/tests/app/test_id_validator.py b/tests/app/test_id_validator.py index de9e384..ed0553e 100644 --- a/tests/app/test_id_validator.py +++ b/tests/app/test_id_validator.py @@ -3,7 +3,7 @@ from dsmlp.app.types import * from dsmlp.app.id_validator import IDValidator from dsmlp.app.validator import Validator -from dsmlp.plugin.awsed import ListTeamsResponse, TeamJson, UserResponse +from dsmlp.plugin.awsed import ListTeamsResponse, TeamJson, UserResponse, UserQuotaResponse, Quota from dsmlp.plugin.kube import Namespace from hamcrest import assert_that, contains_inanyorder, equal_to, has_item from tests.app.utils import gen_request, try_val_with_component diff --git a/tests/app/utils.py b/tests/app/utils.py index 11858a0..74f8a8d 100644 --- a/tests/app/utils.py +++ b/tests/app/utils.py @@ -82,4 +82,7 @@ def try_val_with_component(validator: Validator, json, expected: bool, message: except Exception as e: if expected: raise AssertionError(f"Expected no exception but got {e}") - assert_that(e.message, equal_to(message)) + assert_that(str(e), equal_to(message)) + actual_message = str(e) + print(f"Actual exception message: {actual_message}") + assert_that(actual_message, equal_to(message)) diff --git a/tests/fakes.py b/tests/fakes.py index d82366f..b2f2114 100644 --- a/tests/fakes.py +++ b/tests/fakes.py @@ -1,11 +1,11 @@ from collections.abc import Mapping from dataclasses import dataclass from os import path -from typing import List, TypedDict, Dict +from typing import List, TypedDict, Dict, Any from dacite import from_dict -from dsmlp.plugin.awsed import AwsedClient, ListTeamsResponse, UnsuccessfulRequest, UserResponse, UserGpuQuotaResponse +from dsmlp.plugin.awsed import AwsedClient, ListTeamsResponse, UnsuccessfulRequest, UserResponse, UserQuotaResponse, Quota from dsmlp.plugin.kube import KubeClient, Namespace, NotFound from dsmlp.plugin.logger import Logger @@ -14,7 +14,7 @@ class FakeAwsedClient(AwsedClient): def __init__(self): self.teams: Dict[str, ListTeamsResponse] = {} self.users: Dict[str, UserResponse] = {} - self.user_gpu_quota: Dict[str, UserGpuQuotaResponse] = {} + self.user_quota: Dict[str, UserQuotaResponse] = {} def list_user_teams(self, username: str) -> ListTeamsResponse: try: @@ -30,15 +30,22 @@ def describe_user(self, username: str) -> UserResponse: except KeyError: return None - # Get user GPU quota. If user does not exist, return 0 - def get_user_gpu_quota(self, username: str) -> UserGpuQuotaResponse: + # Get user GPU quota. If user does not exist, return a value of 0 + def get_user_gpu_quota(self, username: str) -> int: try: - return self.user_gpu_quota[username] + user_quota_response = self.user_quota[username] + return user_quota_response.quota.resources.get("gpu", 0) except KeyError: return 0 - def add_user_gpu_quota(self, username, gpu_quota: UserGpuQuotaResponse): - self.user_gpu_quota[username] = gpu_quota + # def add_user_gpu_quota(self, username: str, quota: UserQuotaResponse): + # self.user_quota[username] = quota + + # Assign user GPU quota and create a UserQuotaResponse & Quota objects + def assign_user_gpu_quota(self, username: str, resources: Dict[str, Any]): + quota = Quota(user=username, resources=resources) + user_quota_response = UserQuotaResponse(quota=quota) + self.user_quota[username] = user_quota_response def add_user(self, username, user: UserResponse): self.users[username] = user