Skip to content

Commit

Permalink
Add unit test cases for GPU Validation, including updated AWSED API a…
Browse files Browse the repository at this point in the history
…nd mimicking its behavior in the client
  • Loading branch information
trn024 committed Aug 9, 2024
1 parent ed6e623 commit 44bc970
Show file tree
Hide file tree
Showing 8 changed files with 137 additions and 44 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ requests_mock
dataclasses-json
python-dotenv
pytest
git+https://github.com/ucsd-ets/awsed_python_client.git@2024.1.2-RC1
git+https://github.com/ucsd-ets/awsed_python_client.git@2024.2.1-RC1
23 changes: 12 additions & 11 deletions src/dsmlp/app/gpu_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(self, awsed: AwsedClient, kube: KubeClient, logger: Logger) -> None
self.kube = kube
self.logger = logger

def get_ultilized_gpu(self, request: Request):
def calculate_utilized_gpu(self, request: Request):
# Calculate the number of GPUs requested for kube client
utilized_gpus = 0
for container in request.object.spec.containers:
Expand All @@ -37,16 +37,14 @@ def get_ultilized_gpu(self, request: Request):

utilized_gpus += max(requested, limit)

# Short circuit if no GPUs requested (permits overcap) or return
if utilized_gpus == 0:
raise ValueError("Error: No GPUs requested.")
return utilized_gpus

def get_gpu_quota(self, awsed_quota, kube_client_quota):
def determine_gpu_quota(self, awsed_quota, kube_client_quota):
"""
Use AWSED GPU quota if it is not None and greater than 0
else use namespace GPU quota if it is not None and greater than 0
else use 1 as default
Determine the GPU quota to be used based on AWSED GPU quota and namespace GPU quota.
If AWSED GPU quota is not None and greater than 0, use it.
If AWSED GPU quota is None or 0, check if namespace GPU quota is not None and greater than 0, use it.
If both AWSED GPU quota and namespace GPU quota are None or 0, use 1 as the default GPU quota.
"""

default_gpu_quota = 1
Expand All @@ -71,12 +69,15 @@ def validate_pod(self, request: Request):
curr_gpus = self.kube.get_gpus_in_namespace(request.namespace)
awsed_gpu_quota = self.awsed.get_user_gpu_quota(request.namespace)

# check ultilized gpu
utilized_gpus = self.get_ultilized_gpu(request=request)
# check utilized gpu
utilized_gpus = self.calculate_utilized_gpu(request=request)

# request gpu_quota from method
gpu_quota = self.get_gpu_quota(awsed_gpu_quota, namespace.gpu_quota)
gpu_quota = self.determine_gpu_quota(awsed_gpu_quota, namespace.gpu_quota)

# Short circuit if no GPUs requested (permits overcap) or return
if utilized_gpus == 0:
return
# Check if the total number of utilized GPUs exceeds the GPU quota
if utilized_gpus + curr_gpus > gpu_quota:
raise ValidationFailure(
Expand Down
12 changes: 7 additions & 5 deletions src/dsmlp/ext/awsed.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import requests
from dacite import from_dict

from dsmlp.plugin.awsed import AwsedClient, ListTeamsResponse, TeamJson, UnsuccessfulRequest, UserResponse, UserGpuQuotaResponse
from dsmlp.plugin.awsed import AwsedClient, ListTeamsResponse, TeamJson, UnsuccessfulRequest, UserResponse, UserQuotaResponse, Quota

import awsed.client
import awsed.types
Expand All @@ -28,12 +28,14 @@ def list_user_teams(self, username: str) -> ListTeamsResponse:

return ListTeamsResponse(teams=teams)

# Fetch user's GPU quota with AWSED Api and assign to UserGpuQuotaResponse object
def get_user_gpu_quota(self, username: str) -> UserGpuQuotaResponse:
# Fetch user's GPU quota with AWSED Api and assign to UserQuotaResponse object
def get_user_gpu_quota(self, username: str) -> UserQuotaResponse:
try:
usrGpuQuota = self.client.fetch_user_gpu_quota(username)
usrGpuQuota = self.client.get_user_quota(username)
if not usrGpuQuota:
return None
return UserGpuQuotaResponse(gpu_quota=usrGpuQuota.gpuQuota)
gpu_quota = usrGpuQuota.get("gpu", 0)
quota = Quota(user=username, resources=gpu_quota)
return UserQuotaResponse(quota=quota)
except:
return None
15 changes: 10 additions & 5 deletions src/dsmlp/plugin/awsed.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass
from typing import List
from typing import List, Dict, Any


@dataclass
Expand All @@ -19,8 +19,13 @@ class UserResponse:
enrollments: List[str]

@dataclass
class UserGpuQuotaResponse:
gpu_quota: int
class Quota:
user: str
resources: Dict[str, Any]

@dataclass
class UserQuotaResponse:
quota: Quota

class AwsedClient(metaclass=ABCMeta):
@abstractmethod
Expand All @@ -33,8 +38,8 @@ def describe_user(self, username: str) -> UserResponse:
pass

@abstractmethod
def get_user_gpu_quota(self, username: str) -> UserGpuQuotaResponse:
# Return the gpu quota of a user
def get_user_gpu_quota(self, username: str) -> UserQuotaResponse:
# Return the quota (DICT) of a user
pass

class UnsuccessfulRequest(Exception):
Expand Down
99 changes: 87 additions & 12 deletions tests/app/test_gpu_validator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import inspect
import unittest
from unittest.mock import MagicMock
from operator import contains
from dsmlp.app.types import ValidationFailure
from dsmlp.app.validator import Validator
from dsmlp.plugin.awsed import ListTeamsResponse, TeamJson, UserResponse
from dsmlp.plugin.awsed import ListTeamsResponse, TeamJson, UserResponse, Quota, UserQuotaResponse
from dsmlp.plugin.kube import Namespace
from hamcrest import assert_that, contains_inanyorder, equal_to, has_item
from tests.fakes import FakeAwsedClient, FakeLogger, FakeKubeClient
Expand All @@ -11,11 +13,12 @@
from tests.app.utils import gen_request, try_val_with_component


class TestGPUValidator:
def setup_method(self) -> None:
class TestGPUValidator(unittest.TestCase):
def setup_method(self, method) -> None:
self.logger = FakeLogger()
self.awsed_client = FakeAwsedClient()
self.kube_client = FakeKubeClient()
self.gpu_validator = GPUValidator(self.awsed_client, self.kube_client, self.logger) # added for unit testing

self.awsed_client.add_user(
'user10', UserResponse(uid=10, enrollments=[]))
Expand All @@ -28,7 +31,7 @@ def setup_method(self) -> None:
self.kube_client.set_existing_gpus('user10', 0)

# Set gpu quota for user 10 with AWSED client
self.awsed_client.add_user_gpu_quota('user10', 10)
self.awsed_client.assign_user_gpu_quota('user10',{"gpu": 10})

# Set up user11 without any quota & namespace.
self.awsed_client.add_user(
Expand All @@ -39,7 +42,7 @@ def setup_method(self) -> None:

def test_no_gpus_requested(self):
self.try_validate(
gen_request(), expected=True, message="Allowed"
gen_request(), expected=True
)

def test_quota_not_reached(self):
Expand Down Expand Up @@ -81,15 +84,15 @@ def test_low_priority_overcap(self):
self.kube_client.set_existing_gpus('user10', 11)

self.try_validate(
gen_request(), expected=True)
gen_request(), expected=True, message="Allowed")

def try_validate(self, json, expected: bool, message: str = None):
try_val_with_component(GPUValidator(self.awsed_client,
self.kube_client, self.logger), json, expected, message)

# Test correct response for get_user_gpu_quota method
def test_awsed_gpu_quota_correct_response(self):
self.awsed_client.add_user_gpu_quota('user11', 5)
self.awsed_client.assign_user_gpu_quota('user11', {"gpu": 5})
user_gpu_quota = self.awsed_client.get_user_gpu_quota('user11')
assert_that(user_gpu_quota, equal_to(5))

Expand Down Expand Up @@ -119,23 +122,95 @@ def test_gpu_quota_client_priority(self):
name='user11', labels={'k8s-sync': 'true'}, gpu_quota=8))

self.kube_client.set_existing_gpus('user11', 3)
self.awsed_client.add_user_gpu_quota('user11', 6)
# add awsed quota
self.awsed_client.assign_user_gpu_quota('user11', {"gpu": 6})
self.try_validate(
gen_request(gpu_req=6, username='user11'), expected=False, message="GPU quota exceeded. Wanted 6 but with 3 already in use, the quota of 6 would be exceeded."
)

# Quota both set for user 11 from kube and awsed, should prioritize AWSED quota
def test_gpu_quota_client_priority2(self):
self.awsed_client.add_user_gpu_quota('user11', 18)
self.kube_client.add_namespace('user11', Namespace(
name='user11', labels={'k8s-sync': 'true'}, gpu_quota=12))
# add awsed quota
self.awsed_client.assign_user_gpu_quota('user11', {"gpu": 18})

# set existing gpu = kube client quota
self.kube_client.set_existing_gpus('user11', 12)

self.try_validate(
gen_request(gpu_req=6, username='user11'), expected=True, message="GPU quota exceeded. Wanted 6 but with 5 already in use, the quota of 18 would be exceeded."
gen_request(gpu_req=6, username='user11'), expected=True, message="GPU quota exceeded. Wanted 6 but with 12 already in use, the quota of 18 would be exceeded."
)

# --- Modular / Unit Testing for Validate Pod ---
def test
''' --- Modular / Unit Testing for Validate Pod --- '''
# Test determine_gpu_quota (helper method for validate_pod)
def test_determine_gpu_quota_awsed_quota(self):
# Test when awsed_quota is greater than 0 (should prioritize it)
awsed_quota = 10
kube_client_quota = 5
result = self.gpu_validator.determine_gpu_quota(awsed_quota, kube_client_quota)
self.assertEqual(result, 10)

def test_determine_gpu_quota_kube_client_quota(self):
# Test when awsed_quota is 0 or None, but kube_client_quota is greater than 0
awsed_quota = 0
kube_client_quota = 5
result = self.gpu_validator.determine_gpu_quota(awsed_quota, kube_client_quota)
self.assertEqual(result, 5)

awsed_quota = None
kube_client_quota = 5
result = self.gpu_validator.determine_gpu_quota(awsed_quota, kube_client_quota)
self.assertEqual(result, 5)

def test_determine_gpu_quota_default(self):
# Test when both awsed_quota and kube_client_quota are 0 or None
awsed_quota = 0
kube_client_quota = 0
result = self.gpu_validator.determine_gpu_quota(awsed_quota, kube_client_quota)
self.assertEqual(result, 1)

awsed_quota = None
kube_client_quota = None
result = self.gpu_validator.determine_gpu_quota(awsed_quota, kube_client_quota)
self.assertEqual(result, 1)

# Test caculate_utilized_gpu (helper method for validate_pod)
def test_calculate_utilized_gpu_with_requested_greater_than_limit(self):
request = self._create_request([{'requests': {'nvidia.com/gpu': '5'}, 'limits': {'nvidia.com/gpu': '3'}}])
result = self.gpu_validator.calculate_utilized_gpu(request)
self.assertEqual(result, 5)

def test_calculate_utilized_gpu_with_limit_greater_than_requested(self):
request = self._create_request([{'requests': {'nvidia.com/gpu': '2'}, 'limits': {'nvidia.com/gpu': '4'}}])
result = self.gpu_validator.calculate_utilized_gpu(request)
self.assertEqual(result, 4)

# Test low priority GPUs, no more requested -> no error should be raised
def test_calculate_utilized_gpu_with_no_gpus_requested(self):
request = self._create_request([{'requests': {}, 'limits': {}}])
result = self.gpu_validator.calculate_utilized_gpu(request)
self.assertEqual(result, 0)

def test_user_with_more_gpus_than_quota_not_requesting_more(self):
self.kube_client.add_namespace('user11', Namespace(
name='user11', labels={'k8s-sync': 'true'}, gpu_quota=5))
self.kube_client.set_existing_gpus('user11', 8)

# Create a request where no additional GPUs are requested
request = self._create_request([{'requests': {}, 'limits': {}}])
request.namespace = 'user11'

# Ensure the ValidationFailure exception is not raised
try:
self.gpu_validator.validate_pod(request)
except ValidationFailure:
self.fail("ValidationFailure was raised unexpectedly!")

def _create_request(self, containers):
request = MagicMock()
request.object.spec.containers = [
MagicMock(resources=MagicMock(requests=container['requests'], limits=container['limits']))
for container in containers
]
return request
2 changes: 1 addition & 1 deletion tests/app/test_id_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from dsmlp.app.types import *
from dsmlp.app.id_validator import IDValidator
from dsmlp.app.validator import Validator
from dsmlp.plugin.awsed import ListTeamsResponse, TeamJson, UserResponse
from dsmlp.plugin.awsed import ListTeamsResponse, TeamJson, UserResponse, UserQuotaResponse, Quota
from dsmlp.plugin.kube import Namespace
from hamcrest import assert_that, contains_inanyorder, equal_to, has_item
from tests.app.utils import gen_request, try_val_with_component
Expand Down
5 changes: 4 additions & 1 deletion tests/app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,7 @@ def try_val_with_component(validator: Validator, json, expected: bool, message:
except Exception as e:
if expected:
raise AssertionError(f"Expected no exception but got {e}")
assert_that(e.message, equal_to(message))
assert_that(str(e), equal_to(message))
actual_message = str(e)
print(f"Actual exception message: {actual_message}")
assert_that(actual_message, equal_to(message))
23 changes: 15 additions & 8 deletions tests/fakes.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from collections.abc import Mapping
from dataclasses import dataclass
from os import path
from typing import List, TypedDict, Dict
from typing import List, TypedDict, Dict, Any

from dacite import from_dict

from dsmlp.plugin.awsed import AwsedClient, ListTeamsResponse, UnsuccessfulRequest, UserResponse, UserGpuQuotaResponse
from dsmlp.plugin.awsed import AwsedClient, ListTeamsResponse, UnsuccessfulRequest, UserResponse, UserQuotaResponse, Quota
from dsmlp.plugin.kube import KubeClient, Namespace, NotFound
from dsmlp.plugin.logger import Logger

Expand All @@ -14,7 +14,7 @@ class FakeAwsedClient(AwsedClient):
def __init__(self):
self.teams: Dict[str, ListTeamsResponse] = {}
self.users: Dict[str, UserResponse] = {}
self.user_gpu_quota: Dict[str, UserGpuQuotaResponse] = {}
self.user_quota: Dict[str, UserQuotaResponse] = {}

def list_user_teams(self, username: str) -> ListTeamsResponse:
try:
Expand All @@ -30,15 +30,22 @@ def describe_user(self, username: str) -> UserResponse:
except KeyError:
return None

# Get user GPU quota. If user does not exist, return 0
def get_user_gpu_quota(self, username: str) -> UserGpuQuotaResponse:
# Get user GPU quota. If user does not exist, return a value of 0
def get_user_gpu_quota(self, username: str) -> int:
try:
return self.user_gpu_quota[username]
user_quota_response = self.user_quota[username]
return user_quota_response.quota.resources.get("gpu", 0)
except KeyError:
return 0

def add_user_gpu_quota(self, username, gpu_quota: UserGpuQuotaResponse):
self.user_gpu_quota[username] = gpu_quota
# def add_user_gpu_quota(self, username: str, quota: UserQuotaResponse):
# self.user_quota[username] = quota

# Assign user GPU quota and create a UserQuotaResponse & Quota objects
def assign_user_gpu_quota(self, username: str, resources: Dict[str, Any]):
quota = Quota(user=username, resources=resources)
user_quota_response = UserQuotaResponse(quota=quota)
self.user_quota[username] = user_quota_response

def add_user(self, username, user: UserResponse):
self.users[username] = user
Expand Down

0 comments on commit 44bc970

Please sign in to comment.