Skip to content

Commit

Permalink
Handles limits as well now
Browse files Browse the repository at this point in the history
  • Loading branch information
shouhanzen committed Jan 27, 2024
1 parent f555e09 commit 93c610a
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 9 deletions.
19 changes: 14 additions & 5 deletions src/dsmlp/app/gpu_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,19 @@ def validate_pod(self, request: Request):
namespace = self.kube.get_namespace(request.namespace)
curr_gpus = self.kube.get_gpus_in_namespace(request.namespace)

requested_gpus = 0
utilized_gpus = 0
for container in request.object.spec.containers:
if container.resources is not None and GPU_LABEL in container.resources.requests:
requested_gpus += int(container.resources.requests[GPU_LABEL])
requested, limit = 0, 0
try:
requested = int(container.resources.requests[GPU_LABEL])
except (KeyError, AttributeError, TypeError):
pass
try:
limit = int(container.resources.requests[GPU_LABEL])
except (KeyError, AttributeError, TypeError):
pass

utilized_gpus += max(requested, limit)

if requested_gpus + curr_gpus > namespace.gpu_quota:
raise ValidationFailure(f"GPU quota exceeded. Requested {requested_gpus} but with {curr_gpus} already in use, the quota of {namespace.gpu_quota} would be exceeded.")
if utilized_gpus + curr_gpus > namespace.gpu_quota:
raise ValidationFailure(f"GPU quota exceeded. Wanted {utilized_gpus} but with {curr_gpus} already in use, the quota of {namespace.gpu_quota} would be exceeded.")
11 changes: 9 additions & 2 deletions src/dsmlp/ext/kube.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,17 @@ def get_gpus_in_namespace(self, name: str) -> int:
gpu_count = 0
for pod in pods.items:
for container in pod.spec.containers:
requested, limit = 0, 0
try:
gpu_count += int(container.resources.requests[GPU_LABEL])
except (KeyError, TypeError):
requested = int(container.resources.requests[GPU_LABEL])
except (KeyError, AttributeError, TypeError):
pass
try:
limit = int(container.resources.requests[GPU_LABEL])
except (KeyError, AttributeError, TypeError):
pass

gpu_count += max(requested, limit)

return gpu_count

Expand Down
40 changes: 38 additions & 2 deletions tests/app/test_gpu_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_quota_exceeded(self):
"response": {
"uid": "705ab4f5-6393-11e8-b7cc-42010a800002",
"allowed": False, "status": {
"message": "GPU quota exceeded. Requested 11 but with 0 already in use, the quota of 10 would be exceeded."
"message": "GPU quota exceeded. Wanted 11 but with 0 already in use, the quota of 10 would be exceeded."
}}}))

def test_sum_exceeded(self):
Expand Down Expand Up @@ -146,7 +146,7 @@ def test_sum_exceeded(self):
"response": {
"uid": "705ab4f5-6393-11e8-b7cc-42010a800002",
"allowed": False, "status": {
"message": "GPU quota exceeded. Requested 6 but with 5 already in use, the quota of 10 would be exceeded."
"message": "GPU quota exceeded. Wanted 6 but with 5 already in use, the quota of 10 would be exceeded."
}}}))

def test_low_priority(self):
Expand Down Expand Up @@ -184,6 +184,42 @@ def test_low_priority(self):
"allowed": True, "status": {
"message": "Allowed"
}}}))

# Should respond to limit as well as request
def test_limit_exceeded(self):
self.kube_client.set_existing_gpus('user10', 5)

response = self.when_validate(
{
"request": {
"uid": "705ab4f5-6393-11e8-b7cc-42010a800002",
"userInfo": {
"username": "user10"
},
"namespace": "user10",
"object": {
"kind": "Pod",
"spec": {
"containers": [{
"resources": {
"limits": {
"nvidia.com/gpu": 6
}
}
}]
}
}
}}
)

assert_that(response, equal_to({
"apiVersion": "admission.k8s.io/v1",
"kind": "AdmissionReview",
"response": {
"uid": "705ab4f5-6393-11e8-b7cc-42010a800002",
"allowed": True, "status": {
"message": "Allowed"
}}}))

def test_collect_gpus(self):
real_kube_client = DefaultKubeClient()
Expand Down

0 comments on commit 93c610a

Please sign in to comment.