Handles limits as well now

ucsd-ets · Jan 27, 2024 · 93c610a · 93c610a
1 parent f555e09
commit 93c610a
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 9 deletions.
diff --git a/src/dsmlp/app/gpu_validator.py b/src/dsmlp/app/gpu_validator.py
@@ -33,10 +33,19 @@ def validate_pod(self, request: Request):
         namespace = self.kube.get_namespace(request.namespace)
         curr_gpus = self.kube.get_gpus_in_namespace(request.namespace)
 
-        requested_gpus = 0
+        utilized_gpus = 0
         for container in request.object.spec.containers:
-            if container.resources is not None and GPU_LABEL in container.resources.requests:
-                requested_gpus += int(container.resources.requests[GPU_LABEL])
+                requested, limit = 0, 0
+                try:
+                    requested = int(container.resources.requests[GPU_LABEL])
+                except (KeyError, AttributeError, TypeError):
+                    pass
+                try:
+                    limit = int(container.resources.requests[GPU_LABEL])
+                except (KeyError, AttributeError, TypeError):
+                    pass
+
+                utilized_gpus += max(requested, limit)
 
-        if requested_gpus + curr_gpus > namespace.gpu_quota:
-            raise ValidationFailure(f"GPU quota exceeded. Requested {requested_gpus} but with {curr_gpus} already in use, the quota of {namespace.gpu_quota} would be exceeded.")
+        if utilized_gpus + curr_gpus > namespace.gpu_quota:
+            raise ValidationFailure(f"GPU quota exceeded. Wanted {utilized_gpus} but with {curr_gpus} already in use, the quota of {namespace.gpu_quota} would be exceeded.")
diff --git a/src/dsmlp/ext/kube.py b/src/dsmlp/ext/kube.py
@@ -36,10 +36,17 @@ def get_gpus_in_namespace(self, name: str) -> int:
         gpu_count = 0
         for pod in pods.items:
             for container in pod.spec.containers:
+                requested, limit = 0, 0
                 try:
-                    gpu_count += int(container.resources.requests[GPU_LABEL])
-                except (KeyError, TypeError):
+                    requested = int(container.resources.requests[GPU_LABEL])
+                except (KeyError, AttributeError, TypeError):
                     pass
+                try:
+                    limit = int(container.resources.requests[GPU_LABEL])
+                except (KeyError, AttributeError, TypeError):
+                    pass
+
+                gpu_count += max(requested, limit)
 
         return gpu_count
 

diff --git a/tests/app/test_gpu_validator.py b/tests/app/test_gpu_validator.py
@@ -111,7 +111,7 @@ def test_quota_exceeded(self):
             "response": {
                 "uid": "705ab4f5-6393-11e8-b7cc-42010a800002",
                 "allowed": False, "status": {
-                    "message": "GPU quota exceeded. Requested 11 but with 0 already in use, the quota of 10 would be exceeded."
+                    "message": "GPU quota exceeded. Wanted 11 but with 0 already in use, the quota of 10 would be exceeded."
                 }}}))
 
     def test_sum_exceeded(self):
@@ -146,7 +146,7 @@ def test_sum_exceeded(self):
             "response": {
                 "uid": "705ab4f5-6393-11e8-b7cc-42010a800002",
                 "allowed": False, "status": {
-                    "message": "GPU quota exceeded. Requested 6 but with 5 already in use, the quota of 10 would be exceeded."
+                    "message": "GPU quota exceeded. Wanted 6 but with 5 already in use, the quota of 10 would be exceeded."
                 }}}))
 
     def test_low_priority(self):
@@ -184,6 +184,42 @@ def test_low_priority(self):
                 "allowed": True, "status": {
                     "message": "Allowed"
                 }}}))
+
+    # Should respond to limit as well as request
+    def test_limit_exceeded(self):
+        self.kube_client.set_existing_gpus('user10', 5)
+
+        response = self.when_validate(
+            {
+                "request": {
+                    "uid": "705ab4f5-6393-11e8-b7cc-42010a800002",
+                    "userInfo": {
+                        "username": "user10"
+                    },
+                    "namespace": "user10",
+                    "object": {
+                        "kind": "Pod",
+                        "spec": {
+                            "containers": [{
+                                "resources": {
+                                    "limits": {
+                                        "nvidia.com/gpu": 6
+                                    }
+                                }
+                            }]
+                        }
+                    }
+                }}
+        )
+
+        assert_that(response, equal_to({
+            "apiVersion": "admission.k8s.io/v1",
+            "kind": "AdmissionReview",
+            "response": {
+                "uid": "705ab4f5-6393-11e8-b7cc-42010a800002",
+                "allowed": True, "status": {
+                    "message": "Allowed"
+                }}}))
 
     def test_collect_gpus(self):
         real_kube_client = DefaultKubeClient()