Skip to content

Commit

Permalink
Annotate GPU Reservation Pods with GpuIdx
Browse files Browse the repository at this point in the history
  • Loading branch information
gshaibi committed Mar 20, 2024
1 parent f6207af commit 5a9fee7
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func (p *PodHandler) handleDedicatedGpuPodAddition(pod *v1.Pod, nodeTopology *to
gpu.Status.AllocatedBy.Pod = pod.Name
gpu.Status.AllocatedBy.Container = pod.Spec.Containers[0].Name

if pod.Namespace != constants.ReservationNs {
if !util.IsGpuReservationPod(pod) {
gpu.Status.PodGpuUsageStatus[pod.UID] = calculateUsage(p.dynamicClient, pod, nodeTopology.GpuMemory)
}

Expand All @@ -65,7 +65,7 @@ func (p *PodHandler) handleDedicatedGpuPodUpdate(pod *v1.Pod, nodeTopology *topo
gpu.Status.AllocatedBy.Pod == pod.Name &&
gpu.Status.AllocatedBy.Container == pod.Spec.Containers[0].Name
if isGpuOccupiedByPod {
if pod.Namespace != constants.ReservationNs {
if !util.IsGpuReservationPod(pod) {
gpu.Status.PodGpuUsageStatus[pod.UID] =
calculateUsage(p.dynamicClient, pod, nodeTopology.GpuMemory)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package pod

import (
"context"
"fmt"

"github.com/google/uuid"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/util"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
)

func (p *PodHandler) handleGpuReservationPodAddition(pod *v1.Pod) error {
if !util.IsGpuReservationPod(pod) {
return nil
}

err := p.setReservationPodGpuIdxAnnotation(pod)
if err != nil {
return fmt.Errorf("failed to set GPU index annotation for reservation pod %s: %w", pod.Name, err)
}

return nil
}

func (p *PodHandler) setReservationPodGpuIdxAnnotation(pod *v1.Pod) error {
annotationKey := constants.ReservationPodGpuIdxAnnotation
annotationVal := fmt.Sprintf("GPU-%s", uuid.NewString())
patch := []byte(fmt.Sprintf(`{"metadata": {"annotations": {"%s": "%s"}}}`, annotationKey, annotationVal))

_, err := p.kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.MergePatchType, patch, metav1.PatchOptions{})
if err != nil {
return fmt.Errorf("failed to update pod %s: %w", pod.Name, err)
}

return nil
}
5 changes: 5 additions & 0 deletions internal/status-updater/handlers/pod/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ func (p *PodHandler) HandleAdd(pod *v1.Pod) error {
return fmt.Errorf("could not get node %s topology: %w", pod.Spec.NodeName, err)
}

err = p.handleGpuReservationPodAddition(pod)
if err != nil {
return err
}

err = p.handleDedicatedGpuPodAddition(pod, nodeTopology)
if err != nil {
return err
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ import (
"k8s.io/client-go/kubernetes"
)

const (
runaiReservationNs = constants.ReservationNs
)

func (p *PodHandler) handleSharedGpuPodAddition(pod *v1.Pod, nodeTopology *topology.NodeTopology) error {
if !util.IsSharedGpuPod(pod) {
return nil
Expand Down Expand Up @@ -155,5 +151,5 @@ func getMatchingReservationPodNameByRunaiGpuGroupLabel(kubeclient kubernetes.Int
}

func getNodeReservationPods(kubeclient kubernetes.Interface, nodeName string) (*v1.PodList, error) {
return kubeclient.CoreV1().Pods(runaiReservationNs).List(context.TODO(), metav1.ListOptions{FieldSelector: "spec.nodeName=" + nodeName})
return kubeclient.CoreV1().Pods(constants.ReservationNs).List(context.TODO(), metav1.ListOptions{FieldSelector: "spec.nodeName=" + nodeName})
}
14 changes: 6 additions & 8 deletions internal/status-updater/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ import (
func IsSharedGpuPod(pod *v1.Pod) bool {
_, runaiGpuExists := pod.Annotations[constants.GpuIdxAnnotation]
_, runaiGpuGroupExists := pod.Labels[constants.GpuGroupLabel]
isReservationPod := pod.Namespace == constants.ReservationNs

return !isReservationPod && (runaiGpuExists || runaiGpuGroupExists)
return !IsGpuReservationPod(pod) && (runaiGpuExists || runaiGpuGroupExists)
}

func IsDedicatedGpuPod(pod *v1.Pod) bool {
Expand All @@ -27,11 +26,10 @@ func IsPodTerminated(pod *v1.Pod) bool {
}

func IsPodScheduled(pod *v1.Pod) bool {
for _, condition := range pod.Status.Conditions {
if condition.Type == v1.PodScheduled && condition.Status == v1.ConditionTrue {
return true
}
}
// This should be checked using the pod's PodScheduled condition once https://github.com/run-ai/runai-engine/pull/174 is merged and available.
return pod.Spec.NodeName != ""
}

return false
func IsGpuReservationPod(pod *v1.Pod) bool {
return pod.Namespace == constants.ReservationNs
}

0 comments on commit 5a9fee7

Please sign in to comment.