Skip to content

Commit

Permalink
Add support for fake node deployments and update related constants
Browse files Browse the repository at this point in the history
  • Loading branch information
gshaibi committed Dec 2, 2024
1 parent df53acc commit 77010de
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 5 deletions.
1 change: 1 addition & 0 deletions internal/common/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ const (
LabelGpuGroup = "runai-gpu-group"
LabelGpuProduct = "nvidia.com/gpu.product"
LabelMigConfigState = "nvidia.com/mig.config.state"
LabelFakeNodeDeployment = "run.ai/fake-node-deployment"
LabelFakeNodeDeploymentTemplate = "run.ai/fake-node-deployment-template"
LabelTopologyCMNodeTopology = "node-topology"
LabelTopologyCMNodeName = "node-name"
Expand Down
11 changes: 6 additions & 5 deletions internal/status-updater/handlers/node/fake_node_deployments.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ package node
import (
"context"
"fmt"
"os"
"time"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/spf13/viper"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -18,7 +18,7 @@ import (
)

const (
dummyDcgmExporterPodTimeout = 5 * time.Minute
dummyDcgmExporterPodTimeout = 20 * time.Second
)

func (p *NodeHandler) applyFakeNodeDeployments(node *v1.Node) error {
Expand Down Expand Up @@ -62,7 +62,7 @@ func (p *NodeHandler) deleteFakeNodeDeployments(node *v1.Node) error {
}

func (p *NodeHandler) generateFakeNodeDeployments(node *v1.Node) ([]appsv1.Deployment, error) {
deploymentTemplates, err := p.kubeClient.AppsV1().Deployments(os.Getenv(constants.EnvFakeGpuOperatorNs)).List(context.TODO(), metav1.ListOptions{
deploymentTemplates, err := p.kubeClient.AppsV1().Deployments(viper.GetString(constants.EnvFakeGpuOperatorNs)).List(context.TODO(), metav1.ListOptions{
LabelSelector: fmt.Sprintf("%s=true", constants.LabelFakeNodeDeploymentTemplate),
})
if err != nil {
Expand Down Expand Up @@ -108,13 +108,14 @@ func (p *NodeHandler) applyDeployment(deployment appsv1.Deployment) error {
func (p *NodeHandler) generateFakeNodeDeploymentFromTemplate(template *appsv1.Deployment, node *v1.Node) (*appsv1.Deployment, error) {
dummyDcgmExporterPod, err := p.getDummyDcgmExporterPod(node.Name)
if err != nil {
return nil, fmt.Errorf("failed to get dummy dcgm exporter IP: %w", err)
return nil, fmt.Errorf("failed to get dummy dcgm exporter: %w", err)
}

deployment := template.DeepCopy()

delete(deployment.Labels, constants.LabelFakeNodeDeploymentTemplate)
deployment.Name = fmt.Sprintf("%s-%s", deployment.Name, node.Name)
deployment.Labels[constants.LabelFakeNodeDeployment] = "true"
deployment.Spec.Replicas = ptr.To(int32(1))

deployment.Spec.Selector.MatchLabels[constants.LabelApp] = constants.KwokDCGMExporterApp
Expand Down Expand Up @@ -155,7 +156,7 @@ func (p *NodeHandler) getDummyDcgmExporterPod(nodeName string) (*v1.Pod, error)
ctx, cancel := context.WithTimeout(context.Background(), dummyDcgmExporterPodTimeout)
defer cancel()

watcher, err := p.kubeClient.CoreV1().Pods(v1.NamespaceAll).Watch(ctx, metav1.ListOptions{
watcher, err := p.kubeClient.CoreV1().Pods(viper.GetString(constants.EnvFakeGpuOperatorNs)).Watch(ctx, metav1.ListOptions{
LabelSelector: labelSelector,
FieldSelector: fieldSelector,
})
Expand Down
159 changes: 159 additions & 0 deletions internal/status-updater/handlers/node/fake_node_deployments_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
package node

import (
"context"
"fmt"
"os"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
"github.com/spf13/viper"

appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes/fake"
testcore "k8s.io/client-go/testing"
)

var _ = Describe("NodeHandler", func() {
const (
nodeName = "test-node"
fgoNS = "gpu-operator"
)
var (
nodeHandler *NodeHandler
kubeClient *fake.Clientset
watcher *watch.FakeWatcher

clusterTopology *topology.ClusterTopology
ctx = context.Background()
)

BeforeEach(func() {
// Set up environment variables
os.Setenv(constants.EnvFakeGpuOperatorNs, fgoNS)
os.Setenv(constants.EnvTopologyCmName, "topology")
os.Setenv(constants.EnvTopologyCmNamespace, fgoNS)
viper.AutomaticEnv()

// Set up cluster topology
clusterTopology = &topology.ClusterTopology{
NodePoolLabelKey: "nodepool",
NodePools: map[string]topology.NodePoolTopology{
"default": {
GpuMemory: 100,
GpuProduct: "Tesla V100",
GpuCount: 2,
},
},
}

// Set up kube client and node handler
kubeClient = fake.NewSimpleClientset()
watcher = watch.NewFakeWithChanSize(1, false)
kubeClient.PrependWatchReactor("pods", testcore.DefaultWatchReactor(watcher, nil))
nodeHandler = NewNodeHandler(kubeClient, clusterTopology)

// Create DCGM Exporter Deployment Template
_, err := kubeClient.AppsV1().Deployments(fgoNS).Create(ctx, &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "nvidia-dcgm-exporter",
Labels: map[string]string{
constants.LabelFakeNodeDeploymentTemplate: "true",
},
},
Spec: appsv1.DeploymentSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{},
},
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
constants.LabelApp: constants.DCGMExporterApp,
},
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Env: []v1.EnvVar{},
},
},
},
},
},
}, metav1.CreateOptions{})
Expect(err).To(BeNil())
})

Context("HandleAdd", func() {
It("should handle fake node addition", func() {
node := &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: nodeName,
Annotations: map[string]string{
constants.AnnotationKwokNode: "fake",
},
Labels: map[string]string{
clusterTopology.NodePoolLabelKey: "default",
},
},
}
_, err := kubeClient.CoreV1().Nodes().Create(ctx, node, metav1.CreateOptions{})
Expect(err).To(BeNil())

// Create DCGM Exporter Pod
dcgmExporterPod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "nvidia-dcgm-exporter-123456",
Namespace: fgoNS,
Labels: map[string]string{
constants.LabelApp: constants.DCGMExporterApp,
},
},
Spec: v1.PodSpec{
NodeName: nodeName,
},
Status: v1.PodStatus{
PodIP: "10.0.0.1",
Phase: v1.PodRunning,
},
}
_, err = kubeClient.CoreV1().Pods(fgoNS).Create(ctx, dcgmExporterPod, metav1.CreateOptions{})
Expect(err).To(BeNil())
watcher.Add(dcgmExporterPod)

// Watch for dcgmExporterPod creation
// podWatcher, err := kubeClient.CoreV1().Pods(viper.GetString(constants.EnvFakeGpuOperatorNs)).Watch(ctx, metav1.ListOptions{
// LabelSelector: "app=dcgm-exporter",
// FieldSelector: "spec.nodeName=test-node",
// })
// Expect(err).To(BeNil())
// defer podWatcher.Stop()
// Eventually(podWatcher.ResultChan()).Should(Receive())

err = nodeHandler.HandleAdd(node)
Expect(err).To(BeNil())

By("creating node topology ConfigMap")
_, err = kubeClient.CoreV1().ConfigMaps(fgoNS).Get(ctx, topology.GetNodeTopologyCMName(node.Name), metav1.GetOptions{})
Expect(err).To(BeNil())

By("labeling node")
node, err = kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
Expect(err).To(BeNil())
Expect(node.Labels).To(HaveKeyWithValue(dcgmExporterLabelKey, "true"))
Expect(node.Labels).ToNot(HaveKeyWithValue(devicePluginLabelKey, "true"))

By("applying fake node deployments")
deployments, err := kubeClient.AppsV1().Deployments(fgoNS).List(ctx, metav1.ListOptions{
LabelSelector: fmt.Sprintf("%s=true", constants.LabelFakeNodeDeployment),
})
Expect(err).To(BeNil())
Expect(deployments.Items).To(HaveLen(1))
})
})
})
13 changes: 13 additions & 0 deletions internal/status-updater/handlers/node/node_suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package node_test

import (
"testing"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

func TestNode(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Node Suite")
}

0 comments on commit 77010de

Please sign in to comment.