From 3f0fd060d19ca4d1112cabd4cec2267c8b585b99 Mon Sep 17 00:00:00 2001 From: Guy Shaibi Date: Tue, 2 Apr 2024 15:24:31 +0300 Subject: [PATCH 1/5] Update constants and annotations in code --- internal/common/constants/constants.go | 20 +++++++++---------- internal/migfaker/migfaker.go | 6 +++--- internal/migfaker/migfaker_test.go | 2 +- internal/status-updater/app_test.go | 18 ++++++++--------- .../handlers/node/fake_node_deployments.go | 6 +++--- .../pod/gpu_reservation_pod_handler.go | 4 ++-- .../handlers/pod/gpu_usage_calculator.go | 4 ++-- .../handlers/pod/shared_gpu_pod_handler.go | 8 ++++---- internal/status-updater/util/util.go | 4 ++-- 9 files changed, 36 insertions(+), 36 deletions(-) diff --git a/internal/common/constants/constants.go b/internal/common/constants/constants.go index 4c4497f..5efc4bb 100644 --- a/internal/common/constants/constants.go +++ b/internal/common/constants/constants.go @@ -1,17 +1,17 @@ package constants const ( - GpuIdxAnnotation = "runai-gpu" - GpuFractionAnnotation = "gpu-fraction" - PodGroupNameAnnotation = "pod-group-name" - ReservationPodGpuIdxAnnotation = "run.ai/reserve_for_gpu_index" - MigMappingAnnotation = "run.ai/mig-mapping" - KwokNodeAnnotation = "kwok.x-k8s.io/node" + AnnotationGpuIdx = "runai-gpu" + AnnotationGpuFraction = "gpu-fraction" + AnnotationPodGroupName = "pod-group-name" + AnnotationReservationPodGpuIdx = "run.ai/reserve_for_gpu_index" + AnnotationMigMapping = "run.ai/mig-mapping" + AnnotationKwokNode = "kwok.x-k8s.io/node" - GpuGroupLabel = "runai-gpu-group" - GpuProductLabel = "nvidia.com/gpu.product" - MigConfigStateLabel = "nvidia.com/mig.config.state" - FakeNodeDeploymentTemplateLabel = "run.ai/fake-node-deployment-template" + LabelGpuGroup = "runai-gpu-group" + LabelGpuProduct = "nvidia.com/gpu.product" + LabelMigConfigState = "nvidia.com/mig.config.state" + LabelFakeNodeDeploymentTemplate = "run.ai/fake-node-deployment-template" ReservationNs = "runai-reservation" diff --git a/internal/migfaker/migfaker.go b/internal/migfaker/migfaker.go index 0847e47..148012b 100644 --- a/internal/migfaker/migfaker.go +++ b/internal/migfaker/migfaker.go @@ -48,10 +48,10 @@ func (faker *MigFaker) FakeMapping(config *MigConfigs) error { smappings, _ := json.Marshal(mappings) labels := map[string]string{ - constants.MigConfigStateLabel: "success", + constants.LabelMigConfigState: "success", } annotations := map[string]string{ - constants.MigMappingAnnotation: base64.StdEncoding.EncodeToString(smappings), + constants.AnnotationMigMapping: base64.StdEncoding.EncodeToString(smappings), } err := faker.kubeclient.SetNodeLabels(labels) @@ -95,7 +95,7 @@ func (faker *MigFaker) getGpuProduct() (string, error) { return "", fmt.Errorf("failed to get node labels: %w", err) } - return nodeLabels[constants.GpuProductLabel], nil + return nodeLabels[constants.LabelGpuProduct], nil } func migInstanceNameToGpuInstanceId(gpuProduct string, migInstanceName string) (int, error) { diff --git a/internal/migfaker/migfaker_test.go b/internal/migfaker/migfaker_test.go index 495481e..0ef6243 100644 --- a/internal/migfaker/migfaker_test.go +++ b/internal/migfaker/migfaker_test.go @@ -38,7 +38,7 @@ func TestFakeMapping(t *testing.T) { } kubeClientMock.ActualGetNodeLabels = func() (map[string]string, error) { return map[string]string{ - constants.GpuProductLabel: "NVIDIA-A100-SXM4-40GB", + constants.LabelGpuProduct: "NVIDIA-A100-SXM4-40GB", }, nil } diff --git a/internal/status-updater/app_test.go b/internal/status-updater/app_test.go index 7afb201..195774e 100644 --- a/internal/status-updater/app_test.go +++ b/internal/status-updater/app_test.go @@ -271,7 +271,7 @@ var _ = Describe("StatusUpdater", func() { if err != nil { return "", err } - return pod.Annotations[constants.ReservationPodGpuIdxAnnotation], nil + return pod.Annotations[constants.AnnotationReservationPodGpuIdx], nil }).Should(Equal(strconv.Itoa(gpuIdx))) }) }) @@ -294,7 +294,7 @@ var _ = Describe("StatusUpdater", func() { } for _, gpuDetails := range nodeTopology.Gpus { - if gpuDetails.ID == pod.Annotations[constants.ReservationPodGpuIdxAnnotation] { + if gpuDetails.ID == pod.Annotations[constants.AnnotationReservationPodGpuIdx] { return true, nil } } @@ -531,7 +531,7 @@ func createDedicatedGpuPod(gpuCount int64, phase v1.PodPhase, conditions []v1.Po Namespace: podNamespace, UID: podUID, Annotations: map[string]string{ - constants.PodGroupNameAnnotation: podGroupName, + constants.AnnotationPodGroupName: podGroupName, }, }, Spec: v1.PodSpec{ @@ -557,14 +557,14 @@ func createDedicatedGpuPod(gpuCount int64, phase v1.PodPhase, conditions []v1.Po func createGpuIdxSharedGpuPod(gpuIdx int, gpuFraction float64) *v1.Pod { pod := createBaseSharedGpuPod(gpuFraction) - pod.Annotations[constants.GpuIdxAnnotation] = fmt.Sprintf("%d", gpuIdx) + pod.Annotations[constants.AnnotationGpuIdx] = fmt.Sprintf("%d", gpuIdx) return pod } func createGpuGroupSharedGpuPod(gpuGroup string, gpuFraction float64) *v1.Pod { pod := createBaseSharedGpuPod(gpuFraction) - pod.Labels[constants.GpuGroupLabel] = gpuGroup + pod.Labels[constants.LabelGpuGroup] = gpuGroup return pod } @@ -575,8 +575,8 @@ func createBaseSharedGpuPod(gpuFraction float64) *v1.Pod { Namespace: podNamespace, UID: podUID, Annotations: map[string]string{ - constants.GpuFractionAnnotation: fmt.Sprintf("%f", gpuFraction), - constants.PodGroupNameAnnotation: podGroupName, + constants.AnnotationGpuFraction: fmt.Sprintf("%f", gpuFraction), + constants.AnnotationPodGroupName: podGroupName, }, Labels: map[string]string{}, }, @@ -604,14 +604,14 @@ func createGpuIdxReservationPod(gpuIdx *int) *v1.Pod { pod := createBaseReservationPod() if gpuIdx != nil { - pod.Annotations[constants.ReservationPodGpuIdxAnnotation] = strconv.Itoa(*gpuIdx) + pod.Annotations[constants.AnnotationReservationPodGpuIdx] = strconv.Itoa(*gpuIdx) } return pod } func createGpuGroupReservationPod(gpuGroup string) *v1.Pod { pod := createBaseReservationPod() - pod.Labels[constants.GpuGroupLabel] = gpuGroup + pod.Labels[constants.LabelGpuGroup] = gpuGroup return pod } diff --git a/internal/status-updater/handlers/node/fake_node_deployments.go b/internal/status-updater/handlers/node/fake_node_deployments.go index b75eb0e..2988fb0 100644 --- a/internal/status-updater/handlers/node/fake_node_deployments.go +++ b/internal/status-updater/handlers/node/fake_node_deployments.go @@ -55,7 +55,7 @@ func (p *NodeHandler) deleteFakeNodeDeployments(node *v1.Node) error { func (p *NodeHandler) generateFakeNodeDeployments(node *v1.Node) ([]appsv1.Deployment, error) { deploymentTemplates, err := p.kubeClient.AppsV1().Deployments(os.Getenv(constants.EnvFakeGpuOperatorNs)).List(context.TODO(), metav1.ListOptions{ - LabelSelector: fmt.Sprintf("%s=true", constants.FakeNodeDeploymentTemplateLabel), + LabelSelector: fmt.Sprintf("%s=true", constants.LabelFakeNodeDeploymentTemplate), }) if err != nil { return nil, fmt.Errorf("failed to list deployments: %w", err) @@ -96,7 +96,7 @@ func (p *NodeHandler) applyDeployment(deployment appsv1.Deployment) error { func generateFakeNodeDeploymentFromTemplate(template *appsv1.Deployment, node *v1.Node) *appsv1.Deployment { deployment := template.DeepCopy() - delete(deployment.Labels, constants.FakeNodeDeploymentTemplateLabel) + delete(deployment.Labels, constants.LabelFakeNodeDeploymentTemplate) deployment.Name = fmt.Sprintf("%s-%s", deployment.Name, node.Name) deployment.Spec.Replicas = ptr.To(int32(1)) deployment.Spec.Template.Spec.Containers[0].Env = append(deployment.Spec.Template.Spec.Containers[0].Env, v1.EnvVar{ @@ -111,5 +111,5 @@ func generateFakeNodeDeploymentFromTemplate(template *appsv1.Deployment, node *v } func isFakeNode(node *v1.Node) bool { - return node != nil && node.Annotations[constants.KwokNodeAnnotation] == "fake" + return node != nil && node.Annotations[constants.AnnotationKwokNode] == "fake" } diff --git a/internal/status-updater/handlers/pod/gpu_reservation_pod_handler.go b/internal/status-updater/handlers/pod/gpu_reservation_pod_handler.go index 47b3897..4c30cec 100644 --- a/internal/status-updater/handlers/pod/gpu_reservation_pod_handler.go +++ b/internal/status-updater/handlers/pod/gpu_reservation_pod_handler.go @@ -29,7 +29,7 @@ func (p *PodHandler) handleGpuReservationPodAddition(pod *v1.Pod, nodeTopology * func (p *PodHandler) setReservationPodGpuIdxAnnotationIfNeeded(pod *v1.Pod, nodeTopology *topology.NodeTopology) error { // DEPRECATED: Prior to 2.17, the scheduler had set the GPU index annotation for the reservation pod, // therefore we skip setting the annotation if it already exists to support backward compatibility. - if _, ok := pod.Annotations[constants.ReservationPodGpuIdxAnnotation]; ok { + if _, ok := pod.Annotations[constants.AnnotationReservationPodGpuIdx]; ok { return nil } @@ -39,7 +39,7 @@ func (p *PodHandler) setReservationPodGpuIdxAnnotationIfNeeded(pod *v1.Pod, node return fmt.Errorf("failed to find GPU allocated by pod %s: %w", pod.Name, err) } - annotationKey := constants.ReservationPodGpuIdxAnnotation + annotationKey := constants.AnnotationReservationPodGpuIdx annotationVal := allocatedGpuID patch := []byte(fmt.Sprintf(`{"metadata": {"annotations": {"%s": "%s"}}}`, annotationKey, annotationVal)) diff --git a/internal/status-updater/handlers/pod/gpu_usage_calculator.go b/internal/status-updater/handlers/pod/gpu_usage_calculator.go index 2b5971c..642d86c 100644 --- a/internal/status-updater/handlers/pod/gpu_usage_calculator.go +++ b/internal/status-updater/handlers/pod/gpu_usage_calculator.go @@ -20,7 +20,7 @@ import ( const ( gpuUtilizationAnnotationKey = "run.ai/simulated-gpu-utilization" - gpuFractionAnnotationKey = constants.GpuFractionAnnotation + gpuFractionAnnotationKey = constants.AnnotationGpuFraction idleGpuPodNamePrefix = "runai-idle-gpu-" ) @@ -104,7 +104,7 @@ func calculateUtilizationFromAnnotation(annotationValue string) (*topology.Range } func getPodType(dynamicClient dynamic.Interface, pod *v1.Pod) (string, error) { - podGroupName := pod.Annotations[constants.PodGroupNameAnnotation] + podGroupName := pod.Annotations[constants.AnnotationPodGroupName] if podGroupName == "" { return "", fmt.Errorf("pod %s has no constants.PodGroupNameAnnotation annotation", pod.Name) } diff --git a/internal/status-updater/handlers/pod/shared_gpu_pod_handler.go b/internal/status-updater/handlers/pod/shared_gpu_pod_handler.go index ab584cd..ff25fbe 100644 --- a/internal/status-updater/handlers/pod/shared_gpu_pod_handler.go +++ b/internal/status-updater/handlers/pod/shared_gpu_pod_handler.go @@ -105,7 +105,7 @@ func getMatchingReservationPodName(kubeclient kubernetes.Interface, pod *v1.Pod) } func getMatchingReservationPodNameByRunaiGpuAnnotation(kubeclient kubernetes.Interface, pod *v1.Pod) (string, error) { - runaiGpu := pod.Annotations[constants.GpuIdxAnnotation] + runaiGpu := pod.Annotations[constants.AnnotationGpuIdx] if runaiGpu == "" { return "", fmt.Errorf("pod %s has empty runai-gpu annotation", pod.Name) } @@ -121,7 +121,7 @@ func getMatchingReservationPodNameByRunaiGpuAnnotation(kubeclient kubernetes.Int } for _, nodeReservationPod := range nodeReservationPods.Items { - if nodeReservationPod.Annotations[constants.ReservationPodGpuIdxAnnotation] == runaiGpu { + if nodeReservationPod.Annotations[constants.AnnotationReservationPodGpuIdx] == runaiGpu { return nodeReservationPod.Name, nil } } @@ -130,7 +130,7 @@ func getMatchingReservationPodNameByRunaiGpuAnnotation(kubeclient kubernetes.Int } func getMatchingReservationPodNameByRunaiGpuGroupLabel(kubeclient kubernetes.Interface, pod *v1.Pod) (string, error) { - runaiGpuGroup := pod.Labels[constants.GpuGroupLabel] + runaiGpuGroup := pod.Labels[constants.LabelGpuGroup] if runaiGpuGroup == "" { return "", fmt.Errorf("pod %s has empty runai-gpu-group label", pod.Name) } @@ -141,7 +141,7 @@ func getMatchingReservationPodNameByRunaiGpuGroupLabel(kubeclient kubernetes.Int } for _, nodeReservationPod := range nodeReservationPods.Items { - if nodeReservationPod.Labels[constants.GpuGroupLabel] == runaiGpuGroup { + if nodeReservationPod.Labels[constants.LabelGpuGroup] == runaiGpuGroup { return nodeReservationPod.Name, nil } } diff --git a/internal/status-updater/util/util.go b/internal/status-updater/util/util.go index 5833db7..55116ca 100644 --- a/internal/status-updater/util/util.go +++ b/internal/status-updater/util/util.go @@ -7,8 +7,8 @@ import ( ) func IsSharedGpuPod(pod *v1.Pod) bool { - _, runaiGpuExists := pod.Annotations[constants.GpuIdxAnnotation] - _, runaiGpuGroupExists := pod.Labels[constants.GpuGroupLabel] + _, runaiGpuExists := pod.Annotations[constants.AnnotationGpuIdx] + _, runaiGpuGroupExists := pod.Labels[constants.LabelGpuGroup] return !IsGpuReservationPod(pod) && (runaiGpuExists || runaiGpuGroupExists) } From 2c07b0c89f196d257fe677ec63b2ffd51d654057 Mon Sep 17 00:00:00 2001 From: Guy Shaibi Date: Tue, 2 Apr 2024 15:28:29 +0300 Subject: [PATCH 2/5] Update environment variable names to use constants --- cmd/device-plugin/main.go | 5 +++-- cmd/nvidia-smi/main.go | 7 ++++--- cmd/status-updater/main.go | 3 ++- internal/common/app/apprunner.go | 5 +++-- internal/common/kubeclient/kubeclient.go | 7 ++++--- internal/common/topology/kubernetes.go | 21 ++++++++++--------- internal/migfaker/syncconfig.go | 3 ++- internal/status-exporter/app.go | 2 +- internal/status-exporter/app_test.go | 7 ++++--- .../export/labels/exporter_test.go | 3 ++- .../export/metrics/exporter.go | 3 ++- internal/status-exporter/watch/kubewatcher.go | 5 +++-- internal/status-updater/app_test.go | 4 ++-- .../controllers/node/controller.go | 5 +++-- 14 files changed, 46 insertions(+), 34 deletions(-) diff --git a/cmd/device-plugin/main.go b/cmd/device-plugin/main.go index b5d47e6..3dc1f1b 100644 --- a/cmd/device-plugin/main.go +++ b/cmd/device-plugin/main.go @@ -9,6 +9,7 @@ import ( "github.com/otiai10/copy" "github.com/run-ai/fake-gpu-operator/internal/common/config" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/run-ai/fake-gpu-operator/internal/common/topology" "github.com/run-ai/fake-gpu-operator/internal/deviceplugin" "github.com/spf13/viper" @@ -28,11 +29,11 @@ func main() { kubeClient := KubeClientFn(clusterConfig) log.Println("Fake Device Plugin Running") - requiredEnvVars := []string{"TOPOLOGY_CM_NAME", "TOPOLOGY_CM_NAMESPACE", "NODE_NAME"} + requiredEnvVars := []string{constants.EnvTopologyCmName, constants.EnvTopologyCmNamespace, constants.EnvNodeName} config.ValidateConfig(requiredEnvVars) viper.AutomaticEnv() - topology, err := topology.GetNodeTopologyFromCM(kubeClient, os.Getenv("NODE_NAME")) + topology, err := topology.GetNodeTopologyFromCM(kubeClient, os.Getenv(constants.EnvNodeName)) if err != nil { log.Printf("Failed to get topology: %s\n", err) os.Exit(1) diff --git a/cmd/nvidia-smi/main.go b/cmd/nvidia-smi/main.go index 51eae00..ca05268 100644 --- a/cmd/nvidia-smi/main.go +++ b/cmd/nvidia-smi/main.go @@ -11,6 +11,7 @@ import ( "time" "github.com/jedib0t/go-pretty/v6/table" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/run-ai/fake-gpu-operator/internal/common/topology" ) @@ -25,8 +26,8 @@ type nvidiaSmiArgs struct { // main is the entry point for the application. func main() { - os.Setenv("TOPOLOGY_CM_NAMESPACE", "gpu-operator") - os.Setenv("TOPOLOGY_CM_NAME", "topology") + os.Setenv(constants.EnvTopologyCmNamespace, "gpu-operator") + os.Setenv(constants.EnvTopologyCmName, "topology") args := getNvidiaSmiArgs() @@ -34,7 +35,7 @@ func main() { } func getNvidiaSmiArgs() (args nvidiaSmiArgs) { - nodeName := os.Getenv("NODE_NAME") + nodeName := os.Getenv(constants.EnvNodeName) // Send http request to topology-server to get the topology resp, err := http.Get("http://topology-server.gpu-operator/topology/nodes/" + nodeName) diff --git a/cmd/status-updater/main.go b/cmd/status-updater/main.go index 7a8d4f8..c2a6862 100644 --- a/cmd/status-updater/main.go +++ b/cmd/status-updater/main.go @@ -3,11 +3,12 @@ package main import ( "github.com/run-ai/fake-gpu-operator/internal/common/app" "github.com/run-ai/fake-gpu-operator/internal/common/config" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" status_updater "github.com/run-ai/fake-gpu-operator/internal/status-updater" ) func main() { - requiredEnvVars := []string{"TOPOLOGY_CM_NAME", "TOPOLOGY_CM_NAMESPACE", "FAKE_GPU_OPERATOR_NAMESPACE"} + requiredEnvVars := []string{constants.EnvTopologyCmName, constants.EnvTopologyCmNamespace, constants.EnvFakeGpuOperatorNs} config.ValidateConfig(requiredEnvVars) appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{}) diff --git a/internal/common/app/apprunner.go b/internal/common/app/apprunner.go index 022d92f..26fb703 100644 --- a/internal/common/app/apprunner.go +++ b/internal/common/app/apprunner.go @@ -9,6 +9,7 @@ import ( "github.com/go-playground/validator" "github.com/mitchellh/mapstructure" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/spf13/viper" ) @@ -95,6 +96,6 @@ func bindStruct(input interface{}) error { } func setDefaults() { - viper.SetDefault("TOPOLOGY_CM_NAME", "topology") - viper.SetDefault("TOPOLOGY_CM_NAMESPACE", "gpu-operator") + viper.SetDefault(constants.EnvTopologyCmName, "topology") + viper.SetDefault(constants.EnvTopologyCmNamespace, "gpu-operator") } diff --git a/internal/common/kubeclient/kubeclient.go b/internal/common/kubeclient/kubeclient.go index 49b05b6..4ebbb11 100644 --- a/internal/common/kubeclient/kubeclient.go +++ b/internal/common/kubeclient/kubeclient.go @@ -4,6 +4,7 @@ import ( "context" "log" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/spf13/viper" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -44,7 +45,7 @@ func NewKubeClient(config *rest.Config, stop chan struct{}) *KubeClient { } func (client *KubeClient) SetNodeLabels(lables map[string]string) error { - nodeName := viper.GetString("NODE_NAME") + nodeName := viper.GetString(constants.EnvNodeName) node, err := client.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) if err != nil { return err @@ -60,7 +61,7 @@ func (client *KubeClient) SetNodeLabels(lables map[string]string) error { } func (client *KubeClient) SetNodeAnnotations(annotations map[string]string) error { - nodeName := viper.GetString("NODE_NAME") + nodeName := viper.GetString(constants.EnvNodeName) node, err := client.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) if err != nil { return err @@ -76,7 +77,7 @@ func (client *KubeClient) SetNodeAnnotations(annotations map[string]string) erro } func (client *KubeClient) GetNodeLabels() (map[string]string, error) { - nodeName := viper.GetString("NODE_NAME") + nodeName := viper.GetString(constants.EnvNodeName) node, err := client.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) if err != nil { return nil, err diff --git a/internal/common/topology/kubernetes.go b/internal/common/topology/kubernetes.go index 95c8e8b..e2de164 100644 --- a/internal/common/topology/kubernetes.go +++ b/internal/common/topology/kubernetes.go @@ -6,6 +6,7 @@ import ( "gopkg.in/yaml.v3" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/spf13/viper" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -15,7 +16,7 @@ import ( func GetNodeTopologyFromCM(kubeclient kubernetes.Interface, nodeName string) (*NodeTopology, error) { cmName := GetNodeTopologyCMName(nodeName) cm, err := kubeclient.CoreV1().ConfigMaps( - viper.GetString("TOPOLOGY_CM_NAMESPACE")).Get( + viper.GetString(constants.EnvTopologyCmNamespace)).Get( context.TODO(), cmName, metav1.GetOptions{}) if err != nil { return nil, err @@ -31,7 +32,7 @@ func CreateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTop } _, err = kubeclient.CoreV1().ConfigMaps( - viper.GetString("TOPOLOGY_CM_NAMESPACE")).Create(context.TODO(), cm, metav1.CreateOptions{}) + viper.GetString(constants.EnvTopologyCmNamespace)).Create(context.TODO(), cm, metav1.CreateOptions{}) return err } @@ -42,21 +43,21 @@ func UpdateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTop } _, err = kubeclient.CoreV1().ConfigMaps( - viper.GetString("TOPOLOGY_CM_NAMESPACE")).Update(context.TODO(), cm, metav1.UpdateOptions{}) + viper.GetString(constants.EnvTopologyCmNamespace)).Update(context.TODO(), cm, metav1.UpdateOptions{}) return err } func DeleteNodeTopologyCM(kubeclient kubernetes.Interface, nodeName string) error { err := kubeclient.CoreV1().ConfigMaps( - viper.GetString("TOPOLOGY_CM_NAMESPACE")).Delete(context.TODO(), GetNodeTopologyCMName(nodeName), metav1.DeleteOptions{}) + viper.GetString(constants.EnvTopologyCmNamespace)).Delete(context.TODO(), GetNodeTopologyCMName(nodeName), metav1.DeleteOptions{}) return err } func GetBaseTopologyFromCM(kubeclient kubernetes.Interface) (*BaseTopology, error) { topologyCm, err := kubeclient.CoreV1().ConfigMaps( - viper.GetString("TOPOLOGY_CM_NAMESPACE")).Get( - context.TODO(), viper.GetString("TOPOLOGY_CM_NAME"), metav1.GetOptions{}) + viper.GetString(constants.EnvTopologyCmNamespace)).Get( + context.TODO(), viper.GetString(constants.EnvTopologyCmName), metav1.GetOptions{}) if err != nil { return nil, fmt.Errorf("failed to get topology configmap: %v", err) } @@ -92,8 +93,8 @@ func FromNodeTopologyCM(cm *corev1.ConfigMap) (*NodeTopology, error) { func ToBaseTopologyCM(baseTopology *BaseTopology) (*corev1.ConfigMap, error) { cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ - Name: viper.GetString("TOPOLOGY_CM_NAME"), - Namespace: viper.GetString("TOPOLOGY_CM_NAMESPACE"), + Name: viper.GetString(constants.EnvTopologyCmName), + Namespace: viper.GetString(constants.EnvTopologyCmNamespace), }, Data: make(map[string]string), } @@ -112,7 +113,7 @@ func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.Conf cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: GetNodeTopologyCMName(nodeName), - Namespace: viper.GetString("TOPOLOGY_CM_NAMESPACE"), + Namespace: viper.GetString(constants.EnvTopologyCmNamespace), Labels: map[string]string{ "node-topology": "true", }, @@ -131,5 +132,5 @@ func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.Conf } func GetNodeTopologyCMName(nodeName string) string { - return viper.GetString("TOPOLOGY_CM_NAME") + "-" + nodeName + return viper.GetString(constants.EnvTopologyCmName) + "-" + nodeName } diff --git a/internal/migfaker/syncconfig.go b/internal/migfaker/syncconfig.go index 852a2f6..d4ab0b9 100644 --- a/internal/migfaker/syncconfig.go +++ b/internal/migfaker/syncconfig.go @@ -3,6 +3,7 @@ package migfaker import ( "sync" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/spf13/viper" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/fields" @@ -51,7 +52,7 @@ func ContinuouslySyncMigConfigChanges(clientset kubernetes.Interface, migConfig clientset.CoreV1().RESTClient(), ResourceNodes, v1.NamespaceAll, - fields.OneTermEqualSelector("metadata.name", viper.GetString("NODE_NAME")), + fields.OneTermEqualSelector("metadata.name", viper.GetString(constants.EnvNodeName)), ) _, controller := cache.NewInformer( diff --git a/internal/status-exporter/app.go b/internal/status-exporter/app.go index cdfe5fb..ab073d2 100644 --- a/internal/status-exporter/app.go +++ b/internal/status-exporter/app.go @@ -13,7 +13,7 @@ import ( type StatusExporterAppConfig struct { NodeName string `mapstructure:"NODE_NAME" validator:"required"` - TopologyCmName string `mapstructure:"TOPOLOGY_CM_NAME" validator:"required"` + TopologyCmName string `mapstructure:constants.EnvTopologyCmName validator:"required"` TopologyCmNamespace string `mapstructure:"TOPOLOGY_CM_NAMESPACE" validator:"required"` TopologyMaxExportInterval string `mapstructure:"TOPOLOGY_MAX_EXPORT_INTERVAL"` } diff --git a/internal/status-exporter/app_test.go b/internal/status-exporter/app_test.go index b6de76a..7c5039f 100644 --- a/internal/status-exporter/app_test.go +++ b/internal/status-exporter/app_test.go @@ -18,6 +18,7 @@ import ( "k8s.io/client-go/kubernetes/fake" "github.com/run-ai/fake-gpu-operator/internal/common/app" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/run-ai/fake-gpu-operator/internal/common/kubeclient" "github.com/run-ai/fake-gpu-operator/internal/common/topology" status_exporter "github.com/run-ai/fake-gpu-operator/internal/status-exporter" @@ -102,9 +103,9 @@ func setupConfig() { } func setupEnvs() { - os.Setenv("TOPOLOGY_CM_NAME", topologyCmName) - os.Setenv("TOPOLOGY_CM_NAMESPACE", topologyCmNamespace) - os.Setenv("NODE_NAME", nodeName) + os.Setenv(constants.EnvTopologyCmName, topologyCmName) + os.Setenv(constants.EnvTopologyCmNamespace, topologyCmNamespace) + os.Setenv(constants.EnvNodeName, nodeName) os.Setenv("KUBERNETES_SERVICE_HOST", "fake-k8s-service-host") os.Setenv("KUBERNETES_SERVICE_PORT", "fake-k8s-service-port") } diff --git a/internal/status-exporter/export/labels/exporter_test.go b/internal/status-exporter/export/labels/exporter_test.go index 605b7be..d988954 100644 --- a/internal/status-exporter/export/labels/exporter_test.go +++ b/internal/status-exporter/export/labels/exporter_test.go @@ -5,6 +5,7 @@ import ( "sync" "testing" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/run-ai/fake-gpu-operator/internal/common/kubeclient" "github.com/run-ai/fake-gpu-operator/internal/common/topology" "github.com/run-ai/fake-gpu-operator/internal/status-exporter/export/labels" @@ -22,7 +23,7 @@ func (watcher *FakeWatcher) Subscribe(subscriber chan<- *topology.NodeTopology) func (watcher *FakeWatcher) Watch(stopCh <-chan struct{}) {} func TestExport(t *testing.T) { - viper.SetDefault("NODE_NAME", "my_node") + viper.SetDefault(constants.EnvNodeName, "my_node") myNode := &topology.NodeTopology{ GpuProduct: "some gpu", diff --git a/internal/status-exporter/export/metrics/exporter.go b/internal/status-exporter/export/metrics/exporter.go index 3a3fc32..a6328e3 100644 --- a/internal/status-exporter/export/metrics/exporter.go +++ b/internal/status-exporter/export/metrics/exporter.go @@ -10,6 +10,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/run-ai/fake-gpu-operator/internal/common/topology" "github.com/run-ai/fake-gpu-operator/internal/status-exporter/export" "github.com/run-ai/fake-gpu-operator/internal/status-exporter/watch" @@ -61,7 +62,7 @@ func (e *MetricsExporter) Run(stopCh <-chan struct{}) { } func (e *MetricsExporter) export(nodeTopology *topology.NodeTopology) error { - nodeName := viper.GetString("NODE_NAME") + nodeName := viper.GetString(constants.EnvNodeName) gpuUtilization.Reset() gpuFbUsed.Reset() diff --git a/internal/status-exporter/watch/kubewatcher.go b/internal/status-exporter/watch/kubewatcher.go index 80b04e9..ac53cf1 100644 --- a/internal/status-exporter/watch/kubewatcher.go +++ b/internal/status-exporter/watch/kubewatcher.go @@ -4,6 +4,7 @@ import ( "log" "time" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/run-ai/fake-gpu-operator/internal/common/kubeclient" "github.com/run-ai/fake-gpu-operator/internal/common/topology" "github.com/spf13/viper" @@ -28,7 +29,7 @@ func (w *KubeWatcher) Subscribe(subscriber chan<- *topology.NodeTopology) { } func (w *KubeWatcher) Watch(stopCh <-chan struct{}) { - cmChan, err := w.kubeclient.WatchConfigMap(viper.GetString("TOPOLOGY_CM_NAMESPACE"), topology.GetNodeTopologyCMName(viper.GetString("NODE_NAME"))) + cmChan, err := w.kubeclient.WatchConfigMap(viper.GetString(constants.EnvTopologyCmNamespace), topology.GetNodeTopologyCMName(viper.GetString(constants.EnvNodeName))) if err != nil { panic(err) } @@ -50,7 +51,7 @@ func (w *KubeWatcher) Watch(stopCh <-chan struct{}) { case <-ticker.C: log.Printf("Topology update not received within interval, publishing...\n") - cm, ok := w.kubeclient.GetConfigMap(viper.GetString("TOPOLOGY_CM_NAMESPACE"), topology.GetNodeTopologyCMName(viper.GetString("NODE_NAME"))) + cm, ok := w.kubeclient.GetConfigMap(viper.GetString(constants.EnvTopologyCmNamespace), topology.GetNodeTopologyCMName(viper.GetString(constants.EnvNodeName))) if !ok { break } diff --git a/internal/status-updater/app_test.go b/internal/status-updater/app_test.go index 195774e..b774579 100644 --- a/internal/status-updater/app_test.go +++ b/internal/status-updater/app_test.go @@ -501,8 +501,8 @@ func setupConfig() { } func setupEnvs() { - os.Setenv("TOPOLOGY_CM_NAME", "fake-cm-name") - os.Setenv("TOPOLOGY_CM_NAMESPACE", "fake-cm-namespace") + os.Setenv(constants.EnvTopologyCmName, "fake-cm-name") + os.Setenv(constants.EnvTopologyCmNamespace, "fake-cm-namespace") } func createTopology(gpuCount int64, nodeName string) *topology.NodeTopology { diff --git a/internal/status-updater/controllers/node/controller.go b/internal/status-updater/controllers/node/controller.go index b2bf7cd..38289fb 100644 --- a/internal/status-updater/controllers/node/controller.go +++ b/internal/status-updater/controllers/node/controller.go @@ -6,6 +6,7 @@ import ( "log" "sync" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/run-ai/fake-gpu-operator/internal/common/topology" "github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers" "github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers/util" @@ -82,7 +83,7 @@ func (c *NodeController) pruneTopologyNodes() error { return fmt.Errorf("failed listing fake gpu nodes: %v", err) } - nodeTopologyCms, err := c.kubeClient.CoreV1().ConfigMaps(viper.GetString("TOPOLOGY_CM_NAMESPACE")).List(context.TODO(), metav1.ListOptions{ + nodeTopologyCms, err := c.kubeClient.CoreV1().ConfigMaps(viper.GetString(constants.EnvTopologyCmNamespace)).List(context.TODO(), metav1.ListOptions{ LabelSelector: "node-topology=true", }) if err != nil { @@ -96,7 +97,7 @@ func (c *NodeController) pruneTopologyNodes() error { for _, cm := range nodeTopologyCms.Items { if _, ok := validNodeTopologyCMMap[cm.Name]; !ok { - util.LogErrorIfExist(c.kubeClient.CoreV1().ConfigMaps(viper.GetString("TOPOLOGY_CM_NAMESPACE")).Delete(context.TODO(), cm.Name, metav1.DeleteOptions{}), fmt.Sprintf("Failed to delete node topology cm %s", cm.Name)) + util.LogErrorIfExist(c.kubeClient.CoreV1().ConfigMaps(viper.GetString(constants.EnvTopologyCmNamespace)).Delete(context.TODO(), cm.Name, metav1.DeleteOptions{}), fmt.Sprintf("Failed to delete node topology cm %s", cm.Name)) } } From 1b2c2c2e0c78ad34618805d298c3a3586dadbeb5 Mon Sep 17 00:00:00 2001 From: Guy Shaibi Date: Tue, 2 Apr 2024 15:28:40 +0300 Subject: [PATCH 3/5] Remove unused constant in constants.go --- internal/common/constants/constants.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/common/constants/constants.go b/internal/common/constants/constants.go index 5efc4bb..2a9fa7d 100644 --- a/internal/common/constants/constants.go +++ b/internal/common/constants/constants.go @@ -17,7 +17,6 @@ const ( GpuResourceName = "nvidia.com/gpu" - // GuyTodo: Use these constants in the code EnvFakeNode = "FAKE_NODE" EnvNodeName = "NODE_NAME" EnvTopologyCmName = "TOPOLOGY_CM_NAME" From 064cf863ff777e02001ab793b62d063d5e2524ae Mon Sep 17 00:00:00 2001 From: Guy Shaibi Date: Tue, 2 Apr 2024 16:00:11 +0300 Subject: [PATCH 4/5] Fix incorrect field name in StatusExporterAppConfig --- internal/status-exporter/app.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/status-exporter/app.go b/internal/status-exporter/app.go index ab073d2..cdfe5fb 100644 --- a/internal/status-exporter/app.go +++ b/internal/status-exporter/app.go @@ -13,7 +13,7 @@ import ( type StatusExporterAppConfig struct { NodeName string `mapstructure:"NODE_NAME" validator:"required"` - TopologyCmName string `mapstructure:constants.EnvTopologyCmName validator:"required"` + TopologyCmName string `mapstructure:"TOPOLOGY_CM_NAME" validator:"required"` TopologyCmNamespace string `mapstructure:"TOPOLOGY_CM_NAMESPACE" validator:"required"` TopologyMaxExportInterval string `mapstructure:"TOPOLOGY_MAX_EXPORT_INTERVAL"` } From 7211148a3272555c6b96ed9f905c79cd1f368d40 Mon Sep 17 00:00:00 2001 From: Guy Shaibi Date: Tue, 2 Apr 2024 20:36:54 +0300 Subject: [PATCH 5/5] Fix memory allocation bug in FsExporter --- internal/status-exporter/export/fs/exporter.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/internal/status-exporter/export/fs/exporter.go b/internal/status-exporter/export/fs/exporter.go index e18fa49..97d7179 100644 --- a/internal/status-exporter/export/fs/exporter.go +++ b/internal/status-exporter/export/fs/exporter.go @@ -51,7 +51,7 @@ func (e *FsExporter) export(nodeTopology *topology.NodeTopology) { log.Printf("Exporting pod %s gpu stats to filesystem", podUuid) path := fmt.Sprintf("runai/proc/pod/%s/metrics/gpu/%d", podUuid, gpuIdx) - if err := os.MkdirAll(filepath.Dir(path), 0644); err != nil { + if err := os.MkdirAll(path, 0644); err != nil { log.Printf("Failed creating directory for pod %s: %s", podUuid, err.Error()) } @@ -59,7 +59,7 @@ func (e *FsExporter) export(nodeTopology *topology.NodeTopology) { log.Printf("Failed exporting utilization for pod %s: %s", podUuid, err.Error()) } - if err := writeFile(filepath.Join(path, "memory.allocated"), []byte(strconv.Itoa(gpuUsageStatus.FbUsed))); err != nil { + if err := writeFile(filepath.Join(path, "memory.allocated"), []byte(strconv.Itoa(mbToBytes(gpuUsageStatus.FbUsed)))); err != nil { log.Printf("Failed exporting memory for pod %s: %s", podUuid, err.Error()) } } @@ -72,3 +72,7 @@ func writeFile(path string, content []byte) error { } return nil } + +func mbToBytes(mb int) int { + return mb * 1024 * 1024 +}