From 93d896d0cc3cf5c3c37a306fe59f54fc255f3cba Mon Sep 17 00:00:00 2001 From: Vadym Fedorov Date: Thu, 14 Mar 2024 11:49:02 -0500 Subject: [PATCH] Issue-272: Added tests for K8S usecase, when MIG enabled in a mixed mode Signed-off-by: Vadym Fedorov --- .../clock_events_collector_test.go | 2 +- pkg/dcgmexporter/kubernetes.go | 24 ++++++---- pkg/dcgmexporter/kubernetes_test.go | 32 ++++++++++--- pkg/dcgmexporter/pipeline.go | 2 +- pkg/dcgmexporter/utils_test.go | 47 +++++++++++++++++++ 5 files changed, 90 insertions(+), 17 deletions(-) create mode 100644 pkg/dcgmexporter/utils_test.go diff --git a/pkg/dcgmexporter/clock_events_collector_test.go b/pkg/dcgmexporter/clock_events_collector_test.go index 4151ec20..181c5364 100644 --- a/pkg/dcgmexporter/clock_events_collector_test.go +++ b/pkg/dcgmexporter/clock_events_collector_test.go @@ -132,7 +132,7 @@ func TestClockEventsCollector_Gather(t *testing.T) { gpuIDsAsString[i] = fmt.Sprint(g) } - podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpuIDsAsString)) + podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(nvidiaResourceName, gpuIDsAsString)) // Tell that the app is running on K8S config.Kubernetes = true diff --git a/pkg/dcgmexporter/kubernetes.go b/pkg/dcgmexporter/kubernetes.go index cbebd354..ceb2bee8 100644 --- a/pkg/dcgmexporter/kubernetes.go +++ b/pkg/dcgmexporter/kubernetes.go @@ -59,7 +59,7 @@ func (p *PodMapper) Name() string { func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error { _, err := os.Stat(socketPath) if os.IsNotExist(err) { - logrus.Infof("No Kubelet socket, ignoring") + logrus.Info("No Kubelet socket, ignoring") return nil } @@ -77,6 +77,8 @@ func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error deviceToPod := p.toDeviceToPod(pods, sysInfo) + logrus.Debugf("Device to pod mapping: %+v", deviceToPod) + // Note: for loop are copies the value, if we want to change the value // and not the copy, we need to use the indexes for counter := range metrics { @@ -85,14 +87,18 @@ func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error if err != nil { return err } - if !p.Config.UseOldNamespace { - metrics[counter][j].Attributes[podAttribute] = deviceToPod[deviceID].Name - metrics[counter][j].Attributes[namespaceAttribute] = deviceToPod[deviceID].Namespace - metrics[counter][j].Attributes[containerAttribute] = deviceToPod[deviceID].Container - } else { - metrics[counter][j].Attributes[oldPodAttribute] = deviceToPod[deviceID].Name - metrics[counter][j].Attributes[oldNamespaceAttribute] = deviceToPod[deviceID].Namespace - metrics[counter][j].Attributes[oldContainerAttribute] = deviceToPod[deviceID].Container + + podInfo, exists := deviceToPod[deviceID] + if exists { + if !p.Config.UseOldNamespace { + metrics[counter][j].Attributes[podAttribute] = podInfo.Name + metrics[counter][j].Attributes[namespaceAttribute] = podInfo.Namespace + metrics[counter][j].Attributes[containerAttribute] = podInfo.Container + } else { + metrics[counter][j].Attributes[oldPodAttribute] = podInfo.Name + metrics[counter][j].Attributes[oldNamespaceAttribute] = podInfo.Namespace + metrics[counter][j].Attributes[oldContainerAttribute] = podInfo.Container + } } } } diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go index fefa1dbe..207c75f0 100644 --- a/pkg/dcgmexporter/kubernetes_test.go +++ b/pkg/dcgmexporter/kubernetes_test.go @@ -58,7 +58,7 @@ func TestProcessPodMapper(t *testing.T) { socketPath = tmpDir + "/kubelet.sock" server := grpc.NewServer() gpus := GetGPUUUIDs(arbirtaryMetric) - podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpus)) + podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(nvidiaResourceName, gpus)) cleanup = StartMockServer(t, server, socketPath) defer cleanup() @@ -125,12 +125,14 @@ func CreateTmpDir(t *testing.T) (string, func()) { // Contains a list of UUIDs type PodResourcesMockServer struct { - gpus []string + resourceName string + gpus []string } -func NewPodResourcesMockServer(used []string) *PodResourcesMockServer { +func NewPodResourcesMockServer(resourceName string, gpus []string) *PodResourcesMockServer { return &PodResourcesMockServer{ - gpus: used, + resourceName: resourceName, + gpus: gpus, } } @@ -148,7 +150,7 @@ func (s *PodResourcesMockServer) List( Name: "default", Devices: []*podresourcesapi.ContainerDevices{ { - ResourceName: nvidiaResourceName, + ResourceName: s.resourceName, DeviceIds: []string{gpu}, }, }, @@ -169,6 +171,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { type TestCase struct { KubernetesGPUIDType KubernetesGPUIDType GPUInstanceID uint + ResourceName string MetricGPUID string MetricGPUDevice string MetricMigProfile string @@ -178,17 +181,20 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { testCases := []TestCase{ { KubernetesGPUIDType: GPUUID, + ResourceName: nvidiaResourceName, MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", }, { KubernetesGPUIDType: GPUUID, + ResourceName: nvidiaResourceName, MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", MetricMigProfile: "", }, { KubernetesGPUIDType: GPUUID, + ResourceName: nvidiaResourceName, GPUInstanceID: 3, MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", MetricMigProfile: "", @@ -196,25 +202,38 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { }, { KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, GPUInstanceID: 3, MetricMigProfile: "mig", PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", }, { KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, MetricMigProfile: "mig", PODGPUID: "nvidia0/gi0", }, { KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, MetricGPUDevice: "0", PODGPUID: "0/vgpu", }, { KubernetesGPUIDType: GPUUID, + ResourceName: nvidiaResourceName, MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5::", }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: "nvidia.com/mig-1g.10gb", + MetricMigProfile: "1g.10gb", + MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + MetricGPUDevice: "0", + GPUInstanceID: 3, + }, } for _, tc := range testCases { @@ -235,7 +254,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { defer cleanup() gpus := []string{tc.PODGPUID} - podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpus)) + podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(tc.ResourceName, gpus)) cleanup = StartMockServer(t, server, socketPath) defer cleanup() @@ -261,6 +280,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { FieldName: "DCGM_FI_DEV_POWER_USAGE", PromType: "gauge", } + metrics[counter] = append(metrics[counter], Metric{ GPU: "0", GPUUUID: tc.MetricGPUID, diff --git a/pkg/dcgmexporter/pipeline.go b/pkg/dcgmexporter/pipeline.go index c167842c..0e524669 100644 --- a/pkg/dcgmexporter/pipeline.go +++ b/pkg/dcgmexporter/pipeline.go @@ -361,7 +361,7 @@ var cpuCoreMetricsFormat = ` {{- end }} {{ end }}` -// Template is passed here so that it isn't recompiled at each iteration +// FormatMetrics Template is passed here so that it isn't recompiled at each iteration func FormatMetrics(t *template.Template, groupedMetrics MetricsByCounter) (string, error) { // Format metrics var res bytes.Buffer diff --git a/pkg/dcgmexporter/utils_test.go b/pkg/dcgmexporter/utils_test.go new file mode 100644 index 00000000..b1a7dc94 --- /dev/null +++ b/pkg/dcgmexporter/utils_test.go @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import ( + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestWaitWithTimeout(t *testing.T) { + t.Run("Returns error by timeout", func(t *testing.T) { + wg := &sync.WaitGroup{} + defer wg.Done() + wg.Add(1) + timeout := 500 * time.Millisecond + err := WaitWithTimeout(wg, timeout) + require.Error(t, err) + assert.ErrorContains(t, err, "timeout waiting for WaitGroup") + }) + + t.Run("Returns no error", func(t *testing.T) { + wg := &sync.WaitGroup{} + wg.Add(1) + timeout := 500 * time.Millisecond + wg.Done() + err := WaitWithTimeout(wg, timeout) + require.NoError(t, err) + }) +}