From 82736176510cbf92c7d329d16741f2b0375b8687 Mon Sep 17 00:00:00 2001 From: Zhang Wei Date: Wed, 20 Dec 2023 12:09:43 +0800 Subject: [PATCH] Create DCGMCollectorConstructor type Signed-off-by: Zhang Wei --- pkg/cmd/app.go | 6 +++--- pkg/dcgmexporter/gpu_collector.go | 2 ++ pkg/dcgmexporter/pipeline.go | 16 ++++++++++------ 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index f88899f9..800db21e 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -38,9 +38,9 @@ const ( This is our recommended option for single or mixed MIG Strategies. {{.MajorKey}}:0,1 = monitor GPUs 0 and 1 {{.MinorKey}}:0,2-4 = monitor GPU instances 0, 2, 3, and 4. - + NOTE 1: -i cannot be specified unless MIG mode is enabled. - NOTE 2: Any time indices are specified, those indices must exist on the system. + NOTE 2: Any time indices are specified, those indices must exist on the system. NOTE 3: In MIG mode, only -f or -i with a range can be specified. GPUs are not assigned to pods and therefore reporting must occur at the GPU instance level.` ) @@ -242,7 +242,7 @@ restart: } ch := make(chan string, 10) - pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config) + pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, dcgmexporter.NewDCGMCollector) defer cleanup() if err != nil { logrus.Fatal(err) diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index 0314158b..0703b4c2 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -24,6 +24,8 @@ import ( "github.com/sirupsen/logrus" ) +type DCGMCollectorConstructor func([]Counter, *Config, dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) + func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) { sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.CPUDevices, config.UseFakeGPUs, entityType) if err != nil { diff --git a/pkg/dcgmexporter/pipeline.go b/pkg/dcgmexporter/pipeline.go index 14bee73f..745ce0cf 100644 --- a/pkg/dcgmexporter/pipeline.go +++ b/pkg/dcgmexporter/pipeline.go @@ -27,41 +27,45 @@ import ( "github.com/sirupsen/logrus" ) -func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) { +func NewMetricsPipeline(c *Config, newDCGMCollector DCGMCollectorConstructor) (*MetricsPipeline, func(), error) { counters, err := ExtractCounters(c) if err != nil { return nil, func() {}, err } cleanups := []func(){} - gpuCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_GPU) + gpuCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_GPU) if err != nil { return nil, func() {}, err } cleanups = append(cleanups, cleanup) - switchCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_SWITCH) + switchCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_SWITCH) if err != nil { logrus.Info("Not collecting switch metrics: ", err) } else { cleanups = append(cleanups, cleanup) } - linkCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_LINK) + linkCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_LINK) if err != nil { logrus.Info("Not collecting link metrics: ", err) } else { cleanups = append(cleanups, cleanup) } - cpuCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_CPU) + cpuCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_CPU) if err != nil { logrus.Info("Not collecting cpu metrics: ", err) + } else { + cleanups = append(cleanups, cleanup) } - coreCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_CPU_CORE) + coreCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_CPU_CORE) if err != nil { logrus.Info("Not collecting cpu core metrics: ", err) + } else { + cleanups = append(cleanups, cleanup) } transformations := []Transform{}