Skip to content

Commit

Permalink
Create DCGMCollectorConstructor type
Browse files Browse the repository at this point in the history
Signed-off-by: Zhang Wei <[email protected]>
  • Loading branch information
zwpaper committed Dec 21, 2023
1 parent 2e813e8 commit 8273617
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 9 deletions.
6 changes: 3 additions & 3 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ const (
This is our recommended option for single or mixed MIG Strategies.
{{.MajorKey}}:0,1 = monitor GPUs 0 and 1
{{.MinorKey}}:0,2-4 = monitor GPU instances 0, 2, 3, and 4.
NOTE 1: -i cannot be specified unless MIG mode is enabled.
NOTE 2: Any time indices are specified, those indices must exist on the system.
NOTE 2: Any time indices are specified, those indices must exist on the system.
NOTE 3: In MIG mode, only -f or -i with a range can be specified. GPUs are not assigned to pods
and therefore reporting must occur at the GPU instance level.`
)
Expand Down Expand Up @@ -242,7 +242,7 @@ restart:
}

ch := make(chan string, 10)
pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config)
pipeline, cleanup, err := dcgmexporter.NewMetricsPipeline(config, dcgmexporter.NewDCGMCollector)
defer cleanup()
if err != nil {
logrus.Fatal(err)
Expand Down
2 changes: 2 additions & 0 deletions pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import (
"github.com/sirupsen/logrus"
)

type DCGMCollectorConstructor func([]Counter, *Config, dcgm.Field_Entity_Group) (*DCGMCollector, func(), error)

func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_Group) (*DCGMCollector, func(), error) {
sysInfo, err := InitializeSystemInfo(config.GPUDevices, config.SwitchDevices, config.CPUDevices, config.UseFakeGPUs, entityType)
if err != nil {
Expand Down
16 changes: 10 additions & 6 deletions pkg/dcgmexporter/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,41 +27,45 @@ import (
"github.com/sirupsen/logrus"
)

func NewMetricsPipeline(c *Config) (*MetricsPipeline, func(), error) {
func NewMetricsPipeline(c *Config, newDCGMCollector DCGMCollectorConstructor) (*MetricsPipeline, func(), error) {
counters, err := ExtractCounters(c)
if err != nil {
return nil, func() {}, err
}

cleanups := []func(){}
gpuCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_GPU)
gpuCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_GPU)
if err != nil {
return nil, func() {}, err
}
cleanups = append(cleanups, cleanup)

switchCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_SWITCH)
switchCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_SWITCH)
if err != nil {
logrus.Info("Not collecting switch metrics: ", err)
} else {
cleanups = append(cleanups, cleanup)
}

linkCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_LINK)
linkCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_LINK)
if err != nil {
logrus.Info("Not collecting link metrics: ", err)
} else {
cleanups = append(cleanups, cleanup)
}

cpuCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_CPU)
cpuCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_CPU)
if err != nil {
logrus.Info("Not collecting cpu metrics: ", err)
} else {
cleanups = append(cleanups, cleanup)
}

coreCollector, cleanup, err := NewDCGMCollector(counters, c, dcgm.FE_CPU_CORE)
coreCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_CPU_CORE)
if err != nil {
logrus.Info("Not collecting cpu core metrics: ", err)
} else {
cleanups = append(cleanups, cleanup)
}

transformations := []Transform{}
Expand Down

0 comments on commit 8273617

Please sign in to comment.