diff --git a/internal/pkg/testutils/testutils.go b/internal/pkg/testutils/testutils.go index 050ad284..8ed485dd 100644 --- a/internal/pkg/testutils/testutils.go +++ b/internal/pkg/testutils/testutils.go @@ -13,11 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package testutils import ( + "reflect" "runtime" "testing" + "unsafe" ) // RequireLinux checks if @@ -27,3 +30,32 @@ func RequireLinux(t *testing.T) { t.Skipf("Test is not supported on %q", runtime.GOOS) } } + +// GetStructPrivateFieldValue returns private field value +func GetStructPrivateFieldValue[T any](t *testing.T, v any, fieldName string) T { + t.Helper() + var result T + value := reflect.ValueOf(v) + if value.Kind() == reflect.Ptr { + value = value.Elem() + } + + if value.Kind() != reflect.Struct { + t.Errorf("The type %s is not stuct", value.Type()) + return result + } + + fieldVal := value.FieldByName(fieldName) + + if !fieldVal.IsValid() { + t.Errorf("The field %s is invalid for the %s type", fieldName, value.Type()) + return result + } + + fieldPtr := unsafe.Pointer(fieldVal.UnsafeAddr()) + + // Cast the field pointer to a pointer of the correct type + realPtr := (*T)(fieldPtr) + + return *realPtr +} diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index 779be098..9bdc6014 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -397,17 +397,12 @@ func enableDCGMExpXIDErrorsCountCollector(cs *dcgmexporter.CounterSet, fieldEnti } func getFieldEntityGroupTypeSystemInfo(cs *dcgmexporter.CounterSet, config *dcgmexporter.Config) *dcgmexporter.FieldEntityGroupTypeSystemInfo { - allCounters := []dcgmexporter.Counter{} + var allCounters []dcgmexporter.Counter allCounters = append(allCounters, cs.DCGMCounters...) - allCounters = append(allCounters, - dcgmexporter.Counter{ - FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, - }, - dcgmexporter.Counter{ - FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS, - }, - ) + + allCounters = appendDCGMXIDErrorsCountDependency(allCounters, cs) + allCounters = appendDCGMClockEventsCountDependency(cs, allCounters) fieldEntityGroupTypeSystemInfo := dcgmexporter.NewEntityGroupTypeSystemInfo(allCounters, config) @@ -420,6 +415,40 @@ func getFieldEntityGroupTypeSystemInfo(cs *dcgmexporter.CounterSet, config *dcgm return fieldEntityGroupTypeSystemInfo } +// appendDCGMXIDErrorsCountDependency appends DCGM counters required for the DCGM_EXP_CLOCK_EVENTS_COUNT metric +func appendDCGMClockEventsCountDependency(cs *dcgmexporter.CounterSet, allCounters []dcgmexporter.Counter) []dcgmexporter.Counter { + if len(cs.ExporterCounters) > 0 { + if containsField(cs.ExporterCounters, dcgmexporter.DCGMClockEventsCount) && + !containsField(allCounters, dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS) { + allCounters = append(allCounters, + dcgmexporter.Counter{ + FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, + }) + } + } + return allCounters +} + +// appendDCGMXIDErrorsCountDependency appends DCGM counters required for the DCGM_EXP_XID_ERRORS_COUNT metric +func appendDCGMXIDErrorsCountDependency(allCounters []dcgmexporter.Counter, cs *dcgmexporter.CounterSet) []dcgmexporter.Counter { + if len(cs.ExporterCounters) > 0 { + if containsField(cs.ExporterCounters, dcgmexporter.DCGMXIDErrorsCount) && + !containsField(allCounters, dcgm.DCGM_FI_DEV_XID_ERRORS) { + allCounters = append(allCounters, + dcgmexporter.Counter{ + FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS, + }) + } + } + return allCounters +} + +func containsField(slice []dcgmexporter.Counter, fieldID dcgmexporter.ExporterCounter) bool { + return slices.ContainsFunc(slice, func(counter dcgmexporter.Counter) bool { + return counter.FieldID == dcgm.Short(fieldID) + }) +} + func getCounters(config *dcgmexporter.Config) *dcgmexporter.CounterSet { cs, err := dcgmexporter.GetCounterSet(config) if err != nil { diff --git a/pkg/cmd/app_test.go b/pkg/cmd/app_test.go new file mode 100644 index 00000000..9035c6bd --- /dev/null +++ b/pkg/cmd/app_test.go @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cmd + +import ( + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" + "github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter" +) + +func Test_getFieldEntityGroupTypeSystemInfo(t *testing.T) { + config := &dcgmexporter.Config{ + GPUDevices: dcgmexporter.DeviceOptions{}, + SwitchDevices: dcgmexporter.DeviceOptions{}, + CPUDevices: dcgmexporter.DeviceOptions{}, + UseFakeGPUs: true, + } + + tests := []struct { + name string + counterSet *dcgmexporter.CounterSet + assertion func(*testing.T, *dcgmexporter.FieldEntityGroupTypeSystemInfo) + }{ + { + name: "When DCGM_FI_DEV_XID_ERRORS and DCGM_EXP_XID_ERRORS_COUNT enabled", + counterSet: &dcgmexporter.CounterSet{ + DCGMCounters: []dcgmexporter.Counter{ + { + FieldID: 230, + FieldName: "DCGM_FI_DEV_XID_ERRORS", + PromType: "gauge", + Help: "Value of the last XID error encountered.", + }, + }, + ExporterCounters: []dcgmexporter.Counter{ + { + FieldID: 9001, + FieldName: "DCGM_EXP_XID_ERRORS_COUNT", + PromType: "gauge", + Help: "Count of XID Errors within user-specified time window (see xid-count-window-size param).", + }, + }, + }, + assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + require.NotNil(t, got) + values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + require.Len(t, values, 1) + assert.Equal(t, dcgm.Short(230), values[0].FieldID) + }, + }, + { + name: "When DCGM_FI_DEV_XID_ERRORS enabled", + counterSet: &dcgmexporter.CounterSet{ + DCGMCounters: []dcgmexporter.Counter{ + { + FieldID: 230, + FieldName: "DCGM_FI_DEV_XID_ERRORS", + PromType: "gauge", + Help: "Value of the last XID error encountered.", + }, + }, + }, + assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + require.NotNil(t, got) + values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + require.Len(t, values, 1) + assert.Equal(t, dcgm.Short(230), values[0].FieldID) + }, + }, + { + name: "When DCGM_EXP_XID_ERRORS_COUNT enabled", + counterSet: &dcgmexporter.CounterSet{ + ExporterCounters: []dcgmexporter.Counter{ + { + FieldID: 9001, + FieldName: "DCGM_EXP_XID_ERRORS_COUNT", + PromType: "gauge", + Help: "Count of XID Errors within user-specified time window (see xid-count-window-size param).", + }, + }, + }, + assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + require.NotNil(t, got) + values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + require.Len(t, values, 1) + assert.Equal(t, dcgm.Short(230), values[0].FieldID) + }, + }, + { + name: "When no counters", + counterSet: &dcgmexporter.CounterSet{}, + assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + require.NotNil(t, got) + values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + require.Len(t, values, 0) + }, + }, + { + name: "When DCGM_FI_DEV_CLOCK_THROTTLE_REASON and DCGM_EXP_CLOCK_EVENTS_COUNT enabled", + counterSet: &dcgmexporter.CounterSet{ + DCGMCounters: []dcgmexporter.Counter{ + { + FieldID: 112, + FieldName: "DCGM_FI_DEV_CLOCK_THROTTLE_REASON", + PromType: "gauge", + }, + }, + ExporterCounters: []dcgmexporter.Counter{ + { + FieldID: 9002, + FieldName: "DCGM_EXP_CLOCK_EVENTS_COUNT", + PromType: "gauge", + Help: "Count of clock events within the user-specified time window (see clock-events-count-window-size param).", + }, + }, + }, + assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + require.NotNil(t, got) + require.NotNil(t, got) + values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + require.Len(t, values, 1) + assert.Equal(t, dcgm.Short(112), values[0].FieldID) + }, + }, + { + name: "When DCGM_FI_DEV_CLOCK_THROTTLE_REASON enabled", + counterSet: &dcgmexporter.CounterSet{ + DCGMCounters: []dcgmexporter.Counter{ + { + FieldID: 112, + FieldName: "DCGM_FI_DEV_CLOCK_THROTTLE_REASON", + PromType: "gauge", + }, + }, + }, + assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + require.NotNil(t, got) + values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + require.Len(t, values, 1) + assert.Equal(t, dcgm.Short(112), values[0].FieldID) + }, + }, + { + name: "When DCGM_EXP_CLOCK_EVENTS_COUNT enabled", + counterSet: &dcgmexporter.CounterSet{ + ExporterCounters: []dcgmexporter.Counter{ + { + FieldID: 9002, + FieldName: "DCGM_EXP_CLOCK_EVENTS_COUNT", + PromType: "gauge", + Help: "Count of clock events within the user-specified time window (see clock-events-count-window-size param).", + }, + }, + }, + assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) { + require.NotNil(t, got) + values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters") + require.Len(t, values, 1) + assert.Equal(t, dcgm.Short(112), values[0].FieldID) + }, + }, + } + + cleanupDCGM := initDCGM(config) + defer cleanupDCGM() + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := getFieldEntityGroupTypeSystemInfo(tt.counterSet, config) + if tt.assertion == nil { + t.Skip(tt.name) + } + tt.assertion(t, got) + }) + } +} diff --git a/pkg/dcgmexporter/exporter_metrics.go b/pkg/dcgmexporter/exporter_metrics.go index 90a31acd..ecf7ab7f 100644 --- a/pkg/dcgmexporter/exporter_metrics.go +++ b/pkg/dcgmexporter/exporter_metrics.go @@ -28,7 +28,7 @@ type ExporterCounter uint16 const ( DCGMFIUnknown ExporterCounter = 0 DCGMXIDErrorsCount ExporterCounter = iota + 9000 - DCGMClockEventsCount ExporterCounter = iota + DCGMClockEventsCount ExporterCounter = iota + 9000 ) // String method to convert the enum value to a string