Skip to content

Commit

Permalink
Added unit tests
Browse files Browse the repository at this point in the history
Signed-off-by: Vadym Fedorov <[email protected]>
  • Loading branch information
nvvfedorov authored and bom-d-van committed May 2, 2024
1 parent 30a5db4 commit 425ed56
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 8 deletions.
16 changes: 9 additions & 7 deletions pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ type DCGMCollectorConstructor func([]Counter, string, *Config, FieldEntityGroupT
func NewDCGMCollector(c []Counter,
hostname string,
config *Config,
fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error) {

fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem,
) (*DCGMCollector, func(), error) {
if fieldEntityGroupTypeSystemInfo.isEmpty() {
return nil, func() {}, errors.New("fieldEntityGroupTypeSystemInfo is empty")
}
Expand Down Expand Up @@ -164,7 +164,8 @@ func FindCounterField(c []Counter, fieldId uint) (Counter, error) {
}

func ToSwitchMetric(metrics MetricsByCounter,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string,
) {
labels := map[string]string{}

for _, val := range values {
Expand Down Expand Up @@ -207,8 +208,9 @@ func ToSwitchMetric(metrics MetricsByCounter,
}

func ToCPUMetric(metrics MetricsByCounter,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
var labels = map[string]string{}
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string,
) {
labels := map[string]string{}

for _, val := range values {
v := ToString(val)
Expand Down Expand Up @@ -259,7 +261,7 @@ func ToMetric(
hostname string,
replaceBlanksInModelName bool,
) {
var labels = map[string]string{}
labels := map[string]string{}

for _, val := range values {
v := ToString(val)
Expand All @@ -285,7 +287,7 @@ func ToMetric(
gpuModel := getGPUModel(d, replaceBlanksInModelName)

attrs := map[string]string{}
if counter.FieldName == "DCGM_FI_DEV_XID_ERRORS" {
if counter.FieldID == dcgm.DCGM_FI_DEV_XID_ERRORS {
attrs["err_code"] = strconv.Itoa(int(val.Int64()))
if v := int(val.Int64()); 0 < v && v < len(xidErrCodeToText) && xidErrCodeToText[v] != "" {
attrs["err_msg"] = xidErrCodeToText[val.Int64()]
Expand Down
65 changes: 64 additions & 1 deletion pkg/dcgmexporter/gpu_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,6 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun
}

func TestToMetric(t *testing.T) {

fieldValue := [4096]byte{}
fieldValue[0] = 42
values := []dcgm.FieldValue_v1{
Expand Down Expand Up @@ -315,6 +314,70 @@ func TestToMetric(t *testing.T) {
}
}

func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) {
c := []Counter{
{
FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS,
FieldName: "DCGM_FI_DEV_GPU_TEMP",
PromType: "gauge",
Help: "Temperature Help info",
},
}

d := dcgm.Device{
UUID: "fake0",
Identifiers: dcgm.DeviceIdentifiers{
Model: "NVIDIA T400 4GB",
},
}

var instanceInfo *GPUInstanceInfo = nil

type testCase struct {
name string
fieldValue byte
expectedErr string
}

testCases := []testCase{
{
name: "when DCGM_FI_DEV_XID_ERRORS has known value",
fieldValue: 42,
expectedErr: "Video processor exception",
},
{
name: "when DCGM_FI_DEV_XID_ERRORS has unknown value",
fieldValue: 255,
expectedErr: "Unknown Error",
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
fieldValue := [4096]byte{}
fieldValue[0] = tc.fieldValue
values := []dcgm.FieldValue_v1{
{
FieldId: dcgm.DCGM_FI_DEV_XID_ERRORS,
FieldType: dcgm.DCGM_FT_INT64,
Value: fieldValue,
},
}

metrics := make(map[Counter][]Metric)
ToMetric(metrics, values, c, d, instanceInfo, false, "", false)
assert.Len(t, metrics, 1)
// We get metric value with 0 index
metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)]
assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Value)
assert.Contains(t, metricValues[0].Attributes, "err_code")
assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Attributes["err_code"])
assert.Contains(t, metricValues[0].Attributes, "err_msg")
assert.Equal(t, tc.expectedErr, metricValues[0].Attributes["err_msg"])
})
}
}

func TestGPUCollector_GetMetrics(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)
Expand Down
16 changes: 16 additions & 0 deletions pkg/dcgmexporter/xid_errors.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgmexporter

// Based on this doc: https://docs.nvidia.com/deploy/xid-errors/#topic_4
Expand Down

0 comments on commit 425ed56

Please sign in to comment.