Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pkg/dcgmexporter/gpu_collector.go: include a err_msg label in metric DCGM_FI_DEV_XID_ERRORS #309

Merged
merged 3 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"errors"
"fmt"
"os"
"strconv"
"strings"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
Expand All @@ -31,8 +32,8 @@ type DCGMCollectorConstructor func([]Counter, string, *Config, FieldEntityGroupT
func NewDCGMCollector(c []Counter,
hostname string,
config *Config,
fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error) {

fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem,
) (*DCGMCollector, func(), error) {
if fieldEntityGroupTypeSystemInfo.isEmpty() {
return nil, func() {}, errors.New("fieldEntityGroupTypeSystemInfo is empty")
}
Expand Down Expand Up @@ -163,7 +164,8 @@ func FindCounterField(c []Counter, fieldId uint) (Counter, error) {
}

func ToSwitchMetric(metrics MetricsByCounter,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string,
) {
labels := map[string]string{}

for _, val := range values {
Expand Down Expand Up @@ -206,8 +208,9 @@ func ToSwitchMetric(metrics MetricsByCounter,
}

func ToCPUMetric(metrics MetricsByCounter,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
var labels = map[string]string{}
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string,
) {
labels := map[string]string{}

for _, val := range values {
v := ToString(val)
Expand Down Expand Up @@ -258,7 +261,7 @@ func ToMetric(
hostname string,
replaceBlanksInModelName bool,
) {
var labels = map[string]string{}
labels := map[string]string{}

for _, val := range values {
v := ToString(val)
Expand All @@ -283,6 +286,17 @@ func ToMetric(

gpuModel := getGPUModel(d, replaceBlanksInModelName)

attrs := map[string]string{}
if counter.FieldID == dcgm.DCGM_FI_DEV_XID_ERRORS {
errCode := int(val.Int64())
attrs["err_code"] = strconv.Itoa(errCode)
if 0 < errCode && errCode < len(xidErrCodeToText) {
attrs["err_msg"] = xidErrCodeToText[errCode]
} else {
attrs["err_msg"] = "Unknown Error"
}
}

m := Metric{
Counter: counter,
Value: v,
Expand All @@ -295,7 +309,7 @@ func ToMetric(
Hostname: hostname,

Labels: labels,
Attributes: map[string]string{},
Attributes: attrs,
}
if instanceInfo != nil {
m.MigProfile = instanceInfo.ProfileName
Expand Down
65 changes: 64 additions & 1 deletion pkg/dcgmexporter/gpu_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,6 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun
}

func TestToMetric(t *testing.T) {

fieldValue := [4096]byte{}
fieldValue[0] = 42
values := []dcgm.FieldValue_v1{
Expand Down Expand Up @@ -315,6 +314,70 @@ func TestToMetric(t *testing.T) {
}
}

func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) {
c := []Counter{
{
FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS,
FieldName: "DCGM_FI_DEV_GPU_TEMP",
PromType: "gauge",
Help: "Temperature Help info",
},
}

d := dcgm.Device{
UUID: "fake0",
Identifiers: dcgm.DeviceIdentifiers{
Model: "NVIDIA T400 4GB",
},
}

var instanceInfo *GPUInstanceInfo = nil

type testCase struct {
name string
fieldValue byte
expectedErr string
}

testCases := []testCase{
{
name: "when DCGM_FI_DEV_XID_ERRORS has known value",
fieldValue: 42,
expectedErr: "Video processor exception",
},
{
name: "when DCGM_FI_DEV_XID_ERRORS has unknown value",
fieldValue: 255,
expectedErr: "Unknown Error",
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
fieldValue := [4096]byte{}
fieldValue[0] = tc.fieldValue
values := []dcgm.FieldValue_v1{
{
FieldId: dcgm.DCGM_FI_DEV_XID_ERRORS,
FieldType: dcgm.DCGM_FT_INT64,
Value: fieldValue,
},
}

metrics := make(map[Counter][]Metric)
ToMetric(metrics, values, c, d, instanceInfo, false, "", false)
assert.Len(t, metrics, 1)
// We get metric value with 0 index
metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)]
assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Value)
assert.Contains(t, metricValues[0].Attributes, "err_code")
assert.Equal(t, fmt.Sprint(tc.fieldValue), metricValues[0].Attributes["err_code"])
assert.Contains(t, metricValues[0].Attributes, "err_msg")
assert.Equal(t, tc.expectedErr, metricValues[0].Attributes["err_msg"])
})
}
}

func TestGPUCollector_GetMetrics(t *testing.T) {
teardownTest := setupTest(t)
defer teardownTest(t)
Expand Down
165 changes: 165 additions & 0 deletions pkg/dcgmexporter/xid_errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgmexporter

// Based on this doc: https://docs.nvidia.com/deploy/xid-errors/#topic_4
var xidErrCodeToText = []string{
0: "No Error",
1: "Invalid or corrupted push buffer stream",
2: "Invalid or corrupted push buffer stream",
3: "Invalid or corrupted push buffer stream",
4: "Invalid or corrupted push buffer stream",
5: "Unused",
6: "Invalid or corrupted push buffer stream",
7: "Invalid or corrupted push buffer address",
8: "GPU stopped processing",
9: "Driver error programming GPU",
10: "Unused",
11: "Invalid or corrupted push buffer stream",
12: "Driver error handling GPU exception",
13: "Graphics Engine Exception",
14: "Unused",
15: "Unused",
16: "Display engine hung",
17: "Unused",
18: "Bus mastering disabled in PCI Config Space",
19: "Display Engine error",
20: "Invalid or corrupted Mpeg push buffer",
21: "Invalid or corrupted Motion Estimation push buffer",
22: "Invalid or corrupted Video Processor push buffer",
23: "Unused",
24: "GPU semaphore timeout",
25: "Invalid or illegal push buffer stream",
26: "Framebuffer timeout",
27: "Video processor exception",
28: "Video processor exception",
29: "Video processor exception",
30: "GPU semaphore access error",
31: "GPU memory page fault",
32: "Invalid or corrupted push buffer stream",
33: "Internal micro-controller error",
34: "Video processor exception",
35: "Video processor exception",
36: "Video processor exception",
37: "Driver firmware error",
38: "Driver firmware error",
39: "Unused",
40: "Unused",
41: "Unused",
42: "Video processor exception",
43: "GPU stopped processing",
44: "Graphics Engine fault during context switch",
45: "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda",
46: "GPU stopped processing",
47: "Video processor exception",
48: "Double Bit ECC Error",
49: "Unused",
50: "Unused",
51: "Unused",
52: "Unused",
53: "Unused",
54: "Auxiliary power is not connected to the GPU board",
55: "Unused",
56: "Display Engine error",
57: "Error programming video memory interface",
58: "Unstable video memory interface detected",
59: "Internal micro-controller error",
60: "Video processor exception",
61: "Internal micro-controller breakpoint/warning",
62: "Internal micro-controller halt",
63: "ECC page retirement or row remapping recording event",
64: "ECC page retirement or row remapper recording failure",
65: "Video processor exception",
66: "Illegal access by driver",
67: "Illegal access by driver",
68: "NVDEC0 Exception",
69: "Graphics Engine class error",
70: "CE3: Unknown Error",
71: "CE4: Unknown Error",
72: "CE5: Unknown Error",
73: "NVENC2 Error",
74: "NVLINK Error",
75: "CE6: Unknown Error",
76: "CE7: Unknown Error",
77: "CE8: Unknown Error",
78: "vGPU Start Error",
79: "GPU has fallen off the bus",
80: "Corrupted data sent to GPU",
81: "VGA Subsystem Error",
82: "NVJPG0 Error",
83: "NVDEC1 Error",
84: "NVDEC2 Error",
85: "CE9: Unknown Error",
86: "OFA Exception",
87: "Reserved",
88: "NVDEC3 Error",
89: "NVDEC4 Error",
90: "Reserved",
91: "Reserved",
92: "High single-bit ECC error rate",
93: "Non-fatal violation of provisioned InfoROM wear limit",
94: "Contained ECC error",
95: "Uncontained ECC error",
96: "NVDEC5 Error",
97: "NVDEC6 Error",
98: "NVDEC7 Error",
99: "NVJPG1 Error",
100: "NVJPG2 Error",
101: "NVJPG3 Error",
102: "NVJPG4 Error",
103: "NVJPG5 Error",
104: "NVJPG6 Error",
105: "NVJPG7 Error",
106: "SMBPBI Test Message",
107: "SMBPBI Test Message Silent",
108: "Reserved",
109: "Context Switch Timeout Error",
110: "Security Fault Error",
111: "Display Bundle Error Event",
112: "Display Supervisor Error",
113: "DP Link Training Error",
114: "Display Pipeline Underflow Error",
115: "Display Core Channel Error",
116: "Display Window Channel Error",
117: "Display Cursor Channel Error",
118: "Display Pixel Pipeline Error",
119: "GSP RPC Timeout",
120: "GSP Error",
121: "C2C Link Error",
122: "SPI PMU RPC Read Failure",
123: "SPI PMU RPC Write Failure",
124: "SPI PMU RPC Erase Failure",
125: "Inforom FS Failure",
126: "Reserved",
127: "Reserved",
128: "Reserved",
129: "Reserved",
130: "Reserved",
131: "Reserved",
132: "Reserved",
133: "Reserved",
134: "Reserved",
135: "Reserved",
136: "Reserved",
137: "Reserved",
138: "Reserved",
139: "Reserved",
140: "Unrecovered ECC Error",
141: "Reserved",
142: "Reserved",
143: "GPU Initialization Failure",
}
Loading