Skip to content

Commit

Permalink
pkg/dcgmexporter/gpu_collector.go: include a err_msg label in metric …
Browse files Browse the repository at this point in the history
…DCGM_FI_DEV_XID_ERRORS

The DCGM_FI_DEV_XID_ERRORS metric reports xid error code as its value, this commit includes an err_msg
label with value retrieved from this nvidia doc: https://docs.nvidia.com/deploy/xid-errors/#topic_4

Signed-off-by: Xiaofan Hu <[email protected]>
  • Loading branch information
bom-d-van committed Apr 8, 2024
1 parent 5121ded commit ad53c4a
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 1 deletion.
11 changes: 10 additions & 1 deletion pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,15 @@ func ToMetric(

gpuModel := getGPUModel(d, replaceBlanksInModelName)

attrs := map[string]string{}
if counter.FieldName == "DCGM_FI_DEV_XID_ERRORS" {
if val.Int64() < 256 && xidErrCodeToText[val.Int64()] != "" {
attrs["err_msg"] = xidErrCodeToText[val.Int64()]
} else {
attrs["err_msg"] = "Unknown Error"
}
}

m := Metric{
Counter: counter,
Value: v,
Expand All @@ -295,7 +304,7 @@ func ToMetric(
Hostname: hostname,

Labels: labels,
Attributes: map[string]string{},
Attributes: attrs,
}
if instanceInfo != nil {
m.MigProfile = instanceInfo.ProfileName
Expand Down
149 changes: 149 additions & 0 deletions pkg/dcgmexporter/xid_errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
package dcgmexporter

// Based on this doc: https://docs.nvidia.com/deploy/xid-errors/#topic_4
var xidErrCodeToText = [256]string{
0: "No Error",
1: "Invalid or corrupted push buffer stream",
2: "Invalid or corrupted push buffer stream",
3: "Invalid or corrupted push buffer stream",
4: "Invalid or corrupted push buffer stream",
5: "Unused",
6: "Invalid or corrupted push buffer stream",
7: "Invalid or corrupted push buffer address",
8: "GPU stopped processing",
9: "Driver error programming GPU",
10: "Unused",
11: "Invalid or corrupted push buffer stream",
12: "Driver error handling GPU exception",
13: "Graphics Engine Exception",
14: "Unused",
15: "Unused",
16: "Display engine hung",
17: "Unused",
18: "Bus mastering disabled in PCI Config Space",
19: "Display Engine error",
20: "Invalid or corrupted Mpeg push buffer",
21: "Invalid or corrupted Motion Estimation push buffer",
22: "Invalid or corrupted Video Processor push buffer",
23: "Unused",
24: "GPU semaphore timeout",
25: "Invalid or illegal push buffer stream",
26: "Framebuffer timeout",
27: "Video processor exception",
28: "Video processor exception",
29: "Video processor exception",
30: "GPU semaphore access error",
31: "GPU memory page fault",
32: "Invalid or corrupted push buffer stream",
33: "Internal micro-controller error",
34: "Video processor exception",
35: "Video processor exception",
36: "Video processor exception",
37: "Driver firmware error",
38: "Driver firmware error",
39: "Unused",
40: "Unused",
41: "Unused",
42: "Video processor exception",
43: "GPU stopped processing",
44: "Graphics Engine fault during context switch",
45: "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda",
46: "GPU stopped processing",
47: "Video processor exception",
48: "Double Bit ECC Error",
49: "Unused",
50: "Unused",
51: "Unused",
52: "Unused",
53: "Unused",
54: "Auxiliary power is not connected to the GPU board",
55: "Unused",
56: "Display Engine error",
57: "Error programming video memory interface",
58: "Unstable video memory interface detected",
59: "Internal micro-controller error",
60: "Video processor exception",
61: "Internal micro-controller breakpoint/warning",
62: "Internal micro-controller halt",
63: "ECC page retirement or row remapping recording event",
64: "ECC page retirement or row remapper recording failure",
65: "Video processor exception",
66: "Illegal access by driver",
67: "Illegal access by driver",
68: "NVDEC0 Exception",
69: "Graphics Engine class error",
70: "CE3: Unknown Error",
71: "CE4: Unknown Error",
72: "CE5: Unknown Error",
73: "NVENC2 Error",
74: "NVLINK Error",
75: "CE6: Unknown Error",
76: "CE7: Unknown Error",
77: "CE8: Unknown Error",
78: "vGPU Start Error",
79: "GPU has fallen off the bus",
80: "Corrupted data sent to GPU",
81: "VGA Subsystem Error",
82: "NVJPG0 Error",
83: "NVDEC1 Error",
84: "NVDEC2 Error",
85: "CE9: Unknown Error",
86: "OFA Exception",
87: "Reserved",
88: "NVDEC3 Error",
89: "NVDEC4 Error",
90: "Reserved",
91: "Reserved",
92: "High single-bit ECC error rate",
93: "Non-fatal violation of provisioned InfoROM wear limit",
94: "Contained ECC error",
95: "Uncontained ECC error",
96: "NVDEC5 Error",
97: "NVDEC6 Error",
98: "NVDEC7 Error",
99: "NVJPG1 Error",
100: "NVJPG2 Error",
101: "NVJPG3 Error",
102: "NVJPG4 Error",
103: "NVJPG5 Error",
104: "NVJPG6 Error",
105: "NVJPG7 Error",
106: "SMBPBI Test Message",
107: "SMBPBI Test Message Silent",
108: "Reserved",
109: "Context Switch Timeout Error",
110: "Security Fault Error",
111: "Display Bundle Error Event",
112: "Display Supervisor Error",
113: "DP Link Training Error",
114: "Display Pipeline Underflow Error",
115: "Display Core Channel Error",
116: "Display Window Channel Error",
117: "Display Cursor Channel Error",
118: "Display Pixel Pipeline Error",
119: "GSP RPC Timeout",
120: "GSP Error",
121: "C2C Link Error",
122: "SPI PMU RPC Read Failure",
123: "SPI PMU RPC Write Failure",
124: "SPI PMU RPC Erase Failure",
125: "Inforom FS Failure",
126: "Reserved",
127: "Reserved",
128: "Reserved",
129: "Reserved",
130: "Reserved",
131: "Reserved",
132: "Reserved",
133: "Reserved",
134: "Reserved",
135: "Reserved",
136: "Reserved",
137: "Reserved",
138: "Reserved",
139: "Reserved",
140: "Unrecovered ECC Error",
141: "Reserved",
142: "Reserved",
143: "GPU Initialization Failure",
}

0 comments on commit ad53c4a

Please sign in to comment.