-
Notifications
You must be signed in to change notification settings - Fork 166
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
pkg/dcgmexporter/gpu_collector.go: include a err_msg label in metric …
…DCGM_FI_DEV_XID_ERRORS The DCGM_FI_DEV_XID_ERRORS metric reports xid error code as its value, this commit includes an err_msg label with value retrieved from this nvidia doc: https://docs.nvidia.com/deploy/xid-errors/#topic_4 Signed-off-by: Xiaofan Hu <[email protected]>
- Loading branch information
Showing
2 changed files
with
159 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
package dcgmexporter | ||
|
||
// Based on this doc: https://docs.nvidia.com/deploy/xid-errors/#topic_4 | ||
var xidErrCodeToText = [256]string{ | ||
0: "No Error", | ||
1: "Invalid or corrupted push buffer stream", | ||
2: "Invalid or corrupted push buffer stream", | ||
3: "Invalid or corrupted push buffer stream", | ||
4: "Invalid or corrupted push buffer stream", | ||
5: "Unused", | ||
6: "Invalid or corrupted push buffer stream", | ||
7: "Invalid or corrupted push buffer address", | ||
8: "GPU stopped processing", | ||
9: "Driver error programming GPU", | ||
10: "Unused", | ||
11: "Invalid or corrupted push buffer stream", | ||
12: "Driver error handling GPU exception", | ||
13: "Graphics Engine Exception", | ||
14: "Unused", | ||
15: "Unused", | ||
16: "Display engine hung", | ||
17: "Unused", | ||
18: "Bus mastering disabled in PCI Config Space", | ||
19: "Display Engine error", | ||
20: "Invalid or corrupted Mpeg push buffer", | ||
21: "Invalid or corrupted Motion Estimation push buffer", | ||
22: "Invalid or corrupted Video Processor push buffer", | ||
23: "Unused", | ||
24: "GPU semaphore timeout", | ||
25: "Invalid or illegal push buffer stream", | ||
26: "Framebuffer timeout", | ||
27: "Video processor exception", | ||
28: "Video processor exception", | ||
29: "Video processor exception", | ||
30: "GPU semaphore access error", | ||
31: "GPU memory page fault", | ||
32: "Invalid or corrupted push buffer stream", | ||
33: "Internal micro-controller error", | ||
34: "Video processor exception", | ||
35: "Video processor exception", | ||
36: "Video processor exception", | ||
37: "Driver firmware error", | ||
38: "Driver firmware error", | ||
39: "Unused", | ||
40: "Unused", | ||
41: "Unused", | ||
42: "Video processor exception", | ||
43: "GPU stopped processing", | ||
44: "Graphics Engine fault during context switch", | ||
45: "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda", | ||
46: "GPU stopped processing", | ||
47: "Video processor exception", | ||
48: "Double Bit ECC Error", | ||
49: "Unused", | ||
50: "Unused", | ||
51: "Unused", | ||
52: "Unused", | ||
53: "Unused", | ||
54: "Auxiliary power is not connected to the GPU board", | ||
55: "Unused", | ||
56: "Display Engine error", | ||
57: "Error programming video memory interface", | ||
58: "Unstable video memory interface detected", | ||
59: "Internal micro-controller error", | ||
60: "Video processor exception", | ||
61: "Internal micro-controller breakpoint/warning", | ||
62: "Internal micro-controller halt", | ||
63: "ECC page retirement or row remapping recording event", | ||
64: "ECC page retirement or row remapper recording failure", | ||
65: "Video processor exception", | ||
66: "Illegal access by driver", | ||
67: "Illegal access by driver", | ||
68: "NVDEC0 Exception", | ||
69: "Graphics Engine class error", | ||
70: "CE3: Unknown Error", | ||
71: "CE4: Unknown Error", | ||
72: "CE5: Unknown Error", | ||
73: "NVENC2 Error", | ||
74: "NVLINK Error", | ||
75: "CE6: Unknown Error", | ||
76: "CE7: Unknown Error", | ||
77: "CE8: Unknown Error", | ||
78: "vGPU Start Error", | ||
79: "GPU has fallen off the bus", | ||
80: "Corrupted data sent to GPU", | ||
81: "VGA Subsystem Error", | ||
82: "NVJPG0 Error", | ||
83: "NVDEC1 Error", | ||
84: "NVDEC2 Error", | ||
85: "CE9: Unknown Error", | ||
86: "OFA Exception", | ||
87: "Reserved", | ||
88: "NVDEC3 Error", | ||
89: "NVDEC4 Error", | ||
90: "Reserved", | ||
91: "Reserved", | ||
92: "High single-bit ECC error rate", | ||
93: "Non-fatal violation of provisioned InfoROM wear limit", | ||
94: "Contained ECC error", | ||
95: "Uncontained ECC error", | ||
96: "NVDEC5 Error", | ||
97: "NVDEC6 Error", | ||
98: "NVDEC7 Error", | ||
99: "NVJPG1 Error", | ||
100: "NVJPG2 Error", | ||
101: "NVJPG3 Error", | ||
102: "NVJPG4 Error", | ||
103: "NVJPG5 Error", | ||
104: "NVJPG6 Error", | ||
105: "NVJPG7 Error", | ||
106: "SMBPBI Test Message", | ||
107: "SMBPBI Test Message Silent", | ||
108: "Reserved", | ||
109: "Context Switch Timeout Error", | ||
110: "Security Fault Error", | ||
111: "Display Bundle Error Event", | ||
112: "Display Supervisor Error", | ||
113: "DP Link Training Error", | ||
114: "Display Pipeline Underflow Error", | ||
115: "Display Core Channel Error", | ||
116: "Display Window Channel Error", | ||
117: "Display Cursor Channel Error", | ||
118: "Display Pixel Pipeline Error", | ||
119: "GSP RPC Timeout", | ||
120: "GSP Error", | ||
121: "C2C Link Error", | ||
122: "SPI PMU RPC Read Failure", | ||
123: "SPI PMU RPC Write Failure", | ||
124: "SPI PMU RPC Erase Failure", | ||
125: "Inforom FS Failure", | ||
126: "Reserved", | ||
127: "Reserved", | ||
128: "Reserved", | ||
129: "Reserved", | ||
130: "Reserved", | ||
131: "Reserved", | ||
132: "Reserved", | ||
133: "Reserved", | ||
134: "Reserved", | ||
135: "Reserved", | ||
136: "Reserved", | ||
137: "Reserved", | ||
138: "Reserved", | ||
139: "Reserved", | ||
140: "Unrecovered ECC Error", | ||
141: "Reserved", | ||
142: "Reserved", | ||
143: "GPU Initialization Failure", | ||
} |