Skip to content

Commit

Permalink
fix: move driver_version into nvidia_smi_gpu_info labels
Browse files Browse the repository at this point in the history
Signed-off-by: Utku Ozdemir <[email protected]>
  • Loading branch information
utkuozdemir committed Sep 3, 2021
1 parent 27316b4 commit d7a2ee9
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 19 deletions.
6 changes: 3 additions & 3 deletions internal/exporter/_query-test.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
uuid, name, driver_model.current, driver_model.pending, vbios_version, fan.speed [%], memory.used [MiB]
GPU-df6e7a7c-7314-46f8-abc4-b88b36dcf3aa, NVIDIA GeForce RTX 2080 SUPER, WDDM, WDDM, 90.04.7a.40.73, 38 %, 575 MiB
GPU-04757e3e-3077-4e2e-b988-7e2d647b52e9, Some Other GPU, DoesntMatter, DoesntMatter, 1a.2b.3c.4d, 50 %, 1234 MiB
uuid, name, driver_model.current, driver_model.pending, vbios_version, driver_version, fan.speed [%], memory.used [MiB]
GPU-df6e7a7c-7314-46f8-abc4-b88b36dcf3aa, NVIDIA GeForce RTX 2080 SUPER, WDDM, WDDM, 90.04.7a.40.73, 466.63, 38 %, 575 MiB
GPU-04757e3e-3077-4e2e-b988-7e2d647b52e9, Some Other GPU, DoesntMatter, DoesntMatter, 1a.2b.3c.4d, 123.45, 50 %, 1234 MiB
13 changes: 10 additions & 3 deletions internal/exporter/csv.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package exporter

import "strings"
import (
"fmt"
"strings"
)

type table struct {
rows []row
Expand All @@ -19,7 +22,7 @@ type cell struct {
rawValue string
}

func parseCSVIntoTable(queryResult string, qFields []qField) table {
func parseCSVIntoTable(queryResult string, qFields []qField) (table, error) {
lines := strings.Split(strings.TrimSpace(queryResult), "\n")
titlesLine := lines[0]
valuesLines := lines[1:]
Expand All @@ -39,6 +42,10 @@ func parseCSVIntoTable(queryResult string, qFields []qField) table {
qFieldToCell := make(map[qField]cell, numCols)
cells := make([]cell, numCols)
rawValues := parseCSVLine(valuesLine)
if len(qFields) != len(rFields) {
return table{}, fmt.Errorf("query fields (%d) and returned fields (%d) have different sizes", len(qFields), len(rFields))
}

for colIndex, rawValue := range rawValues {
q := qFields[colIndex]
r := rFields[colIndex]
Expand All @@ -65,7 +72,7 @@ func parseCSVIntoTable(queryResult string, qFields []qField) table {
rows: rows,
rFields: rFields,
qFieldToCells: qFieldToCells,
}
}, nil
}

func parseCSVLine(line string) []string {
Expand Down
3 changes: 2 additions & 1 deletion internal/exporter/csv_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ Some Dummy GPU, 12.34 W
)

func TestParseCsvIntoTable(t *testing.T) {
parsed := parseCSVIntoTable(testCsv, []qField{"name", "power.draw"})
parsed, err := parseCSVIntoTable(testCsv, []qField{"name", "power.draw"})
assert.NoError(t, err)
assert.Len(t, parsed.rows, 2)
assert.Equal(t, []rField{"name", "power.draw [W]"}, parsed.rFields)

Expand Down
11 changes: 9 additions & 2 deletions internal/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ var (
{qField: driverModelCurrentQField, label: "driver_model_current"},
{qField: driverModelPendingQField, label: "driver_model_pending"},
{qField: vBiosVersionQField, label: "vbios_version"},
{qField: driverVersionQField, label: "driver_version"},
}

runCmd = func(cmd *exec.Cmd) error { return cmd.Run() }
Expand Down Expand Up @@ -156,9 +157,11 @@ func (e *gpuExporter) Collect(ch chan<- prometheus.Metric) {
driverModelCurrent := r.qFieldToCells[driverModelCurrentQField].rawValue
driverModelPending := r.qFieldToCells[driverModelPendingQField].rawValue
vBiosVersion := r.qFieldToCells[vBiosVersionQField].rawValue
driverVersion := r.qFieldToCells[driverVersionQField].rawValue

infoMetric := prometheus.MustNewConstMetric(e.gpuInfoDesc, prometheus.GaugeValue,
1, uuid, name, driverModelCurrent, driverModelPending, vBiosVersion)
1, uuid, name, driverModelCurrent,
driverModelPending, vBiosVersion, driverVersion)
ch <- infoMetric

for _, c := range r.cells {
Expand Down Expand Up @@ -193,7 +196,11 @@ func scrape(qFields []qField, nvidiaSmiCommand string) (*table, error) {
return nil, fmt.Errorf("command failed. stderr: %s err: %w", stderr.String(), err)
}

t := parseCSVIntoTable(strings.TrimSpace(stdout.String()), qFields)
t, err := parseCSVIntoTable(strings.TrimSpace(stdout.String()), qFields)
if err != nil {
return nil, err
}

return &t, nil
}

Expand Down
7 changes: 4 additions & 3 deletions internal/exporter/exporter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ end:
}
}

assert.Len(t, descStrs, 9)
assert.Len(t, descStrs, 10)
descs := strings.Join(descStrs, "\n")
assert.Contains(t, descs, "aaa_fan_speed")
assert.Contains(t, descs, "aaa_memory_used")
Expand All @@ -168,6 +168,7 @@ end:
assert.Contains(t, descs, "aaa_driver_model_current")
assert.Contains(t, descs, "aaa_driver_model_pending")
assert.Contains(t, descs, "aaa_vbios_version")
assert.Contains(t, descs, "aaa_driver_version")
}

func TestCollect(t *testing.T) {
Expand All @@ -182,7 +183,7 @@ func TestCollect(t *testing.T) {
logger := log.NewNopLogger()
exp, err := New("aaa", "bbb",
"uuid,name,driver_model.current,driver_model.pending,"+
"vbios_version,fan.speed,memory.used", logger)
"vbios_version,driver_version,fan.speed,memory.used", logger)
assert.NoError(t, err)

doneCh := make(chan bool)
Expand All @@ -207,7 +208,7 @@ end:

metricsJoined := strings.Join(metrics, "\n")

assert.Len(t, metrics, 7)
assert.Len(t, metrics, 9)
assert.Contains(t, metricsJoined, "aaa_gpu_info")
assert.Contains(t, metricsJoined, "aaa_name")
assert.Contains(t, metricsJoined, "aaa_fan_speed_ratio")
Expand Down
5 changes: 5 additions & 0 deletions internal/exporter/fields.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package exporter

import (
"bytes"
"errors"
"os/exec"
"regexp"
"strings"
Expand All @@ -13,6 +14,7 @@ const (
driverModelCurrentQField qField = "driver_model.current"
driverModelPendingQField qField = "driver_model.pending"
vBiosVersionQField qField = "vbios_version"
driverVersionQField qField = "driver_version"
qFieldsAuto = "AUTO"
DefaultQField = qFieldsAuto
)
Expand Down Expand Up @@ -153,6 +155,9 @@ func ParseAutoQFields(nvidiaSmiCommand string) ([]qField, error) {

out := stdout.String()
fields := extractQFields(out)
if fields == nil {
return nil, errors.New("could not extract any query fields")
}
return fields, nil
}

Expand Down
9 changes: 2 additions & 7 deletions samples/sample-source.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
#!/usr/bin/env sh




echo "timestamp, driver_version, count, name, serial, uuid, pci.bus_id, pci.domain, pci.bus, pci.device, pci.device_id, pci.sub_device_id, pcie.link.gen.current, pcie.link.gen.max, pcie.link.width.current, pcie.link.width.max, index, display_mode, display_active, persistence_mode, accounting.mode, accounting.buffer_size, driver_model.current, driver_model.pending, vbios_version, inforom.img, inforom.oem, inforom.ecc, inforom.pwr, gom.current, gom.pending, fan.speed [%], pstate, clocks_throttle_reasons.supported, clocks_throttle_reasons.active, clocks_throttle_reasons.gpu_idle, clocks_throttle_reasons.applications_clocks_setting, clocks_throttle_reasons.sw_power_cap, clocks_throttle_reasons.hw_slowdown, clocks_throttle_reasons.hw_thermal_slowdown, clocks_throttle_reasons.hw_power_brake_slowdown, clocks_throttle_reasons.sw_thermal_slowdown, clocks_throttle_reasons.sync_boost, memory.total [MiB], memory.used [MiB], memory.free [MiB], compute_mode, utilization.gpu [%], utilization.memory [%], encoder.stats.sessionCount, encoder.stats.averageFps, encoder.stats.averageLatency, ecc.mode.current, ecc.mode.pending, ecc.errors.corrected.volatile.device_memory, ecc.errors.corrected.volatile.dram, ecc.errors.corrected.volatile.register_file, ecc.errors.corrected.volatile.l1_cache, ecc.errors.corrected.volatile.l2_cache, ecc.errors.corrected.volatile.texture_memory, ecc.errors.corrected.volatile.cbu, ecc.errors.corrected.volatile.sram, ecc.errors.corrected.volatile.total, ecc.errors.corrected.aggregate.device_memory, ecc.errors.corrected.aggregate.dram, ecc.errors.corrected.aggregate.register_file, ecc.errors.corrected.aggregate.l1_cache, ecc.errors.corrected.aggregate.l2_cache, ecc.errors.corrected.aggregate.texture_memory, ecc.errors.corrected.aggregate.cbu, ecc.errors.corrected.aggregate.sram, ecc.errors.corrected.aggregate.total, ecc.errors.uncorrected.volatile.device_memory, ecc.errors.uncorrected.volatile.dram, ecc.errors.uncorrected.volatile.register_file, ecc.errors.uncorrected.volatile.l1_cache, ecc.errors.uncorrected.volatile.l2_cache, ecc.errors.uncorrected.volatile.texture_memory, ecc.errors.uncorrected.volatile.cbu, ecc.errors.uncorrected.volatile.sram, ecc.errors.uncorrected.volatile.total, ecc.errors.uncorrected.aggregate.device_memory, ecc.errors.uncorrected.aggregate.dram, ecc.errors.uncorrected.aggregate.register_file, ecc.errors.uncorrected.aggregate.l1_cache, ecc.errors.uncorrected.aggregate.l2_cache, ecc.errors.uncorrected.aggregate.texture_memory, ecc.errors.uncorrected.aggregate.cbu, ecc.errors.uncorrected.aggregate.sram, ecc.errors.uncorrected.aggregate.total, retired_pages.single_bit_ecc.count, retired_pages.double_bit.count, retired_pages.pending, temperature.gpu, temperature.memory, power.management, power.draw [W], power.limit [W], enforced.power.limit [W], power.default_limit [W], power.min_limit [W], power.max_limit [W], clocks.current.graphics [MHz], clocks.current.sm [MHz], clocks.current.memory [MHz], clocks.current.video [MHz], clocks.applications.graphics [MHz], clocks.applications.memory [MHz], clocks.default_applications.graphics [MHz], clocks.default_applications.memory [MHz], clocks.max.graphics [MHz], clocks.max.sm [MHz], clocks.max.memory [MHz], mig.mode.current, mig.mode.pending"
echo "2021/06/09 23:27:38.358, 466.63, 1, NVIDIA GeForce RTX 2080 SUPER, [N/A], GPU-df6e7a7c-7314-46f8-abc4-b88b36dcf3aa, 00000000:0C:00.0, 0x0000, 0x0C, 0x00, 0x1E8110DE, 0x40051458, 1, 3, 16, 16, 0, Enabled, Enabled, [N/A], Disabled, 4000, WDDM, WDDM, 90.04.7A.40.73, G001.0000.02.04, 1.1, [N/A], [N/A], [N/A], [N/A], 38 %, P8, 0x00000000000001FF, 0x0000000000000001, Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, 8192 MiB, 856 MiB, 7336 MiB, Default, 0 %, 10 %, 0, 0, 0, [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], 35, N/A, Enabled, 31.12 W, 250.00 W, 250.00 W, 250.00 W, 105.00 W, 350.00 W, 300 MHz, 300 MHz, 405 MHz, 540 MHz, [N/A], [N/A], [N/A], [N/A], 2145 MHz, 2145 MHz, 7751 MHz, [N/A], [N/A]"
echo "2042/06/09 23:27:38.358, 123.45, 1, Dummy GPU 1, [N/A], GPU-523e933c-cf12-4d85-95b4-dd144e8fc516, 00000000:0C:00.0, 0x0000, 0x0C, 0x00, 0x1E8110DE, 0x40051458, 1, 3, 16, 16, 0, Enabled, Enabled, [N/A], Disabled, 4000, WDDM, WDDM, 90.04.7A.40.73, G001.0000.02.04, 1.1, [N/A], [N/A], [N/A], [N/A], 38 %, P8, 0x00000000000001FF, 0x0000000000000001, Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, 8192 MiB, 856 MiB, 7336 MiB, Default, 0 %, 10 %, 0, 0, 0, [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], 35, N/A, Enabled, 31.12 W, 250.00 W, 250.00 W, 250.00 W, 105.00 W, 350.00 W, 300 MHz, 300 MHz, 405 MHz, 540 MHz, [N/A], [N/A], [N/A], [N/A], 2145 MHz, 2145 MHz, 7751 MHz, [N/A], [N/A]"
echo "2021/06/12 18:45:18.358, 466.63, 1, Dummy GPU 2, [N/A], GPU-b2fe4f12-c3dd-4fa4-914b-9e7b975a0faa, 00000000:0C:00.0, 0x0000, 0x0C, 0x00, 0x1E8110DE, 0x40051458, 1, 3, 16, 16, 0, Enabled, Disabled, [N/A], Disabled, 4000, WDDM, WDDM, 90.04.7A.40.73, G001.0000.02.04, 1.1, [N/A], [N/A], [N/A], [N/A], 38 %, P8, 0x00000000000001FF, 0x0000000000000001, Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, Not Active, 8192 MiB, 779 MiB, 7413 MiB, Default, 2 %, 0 %, 0, 0, 0, [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], [N/A], 34, N/A, Enabled, 29.50 W, 250.00 W, 250.00 W, 250.00 W, 105.00 W, 350.00 W, 300 MHz, 300 MHz, 405 MHz, 540 MHz, [N/A], [N/A], [N/A], [N/A], 2235 MHz, 2235 MHz, 7751 MHz, [N/A], [N/A]"
echo "driver_version,uuid,name,driver_model.current,driver_model.pending,vbios_version"
echo "460.91.03,GPU-df6e7a7c-7314-46f8-abc4-b88b36dcf3aa,NVIDIA GeForce RTX 2080 SUPER, WDDM, WDDM, 90.04.7A.40.73"

0 comments on commit d7a2ee9

Please sign in to comment.