From 583e5e99bfc4cf1a8bb8ba73a0f0622a06645523 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Tue, 19 Mar 2024 19:54:01 -0500 Subject: [PATCH 01/18] Update ROCm 6.0/6.1 CHANGELOG.md & README.md * Updates: - [CHANGELOG.md] Add 6.1 and update 6.0 changes - [README.md] Update README.md with ROCm install instructions Change-Id: Ic701ebcb00e5d0af54d8f97707c1cec71a0aac4c Signed-off-by: Charis Poag --- CHANGELOG.md | 408 ++++++++++++++++++++++++++++++++- README.md | 15 +- amdsmi_cli/README.md | 2 +- py-interface/amdsmi_wrapper.py | 3 +- 4 files changed, 420 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37513075..00728e53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,410 @@ # Change Log for AMD SMI Library -Full documentation for amd_smi_lib is available at [https://docs.amd.com/](https://rocm.docs.amd.com/projects/amdsmi/en/latest/). +Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](https://rocm.docs.amd.com/projects/amdsmi/en/latest/). + +***All information listed below is for reference and subject to change.*** + +## amd_smi_lib for ROCm 6.1.0 + +### Added +- **Added Monitor Command** +Provides users the ability to customize GPU metrics to capture, collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi` (no argument), additionally allows uers to customize what data is helpful for their use-case. +```shell +$ amd-smi monitor -h +usage: amd-smi monitor [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] + [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] + [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-p] [-t] [-u] [-m] [-n] + [-d] [-s] [-e] [-v] [-r] + +Monitor a target device for the specified arguments. +If no arguments are provided, all arguments will be enabled. +Use the watch arguments to run continuously + +Monitor Arguments: + -h, --help show this help message and exit + -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: + ID: 0 | BDF: 0000:01:00.0 | UUID: 4eff74a0-0000-1000-802d-1d762a397f73 + all | Selects all devices + -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: + ID: 0 + all | Selects all devices + -O, --core CORE [CORE ...] Select a Core ID from the possible choices: + ID: 0 - 23 + all | Selects all devices + -w, --watch INTERVAL Reprint the command in a loop of INTERVAL seconds + -W, --watch_time TIME The total TIME to watch the given command + -i, --iterations ITERATIONS Total number of ITERATIONS to loop on the given command + -p, --power-usage Monitor power usage in Watts + -t, --temperature Monitor temperature in Celsius + -u, --gfx Monitor graphics utilization (%) and clock (MHz) + -m, --mem Monitor memory utilization (%) and clock (MHz) + -n, --encoder Monitor encoder utilization (%) and clock (MHz) + -d, --decoder Monitor decoder utilization (%) and clock (MHz) + -s, --throttle-status Monitor thermal throttle status + -e, --ecc Monitor ECC single bit, ECC double bit, and PCIe replay error counts + -v, --vram-usage Monitor memory usage in MB + -r, --pcie Monitor PCIe Tx/Rx in MB/s + +Command Modifiers: + --json Displays output in JSON format (human readable by default). + --csv Displays output in CSV format (human readable by default). + --file FILE Saves output into a file on the provided path (stdout by default). + --loglevel LEVEL Set the logging level from the possible choices: + DEBUG, INFO, WARNING, ERROR, CRITICAL +``` +```shell +$ amd-smi monitor -ptumv +GPU POWER GPU_TEMP MEM_TEMP GFX_UTIL GFX_CLOCK MEM_UTIL MEM_CLOCK VRAM_USED VRAM_TOTAL + 0 171 W 32 °C 33 °C 0 % 114 MHz 0 % 900 MHz 283 MB 196300 MB + 1 175 W 33 °C 34 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB + 2 177 W 31 °C 33 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB + 3 172 W 33 °C 32 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB + 4 178 W 32 °C 32 °C 0 % 113 MHz 0 % 900 MHz 284 MB 196300 MB + 5 176 W 33 °C 35 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB + 6 176 W 32 °C 32 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB + 7 175 W 34 °C 32 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB +``` + +- **Integrated ESMI Tool** +Users can get CPU metrics and telemetry through our API and CLI tools. This information can be seen in `amd-smi static` and `amd-smi metric` commands. Only available for limited target processors. As of ROCm 6.0.2, this is listed as: + - AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh + - AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh + + See a few examples listed below. + +```shell +$ amd-smi static -U all +CPU: 0 + SMU: + FW_VERSION: 85.90.0 + INTERFACE_VERSION: + PROTO VERSION: 6 +``` +```shell +$ amd-smi metric -O 0 1 2 +CORE: 0 + BOOST_LIMIT: + VALUE: 400 MHz + CURR_ACTIVE_FREQ_CORE_LIMIT: + VALUE: 400 MHz + CORE_ENERGY: + VALUE: N/A + +CORE: 1 + BOOST_LIMIT: + VALUE: 400 MHz + CURR_ACTIVE_FREQ_CORE_LIMIT: + VALUE: 400 MHz + CORE_ENERGY: + VALUE: N/A + +CORE: 2 + BOOST_LIMIT: + VALUE: 400 MHz + CURR_ACTIVE_FREQ_CORE_LIMIT: + VALUE: 400 MHz + CORE_ENERGY: + VALUE: N/A +``` +```shell +$ amd-smi metric -U all +CPU: 0 + POWER_METRICS: + SOCKET POWER: 102675 mW + SOCKET POWER LIMIT: 550000 mW + SOCKET MAX POWER LIMIT: 550000 mW + PROCHOT: + PROCHOT_STATUS: 0 + FREQ_METRICS: + FCLKMEMCLK: + FCLK: 2000 MHz + MCLK: 1300 MHz + CCLKFREQLIMIT: 400 MHz + SOC_CURRENT_ACTIVE_FREQ_LIMIT: + FREQ: 400 MHz + FREQ_SRC: [HSMP Agent] + SOC_FREQ_RANGE: + MAX_SOCKET_FREQ: 3700 MHz + MIN_SOCKET_FREQ: 400 MHz + C0_RESIDENCY: + RESIDENCY: 4 % + SVI_TELEMETRY_ALL_RAILS: + POWER: 102673 mW + METRIC_VERSION: + VERSION: 11 + METRICS_TABLE: + CPU_FAMILY: 25 + CPU_MODEL: 144 + RESPONSE: + MTBL_ACCUMULATION_COUNTER: 2887162626 + MTBL_MAX_SOCKET_TEMPERATURE: 41.0 °C + MTBL_MAX_VR_TEMPERATURE: 39.0 °C + MTBL_MAX_HBM_TEMPERATURE: 40.0 °C + MTBL_MAX_SOCKET_TEMPERATURE_ACC: 108583340881.125 °C + MTBL_MAX_VR_TEMPERATURE_ACC: 109472702595.0 °C + MTBL_MAX_HBM_TEMPERATURE_ACC: 111516663941.0 °C + MTBL_SOCKET_POWER_LIMIT: 550.0 W + MTBL_MAX_SOCKET_POWER_LIMIT: 550.0 W + MTBL_SOCKET_POWER: 102.678 W + MTBL_TIMESTAMP_RAW: 288731677361880 + MTBL_TIMESTAMP_READABLE: Tue Mar 19 12:32:21 2024 + MTBL_SOCKET_ENERGY_ACC: 166127.84 kJ + MTBL_CCD_ENERGY_ACC: 3317.837 kJ + MTBL_XCD_ENERGY_ACC: 21889.147 kJ + MTBL_AID_ENERGY_ACC: 121932.397 kJ + MTBL_HBM_ENERGY_ACC: 18994.108 kJ + MTBL_CCLK_FREQUENCY_LIMIT: 3.7 GHz + MTBL_GFXCLK_FREQUENCY_LIMIT: 0.0 MHz + MTBL_FCLK_FREQUENCY: 1999.988 MHz + MTBL_UCLK_FREQUENCY: 1299.993 MHz + MTBL_SOCCLK_FREQUENCY: [35.716, 35.715, 35.714, 35.714] MHz + MTBL_VCLK_FREQUENCY: [0.0, 53.749, 53.749, 53.749] MHz + MTBL_DCLK_FREQUENCY: [7.143, 44.791, 44.791, 44.791] MHz + MTBL_LCLK_FREQUENCY: [20.872, 18.75, 35.938, 599.558] MHz + MTBL_FCLK_FREQUENCY_TABLE: [1200.0, 1600.0, 1900.0, 2000.0] MHz + MTBL_UCLK_FREQUENCY_TABLE: [900.0, 1100.0, 1200.0, 1300.0] MHz + MTBL_SOCCLK_FREQUENCY_TABLE: [800.0, 1000.0, 1142.857, 1142.857] MHz + MTBL_VCLK_FREQUENCY_TABLE: [914.286, 1300.0, 1560.0, 1720.0] MHz + MTBL_DCLK_FREQUENCY_TABLE: [711.111, 975.0, 1300.0, 1433.333] MHz + MTBL_LCLK_FREQUENCY_TABLE: [600.0, 844.444, 1150.0, 1150.0] MHz + MTBL_CCLK_FREQUENCY_ACC: [4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, + 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, + 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, + 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, + 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] GHz + MTBL_GFXCLK_FREQUENCY_ACC: [0.0, 0.0, 250534397827.603, 251546257401.82, 250811364089.836, + 249999070486.505, 251622633562.855, 251342375116.05] MHz + MTBL_GFXCLK_FREQUENCY: [0.0, 0.0, 31.091, 31.414, 31.141, 31.478, 31.32, 31.453] + MHz + MTBL_MAX_CCLK_FREQUENCY: 3.7 GHz + MTBL_MIN_CCLK_FREQUENCY: 0.4 GHz + MTBL_MAX_GFXCLK_FREQUENCY: 2100.0 MHz + MTBL_MIN_GFXCLK_FREQUENCY: 500.0 MHz + MTBL_MAX_LCLK_DPM_RANGE: 2 + MTBL_MIN_LCLK_DPM_RANGE: 0 + MTBL_XGMI_WIDTH: 0.0 + MTBL_XGMI_BITRATE: 0.0 Gbps + MTBL_XGMI_READ_BANDWIDTH_ACC: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] Gbps + MTBL_XGMI_WRITE_BANDWIDTH_ACC: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] Gbps + MTBL_SOCKET_C0_RESIDENCY: 4.329 % + MTBL_SOCKET_GFX_BUSY: 0.0 % + MTBL_HBM_BANDWIDTH_UTILIZATION: 0.001 % + MTBL_SOCKET_C0_RESIDENCY_ACC: 311523106.34 + MTBL_SOCKET_GFX_BUSY_ACC: 84739.281 + MTBL_HBM_BANDWIDTH_ACC: 33231180.073 Gbps + MTBL_MAX_HBM_BANDWIDTH: 5324.801 Gbps + MTBL_DRAM_BANDWIDTH_UTILIZATION_ACC: 612843.699 + MTBL_PCIE_BANDWIDTH_ACC: [0.0, 0.0, 0.0, 0.0] Gbps + MTBL_PROCHOT_RESIDENCY_ACC: 0 + MTBL_PPT_RESIDENCY_ACC: 2887162626 + MTBL_SOCKET_THM_RESIDENCY_ACC: 2887162626 + MTBL_VR_THM_RESIDENCY_ACC: 0 + MTBL_HBM_THM_RESIDENCY_ACC: 2887162626 + SOCKET_ENERGY: + RESPONSE: N/A + DDR_BANDWIDTH: + RESPONSE: N/A + CPU_TEMP: + RESPONSE: N/A +``` +- **Added support for new metrics: VCN, JPEG engines, and PCIe errors** +Using the AMD SMI tool, users can retreive VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs (with 4 separate VCN engine activities) for older asics `MM_ACTIVITY` with UVD/VCN engine activity (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new fields. + +```shell +$ amd-smi metric -P +GPU: 0 + PCIE: + WIDTH: 16 + SPEED: 16 GT/s + REPLAY_COUNT: 0 + L0_TO_RECOVERY_COUNT: 1 + REPLAY_ROLL_OVER_COUNT: 0 + NAK_SENT_COUNT: 0 + NAK_RECEIVED_COUNT: 0 + CURRENT_BANDWIDTH_SENT: N/A + CURRENT_BANDWIDTH_RECEIVED: N/A + MAX_PACKET_SIZE: N/A +``` +```shell +$ amd-smi metric --usage +GPU: 0 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + MM_ACTIVITY: N/A + VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %] + JPEG_ACTIVITY: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 + %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, + 0 %, 0 %, 0 %, 0 %] + +``` +- **Added AMDSMI Tool Version** +AMD SMI will report ***three versions***: AMDSMI Tool, AMDSMI Library version, and ROCm version. +The AMDSMI Tool version is the CLI/tool version number with commit ID appended after `+` sign. +The AMDSMI Library version is the library package version number. +The ROCm version is the system's installed ROCm version, if ROCm is not installed it will report N/A. +```shell +$ amd-smi version +AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6.1.0 +``` + +- **Added XGMI table** +Displays XGMI information for AMD GPU devices in a table format. Only available on supported ASICs (eg. MI300). Here users can view read/write data XGMI or PCIe accumulated data transfer size (in KiloBytes). +```shell +$ amd-smi xgmi +LINK METRIC TABLE: + bdf bit_rate max_bandwidth link_type 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +GPU0 0000:0c:00.0 32 Gb/s 512 Gb/s XGMI + Read N/A 2 KB 2 KB 1 KB 2 KB 1 KB 2 KB 2 KB + Write N/A 1 KB 1 KB 1 KB 1 KB 1 KB 1 KB 1 KB +GPU1 0000:22:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB N/A 2 KB 2 KB 1 KB 2 KB 1 KB 2 KB + Write 0 KB N/A 1 KB 1 KB 1 KB 1 KB 1 KB 1 KB +GPU2 0000:38:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 1 KB N/A 2 KB 1 KB 2 KB 0 KB 0 KB + Write 0 KB 1 KB N/A 1 KB 1 KB 1 KB 1 KB 1 KB +GPU3 0000:5c:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 0 KB 2 KB N/A 1 KB 0 KB 0 KB 2 KB + Write 0 KB 1 KB 1 KB N/A 1 KB 1 KB 1 KB 1 KB +GPU4 0000:9f:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 1 KB 0 KB 0 KB N/A 2 KB 0 KB 2 KB + Write 0 KB 1 KB 1 KB 1 KB N/A 1 KB 1 KB 1 KB +GPU5 0000:af:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 2 KB 0 KB 0 KB 0 KB N/A 2 KB 0 KB + Write 0 KB 1 KB 1 KB 1 KB 1 KB N/A 1 KB 1 KB +GPU6 0000:bf:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 0 KB 0 KB 0 KB 0 KB 0 KB N/A 0 KB + Write 0 KB 1 KB 1 KB 1 KB 1 KB 1 KB N/A 1 KB +GPU7 0000:df:00.0 32 Gb/s 512 Gb/s XGMI + Read 0 KB 0 KB 0 KB 0 KB 0 KB 0 KB 0 KB N/A + Write 0 KB 1 KB 1 KB 1 KB 1 KB 1 KB 1 KB N/A + +``` +- **Added units of measure to JSON output.** +We added unit of measure to JSON/CSV `amd-smi metric`, `amd-smi static`, and `amd-smi monitor` commands. + +Ex. +```shell +amd-smi metric -p --json +[ + { + "gpu": 0, + "power": { + "socket_power": { + "value": 10, + "unit": "W" + }, + "gfx_voltage": { + "value": 6, + "unit": "mV" + }, + "soc_voltage": { + "value": 918, + "unit": "mV" + }, + "mem_voltage": { + "value": 1250, + "unit": "mV" + }, + "power_management": "ENABLED", + "throttle_status": "UNTHROTTLED" + } + } +] +``` + +### Changed + +- **Topology is now left-aligned with BDF of each device listed individual table's row/coloumns.** +We provided each device's BDF for every table's row/columns, then left aligned data. We want AMD SMI Tool output to be easy to understand and digest for our users. Having users scroll up to find this information made it difficult to follow, especially for devices which have many devices associated with one ASIC. +```shell +$ amd-smi topology +ACCESS TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:22:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:38:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:5c:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:9f:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:af:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:bf:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED +0000:df:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED + +WEIGHT TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 0 15 15 15 15 15 15 15 +0000:22:00.0 15 0 15 15 15 15 15 15 +0000:38:00.0 15 15 0 15 15 15 15 15 +0000:5c:00.0 15 15 15 0 15 15 15 15 +0000:9f:00.0 15 15 15 15 0 15 15 15 +0000:af:00.0 15 15 15 15 15 0 15 15 +0000:bf:00.0 15 15 15 15 15 15 0 15 +0000:df:00.0 15 15 15 15 15 15 15 0 + +HOPS TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 0 1 1 1 1 1 1 1 +0000:22:00.0 1 0 1 1 1 1 1 1 +0000:38:00.0 1 1 0 1 1 1 1 1 +0000:5c:00.0 1 1 1 0 1 1 1 1 +0000:9f:00.0 1 1 1 1 0 1 1 1 +0000:af:00.0 1 1 1 1 1 0 1 1 +0000:bf:00.0 1 1 1 1 1 1 0 1 +0000:df:00.0 1 1 1 1 1 1 1 0 + +LINK TYPE TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 SELF XGMI XGMI XGMI XGMI XGMI XGMI XGMI +0000:22:00.0 XGMI SELF XGMI XGMI XGMI XGMI XGMI XGMI +0000:38:00.0 XGMI XGMI SELF XGMI XGMI XGMI XGMI XGMI +0000:5c:00.0 XGMI XGMI XGMI SELF XGMI XGMI XGMI XGMI +0000:9f:00.0 XGMI XGMI XGMI XGMI SELF XGMI XGMI XGMI +0000:af:00.0 XGMI XGMI XGMI XGMI XGMI SELF XGMI XGMI +0000:bf:00.0 XGMI XGMI XGMI XGMI XGMI XGMI SELF XGMI +0000:df:00.0 XGMI XGMI XGMI XGMI XGMI XGMI XGMI SELF + +NUMA BW TABLE: + 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 +0000:0c:00.0 N/A 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 +0000:22:00.0 50000-50000 N/A 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 +0000:38:00.0 50000-50000 50000-50000 N/A 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 +0000:5c:00.0 50000-50000 50000-50000 50000-50000 N/A 50000-50000 50000-50000 50000-50000 50000-50000 +0000:9f:00.0 50000-50000 50000-50000 50000-50000 50000-50000 N/A 50000-50000 50000-50000 50000-50000 +0000:af:00.0 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 N/A 50000-50000 50000-50000 +0000:bf:00.0 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 N/A 50000-50000 +0000:df:00.0 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 N/A +``` + +### Optimizations +- N/A + +### Fixed + +- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests** +Devices which do not report (eg. Navi3X/Navi2X/MI100) we have added checks to confirm these devices return AMDSMI_STATUS_NOT_SUPPORTED. Otherwise, tests now display a return string. +- **Fix for devices which have an older pyyaml installed** +Platforms which are identified as having an older pyyaml version or pip, we no manually update both pip and pyyaml as needed. This corrects issues identified below. Fix impacts the following CLI commands: + - `amd-smi list` + - `amd-smi static` + - `amd-smi firmware` + - `amd-smi metric` + - `amd-smi topology` +```shell +TypeError: dump_all() got an unexpected keyword argument 'sort_keys' +``` +- **Fix for crash when user is not a member of video/render groups** +AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid crashes when DRM/device data is inaccessable to the logged in user. + + + +### Known Issues + +- N/A ## amd_smi_lib for ROCm 6.0.0 @@ -26,7 +430,7 @@ Now the information is displayed as a table by each GPU's BDF, which closer rese ### Optimizations -- N/A +- Updated to C++17, gtest-1.14, and cmake 3.14 ### Fixed diff --git a/README.md b/README.md index 109f890e..e50ac965 100755 --- a/README.md +++ b/README.md @@ -26,11 +26,18 @@ installed to query firmware information and hardware IPs. ### Installation -* Install amdgpu driver -* Install amd-smi-lib package through package manager +### Install amdgpu using ROCm +* Install amdgpu driver: +See example below, your release and link may differ. The `amdgpu-install --usecase=rocm` triggers both an amdgpu driver update and AMD SMI packages to be installed on your device. +```shell +sudo apt update +wget https://repo.radeon.com/amdgpu-install/6.0.2/ubuntu/jammy/amdgpu-install_6.0.60002-1_all.deb +sudo apt install ./amdgpu-install_6.0.60002-1_all.deb +sudo amdgpu-install --usecase=rocm +``` * amd-smi --help -### Install Example for Ubuntu 22.04 +### Install Example for Ubuntu 22.04 (without ROCm) ``` bash apt install amd-smi-lib @@ -277,4 +284,4 @@ Path to the program `amdsmitst`: build/tests/amd_smi_test/ The information contained herein is for informational purposes only, and is subject to change without notice. In addition, any stated support is planned and is also subject to change. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. -© 2023 Advanced Micro Devices, Inc. All Rights Reserved. +© 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved. diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index cf2b81df..3273f807 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -15,7 +15,7 @@ Recommended: At least one AMD GPU with AMD driver installed ### Installation -* Install amdgpu driver +* [Install amdgpu driver](../README.md#install-amdgpu-using-rocm) * Optionally install amd_hsmp driver for ESMI CPU functions * Install amd-smi-lib package through package manager * amd-smi --help diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 8fcdb375..f718dcfa 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -1688,7 +1688,8 @@ class struct_amdsmi_error_count_t(Structure): struct_amdsmi_error_count_t._fields_ = [ ('correctable_count', ctypes.c_uint64), ('uncorrectable_count', ctypes.c_uint64), - ('reserved', ctypes.c_uint64 * 2), + ('deferred_count', ctypes.c_uint64), + ('reserved', ctypes.c_uint64 * 5), ] amdsmi_error_count_t = struct_amdsmi_error_count_t From a3407090c3eaa339a53d8a9cac9f8d51d04337ed Mon Sep 17 00:00:00 2001 From: Deepak Mewar Date: Mon, 18 Mar 2024 04:49:17 -0400 Subject: [PATCH 02/18] Updated README with esmi sample code Change-Id: I50de7926fd76757e5810e8c531bcb6f5770ff454 --- README.md | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e50ac965..bd25e588 100755 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ The only required AMD-SMI call for any program that wants to use AMD-SMI is the When AMD-SMI is no longer being used, `amdsmi_shut_down()` should be called. This provides a way to do any releasing of resources that AMD-SMI may have held. -A simple "Hello World" type program that displays the temperature of detected devices would look like this: +1) A simple "Hello World" type program that displays the temperature of detected devices would look like this: ```c++ #include @@ -184,6 +184,67 @@ int main() { } ``` +2) A sample program that displays the power of detected cpus would look like this: + +```c++ +#include +#include +#include "amd_smi/amdsmi.h" + +int main(int argc, char **argv) { + amdsmi_status_t ret; + uint32_t socket_count = 0; + + // Initialize amdsmi for AMD CPUs + ret = amdsmi_init(AMDSMI_INIT_AMD_CPUS); + + ret = amdsmi_get_socket_handles(&socket_count, nullptr); + + // Allocate the memory for the sockets + std::vector sockets(socket_count); + + // Get the sockets of the system + ret = amdsmi_get_socket_handles(&socket_count, &sockets[0]); + + std::cout << "Total Socket: " << socket_count << std::endl; + + // For each socket, get cpus + for (uint32_t i = 0; i < socket_count; i++) { + uint32_t cpu_count = 0; + + // Set processor type as AMD_CPU + processor_type_t processor_type = AMD_CPU; + ret = amdsmi_get_processor_handles_by_type(sockets[i], processor_type, nullptr, &cpu_count); + + // Allocate the memory for the cpus + std::vector plist(cpu_count); + + // Get the cpus for each socket + ret = amdsmi_get_processor_handles_by_type(sockets[i], processor_type, &plist[0], &cpu_count); + + for (uint32_t index = 0; index < plist.size(); index++) { + uint32_t socket_power; + std::cout<<"CPU "<(socket_power)/1000< Date: Thu, 21 Mar 2024 14:53:35 -0500 Subject: [PATCH 03/18] SWDEV-438593 - Updated proccess output error handling Signed-off-by: Maisam Arif Change-Id: I67747da06362428587dab7467d85d8c9296d442e --- amdsmi_cli/amdsmi_commands.py | 39 ++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 697513f5..27152afc 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -2511,6 +2511,8 @@ def process(self, args, multiple_devices=False, watching_output=False, if args.pid: process_pids = [] for process_info in filtered_process_values: + if process_info['process_info'] == "N/A": + continue pid = str(process_info['process_info']['pid']) if str(args.pid) == pid: process_pids.append(process_info) @@ -2520,36 +2522,45 @@ def process(self, args, multiple_devices=False, watching_output=False, if args.name: process_names = [] for process_info in filtered_process_values: + if process_info['process_info'] == "N/A": + continue process_name = str(process_info['process_info']['name']).lower() if str(args.name).lower() == process_name: process_names.append(process_info) filtered_process_values = process_names + logging.debug(f"Process Info for GPU {gpu_id} | {filtered_process_values}") + multiple_devices_csv_override = False # Convert and store output by pid for csv format if self.logger.is_csv_format(): - for process_info in filtered_process_values: - for key, value in process_info['process_info'].items(): - multiple_devices_csv_override = True - - if watching_output: - self.logger.store_output(args.gpu, 'timestamp', int(time.time())) - self.logger.store_output(args.gpu, key, value) + # Check for empty list first + if filtered_process_values == []: + self.logger.store_output(args.gpu, 'process_info', 'No running processes detected') + else: + for process_info in filtered_process_values: + if process_info['process_info'] == "N/A": + self.logger.store_output(args.gpu, 'process_info', 'No running processes detected') + else: + for key, value in process_info['process_info'].items(): + multiple_devices_csv_override = True + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + self.logger.store_output(args.gpu, key, value) - self.logger.store_multiple_device_output() + self.logger.store_multiple_device_output() else: - # Remove brackets if there is only one value - if len(filtered_process_values) == 1: - filtered_process_values = filtered_process_values[0] - if watching_output: self.logger.store_output(args.gpu, 'timestamp', int(time.time())) # Store values in logger.output if filtered_process_values == []: - self.logger.store_output(args.gpu, 'values', {'process_info': 'Not Found'}) + self.logger.store_output(args.gpu, 'process_info', 'No running processes detected') else: - self.logger.store_output(args.gpu, 'values', filtered_process_values) + for process_info in filtered_process_values: + if process_info['process_info'] == "N/A": + process_info['process_info'] = 'No running processes detected' + self.logger.store_output(args.gpu, 'process_info', process_info['process_info']) if multiple_devices: self.logger.store_multiple_device_output() From 1310c767ce6cc600c37256dff156782e125bb868 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 5 Mar 2024 14:01:06 -0600 Subject: [PATCH 04/18] fix: [SWDEV-448201] [rocm/amd_smi_lib] Adds Add PCIE Errors Code changes related to the following: * amdsmi_get_pcie_info() * CLI * examples Change-Id: Ie0b7053e77c88fb18309c16e74bce75d862c45a9 Signed-off-by: Oliveira, Daniel --- amdsmi_cli/amdsmi_commands.py | 107 ++++++++++----------------- example/amd_smi_drm_example.cc | 8 ++ include/amd_smi/amdsmi.h | 2 +- include/amd_smi/impl/amd_smi_utils.h | 54 ++++++++++++++ py-interface/README.md | 58 +++++++-------- py-interface/amdsmi_interface.py | 47 +++++++++++- src/amd_smi/amd_smi.cc | 28 ++++++- 7 files changed, 201 insertions(+), 103 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 27152afc..35af8395 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -361,11 +361,11 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None logging.debug("Failed to get bdf for gpu %s | %s", gpu_id, e.get_error_info()) try: - link_caps = amdsmi_interface.amdsmi_get_pcie_info(args.gpu) - bus_info['max_pcie_width'] = link_caps['pcie_static']['max_pcie_width'] - bus_info['max_pcie_speed'] = link_caps['pcie_static']['max_pcie_speed'] - bus_info['pcie_interface_version'] = link_caps['pcie_static']['pcie_interface_version'] - + pcie_static = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_static'] + bus_info['max_pcie_width'] = pcie_static['max_pcie_width'] + bus_info['max_pcie_speed'] = pcie_static['max_pcie_speed'] + bus_info['pcie_interface_version'] = pcie_static['pcie_interface_version'] + bus_info['slot_type'] = pcie_static['slot_type'] if bus_info['max_pcie_speed'] % 1000 != 0: pcie_speed_GTs_value = round(bus_info['max_pcie_speed'] / 1000, 1) else: @@ -373,14 +373,6 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None bus_info['max_pcie_speed'] = pcie_speed_GTs_value - slot_type = link_caps['pcie_static']['slot_type'] - if isinstance(slot_type, int): - slot_types = amdsmi_interface.amdsmi_wrapper.amdsmi_card_form_factor_t__enumvalues - if slot_type in slot_types: - bus_info['slot_type'] = slot_types[slot_type].replace("AMDSMI_CARD_FORM_FACTOR_", "") - else: - bus_info['slot_type'] = "Unknown" - if bus_info['pcie_interface_version'] > 0: bus_info['pcie_interface_version'] = f"Gen {bus_info['pcie_interface_version']}" @@ -636,7 +628,7 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None except amdsmi_exception.AmdSmiLibraryException as e: policy_info = "N/A" logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info()) - + static_dict['dpm_policy'] = policy_info if 'numa' in current_platform_args: if args.numa: @@ -1460,6 +1452,7 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No if args.pcie: pcie_dict = {"width": "N/A", "speed": "N/A", + "bandwidth": "N/A", "replay_count" : "N/A", "l0_to_recovery_count" : "N/A", "replay_roll_over_count" : "N/A", @@ -1470,65 +1463,43 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No "max_packet_size": "N/A"} try: - pcie_link_status = amdsmi_interface.amdsmi_get_pcie_info(args.gpu) + pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric'] + logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric) - if pcie_link_status['pcie_metric']['pcie_speed'] % 1000 != 0: - pcie_speed_GTs_value = round(pcie_link_status['pcie_metric']['pcie_speed'] / 1000, 1) - else: - pcie_speed_GTs_value = round(pcie_link_status['pcie_metric']['pcie_speed'] / 1000) + pcie_dict['width'] = pcie_metric['pcie_width'] - pcie_dict['width'] = pcie_link_status['pcie_metric']['pcie_width'] - pcie_dict['speed'] = pcie_speed_GTs_value + if pcie_metric['pcie_speed'] != "N/A": + if pcie_metric['pcie_speed'] % 1000 != 0: + pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1) + else: + pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000) + pcie_dict['speed'] = pcie_speed_GTs_value + + pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth'] + pcie_dict['replay_count'] = pcie_metric['pcie_replay_count'] + pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count'] + pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count'] + pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count'] + pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count'] pcie_speed_unit = 'GT/s' + pcie_bw_unit = 'Mb/s' if self.logger.is_human_readable_format(): - pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}" + if pcie_dict['speed'] != "N/A": + pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}" + if pcie_dict['bandwidth'] != "N/A": + pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}" if self.logger.is_json_format(): - pcie_dict['speed'] = {"value" : pcie_dict['speed'], - "unit" : pcie_speed_unit} + if pcie_dict['speed'] != "N/A": + pcie_dict['speed'] = {"value" : pcie_dict['speed'], + "unit" : pcie_speed_unit} + if pcie_dict['bandwidth'] != "N/A": + pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'], + "unit" : pcie_bw_unit} except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info()) - try: - pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_count_acc'] - if pci_replay_counter == "N/A": - # raising exception here to fall back to sysfs - raise amdsmi_exception.AmdSmiLibraryException(amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - pcie_dict['replay_count'] = pci_replay_counter - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info()) - logging.debug("Falling back to sysfs pci replay counter for gpu %s | %s", gpu_id, e.get_error_info()) - try: - pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu) - pcie_dict['replay_count'] = pci_replay_counter - except amdsmi_exception.AmdSmiLibraryException as err: - pcie_dict['replay_count'] = "N/A" - logging.debug("Failed to get sysfs fallback pci replay counter for gpu %s | %s", gpu_id, err.get_error_info()) - - try: - l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_l0_to_recov_count_acc'] - pcie_dict['l0_to_recovery_count'] = l0_to_recovery_counter - except amdsmi_exception.AmdSmiLibraryException as e: - pcie_dict['l0_to_recovery_count'] = "N/A" - logging.debug("Failed to get pcie l0 to recovery counter for gpu %s | %s", gpu_id, e.get_error_info()) - - try: - pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_rover_count_acc'] - pcie_dict['replay_roll_over_count'] = pci_replay_rollover_counter - except amdsmi_exception.AmdSmiLibraryException as e: - pcie_dict['replay_roll_over_count'] = "N/A" - logging.debug("Failed to get pcie replay rollover counter for gpu %s | %s", gpu_id, e.get_error_info()) - - try: - gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) - pcie_dict['nak_sent_count'] = gpu_metrics_info['pcie_nak_sent_count_acc'] - pcie_dict['nak_received_count'] = gpu_metrics_info['pcie_nak_rcvd_count_acc'] - except amdsmi_exception.AmdSmiLibraryException as e: - pcie_dict['nak_sent_count'] = "N/A" - pcie_dict['nak_received_count'] = "N/A" - logging.debug("Failed to get pcie nak info for gpu %s | %s", gpu_id, e.get_error_info()) - try: pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu) sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz'] @@ -4134,14 +4105,14 @@ def xgmi(self, args, multiple_devices=False, gpu=None, metric=None): } try: - pcie_info = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static'] - if pcie_info['max_pcie_speed'] % 1000 != 0: - pcie_speed_GTs_value = round(pcie_info['max_pcie_speed'] / 1000, 1) + pcie_static = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static'] + if pcie_static['max_pcie_speed'] % 1000 != 0: + pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000, 1) else: - pcie_speed_GTs_value = round(pcie_info['max_pcie_speed'] / 1000) + pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000) bitrate = pcie_speed_GTs_value - max_bandwidth = bitrate * pcie_info['max_pcie_width'] + max_bandwidth = bitrate * pcie_static['max_pcie_width'] except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Failed to get bitrate and bandwidth for GPU %s | %s", src_gpu_id, e.get_error_info()) diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc index ea28b8eb..cd9a3a1f 100644 --- a/example/amd_smi_drm_example.cc +++ b/example/amd_smi_drm_example.cc @@ -411,6 +411,14 @@ int main() { printf("\tPCIe max lanes: %d\n", pcie_info.pcie_static.max_pcie_width); printf("\tPCIe max speed: %d\n", pcie_info.pcie_static.max_pcie_speed); + // additional pcie related metrics + printf("\tPCIe bandwidth: %d\n", pcie_info.pcie_metric.pcie_bandwidth); + printf("\tPCIe replay count: %d\n", pcie_info.pcie_metric.pcie_replay_count); + printf("\tPCIe L0 recovery count: %d\n", pcie_info.pcie_metric.pcie_l0_to_recovery_count); + printf("\tPCIe rollover count: %d\n", pcie_info.pcie_metric.pcie_replay_roll_over_count); + printf("\tPCIe nak received count: %d\n", pcie_info.pcie_metric.pcie_nak_received_count); + printf("\tPCIe nak sent count: %d\n", pcie_info.pcie_metric.pcie_nak_sent_count); + // Get VRAM temperature limit int64_t temperature = 0; ret = amdsmi_get_temp_metric( diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index ef58a6ce..861709b9 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -509,7 +509,7 @@ typedef struct { struct pcie_metric_ { uint16_t pcie_width; //!< current PCIe width uint32_t pcie_speed; //!< current PCIe speed in MT/s - uint32_t pcie_bandwidth; //!< current PCIe bandwidth Mb/s + uint32_t pcie_bandwidth; //!< current instantaneous PCIe bandwidth in Mb/s uint64_t pcie_replay_count; //!< total number of the replays issued on the PCIe link uint64_t pcie_l0_to_recovery_count; //!< total number of times the PCIe link transitioned from L0 to the recovery state uint64_t pcie_replay_roll_over_count; //!< total number of replay rollovers issued on the PCIe link diff --git a/include/amd_smi/impl/amd_smi_utils.h b/include/amd_smi/impl/amd_smi_utils.h index 7d2df9b1..30897b34 100644 --- a/include/amd_smi/impl/amd_smi_utils.h +++ b/include/amd_smi/impl/amd_smi_utils.h @@ -21,6 +21,9 @@ #ifndef AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_ #define AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_ +#include +#include + #include "amd_smi/amdsmi.h" #include "amd_smi/impl/amd_smi_gpu_device.h" #include "rocm_smi/rocm_smi_utils.h" @@ -45,4 +48,55 @@ amdsmi_status_t smi_amdgpu_get_pcie_speed_from_pcie_type(uint16_t pcie_type, uin amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(uint32_t device_id, char *market_name); amdsmi_status_t smi_amdgpu_is_gpu_power_management_enabled(amd::smi::AMDSmiGPUDevice* device, bool *enabled); + +template +constexpr bool is_dependent_false_v = false; + +template +inline constexpr bool is_supported_type_v = ( + std::is_same_v>, std::uint8_t> || + std::is_same_v>, std::uint16_t> || + std::is_same_v>, std::uint32_t> || + std::is_same_v>, std::uint64_t> +); + +template +constexpr T get_std_num_limit() +{ + if constexpr (is_supported_type_v) { + return std::numeric_limits::max(); + } + else { + return std::numeric_limits::min(); + static_assert(is_dependent_false_v, "Error: Type not supported..."); + } +} + +template +constexpr bool is_std_num_limit(T value) +{ + return (value == get_std_num_limit()); +} + +template +constexpr T translate_umax_or_assign_value(U source_value, V target_value) +{ + T result{}; + if constexpr (is_supported_type_v && is_supported_type_v) { + // If the source value is uint::max(), then return is uint::max() + if (is_std_num_limit(source_value)) { + result = get_std_num_limit(); + } else { + result = static_cast(target_value); + } + + return result; + } + else { + static_assert(is_dependent_false_v, "Error: Type not supported..."); + } + + return result; +} + #endif // diff --git a/py-interface/README.md b/py-interface/README.md index fbc5450c..7d9fd590 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -580,7 +580,7 @@ Output: Dictionary with fields Field | Description ---|--- -`fw_list`| List of dictionaries that contain information about a certain firmware block +`fw_list` | List of dictionaries that contain information about a certain firmware block Exceptions that can be thrown by `amdsmi_get_fw_info` function: @@ -619,7 +619,7 @@ Output: Dictionary of activites to their respective usage percentage or 'N/A' if Field | Description ---|--- -`gfx_activity`| graphics engine usage percentage (0 - 100) +`gfx_activity` | graphics engine usage percentage (0 - 100) `umc_activity` | memory engine usage percentage (0 - 100) `mm_activity` | average multimedia engine usages in percentage (0 - 100) @@ -659,7 +659,7 @@ Output: Dictionary with fields Field | Description ---|--- -`average_socket_power`| average socket power +`average_socket_power` | average socket power `gfx_voltage` | voltage gfx `power_limit` | power limit @@ -699,7 +699,7 @@ Output: Dictionary with fields Field | Description ---|--- `vram_total` | VRAM total -`vram_used`| VRAM currently in use +`vram_used` | VRAM currently in use Exceptions that can be thrown by `amdsmi_get_gpu_vram_usage` function: @@ -751,7 +751,7 @@ Output: Dictionary with fields Field | Description ---|--- -`cur_clk`| Current clock for given clock type +`cur_clk` | Current clock for given clock type `max_clk` | Maximum clock for given clock type `min_clk` | Minimum clock for given clock type @@ -780,20 +780,19 @@ except AmdSmiException as e: ### amdsmi_get_pcie_info -Description: Returns the pcie link status for the given GPU. +Description: Returns the pcie metric and static information for the given GPU. It is not supported on virtual machine guest Input parameters: * `processor_handle` device which to query -Output: Dictionary with fields +Output: Dictionary with 2 fields `pcie_static` and `pcie_metric` -Field | Description +Fields | Description ---|--- -`pcie_width`| pcie lanes in use -`pcie_speed`| current pcie speed -`pcie_interface_version`| current pcie generation +`pcie_static` |
Subfield Description
`max_pcie_width`Maximum number of pcie lanes available
`max_pcie_speed`Maximum capable pcie speed in GT/s
`pcie_interface_version`PCIe generation ie. 3,4,5...
`slot_type`The type of form factor of the slot: PCIE, OAM, or Unknown
+`pcie_metric` |
Subfield Description
`pcie_width`Current number of pcie lanes available
`pcie_speed`Current pcie speed capable in GT/s
`pcie_bandwidth`Current instantaneous bandwidth usage in Mb/s
`pcie_replay_count`Total number of PCIe replays (NAKs)
`pcie_l0_to_recovery_count`PCIE L0 to recovery state transition accumulated count
`pcie_replay_roll_over_count`PCIe Replay accumulated count
`pcie_nak_sent_count`PCIe NAK sent accumulated count
`pcie_nak_received_count`PCIe NAK received accumulated count
Exceptions that can be thrown by `amdsmi_get_pcie_info` function: @@ -810,10 +809,9 @@ try: print("No GPUs on machine") else: for device in devices: - pcie_link_status = amdsmi_get_pcie_info(device) - print(pcie_link_status["pcie_width"]) - print(pcie_link_status["pcie_speed"]) - print(pcie_link_status["pcie_interface_version"]) + pcie_info = amdsmi_get_pcie_info(device) + print(pcie_info["pcie_static"]) + print(pcie_info["pcie_metric"]) except AmdSmiException as e: print(e) ``` @@ -949,8 +947,8 @@ Output: Dictionary with fields Field | Description ---|--- -`correctable_count`| Correctable ECC error count -`uncorrectable_count`| Uncorrectable ECC error count +`correctable_count` | Correctable ECC error count +`uncorrectable_count` | Uncorrectable ECC error count Exceptions that can be thrown by `amdsmi_get_gpu_total_ecc_count` function: @@ -2021,9 +2019,9 @@ Output: Dictionary with fields Field | Description ---|--- -`num_supported`| The number of supported frequencies -`current`| The current frequency index -`frequency`| List of frequencies, only the first num_supported frequencies are valid +`num_supported` | The number of supported frequencies +`current` | The current frequency index +`frequency` | List of frequencies, only the first num_supported frequencies are valid Exceptions that can be thrown by `amdsmi_get_clk_freq` function: @@ -2062,8 +2060,8 @@ Field | Description `curr_mclk_range` |
Subfield Description
`lower_bound`lower bound mclk range
`upper_bound`upper bound mclk range
`sclk_freq_limits` |
Subfield Description
`lower_bound`lower bound sclk range limt
`upper_bound`upper bound sclk range limit
`mclk_freq_limits` |
Subfield Description
`lower_bound`lower bound mclk range limit
`upper_bound`upper bound mclk range limit
-`curve.vc_points`| The number of supported frequencies -`num_regions`| The current frequency index +`curve.vc_points` | The number of supported frequencies +`num_regions` | The current frequency index Exceptions that can be thrown by `amdsmi_get_gpu_od_volt_info` function: @@ -2228,9 +2226,9 @@ Output: Dictionary with fields Field | Description ---|--- -`available_profiles`| Which profiles are supported by this system -`current`| Which power profile is currently active -`num_profiles`| How many power profiles are available +`available_profiles` | Which profiles are supported by this system +`current` | Which power profile is currently active +`num_profiles` | How many power profiles are available Exceptions that can be thrown by `amdsmi_get_gpu_power_profile_presets` function: @@ -2391,9 +2389,9 @@ Output: Dictionary with fields Field | Description ---|--- -`value`| Counter value -`time_enabled`| Time that the counter was enabled in nanoseconds -`time_running`| Time that the counter was running in nanoseconds +`value` | Counter value +`time_enabled` | Time that the counter was enabled in nanoseconds +`time_running` | Time that the counter was running in nanoseconds Exceptions that can be thrown by `amdsmi_gpu_read_counter` function: @@ -2661,8 +2659,8 @@ Output: Dict containing information about error counts Field | Description ---|--- -`correctable_count`| Count of correctable errors -`uncorrectable_count`| Count of uncorrectable errors +`correctable_count` | Count of correctable errors +`uncorrectable_count` | Count of uncorrectable errors Exceptions that can be thrown by `amdsmi_get_gpu_ecc_count` function: diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 7eb501bb..e27451da 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -2134,7 +2134,7 @@ def amdsmi_get_pcie_info( ) ) - return { + pcie_info_dict = { "pcie_static": { "max_pcie_width": pcie_info.pcie_static.max_pcie_width, "max_pcie_speed": pcie_info.pcie_static.max_pcie_speed, @@ -2153,6 +2153,49 @@ def amdsmi_get_pcie_info( } } + # Check pcie static values for uint max + if pcie_info_dict['pcie_static']['max_pcie_width'] == 0xFFFF: + pcie_info_dict['pcie_static']['max_pcie_width'] = "N/A" + if pcie_info_dict['pcie_static']['max_pcie_speed'] == 0xFFFFFFFF: + pcie_info_dict['pcie_static']['max_pcie_speed'] = "N/A" + if pcie_info_dict['pcie_static']['pcie_interface_version'] == 0xFFFFFFFF: + pcie_info_dict['pcie_static']['pcie_interface_version'] = "N/A" + + slot_type = pcie_info_dict['pcie_static']['slot_type'] + if isinstance(slot_type, int): + slot_types = amdsmi_wrapper.amdsmi_card_form_factor_t__enumvalues + if slot_type in slot_types: + pcie_info_dict['pcie_static']['slot_type'] = slot_types[slot_type].replace("AMDSMI_CARD_FORM_FACTOR_", "") + else: + pcie_info_dict['pcie_static']['slot_type'] = "Unknown" + else: + pcie_info_dict['pcie_static']['slot_type'] = "N/A" + + # Check pcie metric values for uint max + if pcie_info_dict['pcie_metric']['pcie_width'] == 0xFFFF: + pcie_info_dict['pcie_metric']['pcie_width'] = "N/A" + if pcie_info_dict['pcie_metric']['pcie_speed'] == 0xFFFFFFFF: + pcie_info_dict['pcie_metric']['pcie_speed'] = "N/A" + if pcie_info_dict['pcie_metric']['pcie_bandwidth'] == 0xFFFFFFFF: + pcie_info_dict['pcie_metric']['pcie_bandwidth'] = "N/A" + + # TODO Just Navi 21 has a different uint max size for pcie_bandwidth + # if pcie_info_dict['pcie_metric']['pcie_bandwidth'] == 0xFFFFFFFF: + # pcie_info_dict['pcie_metric']['pcie_bandwidth'] = "N/A" + + if pcie_info_dict['pcie_metric']['pcie_replay_count'] == 0xFFFFFFFFFFFFFFFF: + pcie_info_dict['pcie_metric']['pcie_replay_count'] = "N/A" + if pcie_info_dict['pcie_metric']['pcie_l0_to_recovery_count'] == 0xFFFFFFFFFFFFFFFF: + pcie_info_dict['pcie_metric']['pcie_l0_to_recovery_count'] = "N/A" + if pcie_info_dict['pcie_metric']['pcie_replay_roll_over_count'] == 0xFFFFFFFFFFFFFFFF: + pcie_info_dict['pcie_metric']['pcie_replay_roll_over_count'] = "N/A" + if pcie_info_dict['pcie_metric']['pcie_nak_sent_count'] == 0xFFFFFFFFFFFFFFFF: + pcie_info_dict['pcie_metric']['pcie_nak_sent_count'] = "N/A" + if pcie_info_dict['pcie_metric']['pcie_nak_received_count'] == 0xFFFFFFFFFFFFFFFF: + pcie_info_dict['pcie_metric']['pcie_nak_received_count'] = "N/A" + + return pcie_info_dict + def amdsmi_get_processor_handle_from_bdf(bdf): bdf = _parse_bdf(bdf) @@ -3275,7 +3318,7 @@ def amdsmi_get_dpm_policy( processor_handle, ctypes.byref(policy) ) ) - + polices = [] for i in range(0, policy.num_supported): id = policy.policies[i].policy_id diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 392b6188..e57ae30c 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -2052,8 +2052,32 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a status = smi_amdgpu_get_pcie_speed_from_pcie_type(metric_info.pcie_link_speed, &info->pcie_metric.pcie_speed); // mapping to MT/s } else { // gpu metrics returns pcie link speed in .1 GT/s ex. 160 vs 16 - info->pcie_metric.pcie_speed = metric_info.pcie_link_speed * 100; - } + info->pcie_metric.pcie_speed = translate_umax_or_assign_valuepcie_metric.pcie_speed)> + (metric_info.pcie_link_speed, (metric_info.pcie_link_speed * 100)); + } + + // additional pcie related metrics + /** + * pcie_metric.pcie_bandwidth: MB/s (uint32_t) + * metric_info.pcie_bandwidth_inst: GB/s (uint64_t) + */ + info->pcie_metric.pcie_bandwidth = translate_umax_or_assign_valuepcie_metric.pcie_bandwidth)> + (metric_info.pcie_bandwidth_inst, metric_info.pcie_bandwidth_inst); + info->pcie_metric.pcie_replay_count = metric_info.pcie_replay_count_acc; + info->pcie_metric.pcie_l0_to_recovery_count = metric_info.pcie_l0_to_recov_count_acc; + info->pcie_metric.pcie_replay_roll_over_count = metric_info.pcie_replay_rover_count_acc; + /** + * pcie_metric.pcie_nak_received_count: (uint64_t) + * metric_info.pcie_nak_rcvd_count_acc: (uint32_t) + */ + info->pcie_metric.pcie_nak_received_count = translate_umax_or_assign_valuepcie_metric.pcie_nak_received_count)> + (metric_info.pcie_nak_rcvd_count_acc, (metric_info.pcie_nak_rcvd_count_acc)); + /** + * pcie_metric.pcie_nak_sent_count: (uint64_t) + * metric_info.pcie_nak_sent_count_acc: (uint32_t) + */ + info->pcie_metric.pcie_nak_sent_count = translate_umax_or_assign_valuepcie_metric.pcie_nak_sent_count)> + (metric_info.pcie_nak_sent_count_acc, (metric_info.pcie_nak_sent_count_acc)); return AMDSMI_STATUS_SUCCESS; } From 1ac1ee4b9abed1ac01094dc0cc22c1ecba21d667 Mon Sep 17 00:00:00 2001 From: Deepak Mewar Date: Mon, 11 Mar 2024 12:44:14 +0000 Subject: [PATCH 05/18] fix for cpu enable apb error Signed-off-by: Deepak Mewar Change-Id: I092b88484046671857c4adbbbeaba78180b103ab --- amdsmi_cli/amdsmi_commands.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 35af8395..dbdc16ac 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -3325,18 +3325,13 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level cpu_args_enabled = False cpu_attributes = ["cpu_pwr_limit", "cpu_xgmi_link_width", "cpu_lclk_dpm_level", "cpu_pwr_eff_mode", "cpu_gmi3_link_width", "cpu_pcie_link_rate", "cpu_df_pstate_range", - "cpu_disable_apb", "soc_boost_limit"] + "cpu_enable_apb", "cpu_disable_apb", "soc_boost_limit"] for attr in cpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: cpu_args_enabled = True break - # Check if CPU set argument with store_true has been passed - if hasattr(args, "cpu_enable_apb"): - if getattr(args, attr): - cpu_args_enabled = True - # Check if a Core argument has been set core_args_enabled = False core_attributes = ["core_boost_limit"] From e4085c641431fe9c12ec585a91e39a57db54ecd8 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 20 Mar 2024 12:06:24 -0500 Subject: [PATCH 06/18] Get and set the XGMI PLPD Update the API and CLI to support XGMI Per-Link Power Down Policy. Change-Id: Iaf04a771eb8bb0829a5b3088d803a7355a8dfd0b --- amdsmi_cli/README.md | 67 +++++++++++--- amdsmi_cli/amdsmi_commands.py | 52 ++++++++--- amdsmi_cli/amdsmi_parser.py | 4 + include/amd_smi/amdsmi.h | 43 +++++++++ py-interface/README.md | 72 ++++++++++++++- py-interface/amdsmi_interface.py | 45 ++++++++++ py-interface/amdsmi_wrapper.py | 45 ++++++---- rocm_smi/include/rocm_smi/rocm_smi.h | 39 ++++++++ rocm_smi/src/rocm_smi.cc | 128 ++++++++++++++++++++++++++- rocm_smi/src/rocm_smi_device.cc | 6 +- src/amd_smi/amd_smi.cc | 16 ++++ 11 files changed, 467 insertions(+), 50 deletions(-) diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index 3273f807..f9c0c067 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -280,7 +280,7 @@ usage: amd-smi metric [-h] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE [--core-curr-active-freq-core-limit] [--core-energy] [--json | --csv] [--file FILE] [--loglevel LEVEL] -If no GPU is specified, returns metric information for all GPUs on the system. +If no GPU is specified, returns metric information for all GPUs on the system. If no metric argument is provided all metric information will be displayed. Metric arguments: @@ -325,16 +325,16 @@ CPU Arguments: --cpu-c0-res Displays C0 residency --cpu-lclk-dpm-level NBIOID Displays lclk dpm level range. Requires socket ID and NBOID as inputs --cpu-pwr-svi-telemtry-rails Displays svi based telemetry for all rails - --cpu-io-bandwidth IO_BW LINKID_NAME Displays current IO bandwidth for the selected CPU. - input parameters are bandwidth type(1) and link ID encodings + --cpu-io-bandwidth IO_BW LINKID_NAME Displays current IO bandwidth for the selected CPU. + input parameters are bandwidth type(1) and link ID encodings i.e. P2, P3, G0 - G7 - --cpu-xgmi-bandwidth XGMI_BW LINKID_NAME Displays current XGMI bandwidth for the selected CPU - input parameters are bandwidth type(1,2,4) and link ID encodings + --cpu-xgmi-bandwidth XGMI_BW LINKID_NAME Displays current XGMI bandwidth for the selected CPU + input parameters are bandwidth type(1,2,4) and link ID encodings i.e. P2, P3, G0 - G7 --cpu-metrics-ver Displays metrics table version --cpu-metrics-table Displays metric table --cpu-socket-energy Displays socket energy for the selected CPU socket - --cpu-ddr-bandwidth Displays per socket max ddr bw, current utilized bw, + --cpu-ddr-bandwidth Displays per socket max ddr bw, current utilized bw, and current utilized ddr bw in percentage --cpu-temp Displays cpu socket temperature --cpu-dimm-temp-range-rate DIMM_ADDR Displays dimm temperature range and refresh rate @@ -437,7 +437,7 @@ usage: amd-smi topology [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [-a] [-w] [-o] [-t] [-b] -If no GPU is specified, returns information for all GPUs on the system. +If no GPU is specified, returns information for all GPUs on the system. If no topology argument is provided all topology information will be displayed. Topology arguments: @@ -483,7 +483,7 @@ usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ... [--core-boost-limit BOOST_LIMIT] [--json | --csv] [--file FILE] [--loglevel LEVEL] -A GPU must be specified to set a configuration. +A GPU must be specified to set a configuration. A set argument must be provided; Multiple set arguments are accepted Set Arguments: @@ -513,11 +513,12 @@ Set Arguments: NPS1, NPS2, NPS4, NPS8 -o, --power-cap WATTS Set power capacity limit -p, --dpm-policy POLICY_ID Set the GPU DPM policy using policy id + -x, --xgmi-plpd POLICY_ID Set the GPU XGMI per-link power down policy using policy id CPU Arguments: --cpu-pwr-limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value. --cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH Set max and Min linkwidth. Input parameters are min and max link width values - --cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM Sets the max and min dpm level on a given NBIO. + --cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM Sets the max and min dpm level on a given NBIO. Input parameters are die_index, min dpm, max dpm. --cpu-pwr-eff-mode MODE Sets the power efficency mode policy. Input parameter is mode. --cpu-gmi3-link-width MIN_LW MAX_LW Sets max and min gmi3 link width range @@ -675,7 +676,7 @@ GPU: 0 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - POLICY: + DPM_POLICY: NUM_SUPPORTED: 4 CURRENT_ID: 1 POLICIES: @@ -687,6 +688,16 @@ GPU: 0 POLICY_DESCRIPTION: soc_pstate_1 POLICY_ID: 3 POLICY_DESCRIPTION: soc_pstate_2 + XGMI_PLPD: + NUM_SUPPORTED: 3 + CURRENT_ID: 1 + PLPDS: + POLICY_ID: 0 + POLICY_DESCRIPTION: plpd_disallow + POLICY_ID: 1 + POLICY_DESCRIPTION: plpd_default + POLICY_ID: 2 + POLICY_DESCRIPTION: plpd_optimized NUMA: NODE: 0 AFFINITY: 0 @@ -783,7 +794,7 @@ GPU: 1 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - POLICY: + DPM_POLICY: NUM_SUPPORTED: 4 CURRENT_ID: 1 POLICIES: @@ -795,6 +806,16 @@ GPU: 1 POLICY_DESCRIPTION: soc_pstate_1 POLICY_ID: 3 POLICY_DESCRIPTION: soc_pstate_2 + XGMI_PLPD: + NUM_SUPPORTED: 3 + CURRENT_ID: 1 + PLPDS: + POLICY_ID: 0 + POLICY_DESCRIPTION: plpd_disallow + POLICY_ID: 1 + POLICY_DESCRIPTION: plpd_default + POLICY_ID: 2 + POLICY_DESCRIPTION: plpd_optimized NUMA: NODE: 1 AFFINITY: 1 @@ -891,7 +912,7 @@ GPU: 2 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - POLICY: + DPM_POLICY: NUM_SUPPORTED: 4 CURRENT_ID: 1 POLICIES: @@ -903,6 +924,16 @@ GPU: 2 POLICY_DESCRIPTION: soc_pstate_1 POLICY_ID: 3 POLICY_DESCRIPTION: soc_pstate_2 + XGMI_PLPD: + NUM_SUPPORTED: 3 + CURRENT_ID: 1 + PLPDS: + POLICY_ID: 0 + POLICY_DESCRIPTION: plpd_disallow + POLICY_ID: 1 + POLICY_DESCRIPTION: plpd_default + POLICY_ID: 2 + POLICY_DESCRIPTION: plpd_optimized NUMA: NODE: 2 AFFINITY: 2 @@ -999,7 +1030,7 @@ GPU: 3 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 - POLICY: + DPM_POLICY: NUM_SUPPORTED: 4 CURRENT_ID: 1 POLICIES: @@ -1011,6 +1042,16 @@ GPU: 3 POLICY_DESCRIPTION: soc_pstate_1 POLICY_ID: 3 POLICY_DESCRIPTION: soc_pstate_2 + XGMI_PLPD: + NUM_SUPPORTED: 3 + CURRENT_ID: 1 + PLPDS: + POLICY_ID: 0 + POLICY_DESCRIPTION: plpd_disallow + POLICY_ID: 1 + POLICY_DESCRIPTION: plpd_default + POLICY_ID: 2 + POLICY_DESCRIPTION: plpd_optimized NUMA: NODE: 3 AFFINITY: 3 diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index dbdc16ac..689b3fa5 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -244,7 +244,8 @@ def static_cpu(self, args, multiple_devices=False, cpu=None, interface_ver=None) def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, - cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, policy=None): + cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, + policy=None, xgmi_plpd=None): """Get Static information for target gpu Args: @@ -268,6 +269,7 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None fb_info (bool, optional): Value override for args.fb_info. Defaults to None. num_vf (bool, optional): Value override for args.num_vf. Defaults to None. policy (bool, optional): Value override for args.policy. Defaults to None. + xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None. Returns: None: Print output via AMDSMILogger to destination """ @@ -302,8 +304,10 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None args.limit = limit if policy: args.policy = policy - current_platform_args += ["ras", "limit", "partition", "policy"] - current_platform_values += [args.ras, args.limit, args.partition, args.policy] + if xgmi_plpd: + args.xgmi_plpd = xgmi_plpd + current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd"] + current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd] if self.helpers.is_linux() and not self.helpers.is_virtual_os(): if numa: @@ -630,6 +634,15 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['dpm_policy'] = policy_info + if 'xgmi_plpd' in current_platform_args: + if args.xgmi_plpd: + try: + policy_info = amdsmi_interface.amdsmi_get_xgmi_plpd(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + policy_info = "N/A" + logging.debug("Failed to get xgmi_plpd info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['xgmi_plpd'] = policy_info if 'numa' in current_platform_args: if args.numa: try: @@ -766,7 +779,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, cpu=None, - interface_ver=None, policy=None): + interface_ver=None, policy=None, xgmi_plpd = None): """Get Static information for target gpu and cpu Args: @@ -790,6 +803,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, cpu (cpu_handle, optional): cpu_handle for target device. Defaults to None. interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None policy (bool, optional): Value override for args.policy. Defaults to None. + xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None. Raises: IndexError: Index error if gpu list is empty @@ -815,7 +829,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, gpu_args_enabled = False gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras", "board", "numa", "vram", "cache", "partition", - "dfc_ucode", "fb_info", "num_vf", "policy"] + "dfc_ucode", "fb_info", "num_vf", "policy", "xgmi_plpd"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr): @@ -859,7 +873,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf, policy) + dfc_ucode, fb_info, num_vf, policy, xgmi_plpd) def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True): @@ -3090,7 +3104,7 @@ def set_cpu(self, args, multiple_devices=False, cpu=None, cpu_pwr_limit=None, def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, - memory_partition=None, power_cap=None, dpm_policy=None): + memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None): """Issue reset commands to target gpu(s) Args: @@ -3105,6 +3119,7 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None. power_cap (int, optional): Value override for args.power_cap. Defaults to None. dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. + xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -3132,6 +3147,8 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N args.power_cap = power_cap if dpm_policy: args.dpm_policy = dpm_policy + if xgmi_plpd: + args.xgmi_plpd = xgmi_plpd # Handle No GPU passed if args.gpu == None: raise ValueError('No GPU provided, specific GPU target(s) are needed') @@ -3151,7 +3168,8 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N args.memory_partition, args.perf_determinism is not None, args.power_cap, - args.dpm_policy]): + args.dpm_policy, + args.xgmi_plpd]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -3225,6 +3243,15 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}") + if args.xgmi_plpd: + try: + amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.dpm_policy}") + if isinstance(args.power_cap, int): try: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) @@ -3264,7 +3291,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, - soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None): + soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None): """Issue reset commands to target gpu(s) Args: @@ -3294,6 +3321,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level core (device_handle, optional): device_handle for target core. Defaults to None. core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. + xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -3314,7 +3342,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", - "memory_partition", "power_cap", "dpm_policy"] + "memory_partition", "power_cap", "dpm_policy", "xgmi_plpd"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: @@ -3370,7 +3398,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap, dpm_policy) + memory_partition, power_cap, dpm_policy, xgmi_plpd) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: raise ValueError('No CPU or CORE provided, specific target(s) are needed') @@ -3389,7 +3417,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap, dpm_policy) + memory_partition, power_cap, dpm_policy, xgmi_plpd) def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 5341b274..adaa91c3 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -544,6 +544,7 @@ def _add_static_parser(self, subparsers, func): cache_help = "All cache information" board_help = "All board information" dpm_policy_help = "The available DPM policy" + xgmi_plpd_help = "The available XGMI per-link power down policy" # Options arguments help text for Hypervisors and Baremetal ras_help = "Displays RAS features information" @@ -584,6 +585,7 @@ def _add_static_parser(self, subparsers, func): static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help) + static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help) if self.helpers.is_linux() and not self.helpers.is_virtual_os(): static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) @@ -966,6 +968,7 @@ def _add_set_value_parser(self, subparsers, func): set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" set_power_cap_help = "Set power capacity limit" set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n" + set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id\n" # Help text for CPU set options set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." @@ -1002,6 +1005,7 @@ def _add_set_value_parser(self, subparsers, func): set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False, type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID') + set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID') if self.helpers.is_amd_hsmp_initialized(): # Optional CPU Args diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 861709b9..64bdb125 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -3405,6 +3405,49 @@ amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle, */ amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle, uint32_t policy_id); + +/** + * @brief Get the xgmi per-link power down policy parameter for the processor + * + * @platform{gpu_bm_linux} + * + * @details Given a processor handle @p processor_handle, this function will write + * current xgmi plpd settings to @p policy. All the processors at the same socket + * will have the same policy. + * + * @param[in] processor_handle a processor handle + * + * @param[in, out] policy the xgmi plpd for this processor. + * If this parameter is nullptr, this function will return + * ::AMDSMI_STATUS_INVAL + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle, + amdsmi_dpm_policy_t* xgmi_plpd); + +/** + * @brief Set the xgmi per-link power down policy parameter for the processor + * + * @platform{gpu_bm_linux} + * + * @details Given a processor handle @p processor_handle and a dpm policy @p plpd_id, + * this function will set the xgmi plpd for this processor. All the processors at + * the same socket will be set to the same policy. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] xgmi_plpd_id the xgmi plpd id to set. The id is the id in + * amdsmi_dpm_policy_entry_t, which can be obtained by calling + * amdsmi_get_xgmi_plpd() + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle, + uint32_t plpd_id); + /** @} End PerfCont */ /*****************************************************************************/ diff --git a/py-interface/README.md b/py-interface/README.md index 7d9fd590..82f8ca97 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -909,8 +909,8 @@ Field | Description `name` | Name of process `pid` | Process ID `mem` | Process memory usage -`engine_usage`|
Subfield Description
`gfx`GFX engine usage in ns
`enc`Encode engine usage in ns
-`memory_usage`|
Subfield Description
`gtt_mem`GTT memory usage
`cpu_mem`CPU memory usage
`vram_mem`VRAM memory usage
+`engine_usage` |
Subfield Description
`gfx`GFX engine usage in ns
`enc`Encode engine usage in ns
+`memory_usage` |
Subfield Description
`gtt_mem`GTT memory usage
`cpu_mem`CPU memory usage
`vram_mem`VRAM memory usage
Exceptions that can be thrown by `amdsmi_get_gpu_process_info` function: @@ -2612,6 +2612,74 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_set_xgmi_plpd + +Description: Set the xgmi per-link power down policy parameter for the processor + +Input parameters: + +* `processor_handle` handle for the given device +* `policy_id` the xgmi plpd id to set. + +Output: None + +Exceptions that can be thrown by `amdsmi_set_xgmi_plpd` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_set_xgmi_plpd(device, 0) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_get_xgmi_plpd + +Description: Get the xgmi per-link power down policy parameter for the processor + +Input parameters: + +* `processor_handle` handle for the given device + +Output: Dict containing information about xgmi per-link power down policy + +Field | Description +---|--- +`num_supported` | The number of supported policies +`current_id` | The current policy index +`plpds` | List of policies. + +Exceptions that can be thrown by `amdsmi_get_xgmi_plpd` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + xgmi_plpd = amdsmi_get_xgmi_plpd(device) + print(xgmi_plpd) +except AmdSmiException as e: + print(e) +``` + ### amdsmi_set_gpu_overdrive_level Description: **deprecated** Set the overdrive percent associated with the diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index e27451da..c9e773b8 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -2746,6 +2746,20 @@ def amdsmi_set_dpm_policy( ) ) +def amdsmi_set_xgmi_plpd( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + policy_id: int, +): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + _check_res( + amdsmi_wrapper.amdsmi_set_xgmi_plpd( + processor_handle, policy_id + ) + ) + def amdsmi_set_gpu_overdrive_level( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int ): @@ -3335,6 +3349,37 @@ def amdsmi_get_dpm_policy( "policies": polices, } +def amdsmi_get_xgmi_plpd( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, +) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + policy = amdsmi_wrapper.amdsmi_dpm_policy_t() + _check_res( + amdsmi_wrapper.amdsmi_get_xgmi_plpd( + processor_handle, ctypes.byref(policy) + ) + ) + + polices = [] + for i in range(0, policy.num_supported): + id = policy.policies[i].policy_id + desc = policy.policies[i].policy_description + polices.append({ + 'policy_id' : id, + 'policy_description': desc.decode() + }) + current_id = policy.policies[policy.current].policy_id + + return { + "num_supported": policy.num_supported, + "current_id": current_id, + "plpds": polices, + } + def amdsmi_get_gpu_od_volt_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index f718dcfa..13cd2062 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -746,19 +746,6 @@ class struct_fields_(Structure): class struct_amdsmi_pcie_info_t(Structure): pass -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('reserved', ctypes.c_uint64 * 10), -] - class struct_pcie_metric_(Structure): pass @@ -777,6 +764,19 @@ class struct_pcie_metric_(Structure): ('reserved', ctypes.c_uint64 * 13), ] +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('reserved', ctypes.c_uint64 * 10), +] + struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -2058,6 +2058,12 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): amdsmi_set_dpm_policy = _libraries['libamd_smi.so'].amdsmi_set_dpm_policy amdsmi_set_dpm_policy.restype = amdsmi_status_t amdsmi_set_dpm_policy.argtypes = [amdsmi_processor_handle, uint32_t] +amdsmi_get_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_get_xgmi_plpd +amdsmi_get_xgmi_plpd.restype = amdsmi_status_t +amdsmi_get_xgmi_plpd.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_dpm_policy_t)] +amdsmi_set_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_set_xgmi_plpd +amdsmi_set_xgmi_plpd.restype = amdsmi_status_t +amdsmi_set_xgmi_plpd.argtypes = [amdsmi_processor_handle, uint32_t] amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version amdsmi_get_lib_version.restype = amdsmi_status_t amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)] @@ -2594,8 +2600,9 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'amdsmi_get_processor_info', 'amdsmi_get_processor_type', 'amdsmi_get_socket_handles', 'amdsmi_get_socket_info', 'amdsmi_get_temp_metric', 'amdsmi_get_utilization_count', - 'amdsmi_get_xgmi_info', 'amdsmi_gpu_block_t', - 'amdsmi_gpu_cache_info_t', 'amdsmi_gpu_control_counter', + 'amdsmi_get_xgmi_info', 'amdsmi_get_xgmi_plpd', + 'amdsmi_gpu_block_t', 'amdsmi_gpu_cache_info_t', + 'amdsmi_gpu_control_counter', 'amdsmi_gpu_counter_group_supported', 'amdsmi_gpu_create_counter', 'amdsmi_gpu_destroy_counter', 'amdsmi_gpu_metrics_t', 'amdsmi_gpu_read_counter', 'amdsmi_gpu_xgmi_error_status', @@ -2636,10 +2643,10 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'amdsmi_set_gpu_overdrive_level', 'amdsmi_set_gpu_pci_bandwidth', 'amdsmi_set_gpu_perf_determinism_mode', 'amdsmi_set_gpu_perf_level', 'amdsmi_set_gpu_power_profile', - 'amdsmi_set_power_cap', 'amdsmi_shut_down', - 'amdsmi_smu_fw_version_t', 'amdsmi_socket_handle', - 'amdsmi_status_code_to_string', 'amdsmi_status_t', - 'amdsmi_stop_gpu_event_notification', + 'amdsmi_set_power_cap', 'amdsmi_set_xgmi_plpd', + 'amdsmi_shut_down', 'amdsmi_smu_fw_version_t', + 'amdsmi_socket_handle', 'amdsmi_status_code_to_string', + 'amdsmi_status_t', 'amdsmi_stop_gpu_event_notification', 'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t', 'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type', 'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number', diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 12654213..e10ab49b 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -3364,6 +3364,45 @@ rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind, rsmi_status_t rsmi_dev_dpm_policy_set(uint32_t dv_ind, uint32_t policy_id); +/** + * @brief Get the xgmi per-link power down policy parameter for a device + * + * + * @details Given a device index @p dv_ind, this function will write + * current xgmi plpd settings to @p xgmi_plpd. All the processors at the same socket + * will have the same policy. + * + * @param[in] dv_ind a device index + * + * @param[in, out] xgmi_plpd the xgmi_plpd policy for this device. + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVAL + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_xgmi_plpd_get(uint32_t dv_ind, + rsmi_dpm_policy_t* xgmi_plpd); + +/** + * @brief Set the xgmi per-link power down policy parameter for a device + * + * + * @details Given a device index @p dv_ind, and a dpm policy @p plpd_id, + * this function will set the xgmi plpd for this processor. All the processors at + * the same socket will be set to the same policy. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] xgmi_plpd_id the xgmi plpd id to set. The id is the id in + * rsmi_dpm_policy_entry_t, which can be obtained by calling + * rsmi_dev_xgmi_plpd_get() + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_xgmi_plpd_set(uint32_t dv_ind, + uint32_t plpd_id); /** @} */ // end of PerfCont /*****************************************************************************/ diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 91c8ddbb..6aa0d86f 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -2038,6 +2038,130 @@ rsmi_dev_dpm_policy_set(uint32_t dv_ind, CATCH } +rsmi_status_t +rsmi_dev_xgmi_plpd_get(uint32_t dv_ind, + rsmi_dpm_policy_t* policy) { + rsmi_status_t ret; + std::vector val_vec; + + if (policy == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + *policy = {}; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + DEVICE_MUTEX + + ret = GetDevValueVec(amd::smi::kDevDPMPolicy, dv_ind, &val_vec); + if (ret == RSMI_STATUS_FILE_ERROR) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR " + << "-> reporting RSMI_STATUS_NOT_SUPPORTED"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS" + << " -> reporting " << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); + return ret; + } + /* + It will reply on the number but no string as it may vary from soc to soc. + The current xmgi plpd marked with * + xgmi plpd + 0 : plpd_disallow + 1 : plpd_default + 2 : plpd_optimized* + */ + bool see_plpd_pstate = false; + bool see_current = false; + policy->num_supported = 0; + for (uint32_t i = 0; i < val_vec.size(); ++i) { + auto current_line = amd::smi::trim(val_vec[i]); + if (current_line == "xgmi plpd") { + see_plpd_pstate = true; + continue; + } + if (see_plpd_pstate == false) continue; + + // Get tokens: : + std::vector tokens; + std::istringstream f(current_line); + std::string s; + while (getline(f, s, ':')) { + tokens.push_back(s); + } + + int value = 0; + // At the end + if (tokens.size() < 2 || !amd::smi::stringToInteger(tokens[0], value)) { + break; + } + + if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", Unexpected pstat data: the id is negative or too many plpd policies."; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + + policy->policies[policy->num_supported].policy_id = value; + std::string description = amd::smi::trim(tokens[1]); + if (current_line.back() == '*') { // current policy + description.pop_back(); // remove last * + description = amd::smi::trim(description); + policy->current = policy->num_supported; + see_current = true; + } + strncpy(policy->policies[policy->num_supported].policy_description, + description.c_str(), + RSMI_MAX_POLICY_NAME-1); + policy->num_supported++; + } // end for + + if (!see_plpd_pstate) { + return RSMI_STATUS_NOT_SUPPORTED; + } + + if (!see_current) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", Unexpected pstat data: cannot find the current plpd policy."; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + // Cannot find it + return RSMI_STATUS_SUCCESS; + + CATCH +} + +rsmi_status_t +rsmi_dev_xgmi_plpd_set(uint32_t dv_ind, + uint32_t plpd_id) { + rsmi_status_t ret; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + GET_DEV_FROM_INDX + + std::string value("xgmi "); + value += std::to_string(plpd_id); + int ret = dev->writeDevInfo(amd::smi::kDevDPMPolicy , value); + return amd::smi::ErrnoToRsmiStatus(ret); + + CATCH +} + rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind, rsmi_dpm_policy_t* policy) { @@ -2107,7 +2231,7 @@ rsmi_dev_dpm_policy_get(uint32_t dv_ind, if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) { ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", Unexpeced pstat data: the id is negative or too many policies."; + << ", Unexpected pstat data: the id is negative or too many policies."; LOG_ERROR(ss); return RSMI_STATUS_UNEXPECTED_DATA; } @@ -2132,7 +2256,7 @@ rsmi_dev_dpm_policy_get(uint32_t dv_ind, if (!see_current) { ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", Unexpeced pstat data: cannot find the current policy."; + << ", Unexpected pstat data: cannot find the current policy."; LOG_ERROR(ss); return RSMI_STATUS_UNEXPECTED_DATA; } diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 3e63659c..92de58c6 100755 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -536,8 +536,10 @@ static const std::map kDevFuncDependsMap = { {"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}}, {"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}}, {"rsmi_dev_pm_metrics_info_get", {{kDevPmMetricsFName}, {}}}, - {"rsmi_dev_dpm_policy_get", {{kDevDPMPolicyFName}, {}}}, - {"rsmi_dev_dpm_policy_set", {{kDevDPMPolicyFName}, {}}}, + {"rsmi_dev_dpm_policy_get", {{kDevDPMPolicyFName}, {}}}, + {"rsmi_dev_dpm_policy_set", {{kDevDPMPolicyFName}, {}}}, + {"rsmi_dev_xgmi_plpd_get", {{kDevDPMPolicyFName}, {}}}, + {"rsmi_dev_xgmi_plpd_set", {{kDevDPMPolicyFName}, {}}}, {"rsmi_dev_reg_table_info_get", {{kDevRegMetricsFName}, {}}}, {"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}}, {"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}}, diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index e57ae30c..1dafee87 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1369,6 +1369,22 @@ amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle, reinterpret_cast(policy)); } +amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle, + uint32_t policy) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_xgmi_plpd_set, processor_handle, + policy); +} + +amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle, + amdsmi_dpm_policy_t* policy) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_xgmi_plpd_get, processor_handle, + reinterpret_cast(policy)); +} + amdsmi_status_t amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle, uint32_t *num_pages, From 72b0a6efe56cc368e02d783efab0a800c30bfff3 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Tue, 26 Mar 2024 08:24:12 -0500 Subject: [PATCH 07/18] SWDEV-431924 - Corrected amdsmi_get_gpu_board_info() to return N/A for invalid values Signed-off-by: Maisam Arif Change-Id: I3f7e7c873c24b8f5ddd6784700f193c2fdf199e0 --- py-interface/amdsmi_interface.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index c9e773b8..09d829fe 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -1848,7 +1848,7 @@ def amdsmi_get_gpu_board_info( processor_handle, ctypes.byref(board_info)) ) - return { + board_info_dict = { "model_number": board_info.model_number.decode("utf-8").strip(), "product_serial": board_info.product_serial.decode("utf-8").strip(), "fru_id": board_info.fru_id.decode("utf-8").strip(), @@ -1856,6 +1856,12 @@ def amdsmi_get_gpu_board_info( "manufacturer_name": board_info.manufacturer_name.decode("utf-8").strip() } + for key, value in board_info_dict.items(): + if value == "": + board_info_dict[key] = "N/A" + + return board_info_dict + def amdsmi_get_gpu_ras_feature_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, From dad2c430ead1a4214d90d72c07eadc9717ed1bfa Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Tue, 26 Mar 2024 08:45:08 -0500 Subject: [PATCH 08/18] SWDEV-435406 - Corrected amdsmi_get_power_info() to return N/A for invalid values Signed-off-by: Maisam Arif Change-Id: I2aeb6f6670f6f47cd496faf7fc41192647f7d58c --- py-interface/amdsmi_interface.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 09d829fe..e3dfa1a4 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -2038,7 +2038,7 @@ def amdsmi_get_power_info( ) ) - return { + power_info_dict = { "current_socket_power": power_measure.current_socket_power, "average_socket_power": power_measure.average_socket_power, "gfx_voltage": power_measure.gfx_voltage, @@ -2047,6 +2047,12 @@ def amdsmi_get_power_info( "power_limit" : power_measure.power_limit, } + for key, value in power_info_dict.items(): + if value == 0xFFFF: + power_info_dict[key] = "N/A" + + return power_info_dict + def amdsmi_is_gpu_power_management_enabled( processor_handle: amdsmi_wrapper.amdsmi_processor_handle From 8bf2bd4b898c885b0e94247fc84a2c6c8acf2671 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Tue, 26 Mar 2024 07:59:50 -0500 Subject: [PATCH 09/18] SWDEV-447333 - Corrected amdsmi_init() python documentation Signed-off-by: Maisam Arif Change-Id: If46e7236316687cd97cf1a69770f87154e2681ff --- py-interface/README.md | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/py-interface/README.md b/py-interface/README.md index 82f8ca97..ae9b3568 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -73,9 +73,9 @@ except AmdSmiException as e: ### amdsmi_init -Description: Dynamically initialize amdsmi with amd_hsmp and amdgpu drivers +Description: Initialize amdsmi with AmdSmiInitFlags -Input parameters: `None` +Input parameters: AmdSmiInitFlags Output: `None` @@ -83,19 +83,37 @@ Exceptions that can be thrown by `amdsmi_init` function: * `AmdSmiLibraryException` -Example: +Initialize GPUs only example: ```python try: + # by default we initalize with AmdSmiInitFlags.INIT_AMD_GPUS init_flag = amdsmi_init() - # Print out integer bitmask of initialized drivers - # 1 is for amd_hsmp - # 2 is for amdgpu - # 3 is for amd_hsmp and amdgpu - print(init_flag) # continue with amdsmi except AmdSmiException as e: - print("Init failed") + print("Init GPUs failed") + print(e) +``` + +Initialize CPUs only example: + +```python +try: + init_flag = amdsmi_init(AmdSmiInitFlags.INIT_AMD_CPUS) + # continue with amdsmi +except AmdSmiException as e: + print("Init CPUs failed") + print(e) +``` + +Initialize both GPUs and CPUs example: + +```python +try: + init_flag = amdsmi_init(AmdSmiInitFlags.INIT_AMD_APUS) + # continue with amdsmi +except AmdSmiException as e: + print("Init both GPUs & CPUs failed") print(e) ``` From 93b81e501250fb6fc885cdd11e56407ea5ef87f5 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Tue, 26 Mar 2024 01:01:38 -0500 Subject: [PATCH 10/18] SWDEV-445664 - Aligned metric --clock with Host Change-Id: Ib4dc372aed61f6301680ac746eccf448e9d0ed00 Signed-off-by: Maisam Arif --- CHANGELOG.md | 136 +++++++++++++++++- amdsmi_cli/amdsmi_commands.py | 240 +++++++++++++++++++++++-------- amdsmi_cli/amdsmi_helpers.py | 20 ++- py-interface/README.md | 2 +- py-interface/amdsmi_interface.py | 4 +- src/amd_smi/amd_smi.cc | 6 + src/amd_smi/amd_smi_utils.cc | 6 + 7 files changed, 343 insertions(+), 71 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00728e53..b7a3a85d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,11 +4,127 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ***All information listed below is for reference and subject to change.*** +## amd_smi_lib for ROCm 6.2.0 + +### Changed + +Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status. + +``` shell +$ amd-smi metric --clock +GPU: 0 + CLOCK: + GFX_0: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_1: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_2: + CLK: 112 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_3: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_4: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_5: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_6: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + GFX_7: + CLK: 113 MHz + MIN_CLK: 500 MHz + MAX_CLK: 1800 MHz + CLK_LOCKED: DISABLED + DEEP_SLEEP: ENABLED + MEM_0: + CLK: 900 MHz + MIN_CLK: 900 MHz + MAX_CLK: 1200 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: DISABLED + VCLK_0: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1480 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_1: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1480 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_2: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1480 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + VCLK_3: + CLK: 29 MHz + MIN_CLK: 914 MHz + MAX_CLK: 1480 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_0: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1233 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_1: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1233 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_2: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1233 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED + DCLK_3: + CLK: 22 MHz + MIN_CLK: 711 MHz + MAX_CLK: 1233 MHz + CLK_LOCKED: N/A + DEEP_SLEEP: ENABLED +``` + ## amd_smi_lib for ROCm 6.1.0 ### Added + - **Added Monitor Command** Provides users the ability to customize GPU metrics to capture, collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi` (no argument), additionally allows uers to customize what data is helpful for their use-case. + ```shell $ amd-smi monitor -h usage: amd-smi monitor [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] @@ -52,6 +168,7 @@ Command Modifiers: --loglevel LEVEL Set the logging level from the possible choices: DEBUG, INFO, WARNING, ERROR, CRITICAL ``` + ```shell $ amd-smi monitor -ptumv GPU POWER GPU_TEMP MEM_TEMP GFX_UTIL GFX_CLOCK MEM_UTIL MEM_CLOCK VRAM_USED VRAM_TOTAL @@ -80,6 +197,7 @@ CPU: 0 INTERFACE_VERSION: PROTO VERSION: 6 ``` + ```shell $ amd-smi metric -O 0 1 2 CORE: 0 @@ -106,6 +224,7 @@ CORE: 2 CORE_ENERGY: VALUE: N/A ``` + ```shell $ amd-smi metric -U all CPU: 0 @@ -212,6 +331,7 @@ CPU: 0 CPU_TEMP: RESPONSE: N/A ``` + - **Added support for new metrics: VCN, JPEG engines, and PCIe errors** Using the AMD SMI tool, users can retreive VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs (with 4 separate VCN engine activities) for older asics `MM_ACTIVITY` with UVD/VCN engine activity (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new fields. @@ -230,6 +350,7 @@ GPU: 0 CURRENT_BANDWIDTH_RECEIVED: N/A MAX_PACKET_SIZE: N/A ``` + ```shell $ amd-smi metric --usage GPU: 0 @@ -243,11 +364,13 @@ GPU: 0 0 %, 0 %, 0 %, 0 %] ``` + - **Added AMDSMI Tool Version** AMD SMI will report ***three versions***: AMDSMI Tool, AMDSMI Library version, and ROCm version. The AMDSMI Tool version is the CLI/tool version number with commit ID appended after `+` sign. The AMDSMI Library version is the library package version number. The ROCm version is the system's installed ROCm version, if ROCm is not installed it will report N/A. + ```shell $ amd-smi version AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6.1.0 @@ -255,6 +378,7 @@ AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6 - **Added XGMI table** Displays XGMI information for AMD GPU devices in a table format. Only available on supported ASICs (eg. MI300). Here users can view read/write data XGMI or PCIe accumulated data transfer size (in KiloBytes). + ```shell $ amd-smi xgmi LINK METRIC TABLE: @@ -285,10 +409,12 @@ GPU7 0000:df:00.0 32 Gb/s 512 Gb/s XGMI Write 0 KB 1 KB 1 KB 1 KB 1 KB 1 KB 1 KB N/A ``` + - **Added units of measure to JSON output.** We added unit of measure to JSON/CSV `amd-smi metric`, `amd-smi static`, and `amd-smi monitor` commands. Ex. + ```shell amd-smi metric -p --json [ @@ -321,7 +447,8 @@ amd-smi metric -p --json ### Changed - **Topology is now left-aligned with BDF of each device listed individual table's row/coloumns.** -We provided each device's BDF for every table's row/columns, then left aligned data. We want AMD SMI Tool output to be easy to understand and digest for our users. Having users scroll up to find this information made it difficult to follow, especially for devices which have many devices associated with one ASIC. +We provided each device's BDF for every table's row/columns, then left aligned data. We want AMD SMI Tool output to be easy to understand and digest for our users. Having users scroll up to find this information made it difficult to follow, especially for devices which have many devices associated with one ASIC. + ```shell $ amd-smi topology ACCESS TABLE: @@ -381,6 +508,7 @@ NUMA BW TABLE: ``` ### Optimizations + - N/A ### Fixed @@ -394,14 +522,14 @@ Platforms which are identified as having an older pyyaml version or pip, we no m - `amd-smi firmware` - `amd-smi metric` - `amd-smi topology` + ```shell TypeError: dump_all() got an unexpected keyword argument 'sort_keys' ``` + - **Fix for crash when user is not a member of video/render groups** AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid crashes when DRM/device data is inaccessable to the logged in user. - - ### Known Issues - N/A @@ -419,7 +547,6 @@ You can now query MI300 device metrics to get real-time information. Metrics inc - **Compute and memory partition support** Users can now view, set, and reset partitions. The topology display can provide a more in-depth look at the device's current configuration. - ### Changed - **GPU index sorting made consistent with other tools** @@ -437,7 +564,6 @@ Now the information is displayed as a table by each GPU's BDF, which closer rese - **Fix for driver not initialized** If driver module is not loaded, user retrieve error reponse indicating amdgpu module is not loaded. - ### Known Issues - N/A diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 689b3fa5..fce9e852 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -1344,73 +1344,189 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No values_dict['power'] = power_dict if "clock" in current_platform_args: if args.clock: + # Populate Skeleton output with N/A clocks = {} - clock_types = [amdsmi_interface.AmdSmiClkType.GFX, - amdsmi_interface.AmdSmiClkType.MEM, - amdsmi_interface.AmdSmiClkType.VCLK0, - amdsmi_interface.AmdSmiClkType.VCLK1] - for clock_type in clock_types: - clock_name = amdsmi_interface.amdsmi_wrapper.amdsmi_clk_type_t__enumvalues[clock_type].replace("CLK_TYPE_", "") - # Ensure that gfx is the clock_name instead of another macro - if clock_type == amdsmi_interface.AmdSmiClkType.GFX: - clock_name = "gfx" - - # Store the clock_name for vclk0 - vlck0_clock_name = None - if clock_type == amdsmi_interface.AmdSmiClkType.VCLK0: - vlck0_clock_name = clock_name - try: - clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, clock_type) - clock_info = {"clk" : clock_info_dict["cur_clk"]} - del clock_info_dict["cur_clk"] - clock_info.update(clock_info_dict) - - if clock_info['sleep_clk'] == 0xFFFFFFFF: - clock_info['sleep_clk'] = "N/A" - - clock_freq_unit = 'MHz' - for key, value in clock_info.items(): - if isinstance(value, int): - if self.logger.is_human_readable_format(): - clock_info[key] = f"{value} {clock_freq_unit}" - if self.logger.is_json_format(): - clock_info[key] = {"value" : value, - "unit" : clock_freq_unit} - - clocks[clock_name] = clock_info - except amdsmi_exception.AmdSmiLibraryException as e: - # Handle the case where VCLK1 is not enaled in sysfs on all GPUs - if clock_type == amdsmi_interface.AmdSmiClkType.VCLK1: - # Check if VCLK0 was retrieved successfully - if vlck0_clock_name in clocks: - # Since VCLK0 exists, do not error - logging.debug("VLCK0 exists, not adding %s clock info to output for gpu %s | %s", clock_name, gpu_id, e.get_error_info()) - continue + for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS): + gfx_index = f"gfx_{clock_index}" + clocks[gfx_index] = {"clk" : "N/A", + "min_clk" : "N/A", + "max_clk" : "N/A", + "clk_locked" : "N/A", + "deep_sleep" : "N/A"} + + clocks["mem_0"] = {"clk" : "N/A", + "min_clk" : "N/A", + "max_clk" : "N/A", + "clk_locked" : "N/A", + "deep_sleep" : "N/A"} + + for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS): + vclk_index = f"vclk_{clock_index}" + clocks[vclk_index] = {"clk" : "N/A", + "min_clk" : "N/A", + "max_clk" : "N/A", + "clk_locked" : "N/A", + "deep_sleep" : "N/A"} + + for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS): + dclk_index = f"dclk_{clock_index}" + clocks[dclk_index] = {"clk" : "N/A", + "min_clk" : "N/A", + "max_clk" : "N/A", + "clk_locked" : "N/A", + "deep_sleep" : "N/A"} + + clock_unit = "MHz" + # TODO make the deepsleep threshold correspond to the * in sysfs for current deep sleep status + deep_sleep_threshold = 140 + + # Populate clock values from gpu_metrics_info + try: + gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) + + # Populate GFX clock values + current_gfx_clocks = gpu_metrics_info["current_gfxclks"] + for clock_index, current_gfx_clock in enumerate(current_gfx_clocks): + # If the current clock is N/A then nothing else applies + if current_gfx_clock == "N/A": + continue + + gfx_index = f"gfx_{clock_index}" + clocks[gfx_index]["clk"] = self.helpers.unit_format(self.logger, + current_gfx_clock, + clock_unit) + + # Populate clock locked status + if gpu_metrics_info["gfxclk_lock_status"] != "N/A": + gfx_clock_lock_flag = 1 << clock_index # This is the position of the clock lock flag + if gpu_metrics_info["gfxclk_lock_status"] & gfx_clock_lock_flag: + clocks[gfx_index]["clk_locked"] = "ENABLED" + else: + clocks[gfx_index]["clk_locked"] = "DISABLED" + + # Populate deep sleep status + if int(current_gfx_clock) <= deep_sleep_threshold: + clocks[gfx_index]["deep_sleep"] = "ENABLED" else: - # Handle all other failed to get clock info - clocks[clock_name] = {"clk": "N/A", - "max_clk": "N/A", - "min_clk": "N/A", - "sleep_clk": "N/A"} - logging.debug("Failed to get %s clock info for gpu %s | %s", clock_name, gpu_id, e.get_error_info()) + clocks[gfx_index]["deep_sleep"] = "DISABLED" - try: - gfxclk_lock_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['gfxclk_lock_status'] - if gfxclk_lock_status != "N/A": - if gfxclk_lock_status: - gfxclk_lock_status = "ENABLED" + # Populate MEM clock value + current_mem_clock = gpu_metrics_info["current_uclk"] # single value + if current_mem_clock != "N/A": + clocks["mem_0"]["clk"] = self.helpers.unit_format(self.logger, + current_mem_clock, + clock_unit) + + if int(current_mem_clock) <= deep_sleep_threshold: + clocks["mem_0"]["deep_sleep"] = "ENABLED" + else: + clocks["mem_0"]["deep_sleep"] = "DISABLED" + + # Populate VCLK clock values + current_vclk_clocks = gpu_metrics_info["current_vclk0s"] + for clock_index, current_vclk_clock in enumerate(current_vclk_clocks): + # If the current clock is N/A then nothing else applies + if current_vclk_clock == "N/A": + continue + + vclk_index = f"vclk_{clock_index}" + clocks[vclk_index]["clk"] = self.helpers.unit_format(self.logger, + current_vclk_clock, + clock_unit) + + if int(current_vclk_clock) <= deep_sleep_threshold: + clocks[vclk_index]["deep_sleep"] = "ENABLED" else: - gfxclk_lock_status = "DISABLED" + clocks[vclk_index]["deep_sleep"] = "DISABLED" + + # Populate DCLK clock values + current_dclk_clocks = gpu_metrics_info["current_dclk0s"] + for clock_index, current_dclk_clock in enumerate(current_dclk_clocks): + # If the current clock is N/A then nothing else applies + if current_dclk_clock == "N/A": + continue + + dclk_index = f"dclk_{clock_index}" + clocks[dclk_index]["clk"] = self.helpers.unit_format(self.logger, + current_dclk_clock, + clock_unit) + + if int(current_dclk_clock) <= deep_sleep_threshold: + clocks[dclk_index]["deep_sleep"] = "ENABLED" + else: + clocks[dclk_index]["deep_sleep"] = "DISABLED" except amdsmi_exception.AmdSmiLibraryException as e: - gfxclk_lock_status = "N/A" - logging.debug("Failed to get gfx clock lock status info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get gpu_metrics_info for gpu %s | %s", gpu_id, e.get_error_info()) - if "gfx" in clocks: - if isinstance(clocks['gfx'], dict): - clocks['gfx']['clk_locked'] = gfxclk_lock_status - else: - clocks['gfx'] = {"clk_locked": gfxclk_lock_status} + # Populate the max and min clock values from sysfs + # Min and Max values are per clock type, not per clock engine + + # GFX min and max clocks + try: + gfx_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.GFX) + + for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS): + gfx_index = f"gfx_{clock_index}" + if clocks[gfx_index]["clk"] == "N/A": + # if the current clock is N/A then we shouldn't populate the max and min values + continue + + clocks[gfx_index]["min_clk"] = self.helpers.unit_format(self.logger, + gfx_clock_info_dict["min_clk"], + clock_unit) + clocks[gfx_index]["max_clk"] = self.helpers.unit_format(self.logger, + gfx_clock_info_dict["max_clk"], + clock_unit) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get gfx clock info for gpu %s | %s", gpu_id, e.get_error_info()) + + # MEM min and max clocks + try: + mem_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.MEM) + + # if the current clock is N/A then we shouldn't populate the max and min values + if clocks["mem_0"]["clk"] != "N/A": + clocks["mem_0"]["min_clk"] = self.helpers.unit_format(self.logger, + mem_clock_info_dict["min_clk"], + clock_unit) + clocks["mem_0"]["max_clk"] = self.helpers.unit_format(self.logger, + mem_clock_info_dict["max_clk"], + clock_unit) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e.get_error_info()) + + # VCLK & DCLK min and max clocks + try: + vclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.VCLK0) + + dclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + amdsmi_interface.AmdSmiClkType.DCLK0) + + for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS): + vclk_index = f"vclk_{clock_index}" + # if the current clock is N/A then we shouldn't populate the max and min values + if clocks[vclk_index]["clk"] != "N/A": + clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger, + vclk0_clock_info_dict["min_clk"], + clock_unit) + clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger, + vclk0_clock_info_dict["max_clk"], + clock_unit) + + dclk_index = f"dclk_{clock_index}" + if clocks[dclk_index]["clk"] != "N/A": + clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger, + dclk0_clock_info_dict["min_clk"], + clock_unit) + clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger, + dclk0_clock_info_dict["max_clk"], + clock_unit) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get vclk and/or dclk clock info for gpu %s | %s", gpu_id, e.get_error_info()) values_dict['clock'] = clocks if "temperature" in current_platform_args: @@ -4116,7 +4232,7 @@ def xgmi(self, args, multiple_devices=False, gpu=None, metric=None): for xgmi_dict in xgmi_values: src_gpu_id = xgmi_dict['gpu'] src_gpu_bdf = xgmi_dict['bdf'] - src_gpu = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(src_gpu_bdf) #TODO VERIFY this is correct + src_gpu = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(src_gpu_bdf) logging.debug("check2 device_handle: %s", src_gpu) # This should be the same order as the check1 @@ -4256,7 +4372,7 @@ def xgmi(self, args, multiple_devices=False, gpu=None, metric=None): self.logger.multiple_device_output = xgmi_values - if self.logger.is_csv_format(): # @TODO Test topology override needed + if self.logger.is_csv_format(): new_output = [] for elem in self.logger.multiple_device_output: new_output.append(self.logger.flatten_dict(elem, topology_override=True)) diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index 2083c155..6383969a 100644 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -412,7 +412,7 @@ def get_device_handles_from_core_selections(self, core_selections: List[str], co return True, selected_device_handles - def handle_gpus(self, args,logger, subcommand): + def handle_gpus(self, args, logger, subcommand): """This function will run execute the subcommands based on the number of gpus passed in via args. params: @@ -708,3 +708,21 @@ def convert_bytes_to_readable(self, bytes_input): return f"{bytes_input:3.1f} {unit}" bytes_input /= 1024 return f"{bytes_input:.1f} YB" + + + def unit_format(self, logger, value, unit): + """This function will format output with unit based on the logger output format + + params: + args - argparser args to pass to subcommand + logger (AMDSMILogger) - Logger to print out output + value - the value to be formatted + unit - the unit to be formatted with the value + return: + str or dict : formatted output + """ + if logger.is_json_format(): + return {"value": value, "unit": unit} + if logger.is_human_readable_format(): + return f"{value} {unit}" + return f"{value}" diff --git a/py-interface/README.md b/py-interface/README.md index ae9b3568..f8b8b3d2 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -2155,7 +2155,7 @@ Output: Dictionary with fields `indep_throttle_status` | ASIC independent throttle status (see drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h for bit flags) | `current_socket_power` | Current socket power (also known as instant socket power) | W `vcn_activity` | List of VCN encode/decode engine utilization per AID | % -`gfxclk_lock_status` | Clock lock status. Each bit corresponds to clock instance. | +`gfxclk_lock_status` | Clock lock status. Bits 0:7 correspond to each gfx clock engine instance. Bits 0:5 for APU/AID devices | `xgmi_link_width` | XGMI bus width | lanes `xgmi_link_speed` | XGMI bitrate | GB/s `pcie_bandwidth_acc` | PCIe accumulated bandwidth | GB/s diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index e3dfa1a4..98c41f73 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -3519,7 +3519,7 @@ def amdsmi_get_gpu_metrics_info( if gpu_metrics_output[metric] == 0xFFFF: gpu_metrics_output[metric] = "N/A" - uint_32_metrics = ['gfx_activity_acc','mem_activity_acc', 'pcie_nak_sent_count_acc', 'pcie_nak_rcvd_count_acc'] + uint_32_metrics = ['gfx_activity_acc','mem_activity_acc', 'pcie_nak_sent_count_acc', 'pcie_nak_rcvd_count_acc', 'gfxclk_lock_status'] for metric in uint_32_metrics: if gpu_metrics_output[metric] == 0xFFFFFFFF: gpu_metrics_output[metric] = "N/A" @@ -3533,7 +3533,7 @@ def amdsmi_get_gpu_metrics_info( gpu_metrics_output[metric] = "N/A" # Custom validation for metrics in a bool format - uint_32_bool_metrics = ['throttle_status', 'gfxclk_lock_status'] + uint_32_bool_metrics = ['throttle_status'] for metric in uint_32_bool_metrics: if gpu_metrics_output[metric] == 0xFFFFFFFF: gpu_metrics_output[metric] = "N/A" diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 1dafee87..2f56eb45 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1651,6 +1651,12 @@ amdsmi_get_clock_info(amdsmi_processor_handle processor_handle, amdsmi_clk_type_ case CLK_TYPE_VCLK1: info->cur_clk = metrics.current_vclk1; break; + case CLK_TYPE_DCLK0: + info->cur_clk = metrics.current_dclk0; + break; + case CLK_TYPE_DCLK1: + info->cur_clk = metrics.current_dclk1; + break; default: return AMDSMI_STATUS_INVAL; } diff --git a/src/amd_smi/amd_smi_utils.cc b/src/amd_smi/amd_smi_utils.cc index f73a1a76..13762c38 100644 --- a/src/amd_smi/amd_smi_utils.cc +++ b/src/amd_smi/amd_smi_utils.cc @@ -229,6 +229,12 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_ case CLK_TYPE_VCLK1: fullpath += "/pp_dpm_vclk1"; break; + case CLK_TYPE_DCLK0: + fullpath += "/pp_dpm_dclk"; + break; + case CLK_TYPE_DCLK1: + fullpath += "/pp_dpm_dclk1"; + break; default: return AMDSMI_STATUS_INVAL; } From e2e4349bd245b06791dab7157ca39fa0563c76ff Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Tue, 26 Mar 2024 03:33:19 -0500 Subject: [PATCH 11/18] SWDEV-445664 - Aligned metric --ecc & --ecc-blocks with Host Signed-off-by: Maisam Arif Change-Id: I93cf2bdab8c4c066bacf0e910e5620d37b362b07 --- CHANGELOG.md | 57 +++++++++++++++++++++++--------- amdsmi_cli/amdsmi_commands.py | 11 +++--- py-interface/README.md | 2 ++ py-interface/amdsmi_interface.py | 10 +++--- 4 files changed, 56 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7a3a85d..74deff1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Change Log for AMD SMI Library -Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](https://rocm.docs.amd.com/projects/amdsmi/en/latest/). +Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](https://rocm.docs.amd.com/projects/amdsmi/en/latest/). ***All information listed below is for reference and subject to change.*** @@ -8,6 +8,7 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ### Changed +- **Updated metrics --clocks** Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status. ``` shell @@ -118,11 +119,35 @@ GPU: 0 DEEP_SLEEP: ENABLED ``` +- **Added deferred ecc counts** +Added deferred error correctable counts to `amd-smi metric --ecc --ecc-blocks` + +```shell +$ amd-smi metric --ecc --ecc-blocks +GPU: 0 + ECC: + TOTAL_CORRECTABLE_COUNT: 0 + TOTAL_UNCORRECTABLE_COUNT: 0 + TOTAL_DEFERRED_COUNT: 0 + CACHE_CORRECTABLE_COUNT: 0 + CACHE_UNCORRECTABLE_COUNT: 0 + ECC_BLOCKS: + UMC: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + SDMA: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + DEFERRED_COUNT: 0 + ... +``` + ## amd_smi_lib for ROCm 6.1.0 ### Added -- **Added Monitor Command** +- **Added Monitor Command** Provides users the ability to customize GPU metrics to capture, collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi` (no argument), additionally allows uers to customize what data is helpful for their use-case. ```shell @@ -182,10 +207,10 @@ GPU POWER GPU_TEMP MEM_TEMP GFX_UTIL GFX_CLOCK MEM_UTIL MEM_CLOCK VRAM_U 7 175 W 34 °C 32 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB ``` -- **Integrated ESMI Tool** -Users can get CPU metrics and telemetry through our API and CLI tools. This information can be seen in `amd-smi static` and `amd-smi metric` commands. Only available for limited target processors. As of ROCm 6.0.2, this is listed as: - - AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh - - AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh +- **Integrated ESMI Tool** +Users can get CPU metrics and telemetry through our API and CLI tools. This information can be seen in `amd-smi static` and `amd-smi metric` commands. Only available for limited target processors. As of ROCm 6.0.2, this is listed as: + - AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh + - AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh See a few examples listed below. @@ -332,7 +357,7 @@ CPU: 0 RESPONSE: N/A ``` -- **Added support for new metrics: VCN, JPEG engines, and PCIe errors** +- **Added support for new metrics: VCN, JPEG engines, and PCIe errors** Using the AMD SMI tool, users can retreive VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs (with 4 separate VCN engine activities) for older asics `MM_ACTIVITY` with UVD/VCN engine activity (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new fields. ```shell @@ -376,7 +401,7 @@ $ amd-smi version AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6.1.0 ``` -- **Added XGMI table** +- **Added XGMI table** Displays XGMI information for AMD GPU devices in a table format. Only available on supported ASICs (eg. MI300). Here users can view read/write data XGMI or PCIe accumulated data transfer size (in KiloBytes). ```shell @@ -513,10 +538,10 @@ NUMA BW TABLE: ### Fixed -- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests** +- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests** Devices which do not report (eg. Navi3X/Navi2X/MI100) we have added checks to confirm these devices return AMDSMI_STATUS_NOT_SUPPORTED. Otherwise, tests now display a return string. -- **Fix for devices which have an older pyyaml installed** -Platforms which are identified as having an older pyyaml version or pip, we no manually update both pip and pyyaml as needed. This corrects issues identified below. Fix impacts the following CLI commands: +- **Fix for devices which have an older pyyaml installed** +Platforms which are identified as having an older pyyaml version or pip, we no manually update both pip and pyyaml as needed. This corrects issues identified below. Fix impacts the following CLI commands: - `amd-smi list` - `amd-smi static` - `amd-smi firmware` @@ -538,18 +563,18 @@ AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid cr ### Added -- **Integrated the E-SMI (EPYC-SMI) library** +- **Integrated the E-SMI (EPYC-SMI) library** You can now query CPU-related information directly through AMD SMI. Metrics include power, energy, performance, and other system details. -- **Added support for gfx942 metrics** +- **Added support for gfx942 metrics** You can now query MI300 device metrics to get real-time information. Metrics include power, temperature, energy, and performance. -- **Compute and memory partition support** +- **Compute and memory partition support** Users can now view, set, and reset partitions. The topology display can provide a more in-depth look at the device's current configuration. ### Changed -- **GPU index sorting made consistent with other tools** +- **GPU index sorting made consistent with other tools** To ensure alignment with other ROCm software tools, GPU index sorting is optimized to use Bus:Device.Function (BDF) rather than the card number. - **Topology output is now aligned with GPU BDF table** Earlier versions of the topology output were difficult to read since each GPU was displayed linearly. @@ -561,7 +586,7 @@ Now the information is displayed as a table by each GPU's BDF, which closer rese ### Fixed -- **Fix for driver not initialized** +- **Fix for driver not initialized** If driver module is not loaded, user retrieve error reponse indicating amdgpu module is not loaded. ### Known Issues diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index fce9e852..cc2ab30e 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -1500,7 +1500,7 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No # VCLK & DCLK min and max clocks try: - vclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, + vclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, amdsmi_interface.AmdSmiClkType.VCLK0) dclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, @@ -1668,6 +1668,7 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No ecc_count = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu) ecc_count['total_correctable_count'] = ecc_count.pop('correctable_count') ecc_count['total_uncorrectable_count'] = ecc_count.pop('uncorrectable_count') + ecc_count['total_deferred_count'] = ecc_count.pop('deferred_count') except amdsmi_exception.AmdSmiLibraryException as e: ecc_count['total_correctable_count'] = "N/A" ecc_count['total_uncorrectable_count'] = "N/A" @@ -1691,7 +1692,7 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No if "ecc_blocks" in current_platform_args: if args.ecc_blocks: ecc_dict = {} - uncountable_blocks = ["ATHUB", "DF", "SMN", "SEM", "MP0", "MP1", "FUSE"] + uncountable_blocks = ["ATHUB", "DF", "SMN", "SEM", "FUSE"] try: ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu) for state in ras_states: @@ -1702,10 +1703,12 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No try: ecc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, gpu_block) ecc_dict[state['block']] = {'correctable_count' : ecc_count['correctable_count'], - 'uncorrectable_count' : ecc_count['uncorrectable_count']} + 'uncorrectable_count' : ecc_count['uncorrectable_count'], + 'deferred_count' : ecc_count['deferred_count']} except amdsmi_exception.AmdSmiLibraryException as e: ecc_dict[state['block']] = {'correctable_count' : "N/A", - 'uncorrectable_count' : "N/A"} + 'uncorrectable_count' : "N/A", + 'deferred_count' : "N/A"} logging.debug("Failed to get ecc count for gpu %s at block %s | %s", gpu_id, gpu_block, e.get_error_info()) values_dict['ecc_blocks'] = ecc_dict diff --git a/py-interface/README.md b/py-interface/README.md index f8b8b3d2..87089306 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -967,6 +967,7 @@ Field | Description ---|--- `correctable_count` | Correctable ECC error count `uncorrectable_count` | Uncorrectable ECC error count +`deferred_count` | Deferred ECC error count Exceptions that can be thrown by `amdsmi_get_gpu_total_ecc_count` function: @@ -2747,6 +2748,7 @@ Field | Description ---|--- `correctable_count` | Count of correctable errors `uncorrectable_count` | Count of uncorrectable errors +`deferred_count` | Count of deferred errors Exceptions that can be thrown by `amdsmi_get_gpu_ecc_count` function: diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 98c41f73..bf9fa6a0 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -1821,16 +1821,17 @@ def amdsmi_get_gpu_total_ecc_count( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) - error_count = amdsmi_wrapper.amdsmi_error_count_t() + ec = amdsmi_wrapper.amdsmi_error_count_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_total_ecc_count( - processor_handle, ctypes.byref(error_count) + processor_handle, ctypes.byref(ec) ) ) return { - "correctable_count": error_count.correctable_count, - "uncorrectable_count": error_count.uncorrectable_count, + "correctable_count": ec.correctable_count, + "uncorrectable_count": ec.uncorrectable_count, + "deferred_count": ec.deferred_count, } @@ -3655,6 +3656,7 @@ def amdsmi_get_gpu_ecc_count( return { "correctable_count": ec.correctable_count, "uncorrectable_count": ec.uncorrectable_count, + "deferred_count": ec.deferred_count, } From 51b3f8cccbaff1fe1164215f5eef4d9b77c6fac8 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Wed, 27 Mar 2024 00:45:53 -0500 Subject: [PATCH 12/18] SWDEV-452739 - Add CEM slot type to amd-smi Updated CHANGELOG.md and re-added spaces after bolded lines Signed-off-by: Maisam Arif Change-Id: Ic728b3e9b083c62fe4c9791b8ede991f5dacc1ca --- CHANGELOG.md | 49 +++++++++++++++++++++++----------- include/amd_smi/amdsmi.h | 1 + py-interface/README.md | 2 +- py-interface/amdsmi_wrapper.py | 14 +++++----- src/amd_smi/amd_smi.cc | 7 +++-- 5 files changed, 48 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74deff1b..1c6dc5bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,11 +4,11 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ***All information listed below is for reference and subject to change.*** -## amd_smi_lib for ROCm 6.2.0 +## amd_smi_lib for ROCm 6.1.1 ### Changed -- **Updated metrics --clocks** +- **Updated metrics --clocks** Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status. ``` shell @@ -119,7 +119,7 @@ GPU: 0 DEEP_SLEEP: ENABLED ``` -- **Added deferred ecc counts** +- **Added deferred ecc counts** Added deferred error correctable counts to `amd-smi metric --ecc --ecc-blocks` ```shell @@ -143,11 +143,28 @@ GPU: 0 ... ``` +### Fixed + +- **Fix for GPU reset error on non-amdgpu cards** +Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix +updates CLI to target only AMD ASICs. + +- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards** +Updated API to include `amdsmi_card_form_factor_t.AMDSMI_CARD_FORM_FACTOR_CEM`. Prevously, this would report "UNKNOWN". This fix +provides the correct board `SLOT_TYPE` associated with these ASICs (and other Navi cards). + +- **Improved Error handling for `amd-smi process`** +Fixed Attribute Error when getting process in csv format + +### Known issues + +- `amd-smi bad-pages` can results with "ValueError: NULL pointer access" with certain PM FW versions + ## amd_smi_lib for ROCm 6.1.0 ### Added -- **Added Monitor Command** +- **Added Monitor Command** Provides users the ability to customize GPU metrics to capture, collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi` (no argument), additionally allows uers to customize what data is helpful for their use-case. ```shell @@ -207,7 +224,7 @@ GPU POWER GPU_TEMP MEM_TEMP GFX_UTIL GFX_CLOCK MEM_UTIL MEM_CLOCK VRAM_U 7 175 W 34 °C 32 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB ``` -- **Integrated ESMI Tool** +- **Integrated ESMI Tool** Users can get CPU metrics and telemetry through our API and CLI tools. This information can be seen in `amd-smi static` and `amd-smi metric` commands. Only available for limited target processors. As of ROCm 6.0.2, this is listed as: - AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh - AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh @@ -357,7 +374,7 @@ CPU: 0 RESPONSE: N/A ``` -- **Added support for new metrics: VCN, JPEG engines, and PCIe errors** +- **Added support for new metrics: VCN, JPEG engines, and PCIe errors** Using the AMD SMI tool, users can retreive VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs (with 4 separate VCN engine activities) for older asics `MM_ACTIVITY` with UVD/VCN engine activity (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new fields. ```shell @@ -401,7 +418,7 @@ $ amd-smi version AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6.1.0 ``` -- **Added XGMI table** +- **Added XGMI table** Displays XGMI information for AMD GPU devices in a table format. Only available on supported ASICs (eg. MI300). Here users can view read/write data XGMI or PCIe accumulated data transfer size (in KiloBytes). ```shell @@ -538,9 +555,9 @@ NUMA BW TABLE: ### Fixed -- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests** +- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests** Devices which do not report (eg. Navi3X/Navi2X/MI100) we have added checks to confirm these devices return AMDSMI_STATUS_NOT_SUPPORTED. Otherwise, tests now display a return string. -- **Fix for devices which have an older pyyaml installed** +- **Fix for devices which have an older pyyaml installed** Platforms which are identified as having an older pyyaml version or pip, we no manually update both pip and pyyaml as needed. This corrects issues identified below. Fix impacts the following CLI commands: - `amd-smi list` - `amd-smi static` @@ -552,7 +569,7 @@ Platforms which are identified as having an older pyyaml version or pip, we no m TypeError: dump_all() got an unexpected keyword argument 'sort_keys' ``` -- **Fix for crash when user is not a member of video/render groups** +- **Fix for crash when user is not a member of video/render groups** AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid crashes when DRM/device data is inaccessable to the logged in user. ### Known Issues @@ -563,20 +580,20 @@ AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid cr ### Added -- **Integrated the E-SMI (EPYC-SMI) library** +- **Integrated the E-SMI (EPYC-SMI) library** You can now query CPU-related information directly through AMD SMI. Metrics include power, energy, performance, and other system details. -- **Added support for gfx942 metrics** +- **Added support for gfx942 metrics** You can now query MI300 device metrics to get real-time information. Metrics include power, temperature, energy, and performance. -- **Compute and memory partition support** +- **Compute and memory partition support** Users can now view, set, and reset partitions. The topology display can provide a more in-depth look at the device's current configuration. ### Changed -- **GPU index sorting made consistent with other tools** +- **GPU index sorting made consistent with other tools** To ensure alignment with other ROCm software tools, GPU index sorting is optimized to use Bus:Device.Function (BDF) rather than the card number. -- **Topology output is now aligned with GPU BDF table** +- **Topology output is now aligned with GPU BDF table** Earlier versions of the topology output were difficult to read since each GPU was displayed linearly. Now the information is displayed as a table by each GPU's BDF, which closer resembles rocm-smi output. @@ -586,7 +603,7 @@ Now the information is displayed as a table by each GPU's BDF, which closer rese ### Fixed -- **Fix for driver not initialized** +- **Fix for driver not initialized** If driver module is not loaded, user retrieve error reponse indicating amdgpu module is not loaded. ### Known Issues diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 64bdb125..834b820c 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -495,6 +495,7 @@ typedef union { typedef enum { AMDSMI_CARD_FORM_FACTOR_PCIE, AMDSMI_CARD_FORM_FACTOR_OAM, + AMDSMI_CARD_FORM_FACTOR_CEM, AMDSMI_CARD_FORM_FACTOR_UNKNOWN } amdsmi_card_form_factor_t; diff --git a/py-interface/README.md b/py-interface/README.md index 87089306..f965fb02 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -809,7 +809,7 @@ Output: Dictionary with 2 fields `pcie_static` and `pcie_metric` Fields | Description ---|--- -`pcie_static` |
Subfield Description
`max_pcie_width`Maximum number of pcie lanes available
`max_pcie_speed`Maximum capable pcie speed in GT/s
`pcie_interface_version`PCIe generation ie. 3,4,5...
`slot_type`The type of form factor of the slot: PCIE, OAM, or Unknown
+`pcie_static` |
Subfield Description
`max_pcie_width`Maximum number of pcie lanes available
`max_pcie_speed`Maximum capable pcie speed in GT/s
`pcie_interface_version`PCIe generation ie. 3,4,5...
`slot_type`The type of form factor of the slot: OAM, PCIE, CEM, or Unknown
`pcie_metric` |
Subfield Description
`pcie_width`Current number of pcie lanes available
`pcie_speed`Current pcie speed capable in GT/s
`pcie_bandwidth`Current instantaneous bandwidth usage in Mb/s
`pcie_replay_count`Total number of PCIe replays (NAKs)
`pcie_l0_to_recovery_count`PCIE L0 to recovery state transition accumulated count
`pcie_replay_roll_over_count`PCIe Replay accumulated count
`pcie_nak_sent_count`PCIe NAK sent accumulated count
`pcie_nak_received_count`PCIe NAK received accumulated count
Exceptions that can be thrown by `amdsmi_get_pcie_info` function: diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 13cd2062..560590ea 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -737,11 +737,13 @@ class struct_fields_(Structure): amdsmi_card_form_factor_t__enumvalues = { 0: 'AMDSMI_CARD_FORM_FACTOR_PCIE', 1: 'AMDSMI_CARD_FORM_FACTOR_OAM', - 2: 'AMDSMI_CARD_FORM_FACTOR_UNKNOWN', + 2: 'AMDSMI_CARD_FORM_FACTOR_CEM', + 3: 'AMDSMI_CARD_FORM_FACTOR_UNKNOWN', } AMDSMI_CARD_FORM_FACTOR_PCIE = 0 AMDSMI_CARD_FORM_FACTOR_OAM = 1 -AMDSMI_CARD_FORM_FACTOR_UNKNOWN = 2 +AMDSMI_CARD_FORM_FACTOR_CEM = 2 +AMDSMI_CARD_FORM_FACTOR_UNKNOWN = 3 amdsmi_card_form_factor_t = ctypes.c_uint32 # enum class struct_amdsmi_pcie_info_t(Structure): pass @@ -2347,10 +2349,10 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'AMDSMI_CACHE_PROPERTY_DATA_CACHE', 'AMDSMI_CACHE_PROPERTY_ENABLED', 'AMDSMI_CACHE_PROPERTY_INST_CACHE', - 'AMDSMI_CACHE_PROPERTY_SIMD_CACHE', 'AMDSMI_CARD_FORM_FACTOR_OAM', - 'AMDSMI_CARD_FORM_FACTOR_PCIE', 'AMDSMI_CARD_FORM_FACTOR_UNKNOWN', - 'AMDSMI_CNTR_CMD_START', 'AMDSMI_CNTR_CMD_STOP', - 'AMDSMI_COARSE_GRAIN_GFX_ACTIVITY', + 'AMDSMI_CACHE_PROPERTY_SIMD_CACHE', 'AMDSMI_CARD_FORM_FACTOR_CEM', + 'AMDSMI_CARD_FORM_FACTOR_OAM', 'AMDSMI_CARD_FORM_FACTOR_PCIE', + 'AMDSMI_CARD_FORM_FACTOR_UNKNOWN', 'AMDSMI_CNTR_CMD_START', + 'AMDSMI_CNTR_CMD_STOP', 'AMDSMI_COARSE_GRAIN_GFX_ACTIVITY', 'AMDSMI_COARSE_GRAIN_MEM_ACTIVITY', 'AMDSMI_CURRENT_POWER', 'AMDSMI_DEV_PERF_LEVEL_AUTO', 'AMDSMI_DEV_PERF_LEVEL_DETERMINISM', 'AMDSMI_DEV_PERF_LEVEL_FIRST', 'AMDSMI_DEV_PERF_LEVEL_HIGH', diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 2f56eb45..e85646d2 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -2050,11 +2050,14 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a processor_handle, &slot_type); if (status == AMDSMI_STATUS_SUCCESS) { switch (slot_type) { + case RSMI_PCIE_SLOT_PCIE: + info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_PCIE; + break; case RSMI_PCIE_SLOT_OAM: info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_OAM; break; - case RSMI_PCIE_SLOT_PCIE: - info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_PCIE; + case RSMI_PCIE_SLOT_CEM: + info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_CEM; break; default: info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_UNKNOWN; From 9800156a7a28e7c2c516039994103b447a478a2f Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Wed, 27 Mar 2024 01:08:37 -0500 Subject: [PATCH 13/18] Bump Version to 24.5.0.0 Signed-off-by: Maisam Arif Change-Id: I2509c8c2df54f0c5e9376fc0a21c09adc74f0ea8 --- CMakeLists.txt | 2 +- amdsmi_cli/README.md | 2 +- docs/doxygen/Doxyfile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d59fe2f4..97dbc610 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ find_program(GIT NAMES git) ## Setup the package version based on git tags. set(PKG_VERSION_GIT_TAG_PREFIX "amdsmi_pkg_ver") -get_package_version_number("24.4.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) +get_package_version_number("24.5.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) message("Package version: ${PKG_VERSION_STR}") set(${AMD_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}") set(${AMD_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}") diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index f9c0c067..72028e5b 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -79,7 +79,7 @@ amd-smi will report the version and current platform detected when running the c ~$ amd-smi usage: amd-smi [-h] ... -AMD System Management Interface | Version: 24.4.0.0 | ROCm version: 6.1.0 | Platform: Linux Baremetal +AMD System Management Interface | Version: 24.5.0.0 | ROCm version: 6.1.1 | Platform: Linux Baremetal options: -h, --help show this help message and exit diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index ff7a8a18..de8ab73b 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -48,7 +48,7 @@ PROJECT_NAME = AMD SMI # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "24.4.0.0" +PROJECT_NUMBER = "24.5.0.0" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a From 08e2e21bab124011e6f556e9335290e567e2efe6 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Thu, 14 Mar 2024 05:53:26 -0500 Subject: [PATCH 14/18] fix: [SWDEV-442525] [rocm/amd_smi_lib] Fixes gpu_process_list Code changes related to the following: * amdsmi_get_gpu_process_list() * CLI * Examples * Unit tests * Changelog * Readme * rocm_smi_lib commit: 677433b367c5738e165c74ac07bdb7ab26d22949 Change-Id: I9210fbca7a5da92d0a8b472b72ca82597c8e4fb5 Signed-off-by: Oliveira, Daniel --- CHANGELOG.md | 51 +++--- amdsmi_cli/amdsmi_commands.py | 8 +- example/amd_smi_drm_example.cc | 87 +++++----- include/amd_smi/amdsmi.h | 49 +++--- include/amd_smi/impl/amd_smi_gpu_device.h | 22 +++ py-interface/README.md | 44 +---- py-interface/amdsmi_interface.py | 40 ++--- py-interface/amdsmi_wrapper.py | 7 +- rocm_smi/include/rocm_smi/rocm_smi.h | 28 ++-- rocm_smi/src/rocm_smi_kfd.cc | 4 +- src/amd_smi/amd_smi.cc | 101 +++++------- src/amd_smi/amd_smi_gpu_device.cc | 155 +++++++++++++++++- src/amd_smi/fdinfo.cc | 11 +- .../functional/process_info_read.cc | 1 + 14 files changed, 371 insertions(+), 237 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c6dc5bd..f606276c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ### Changed -- **Updated metrics --clocks** +- **Updated metrics --clocks** Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status. ``` shell @@ -119,7 +119,7 @@ GPU: 0 DEEP_SLEEP: ENABLED ``` -- **Added deferred ecc counts** +- **Added deferred ecc counts** Added deferred error correctable counts to `amd-smi metric --ecc --ecc-blocks` ```shell @@ -149,11 +149,14 @@ GPU: 0 Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix updates CLI to target only AMD ASICs. -- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards** +- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards** Updated API to include `amdsmi_card_form_factor_t.AMDSMI_CARD_FORM_FACTOR_CEM`. Prevously, this would report "UNKNOWN". This fix provides the correct board `SLOT_TYPE` associated with these ASICs (and other Navi cards). -- **Improved Error handling for `amd-smi process`** +- **Fix for `amd-smi process`** +Fixed output results when getting processes running on a device. + +- **Improved Error handling for `amd-smi process`** Fixed Attribute Error when getting process in csv format ### Known issues @@ -164,7 +167,7 @@ Fixed Attribute Error when getting process in csv format ### Added -- **Added Monitor Command** +- **Added Monitor Command** Provides users the ability to customize GPU metrics to capture, collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi` (no argument), additionally allows uers to customize what data is helpful for their use-case. ```shell @@ -224,7 +227,7 @@ GPU POWER GPU_TEMP MEM_TEMP GFX_UTIL GFX_CLOCK MEM_UTIL MEM_CLOCK VRAM_U 7 175 W 34 °C 32 °C 0 % 113 MHz 0 % 900 MHz 283 MB 196300 MB ``` -- **Integrated ESMI Tool** +- **Integrated ESMI Tool** Users can get CPU metrics and telemetry through our API and CLI tools. This information can be seen in `amd-smi static` and `amd-smi metric` commands. Only available for limited target processors. As of ROCm 6.0.2, this is listed as: - AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh - AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh @@ -374,7 +377,7 @@ CPU: 0 RESPONSE: N/A ``` -- **Added support for new metrics: VCN, JPEG engines, and PCIe errors** +- **Added support for new metrics: VCN, JPEG engines, and PCIe errors** Using the AMD SMI tool, users can retreive VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs (with 4 separate VCN engine activities) for older asics `MM_ACTIVITY` with UVD/VCN engine activity (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new fields. ```shell @@ -407,10 +410,10 @@ GPU: 0 ``` -- **Added AMDSMI Tool Version** -AMD SMI will report ***three versions***: AMDSMI Tool, AMDSMI Library version, and ROCm version. -The AMDSMI Tool version is the CLI/tool version number with commit ID appended after `+` sign. -The AMDSMI Library version is the library package version number. +- **Added AMDSMI Tool Version** +AMD SMI will report ***three versions***: AMDSMI Tool, AMDSMI Library version, and ROCm version. +The AMDSMI Tool version is the CLI/tool version number with commit ID appended after `+` sign. +The AMDSMI Library version is the library package version number. The ROCm version is the system's installed ROCm version, if ROCm is not installed it will report N/A. ```shell @@ -418,7 +421,7 @@ $ amd-smi version AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6.1.0 ``` -- **Added XGMI table** +- **Added XGMI table** Displays XGMI information for AMD GPU devices in a table format. Only available on supported ASICs (eg. MI300). Here users can view read/write data XGMI or PCIe accumulated data transfer size (in KiloBytes). ```shell @@ -452,10 +455,10 @@ GPU7 0000:df:00.0 32 Gb/s 512 Gb/s XGMI ``` -- **Added units of measure to JSON output.** +- **Added units of measure to JSON output.** We added unit of measure to JSON/CSV `amd-smi metric`, `amd-smi static`, and `amd-smi monitor` commands. -Ex. +Ex. ```shell amd-smi metric -p --json @@ -488,7 +491,7 @@ amd-smi metric -p --json ### Changed -- **Topology is now left-aligned with BDF of each device listed individual table's row/coloumns.** +- **Topology is now left-aligned with BDF of each device listed individual table's row/coloumns.** We provided each device's BDF for every table's row/columns, then left aligned data. We want AMD SMI Tool output to be easy to understand and digest for our users. Having users scroll up to find this information made it difficult to follow, especially for devices which have many devices associated with one ASIC. ```shell @@ -555,9 +558,9 @@ NUMA BW TABLE: ### Fixed -- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests** +- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests** Devices which do not report (eg. Navi3X/Navi2X/MI100) we have added checks to confirm these devices return AMDSMI_STATUS_NOT_SUPPORTED. Otherwise, tests now display a return string. -- **Fix for devices which have an older pyyaml installed** +- **Fix for devices which have an older pyyaml installed** Platforms which are identified as having an older pyyaml version or pip, we no manually update both pip and pyyaml as needed. This corrects issues identified below. Fix impacts the following CLI commands: - `amd-smi list` - `amd-smi static` @@ -569,7 +572,7 @@ Platforms which are identified as having an older pyyaml version or pip, we no m TypeError: dump_all() got an unexpected keyword argument 'sort_keys' ``` -- **Fix for crash when user is not a member of video/render groups** +- **Fix for crash when user is not a member of video/render groups** AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid crashes when DRM/device data is inaccessable to the logged in user. ### Known Issues @@ -580,20 +583,20 @@ AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid cr ### Added -- **Integrated the E-SMI (EPYC-SMI) library** +- **Integrated the E-SMI (EPYC-SMI) library** You can now query CPU-related information directly through AMD SMI. Metrics include power, energy, performance, and other system details. -- **Added support for gfx942 metrics** +- **Added support for gfx942 metrics** You can now query MI300 device metrics to get real-time information. Metrics include power, temperature, energy, and performance. -- **Compute and memory partition support** +- **Compute and memory partition support** Users can now view, set, and reset partitions. The topology display can provide a more in-depth look at the device's current configuration. ### Changed -- **GPU index sorting made consistent with other tools** +- **GPU index sorting made consistent with other tools** To ensure alignment with other ROCm software tools, GPU index sorting is optimized to use Bus:Device.Function (BDF) rather than the card number. -- **Topology output is now aligned with GPU BDF table** +- **Topology output is now aligned with GPU BDF table** Earlier versions of the topology output were difficult to read since each GPU was displayed linearly. Now the information is displayed as a table by each GPU's BDF, which closer resembles rocm-smi output. @@ -603,7 +606,7 @@ Now the information is displayed as a table by each GPU's BDF, which closer rese ### Fixed -- **Fix for driver not initialized** +- **Fix for driver not initialized** If driver module is not loaded, user retrieve error reponse indicating amdgpu module is not loaded. ### Known Issues diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index cc2ab30e..b9bc3a5e 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -2570,16 +2570,18 @@ def process(self, args, multiple_devices=False, watching_output=False, try: process_list = amdsmi_interface.amdsmi_get_gpu_process_list(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info()) raise e filtered_process_values = [] - for process_handle in process_list: + for process in process_list: try: - process_info = amdsmi_interface.amdsmi_get_gpu_process_info(args.gpu, process_handle) + process_info = amdsmi_interface.amdsmi_get_gpu_process_info(args.gpu, process) except amdsmi_exception.AmdSmiLibraryException as e: process_info = "N/A" - logging.debug("Failed to get process info for gpu %s on process_handle %s | %s", gpu_id, process_handle, e.get_error_info()) + logging.debug("Failed to get process info for process %s on gpu %s | %s", process, gpu_id, e.get_error_info()) filtered_process_values.append({'process_info': process_info}) continue diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc index cd9a3a1f..25ac6ade 100644 --- a/example/amd_smi_drm_example.cc +++ b/example/amd_smi_drm_example.cc @@ -432,7 +432,9 @@ int main() { ret = amdsmi_get_temp_metric( processor_handles[j], TEMPERATURE_TYPE_EDGE, AMDSMI_TEMP_CRITICAL, &temperature); - CHK_AMDSMI_RET(ret) + if (ret != amdsmi_status_t::AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + } printf("\tGPU GFX temp limit: %ld\n\n", temperature); // Get temperature measurements @@ -447,7 +449,9 @@ int main() { processor_handles[j], temp_type, AMDSMI_TEMP_CURRENT, &temp_measurements[(int)(temp_type)]); - CHK_AMDSMI_RET(ret) + if (ret != amdsmi_status_t::AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + } } printf(" Output of amdsmi_get_temp_metric:\n"); printf("\tGPU Edge temp measurement: %ld\n", @@ -526,14 +530,13 @@ int main() { }; uint32_t num_process = 0; - ret = amdsmi_get_gpu_process_list(processor_handles[j], &num_process, - nullptr); + ret = amdsmi_get_gpu_process_list(processor_handles[j], &num_process, nullptr); CHK_AMDSMI_RET(ret) if (!num_process) { printf("No processes found.\n"); } else { - amdsmi_process_handle_t process_list[num_process]; - amdsmi_proc_info_t info_list[num_process]; + std::cout << "Processes found: " << num_process << "\n"; + amdsmi_proc_info_t process_info_list[num_process]; amdsmi_proc_info_t process = {}; uint64_t mem = 0, gtt_mem = 0, cpu_mem = 0, vram_mem = 0; uint64_t gfx = 0, enc = 0; @@ -544,24 +547,14 @@ int main() { bdf.fields.device_number, bdf.fields.function_number); int num = 0; - ret = amdsmi_get_gpu_process_list(processor_handles[j], &num_process, - process_list); - CHK_AMDSMI_RET(ret) - for (uint32_t it = 0; it < num_process; it += 1) { - if (getpid() == process_list[it]) { - continue; - } - ret = amdsmi_get_gpu_process_info(processor_handles[j], - process_list[it], &process); - if (ret != AMDSMI_STATUS_SUCCESS) { - printf("amdsmi_get_gpu_process_info() failed for " - "process_list[%d], returned %d\n", - it, ret); - continue; - } - info_list[num++] = process; + ret = amdsmi_get_gpu_process_list(processor_handles[j], &num_process, process_info_list); + std::cout << "Allocation size for process list: " << num_process << "\n"; + CHK_AMDSMI_RET(ret); + for (auto idx = uint32_t(0); idx < num_process; ++idx) { + process = static_cast(process_info_list[idx]); + printf("\t *Process id: %ld / Name: %s / VRAM: %lld \n", process.pid, process.name, process.memory_usage.vram_mem); } - qsort(info_list, num, sizeof(info_list[0]), compare); + printf("+=======+==================+============+==============" "+=============+=============+=============+============" "==+=========================================+\n"); @@ -575,41 +568,41 @@ int main() { printf("+=======+" "+=============+=============+=============+============" "==+=========================================+\n"); - for (int it = 0; it < num; it++) { + for (int it = 0; it < num_process; it++) { char command[30]; struct passwd *pwd = nullptr; struct stat st; - sprintf(command, "/proc/%d", info_list[it].pid); + sprintf(command, "/proc/%d", process_info_list[it].pid); if (stat(command, &st)) continue; pwd = getpwuid(st.st_uid); if (!pwd) printf("| %5d | %16s | %10d | %s | %7ld KiB | %7ld KiB " "| %7ld KiB | %7ld KiB | %lu %lu |\n", - info_list[it].pid, info_list[it].name, st.st_uid, - bdf_str, info_list[it].mem / 1024, - info_list[it].memory_usage.gtt_mem / 1024, - info_list[it].memory_usage.cpu_mem / 1024, - info_list[it].memory_usage.vram_mem / 1024, - info_list[it].engine_usage.gfx, - info_list[it].engine_usage.enc); + process_info_list[it].pid, process_info_list[it].name, st.st_uid, + bdf_str, process_info_list[it].mem / 1024, + process_info_list[it].memory_usage.gtt_mem / 1024, + process_info_list[it].memory_usage.cpu_mem / 1024, + process_info_list[it].memory_usage.vram_mem / 1024, + process_info_list[it].engine_usage.gfx, + process_info_list[it].engine_usage.enc); else printf("| %5d | %16s | %10s | %s | %7ld KiB | %7ld KiB " "| %7ld KiB | %7ld KiB | %lu %lu |\n", - info_list[it].pid, info_list[it].name, - pwd->pw_name, bdf_str, info_list[it].mem / 1024, - info_list[it].memory_usage.gtt_mem / 1024, - info_list[it].memory_usage.cpu_mem / 1024, - info_list[it].memory_usage.vram_mem / 1024, - info_list[it].engine_usage.gfx, - info_list[it].engine_usage.enc); - mem += info_list[it].mem / 1024; - gtt_mem += info_list[it].memory_usage.gtt_mem / 1024; - cpu_mem += info_list[it].memory_usage.cpu_mem / 1024; - vram_mem += info_list[it].memory_usage.vram_mem / 1024; - gfx = info_list[it].engine_usage.gfx; - enc = info_list[it].engine_usage.enc; + process_info_list[it].pid, process_info_list[it].name, + pwd->pw_name, bdf_str, process_info_list[it].mem / 1024, + process_info_list[it].memory_usage.gtt_mem / 1024, + process_info_list[it].memory_usage.cpu_mem / 1024, + process_info_list[it].memory_usage.vram_mem / 1024, + process_info_list[it].engine_usage.gfx, + process_info_list[it].engine_usage.enc); + mem += process_info_list[it].mem / 1024; + gtt_mem += process_info_list[it].memory_usage.gtt_mem / 1024; + cpu_mem += process_info_list[it].memory_usage.cpu_mem / 1024; + vram_mem += process_info_list[it].memory_usage.vram_mem / 1024; + gfx = process_info_list[it].engine_usage.gfx; + enc = process_info_list[it].engine_usage.enc; printf( "+-------+------------------+------------+-------------" "-+-------------+-------------+-------------+----------" @@ -644,7 +637,9 @@ int main() { int64_t val_i64 = 0; ret = amdsmi_get_temp_metric(processor_handles[j], TEMPERATURE_TYPE_EDGE, AMDSMI_TEMP_CURRENT, &val_i64); - CHK_AMDSMI_RET(ret) + if (ret != amdsmi_status_t::AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + } printf(" Output of amdsmi_get_temp_metric:\n"); std::cout << "\t\tTemperature: " << val_i64 << "C" << "\n\n"; diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 834b820c..3f7bd398 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -657,9 +657,9 @@ typedef struct { uint32_t mm_activity; uint32_t reserved[13]; } amdsmi_engine_usage_t; - typedef uint32_t amdsmi_process_handle_t; + typedef struct { char name[AMDSMI_NORMAL_STRING_LENGTH]; amdsmi_process_handle_t pid; @@ -679,6 +679,7 @@ typedef struct { uint32_t reserved[4]; } amdsmi_proc_info_t; + //! Guaranteed maximum possible number of supported frequencies #define AMDSMI_MAX_NUM_FREQUENCIES 33 @@ -4743,33 +4744,39 @@ amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle, amdsmi_vram_ * number of processes currently running, * AMDSMI_STATUS_OUT_OF_RESOURCES will be returned. * + * For cases where max_process is not zero (0), it specifies the list's size limit. + * That is, the maximum size this list will be able to hold. After the list is built + * internally, as a return status, we will have AMDSMI_STATUS_OUT_OF_RESOURCES when + * the original size limit is smaller than the actual list of processes running. + * Hence, the caller is aware the list size needs to be resized, or + * AMDSMI_STATUS_SUCCESS otherwise. + * Holding a copy of max_process before it is passed in will be helpful for monitoring + * the allocations done upon each call since the max_process will permanently be changed + * to reflect the actual number of processes running. + * Note: For the specific cases where the return status is AMDSMI_STATUS_NO_PERM only. + * The list of process and size are AMDSMI_STATUS_SUCCESS, however there are + * processes details not fully retrieved due to permissions. + * + * * @param[out] list Reference to a user-provided buffer where the process * list will be returned. This buffer must contain at least - * max_processes entries of type smi_process_handle. Must be allocated + * max_processes entries of type amd_proc_info_list_t. Must be allocated * by user. * - * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail - */ -amdsmi_status_t -amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_process_handle_t *list); - -/** - * @brief Returns the process information of a given process. - * Engine usage show how much time the process spend using these engines in ns. - * - * @platform{gpu_bm_linux} @platform{guest_1vf} @platform{guest_mvf} @platform{guest_windows} - * - * @param[in] processor_handle Device which to query - * - * @param[in] process Handle of process to query. + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, + * | ::AMDSMI_STATUS_NO_PERM on success, but not all details from process retrieved, + * | ::AMDSMI_STATUS_OUT_OF_RESOURCES, filled list buffer with data, but number of + * actual running processes is larger than the size provided. * - * @param[out] info Reference to a process information structure where to return - * information. Must be allocated by user. - * - * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ + // Note: If the reserved size for processes is smaller than the number of + // actual processes running. The AMDSMI_STATUS_OUT_OF_RESOURCES is + // an indication the caller should handle the situation (resize). + // The max_processes is always changed to reflect the actual size of + // list of processes running, so the caller knows where it is at. + // amdsmi_status_t -amdsmi_get_gpu_process_info(amdsmi_processor_handle processor_handle, amdsmi_process_handle_t process, amdsmi_proc_info_t *info); +amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_proc_info_t *list); /** @} End processinfo */ diff --git a/include/amd_smi/impl/amd_smi_gpu_device.h b/include/amd_smi/impl/amd_smi_gpu_device.h index 527d5277..9dd39424 100644 --- a/include/amd_smi/impl/amd_smi_gpu_device.h +++ b/include/amd_smi/impl/amd_smi_gpu_device.h @@ -53,7 +53,20 @@ namespace amd { namespace smi { + +// PID, amdsmi_proc_info_t +using GPUComputeProcessList_t = std::map; +using ComputeProcessListClassType_t = uint16_t; + +enum class ComputeProcessListType_t : ComputeProcessListClassType_t +{ + kAllProcesses, + kAllProcessesOnDevice, +}; + + class AMDSmiGPUDevice: public AMDSmiProcessor { + public: AMDSmiGPUDevice(uint32_t gpu_id, uint32_t fd, std::string path, amdsmi_bdf_t bdf, AMDSmiDrm& drm): AMDSmiProcessor(AMD_GPU), gpu_id_(gpu_id), fd_(fd), path_(path), bdf_(bdf), drm_(drm) {} @@ -73,6 +86,10 @@ class AMDSmiGPUDevice: public AMDSmiProcessor { amdsmi_bdf_t get_bdf(); bool check_if_drm_is_supported() { return drm_.check_if_drm_is_supported(); } uint32_t get_vendor_id(); + const GPUComputeProcessList_t& amdgpu_get_compute_process_list(ComputeProcessListType_t list_type = ComputeProcessListType_t::kAllProcessesOnDevice); + const GPUComputeProcessList_t& amdgpu_get_all_compute_process_list() { + return amdgpu_get_compute_process_list(ComputeProcessListType_t::kAllProcesses); + } amdsmi_status_t amdgpu_query_info(unsigned info_id, unsigned size, void *value) const; @@ -83,6 +100,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor { amdsmi_status_t amdgpu_query_vbios(void *info) const; amdsmi_status_t amdgpu_query_driver_name(std::string& name) const; amdsmi_status_t amdgpu_query_driver_date(std::string& date) const; + private: uint32_t gpu_id_; uint32_t fd_; @@ -90,6 +108,10 @@ class AMDSmiGPUDevice: public AMDSmiProcessor { amdsmi_bdf_t bdf_; uint32_t vendor_id_; AMDSmiDrm& drm_; + GPUComputeProcessList_t compute_process_list_; + int32_t get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list, + ComputeProcessListType_t list_type); + }; diff --git a/py-interface/README.md b/py-interface/README.md index f965fb02..4199f1a9 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -882,43 +882,14 @@ except AmdSmiException as e: ### amdsmi_get_gpu_process_list -Description: Returns the list of processes for the given GPU +Description: Returns the list of processes for the given GPU. +The list is of type `amdsmi_proc_info_t` and holds information about the running process. Input parameters: * `processor_handle` device which to query -Output: List of process handles found - -Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function: - -* `AmdSmiLibraryException` -* `AmdSmiRetryException` -* `AmdSmiParameterException` - -Example: - -```python -try: - devices = amdsmi_get_processor_handles() - if len(devices) == 0: - print("No GPUs on machine") - else: - for device in devices: - processes = amdsmi_get_gpu_process_list(device) - print(processes) -except AmdSmiException as e: - print(e) -``` - -### amdsmi_get_gpu_process_info - -Description: Returns the info for the given process - -Input parameters: - -* `processor_handle` device which to query -* `process_handle` process which to query +Output: List of process processes with fields Output: Dictionary with fields @@ -930,7 +901,7 @@ Field | Description `engine_usage` |
Subfield Description
`gfx`GFX engine usage in ns
`enc`Encode engine usage in ns
`memory_usage` |
Subfield Description
`gtt_mem`GTT memory usage
`cpu_mem`CPU memory usage
`vram_mem`VRAM memory usage
-Exceptions that can be thrown by `amdsmi_get_gpu_process_info` function: +Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function: * `AmdSmiLibraryException` * `AmdSmiRetryException` @@ -946,8 +917,11 @@ try: else: for device in devices: processes = amdsmi_get_gpu_process_list(device) - for process in processes: - print(amdsmi_get_gpu_process_info(device, process)) + if len(processes) == 0: + print("No processes running on this GPU") + else: + for process in processes: + print(process) except AmdSmiException as e: print(e) ``` diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index bf9fa6a0..bf45635d 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -1923,15 +1923,16 @@ def amdsmi_get_gpu_ras_block_features_enabled( def amdsmi_get_gpu_process_list( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> List[amdsmi_wrapper.amdsmi_process_handle_t]: +) -> List[amdsmi_wrapper.amdsmi_proc_info_t]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) + # This will get populated with the number of processes found max_processes = ctypes.c_uint32(MAX_NUM_PROCESSES) - process_list = (amdsmi_wrapper.amdsmi_process_handle_t * + process_list = (amdsmi_wrapper.amdsmi_proc_info_t * max_processes.value)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_process_list( @@ -1939,42 +1940,37 @@ def amdsmi_get_gpu_process_list( ) ) - return [amdsmi_wrapper.amdsmi_process_handle_t(process_list[x])\ - for x in range(0, max_processes.value)] + result = [] + for index in range(max_processes.value): + result.append(process_list[index]) + return result def amdsmi_get_gpu_process_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, - process: amdsmi_wrapper.amdsmi_process_handle_t, + process: amdsmi_wrapper.amdsmi_proc_info_t, ) -> Dict[str, Any]: if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) - if not isinstance(process, amdsmi_wrapper.amdsmi_process_handle_t): + if not isinstance(process, amdsmi_wrapper.amdsmi_proc_info_t): raise AmdSmiParameterException( - process, amdsmi_wrapper.amdsmi_process_handle_t) - - info = amdsmi_wrapper.amdsmi_proc_info_t() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_process_info( - processor_handle, process, ctypes.byref(info) - ) - ) + process, amdsmi_wrapper.amdsmi_proc_info_t) return { - "name": info.name.decode("utf-8"), - "pid": info.pid, - "mem": info.mem, + "name": process.name.decode("utf-8"), + "pid": process.pid, + "mem": process.mem, "engine_usage": { - "gfx": info.engine_usage.gfx, - "enc": info.engine_usage.enc + "gfx": process.engine_usage.gfx, + "enc": process.engine_usage.enc }, "memory_usage": { - "gtt_mem": info.memory_usage.gtt_mem, - "cpu_mem": info.memory_usage.cpu_mem, - "vram_mem": info.memory_usage.vram_mem, + "gtt_mem": process.memory_usage.gtt_mem, + "cpu_mem": process.memory_usage.cpu_mem, + "vram_mem": process.memory_usage.vram_mem, }, } diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 560590ea..d9116193 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -2212,10 +2212,7 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): amdsmi_get_gpu_vram_usage.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_vram_usage_t)] amdsmi_get_gpu_process_list = _libraries['libamd_smi.so'].amdsmi_get_gpu_process_list amdsmi_get_gpu_process_list.restype = amdsmi_status_t -amdsmi_get_gpu_process_list.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_uint32)] -amdsmi_get_gpu_process_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_process_info -amdsmi_get_gpu_process_info.restype = amdsmi_status_t -amdsmi_get_gpu_process_info.argtypes = [amdsmi_processor_handle, amdsmi_process_handle_t, ctypes.POINTER(struct_amdsmi_proc_info_t)] +amdsmi_get_gpu_process_list.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(struct_amdsmi_proc_info_t)] amdsmi_get_gpu_total_ecc_count = _libraries['libamd_smi.so'].amdsmi_get_gpu_total_ecc_count amdsmi_get_gpu_total_ecc_count.restype = amdsmi_status_t amdsmi_get_gpu_total_ecc_count.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_error_count_t)] @@ -2580,7 +2577,7 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'amdsmi_get_gpu_pci_throughput', 'amdsmi_get_gpu_perf_level', 'amdsmi_get_gpu_pm_metrics_info', 'amdsmi_get_gpu_power_profile_presets', - 'amdsmi_get_gpu_process_info', 'amdsmi_get_gpu_process_list', + 'amdsmi_get_gpu_process_list', 'amdsmi_get_gpu_ras_block_features_enabled', 'amdsmi_get_gpu_ras_feature_info', 'amdsmi_get_gpu_reg_table_info', 'amdsmi_get_gpu_revision', diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index e10ab49b..b6420d79 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -902,7 +902,7 @@ typedef struct { struct { uint32_t cache_size_kb; /* In KB */ uint32_t cache_level; - /* + /* HSA_CACHE_TYPE_DATA 0x00000001 HSA_CACHE_TYPE_INSTRUCTION 0x00000002 HSA_CACHE_TYPE_CPU 0x00000004 @@ -1248,12 +1248,14 @@ typedef struct { */ typedef struct { uint32_t process_id; //!< Process ID - uint32_t pasid; //!< PASID + uint32_t pasid; //!< PASID: (Process Address Space ID) uint64_t vram_usage; //!< VRAM usage uint64_t sdma_usage; //!< SDMA usage in microseconds uint32_t cu_occupancy; //!< Compute Unit usage in percent } rsmi_process_info_t; +//! CU occupancy invalidation value for the GFX revisions not providing cu_occupancy debugfs method +#define CU_OCCUPANCY_INVALID 0xFFFFFFFF /** * @brief Opaque handle to function-support object @@ -1447,7 +1449,7 @@ rsmi_status_t rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id); * * @details Given a device index @p dv_ind, a pointer to a caller provided * char buffer @p name, and a length of this buffer @p len, this function will - * write the name of the PCIe vendor (up to @p len characters) buffer @p name. + * write the name of the PCIe vendor (up to @p len characters) buffer @p name. * * If the integer ID associated with the PCIe vendor is not found in one of the * system files containing device name information (e.g. @@ -2294,9 +2296,9 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, /** * @brief Get gpu cache info. - * - * @details Given a device index @p dv_ind, and a pointer to a cache - * info @p info, this function will write the cache size and level + * + * @details Given a device index @p dv_ind, and a pointer to a cache + * info @p info, this function will write the cache size and level * to the location pointed to by @p info. * @param[in] dv_ind a device index * @@ -2930,16 +2932,16 @@ rsmi_status_t rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, * @brief Get the pm metrics table with provided device index. * * @details Given a device index @p dv_ind, @p pm_metrics pointer, - * and @p num_of_metrics pointer, + * and @p num_of_metrics pointer, * this function will write the pm metrics name value pair * to the array at @p pm_metrics and the number of metrics retreived to @p num_of_metrics * Note: the library allocated memory for pm_metrics, and user must call * free(pm_metrics) to free it after use. - * + * * @param[in] dv_ind a device index * * @param[inout] pm_metrics A pointerto an array to hold multiple PM metrics. On successs, - * the library will allocate memory of pm_metrics and write metrics to this array. + * the library will allocate memory of pm_metrics and write metrics to this array. * The caller must free this memory after usage to avoid memory leak. * * @param[inout] num_of_metrics a pointer to uint32_t to which the number of @@ -2964,18 +2966,18 @@ rsmi_status_t rsmi_dev_pm_metrics_info_get(uint32_t dv_ind, * @brief Get the register metrics table with provided device index and registertype. * * @details Given a device index @p dv_ind, @p reg_type, @p reg_metrics pointer, - * and @p num_of_metrics pointer, + * and @p num_of_metrics pointer, * this function will write the register metrics name value pair * to the array at @p reg_metrics and the number of metrics retreived to @p num_of_metrics * Note: the library allocated memory for reg_metrics, and user must call * free(reg_metrics) to free it after use. - * + * * @param[in] dv_ind a device index - * + * * @param[in] reg_type The register type * * @param[inout] reg_metrics A pointerto an array to hold multiple register metrics. On successs, - * the library will allocate memory of reg_metrics and write metrics to this array. + * the library will allocate memory of reg_metrics and write metrics to this array. * The caller must free this memory after usage to avoid memory leak. * * @param[inout] num_of_metrics a pointer to uint32_t to which the number of diff --git a/rocm_smi/src/rocm_smi_kfd.cc b/rocm_smi/src/rocm_smi_kfd.cc index fb2c2157..13d2c27b 100755 --- a/rocm_smi/src/rocm_smi_kfd.cc +++ b/rocm_smi/src/rocm_smi_kfd.cc @@ -526,7 +526,9 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, // Collect count of compute units cu_count += kfd_node_map[gpu_id]->cu_count(); } else { - return err; + //Some GFX revisions do not provide cu_occupancy debugfs method + proc->cu_occupancy = CU_OCCUPANCY_INVALID; + cu_count = 0; } } diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index e85646d2..7d375fb3 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1785,76 +1785,55 @@ amdsmi_get_gpu_total_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_ } amdsmi_status_t -amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_process_handle_t *list) { +amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_proc_info_t *list) { AMDSMI_CHECK_INIT(); - - if (max_processes == nullptr) { + if (!max_processes) { return AMDSMI_STATUS_INVAL; } - std::vector pids; - uint32_t i = 0; - uint64_t size = 0; - amdsmi_status_t status; amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; - amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device); - if (r != AMDSMI_STATUS_SUCCESS) - return r; - - if (gpu_device->check_if_drm_is_supported()){ - amdsmi_bdf_t bdf = gpu_device->get_bdf(); - status = gpuvsmi_get_pids(bdf, pids, &size); - if (status != AMDSMI_STATUS_SUCCESS) { - return status; - } - if (*max_processes == 0 || (pids.size() == 0)) { - *max_processes = (uint32_t)pids.size(); - return AMDSMI_STATUS_SUCCESS; - } - if (!list) { - return AMDSMI_STATUS_INVAL; - } - if (*max_processes < pids.size()) { - return AMDSMI_STATUS_OUT_OF_RESOURCES; - } - for (auto &pid : pids) { - if (i >= *max_processes) { - break; + amdsmi_status_t status_code = get_gpu_device_from_handle(processor_handle, &gpu_device); + if (status_code != amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { + return status_code; + } + + auto compute_process_list = gpu_device->amdgpu_get_compute_process_list(); + if ((*max_processes == 0) || compute_process_list.empty()) { + *max_processes = static_cast(compute_process_list.size()); + return amdsmi_status_t::AMDSMI_STATUS_SUCCESS; + } + if (!list) { + return amdsmi_status_t::AMDSMI_STATUS_INVAL; + } + + const auto max_processes_original_size(*max_processes); + auto idx = uint32_t(0); + auto is_required_previlegies_required(false); + for (auto& process : compute_process_list) { + if (idx < *max_processes) { + list[idx++] = static_cast(process.second); + // Note: If we could not read the process info for an existing process, + // that is likely a permission error. + if (!is_required_previlegies_required && std::string(process.second.name).empty()) { + is_required_previlegies_required = true; } - list[i++] = (uint32_t)pid; + } else { + break; } - *max_processes = (uint32_t)pids.size(); - } - else { - // rocm - } - - return AMDSMI_STATUS_SUCCESS; -} - -amdsmi_status_t -amdsmi_get_gpu_process_info(amdsmi_processor_handle processor_handle, amdsmi_process_handle_t process, amdsmi_proc_info_t *info) { - AMDSMI_CHECK_INIT(); - - if (info == nullptr) { - return AMDSMI_STATUS_INVAL; } - amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; - amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device); - if (r != AMDSMI_STATUS_SUCCESS) - return r; - - amdsmi_status_t status; - if (gpu_device->check_if_drm_is_supported()) { - status = gpuvsmi_get_pid_info(gpu_device->get_bdf(), process, *info); - if (status != AMDSMI_STATUS_SUCCESS) return status; - } - else { - // rocm - } - - return AMDSMI_STATUS_SUCCESS; + // Note: If the reserved size for processes is smaller than the number of + // actual processes running. The AMDSMI_STATUS_OUT_OF_RESOURCES is + // an indication the caller should handle the situation (resize). + // The max_processes is always changed to reflect the actual size of + // list of processes running, so the caller knows where it is at. + // Holding a copy of max_process before it is passed in will be helpful + // for the caller. + status_code = is_required_previlegies_required + ? amdsmi_status_t::AMDSMI_STATUS_NO_PERM : AMDSMI_STATUS_SUCCESS; + *max_processes = static_cast(compute_process_list.size()); + return (max_processes_original_size >= static_cast(compute_process_list.size())) + ? status_code : amdsmi_status_t::AMDSMI_STATUS_OUT_OF_RESOURCES; } amdsmi_status_t diff --git a/src/amd_smi/amd_smi_gpu_device.cc b/src/amd_smi/amd_smi_gpu_device.cc index 45d419f2..72c5cc4d 100644 --- a/src/amd_smi/amd_smi_gpu_device.cc +++ b/src/amd_smi/amd_smi_gpu_device.cc @@ -41,10 +41,16 @@ * */ -#include #include "amd_smi/impl/amd_smi_gpu_device.h" +#include "amd_smi/impl/amd_smi_common.h" +#include "amd_smi/impl/fdinfo.h" +#include "rocm_smi/rocm_smi_kfd.h" #include "rocm_smi/rocm_smi_utils.h" +#include +#include +#include +#include namespace amd { namespace smi { @@ -148,6 +154,153 @@ amdsmi_status_t AMDSmiGPUDevice::amdgpu_query_vbios(void *info) const { return drm_.amdgpu_query_vbios(fd, info); } + +int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list, + ComputeProcessListType_t list_type) +{ + /** + * The first call to GetProcessInfo() helps to find the size it needs, + * so we can create a tailored size list. + */ + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + auto list_process_running_size = uint32_t(0); + auto list_process_allocation_size = uint32_t(0); + + status_code = rsmi_compute_process_info_get(nullptr, &list_process_running_size); + if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (list_process_running_size <= 0)) { + return status_code; + } + + /** + * The second call to GetProcessInfo() helps to set proper sizes for both, + * the raw array of processes (amdsmi_process_info_t) and list of processes (amdsmi_proc_info_t). + */ + using RsmiDeviceList_t = uint32_t[]; + using RsmiProcessList_t = rsmi_process_info_t[]; + std::unique_ptr list_all_processes_ptr = std::make_unique(list_process_running_size); + + list_process_allocation_size = list_process_running_size; + status_code = rsmi_compute_process_info_get(list_all_processes_ptr.get(), &list_process_allocation_size); + if (status_code) { + return status_code; + } + + // Restore the original size to read + list_process_running_size = list_process_allocation_size; + if (list_process_running_size <= 0) { + return rsmi_status_t::RSMI_STATUS_NOT_FOUND; + } + + + /** + * Setup for the cases where the process list is by device. + */ + auto list_device_running_size = uint32_t(0); + auto list_device_allocation_size = uint32_t(0); + status_code = rsmi_num_monitor_devices(&list_device_running_size); + if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (list_device_running_size <= 0)) { + return status_code; + } + + + /** + * Complete the process information + */ + auto get_process_info = [&](const rsmi_process_info_t& rsmi_proc_info, amdsmi_proc_info_t& asmi_proc_info) { + auto status_code = gpuvsmi_get_pid_info(get_bdf(), rsmi_proc_info.process_id, asmi_proc_info); + // If we cannot get the info from sysfs, save the minimum info + if (status_code != amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { + asmi_proc_info.pid = rsmi_proc_info.process_id; + asmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage; + } + + return status_code; + }; + + /** + * Get process information + */ + auto update_list_by_running_process = [&](const uint32_t process_id) { + auto status_result(true); + rsmi_process_info_t rsmi_proc_info{}; + auto status_code = rsmi_compute_process_info_by_pid_get(process_id, &rsmi_proc_info); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { + status_result = false; + return status_result; + } + + amdsmi_proc_info_t tmp_asmi_proc_info{}; + get_process_info(rsmi_proc_info, tmp_asmi_proc_info); + compute_process_list.emplace(process_id, tmp_asmi_proc_info); + + return status_result; + }; + + + /** + * Devices used by a process. + */ + auto update_list_by_running_device = [&](const uint32_t process_id, + const uint32_t proc_addr_id) { + // Get all devices running this process + auto status_result(true); + std::unique_ptr list_device_ptr = std::make_unique(list_device_running_size); + list_device_allocation_size = list_device_running_size; + auto status_code = rsmi_compute_process_gpus_get(process_id, list_device_ptr.get(), &list_device_allocation_size); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { + status_result = false; + return status_result; + } + + for (auto device_idx = uint32_t(0); device_idx < list_device_allocation_size; ++device_idx) { + // Is this device running this process? + if (list_device_ptr[device_idx] == get_gpu_id()) { + rsmi_process_info_t rsmi_dev_proc_info{}; + auto status_code = rsmi_compute_process_info_by_device_get(process_id, list_device_ptr[device_idx], &rsmi_dev_proc_info); + if ((status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) && + ((rsmi_dev_proc_info.process_id == process_id) && (rsmi_dev_proc_info.pasid == proc_addr_id))) { + amdsmi_proc_info_t tmp_asmi_proc_info{}; + get_process_info(rsmi_dev_proc_info, tmp_asmi_proc_info); + compute_process_list.emplace(process_id, tmp_asmi_proc_info); + } + } + } + + return status_result; + }; + + + /** + * Transfer/Save the ones linked to this device. + */ + compute_process_list.clear(); + for (auto process_idx = uint32_t(0); process_idx < list_process_running_size; ++process_idx) { + if (list_type == ComputeProcessListType_t::kAllProcesses) { + if (update_list_by_running_process(list_all_processes_ptr[process_idx].process_id)) { + } + } + + if (list_type == ComputeProcessListType_t::kAllProcessesOnDevice) { + if (update_list_by_running_device(list_all_processes_ptr[process_idx].process_id, + list_all_processes_ptr[process_idx].pasid)) { + } + } + } + + return status_code; +} + +const GPUComputeProcessList_t& AMDSmiGPUDevice::amdgpu_get_compute_process_list(ComputeProcessListType_t list_type) +{ + auto error_code = get_compute_process_list_impl(compute_process_list_, list_type); + if (error_code) { + compute_process_list_.clear(); + } + + return compute_process_list_; +} + + } // namespace smi } // namespace amd diff --git a/src/amd_smi/fdinfo.cc b/src/amd_smi/fdinfo.cc index 997bc225..9b963a8a 100644 --- a/src/amd_smi/fdinfo.cc +++ b/src/amd_smi/fdinfo.cc @@ -220,12 +220,10 @@ amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid, } } - closedir(d); - if (!pasids.size()) - return AMDSMI_STATUS_NOT_FOUND; - + // Note: If possible at all, try to get the name of the process/container. + // In case the other info fail, get at least something. std::ifstream filename(name_path.c_str()); std::string name; @@ -252,9 +250,12 @@ amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid, if (strlen(info.container_name) > 0) break; } - info.pid = (uint32_t)pid; + if (!pasids.size()) { + return AMDSMI_STATUS_NOT_FOUND; + } + return AMDSMI_STATUS_SUCCESS; } diff --git a/tests/amd_smi_test/functional/process_info_read.cc b/tests/amd_smi_test/functional/process_info_read.cc index f0394593..d88bbe49 100755 --- a/tests/amd_smi_test/functional/process_info_read.cc +++ b/tests/amd_smi_test/functional/process_info_read.cc @@ -226,4 +226,5 @@ void TestProcInfoRead::Run(void) { } } delete []procs; + } From 08a3e76b269aa2ff72977b4aa7b80a06781ec12f Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Wed, 27 Mar 2024 17:10:59 -0500 Subject: [PATCH 15/18] SWDEV-445668 - Align topology JSON Updates: - [CLI] Updated json output to provide format similar to host eg. [ { "gpu": 0, "bdf": "0000:01:00.0", "links": [ { "gpu": 0, "bdf": "0000:01:00.0", "weight": 0, "link_status": "ENABLED", "link_type": "SELF", "num_hops": 0, "bandwidth": "N/A", "fb_sharing": "ENABLED" }, { "gpu": 1, "bdf": "0001:01:00.0", "weight": 15, "link_status": "ENABLED", "link_type": "XGMI", "num_hops": 1, "bandwidth": "50000-100000", "fb_sharing": "ENABLED" }, ... ] }, { ... Change-Id: I63217f63a4d6ebc23a8a84eaac9dbb7aff5f4cb4 Signed-off-by: Charis Poag --- CHANGELOG.md | 140 ++++++++++++++++++++++++++++++++++ amdsmi_cli/amdsmi_commands.py | 135 +++++++++++++++++++++++++++----- 2 files changed, 255 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f606276c..6a5fcc99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ## amd_smi_lib for ROCm 6.1.1 +### Added + +- N/A + ### Changed - **Updated metrics --clocks** @@ -143,6 +147,142 @@ GPU: 0 ... ``` +- **Updated `amd-smi topology --json` to align with host/guest** +Topology's `--json` output now is changed to align with output reported bt host/guest systems. Additionally, users can select/filter specific topology details as desired (refer to `amd-smi topology -h` for full list). See examples shown below. + +*Previous format:* +```shell +$ amd-smi topology --json +[ + { + "gpu": 0, + "link_accessibility": { + "gpu_0": "ENABLED", + "gpu_1": "DISABLED" + }, + "weight": { + "gpu_0": 0, + "gpu_1": 40 + }, + "hops": { + "gpu_0": 0, + "gpu_1": 2 + }, + "link_type": { + "gpu_0": "SELF", + "gpu_1": "PCIE" + }, + "numa_bandwidth": { + "gpu_0": "N/A", + "gpu_1": "N/A" + } + }, + { + "gpu": 1, + "link_accessibility": { + "gpu_0": "DISABLED", + "gpu_1": "ENABLED" + }, + "weight": { + "gpu_0": 40, + "gpu_1": 0 + }, + "hops": { + "gpu_0": 2, + "gpu_1": 0 + }, + "link_type": { + "gpu_0": "PCIE", + "gpu_1": "SELF" + }, + "numa_bandwidth": { + "gpu_0": "N/A", + "gpu_1": "N/A" + } + } +] +``` + +*New format:* +```shell +$ amd-smi topology --json +[ + { + "gpu": 0, + "bdf": "0000:01:00.0", + "links": [ + { + "gpu": 0, + "bdf": "0000:01:00.0", + "weight": 0, + "link_status": "ENABLED", + "link_type": "SELF", + "num_hops": 0, + "bandwidth": "N/A", + "fb_sharing": "ENABLED" + }, + { + "gpu": 1, + "bdf": "0001:01:00.0", + "weight": 15, + "link_status": "ENABLED", + "link_type": "XGMI", + "num_hops": 1, + "bandwidth": "50000-100000", + "fb_sharing": "ENABLED" + }, + ... + ] + }, + ... +] +``` +```shell +$ /opt/rocm/bin/amd-smi topology -a -t --json +[ + { + "gpu": 0, + "bdf": "0000:08:00.0", + "links": [ + { + "gpu": 0, + "bdf": "0000:08:00.0", + "link_status": "ENABLED", + "link_type": "SELF" + }, + { + "gpu": 1, + "bdf": "0000:44:00.0", + "link_status": "DISABLED", + "link_type": "PCIE" + } + ] + }, + { + "gpu": 1, + "bdf": "0000:44:00.0", + "links": [ + { + "gpu": 0, + "bdf": "0000:08:00.0", + "link_status": "DISABLED", + "link_type": "PCIE" + }, + { + "gpu": 1, + "bdf": "0000:44:00.0", + "link_status": "ENABLED", + "link_type": "SELF" + } + ] + } +] +``` + +### Optimizations + +- N/A + ### Fixed - **Fix for GPU reset error on non-amdgpu cards** diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index b9bc3a5e..431b23fa 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -2763,20 +2763,115 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, # Populate the possible gpus topo_values = [] - for gpu in args.gpu: - gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) - topo_values.append({"gpu" : gpu_id}) - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu) - self.logger.table_header += gpu_bdf.rjust(13) + for src_gpu_index, src_gpu in enumerate(args.gpu): + src_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu) + topo_values.append({"gpu" : src_gpu_id}) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + topo_values[src_gpu_index]['bdf'] = src_gpu_bdf + self.logger.table_header += src_gpu_bdf.rjust(13) + + if not self.logger.is_json_format(): + continue # below is for JSON format only + + ########################## + # JSON formatting start # + ########################## + links = [] + # create json obj for data alignment + # dest_gpu_links = { + # "gpu": GPU # + # "bdf": BDF identification + # "weight": 0 - self (current node); weight >= 0 correlated with hops (GPU-CPU, GPU-GPU, GPU-CPU-CPU-GPU, etc..) + # "link_status": "ENABLED" - devices linked; "DISABLED" - devices not linked + # "link_type": "SELF" - current node, "PCIE", "XGMI", "N/A" - no link,"UNKNOWN" - unidentified link type + # "num_hops": num_hops - # of hops between devices + # "bandwidth": numa_bw - The NUMA "minimum bandwidth-maximum bandwidth" beween src and dest nodes + # "N/A" - self node or not connected devices + # "fb_sharing": "ENABLED/DISABLED" - same output as defined in link_status. Devices in a hive setup should + # all have sharing enabled. + # } + + for dest_gpu_index, dest_gpu in enumerate(args.gpu): + link_type = "SELF" + if src_gpu != dest_gpu: + link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type'] + if isinstance(link_type, int): + if link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_UNDEFINED: + link_type = "UNKNOWN" + elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_PCIEXPRESS: + link_type = "PCIE" + elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_XGMI: + link_type = "XGMI" + else: + link_type = "N/A" + + numa_bw = "N/A" + if src_gpu != dest_gpu: + try: + bw_dict = amdsmi_interface.amdsmi_get_minmax_bandwidth_between_processors(src_gpu, dest_gpu) + numa_bw = f"{bw_dict['min_bandwidth']}-{bw_dict['max_bandwidth']}" + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get min max bandwidth for %s to %s | %s", + self.helpers.get_gpu_id_from_device_handle(src_gpu), + self.helpers.get_gpu_id_from_device_handle(dest_gpu), + e.get_error_info()) + + weight = 0 + num_hops = 0 + if src_gpu != dest_gpu: + weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu) + num_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops'] + link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu) + if link_status: + link_status = "ENABLED" + else: + link_status = "DISABLED" + + # fb_sharing in BM - in a hive configuration, this is + # link_status = amdsmi_is_P2P_accessible(src,dest) + dest_gpu_links = { + "gpu": self.helpers.get_gpu_id_from_device_handle(dest_gpu), + "bdf": amdsmi_interface.amdsmi_get_gpu_device_bdf(dest_gpu), + "weight": weight, + "link_status": link_status, + "link_type": link_type, + "num_hops": num_hops, + "bandwidth": numa_bw, + "fb_sharing": link_status + } + if not args.access: # currently includes fb_sharing + del dest_gpu_links['link_status'] + del dest_gpu_links['fb_sharing'] + if not args.weight: + del dest_gpu_links['weight'] + if not args.link_type: + del dest_gpu_links['link_type'] + if not args.hops: + del dest_gpu_links['num_hops'] + if not args.numa_bw: + del dest_gpu_links['bandwidth'] + links.append(dest_gpu_links) + isEndOfDest = dest_gpu_index+1 == len(args.gpu) + isEndOfSrc = src_gpu_index+1 == len(args.gpu) + if isEndOfDest: + topo_values[src_gpu_index]['links'] = links + continue + if isEndOfSrc: + self.logger.multiple_device_output = topo_values + self.logger.print_output(multiple_device_enabled=True, tabular=True) + return + ########################## + # JSON formatting end # + ########################## if args.access: tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) if self.logger.is_human_readable_format(): - tabular_output_dict = {'gpu' : f"{gpu_bdf} "} + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} else: - tabular_output_dict = {'gpu' : gpu_bdf} + tabular_output_dict = {'gpu' : src_gpu_bdf} src_gpu_links = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -2808,11 +2903,11 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, if args.weight: tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) if self.logger.is_human_readable_format(): - tabular_output_dict = {'gpu' : f"{gpu_bdf} "} + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} else: - tabular_output_dict = {'gpu' : gpu_bdf} + tabular_output_dict = {'gpu' : src_gpu_bdf} src_gpu_weight = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -2845,11 +2940,11 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, if args.hops: tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) if self.logger.is_human_readable_format(): - tabular_output_dict = {'gpu' : f"{gpu_bdf} "} + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} else: - tabular_output_dict = {'gpu' : gpu_bdf} + tabular_output_dict = {'gpu' : src_gpu_bdf} src_gpu_hops = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -2882,11 +2977,11 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, if args.link_type: tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) if self.logger.is_human_readable_format(): - tabular_output_dict = {'gpu' : f"{gpu_bdf} "} + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} else: - tabular_output_dict = {'gpu' : gpu_bdf} + tabular_output_dict = {'gpu' : src_gpu_bdf} src_gpu_link_type = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) @@ -2924,11 +3019,11 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, if args.numa_bw: tabular_output = [] for src_gpu_index, src_gpu in enumerate(args.gpu): - gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) + src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu) if self.logger.is_human_readable_format(): - tabular_output_dict = {'gpu' : f"{gpu_bdf} "} + tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "} else: - tabular_output_dict = {'gpu' : gpu_bdf} + tabular_output_dict = {'gpu' : src_gpu_bdf} src_gpu_link_type = {} for dest_gpu in args.gpu: dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu) From 9758a8bc3318f02d7aa2ef14165f4ace0fca6201 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Fri, 5 Apr 2024 01:58:39 -0500 Subject: [PATCH 16/18] Removed fb_sharing fields from Linux BM Signed-off-by: Maisam Arif Change-Id: Ia2942b9d33699ced1683270454c479701bce1246 --- CHANGELOG.md | 2 -- amdsmi_cli/amdsmi_commands.py | 15 +++++---------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a5fcc99..bea6fc89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -219,7 +219,6 @@ $ amd-smi topology --json "link_type": "SELF", "num_hops": 0, "bandwidth": "N/A", - "fb_sharing": "ENABLED" }, { "gpu": 1, @@ -229,7 +228,6 @@ $ amd-smi topology --json "link_type": "XGMI", "num_hops": 1, "bandwidth": "50000-100000", - "fb_sharing": "ENABLED" }, ... ] diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 431b23fa..ee6159e5 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -2782,13 +2782,11 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, # "gpu": GPU # # "bdf": BDF identification # "weight": 0 - self (current node); weight >= 0 correlated with hops (GPU-CPU, GPU-GPU, GPU-CPU-CPU-GPU, etc..) - # "link_status": "ENABLED" - devices linked; "DISABLED" - devices not linked + # "link_status": "ENABLED" - devices linked; "DISABLED" - devices not linked; Correlated to access # "link_type": "SELF" - current node, "PCIE", "XGMI", "N/A" - no link,"UNKNOWN" - unidentified link type # "num_hops": num_hops - # of hops between devices # "bandwidth": numa_bw - The NUMA "minimum bandwidth-maximum bandwidth" beween src and dest nodes # "N/A" - self node or not connected devices - # "fb_sharing": "ENABLED/DISABLED" - same output as defined in link_status. Devices in a hive setup should - # all have sharing enabled. # } for dest_gpu_index, dest_gpu in enumerate(args.gpu): @@ -2818,7 +2816,7 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, weight = 0 num_hops = 0 - if src_gpu != dest_gpu: + if src_gpu != dest_gpu: weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu) num_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops'] link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu) @@ -2827,7 +2825,6 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, else: link_status = "DISABLED" - # fb_sharing in BM - in a hive configuration, this is # link_status = amdsmi_is_P2P_accessible(src,dest) dest_gpu_links = { "gpu": self.helpers.get_gpu_id_from_device_handle(dest_gpu), @@ -2837,11 +2834,9 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, "link_type": link_type, "num_hops": num_hops, "bandwidth": numa_bw, - "fb_sharing": link_status } - if not args.access: # currently includes fb_sharing + if not args.access: del dest_gpu_links['link_status'] - del dest_gpu_links['fb_sharing'] if not args.weight: del dest_gpu_links['weight'] if not args.link_type: @@ -2851,9 +2846,9 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None, if not args.numa_bw: del dest_gpu_links['bandwidth'] links.append(dest_gpu_links) - isEndOfDest = dest_gpu_index+1 == len(args.gpu) + dest_end = dest_gpu_index+1 == len(args.gpu) isEndOfSrc = src_gpu_index+1 == len(args.gpu) - if isEndOfDest: + if dest_end: topo_values[src_gpu_index]['links'] = links continue if isEndOfSrc: From 50450a2a69bfe33da51d5398deed381a340530cc Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Fri, 5 Apr 2024 02:30:08 -0500 Subject: [PATCH 17/18] Added amdsmi_get_gpu_process_info python library documentation Signed-off-by: Maisam Arif Change-Id: I2218bf664a8a155e6b3085378db0fb20f3be3f70 --- py-interface/README.md | 44 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/py-interface/README.md b/py-interface/README.md index 4199f1a9..19454121 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -882,14 +882,46 @@ except AmdSmiException as e: ### amdsmi_get_gpu_process_list -Description: Returns the list of processes for the given GPU. -The list is of type `amdsmi_proc_info_t` and holds information about the running process. +Description: Returns the list of processes running on the target GPU. Input parameters: * `processor_handle` device which to query -Output: List of process processes with fields +Output: List of `amdsmi_proc_info_t` process objects running on the target GPU; can be empty + +Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + processes = amdsmi_get_gpu_process_list(device) + if len(processes) == 0: + print("No processes running on this GPU") + else: + for process in processes: + print(amdsmi_get_gpu_process_info(device, process)) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_get_gpu_process_info + +Description: Returns info about process given the target GPU and the corresponding `amdsmi_proc_info_t` object + +Input parameters: + +* `processor_handle` device which to query Output: Dictionary with fields @@ -899,9 +931,9 @@ Field | Description `pid` | Process ID `mem` | Process memory usage `engine_usage` |
Subfield Description
`gfx`GFX engine usage in ns
`enc`Encode engine usage in ns
-`memory_usage` |
Subfield Description
`gtt_mem`GTT memory usage
`cpu_mem`CPU memory usage
`vram_mem`VRAM memory usage
+`memory_usage` |
Subfield Description
`gtt_mem`GTT memory usage
`cpu_mem`CPU memory usage
`vram_mem`VRAM memory usage
Date: Fri, 5 Apr 2024 02:31:08 -0500 Subject: [PATCH 18/18] Bump Version to 24.5.1.0 Signed-off-by: Maisam Arif Change-Id: I842e223b78f337a39098f652fa6e7ef51948fbaf --- CMakeLists.txt | 2 +- amdsmi_cli/README.md | 2 +- docs/doxygen/Doxyfile | 2 +- include/amd_smi/amdsmi.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97dbc610..6cf0d289 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ find_program(GIT NAMES git) ## Setup the package version based on git tags. set(PKG_VERSION_GIT_TAG_PREFIX "amdsmi_pkg_ver") -get_package_version_number("24.5.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) +get_package_version_number("24.5.1" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) message("Package version: ${PKG_VERSION_STR}") set(${AMD_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}") set(${AMD_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}") diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index 72028e5b..06e89147 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -79,7 +79,7 @@ amd-smi will report the version and current platform detected when running the c ~$ amd-smi usage: amd-smi [-h] ... -AMD System Management Interface | Version: 24.5.0.0 | ROCm version: 6.1.1 | Platform: Linux Baremetal +AMD System Management Interface | Version: 24.5.1.0 | ROCm version: 6.1.1 | Platform: Linux Baremetal options: -h, --help show this help message and exit diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index de8ab73b..ef62d4d1 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -48,7 +48,7 @@ PROJECT_NAME = AMD SMI # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "24.5.0.0" +PROJECT_NUMBER = "24.5.1.0" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 3f7bd398..ba73c093 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -154,7 +154,7 @@ typedef enum { #define AMDSMI_LIB_VERSION_MAJOR 5 //! Minor version should be updated for each API change, but without changing headers -#define AMDSMI_LIB_VERSION_MINOR 0 +#define AMDSMI_LIB_VERSION_MINOR 1 //! Release version should be set to 0 as default and can be updated by the PMs for each CSP point release #define AMDSMI_LIB_VERSION_RELEASE 0