From 583e5e99bfc4cf1a8bb8ba73a0f0622a06645523 Mon Sep 17 00:00:00 2001
From: Charis Poag <Charis.Poag@amd.com>
Date: Tue, 19 Mar 2024 19:54:01 -0500
Subject: [PATCH 01/18] Update ROCm 6.0/6.1 CHANGELOG.md & README.md

* Updates:
    - [CHANGELOG.md] Add 6.1 and update 6.0 changes
    - [README.md] Update README.md with ROCm install instructions

Change-Id: Ic701ebcb00e5d0af54d8f97707c1cec71a0aac4c
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
---
 CHANGELOG.md                   | 408 ++++++++++++++++++++++++++++++++-
 README.md                      |  15 +-
 amdsmi_cli/README.md           |   2 +-
 py-interface/amdsmi_wrapper.py |   3 +-
 4 files changed, 420 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 37513075..00728e53 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,410 @@
 # Change Log for AMD SMI Library
 
-Full documentation for amd_smi_lib is available at [https://docs.amd.com/](https://rocm.docs.amd.com/projects/amdsmi/en/latest/).
+Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](https://rocm.docs.amd.com/projects/amdsmi/en/latest/).  
+
+***All information listed below is for reference and subject to change.***
+
+## amd_smi_lib for ROCm 6.1.0
+
+### Added
+- **Added Monitor Command**  
+Provides users the ability to customize GPU metrics to capture, collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi` (no argument), additionally allows uers to customize what data is helpful for their use-case.
+```shell
+$ amd-smi monitor -h
+usage: amd-smi monitor [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
+                       [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]]
+                       [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-p] [-t] [-u] [-m] [-n]
+                       [-d] [-s] [-e] [-v] [-r]
+
+Monitor a target device for the specified arguments.
+If no arguments are provided, all arguments will be enabled.
+Use the watch arguments to run continuously
+
+Monitor Arguments:
+  -h, --help                   show this help message and exit
+  -g, --gpu GPU [GPU ...]      Select a GPU ID, BDF, or UUID from the possible choices:
+                               ID: 0 | BDF: 0000:01:00.0 | UUID: 4eff74a0-0000-1000-802d-1d762a397f73
+                                 all | Selects all devices
+  -U, --cpu CPU [CPU ...]      Select a CPU ID from the possible choices:
+                               ID: 0
+                                 all | Selects all devices
+  -O, --core CORE [CORE ...]   Select a Core ID from the possible choices:
+                               ID: 0 - 23
+                                 all  | Selects all devices
+  -w, --watch INTERVAL         Reprint the command in a loop of INTERVAL seconds
+  -W, --watch_time TIME        The total TIME to watch the given command
+  -i, --iterations ITERATIONS  Total number of ITERATIONS to loop on the given command
+  -p, --power-usage            Monitor power usage in Watts
+  -t, --temperature            Monitor temperature in Celsius
+  -u, --gfx                    Monitor graphics utilization (%) and clock (MHz)
+  -m, --mem                    Monitor memory utilization (%) and clock (MHz)
+  -n, --encoder                Monitor encoder utilization (%) and clock (MHz)
+  -d, --decoder                Monitor decoder utilization (%) and clock (MHz)
+  -s, --throttle-status        Monitor thermal throttle status
+  -e, --ecc                    Monitor ECC single bit, ECC double bit, and PCIe replay error counts
+  -v, --vram-usage             Monitor memory usage in MB
+  -r, --pcie                   Monitor PCIe Tx/Rx in MB/s
+
+Command Modifiers:
+  --json                       Displays output in JSON format (human readable by default).
+  --csv                        Displays output in CSV format (human readable by default).
+  --file FILE                  Saves output into a file on the provided path (stdout by default).
+  --loglevel LEVEL             Set the logging level from the possible choices:
+                                DEBUG, INFO, WARNING, ERROR, CRITICAL
+```
+```shell
+$ amd-smi monitor -ptumv
+GPU  POWER  GPU_TEMP  MEM_TEMP  GFX_UTIL  GFX_CLOCK  MEM_UTIL  MEM_CLOCK  VRAM_USED  VRAM_TOTAL
+  0  171 W     32 °C     33 °C       0 %    114 MHz       0 %    900 MHz     283 MB   196300 MB
+  1  175 W     33 °C     34 °C       0 %    113 MHz       0 %    900 MHz     283 MB   196300 MB
+  2  177 W     31 °C     33 °C       0 %    113 MHz       0 %    900 MHz     283 MB   196300 MB
+  3  172 W     33 °C     32 °C       0 %    113 MHz       0 %    900 MHz     283 MB   196300 MB
+  4  178 W     32 °C     32 °C       0 %    113 MHz       0 %    900 MHz     284 MB   196300 MB
+  5  176 W     33 °C     35 °C       0 %    113 MHz       0 %    900 MHz     283 MB   196300 MB
+  6  176 W     32 °C     32 °C       0 %    113 MHz       0 %    900 MHz     283 MB   196300 MB
+  7  175 W     34 °C     32 °C       0 %    113 MHz       0 %    900 MHz     283 MB   196300 MB
+```
+
+- **Integrated ESMI Tool**  
+Users can get CPU metrics and telemetry through our API and CLI tools. This information can be seen in `amd-smi static` and `amd-smi metric` commands. Only available for limited target processors. As of ROCm 6.0.2, this is listed as:  
+  - AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh  
+  - AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh  
+
+  See a few examples listed below.
+
+```shell
+$ amd-smi static -U all
+CPU: 0
+    SMU:
+        FW_VERSION: 85.90.0
+    INTERFACE_VERSION:
+        PROTO VERSION: 6
+```
+```shell
+$ amd-smi metric -O 0 1 2
+CORE: 0
+    BOOST_LIMIT:
+        VALUE: 400 MHz
+    CURR_ACTIVE_FREQ_CORE_LIMIT:
+        VALUE: 400 MHz
+    CORE_ENERGY:
+        VALUE: N/A
+
+CORE: 1
+    BOOST_LIMIT:
+        VALUE: 400 MHz
+    CURR_ACTIVE_FREQ_CORE_LIMIT:
+        VALUE: 400 MHz
+    CORE_ENERGY:
+        VALUE: N/A
+
+CORE: 2
+    BOOST_LIMIT:
+        VALUE: 400 MHz
+    CURR_ACTIVE_FREQ_CORE_LIMIT:
+        VALUE: 400 MHz
+    CORE_ENERGY:
+        VALUE: N/A
+```
+```shell
+$ amd-smi metric -U all
+CPU: 0
+    POWER_METRICS:
+        SOCKET POWER: 102675 mW
+        SOCKET POWER LIMIT: 550000 mW
+        SOCKET MAX POWER LIMIT: 550000 mW
+    PROCHOT:
+        PROCHOT_STATUS: 0
+    FREQ_METRICS:
+        FCLKMEMCLK:
+            FCLK: 2000 MHz
+            MCLK: 1300 MHz
+        CCLKFREQLIMIT: 400 MHz
+        SOC_CURRENT_ACTIVE_FREQ_LIMIT:
+            FREQ: 400 MHz
+            FREQ_SRC: [HSMP Agent]
+        SOC_FREQ_RANGE:
+            MAX_SOCKET_FREQ: 3700 MHz
+            MIN_SOCKET_FREQ: 400 MHz
+    C0_RESIDENCY:
+        RESIDENCY: 4 %
+    SVI_TELEMETRY_ALL_RAILS:
+        POWER: 102673 mW
+    METRIC_VERSION:
+        VERSION: 11
+    METRICS_TABLE:
+        CPU_FAMILY: 25
+        CPU_MODEL: 144
+        RESPONSE:
+            MTBL_ACCUMULATION_COUNTER: 2887162626
+            MTBL_MAX_SOCKET_TEMPERATURE: 41.0 °C
+            MTBL_MAX_VR_TEMPERATURE: 39.0 °C
+            MTBL_MAX_HBM_TEMPERATURE: 40.0 °C
+            MTBL_MAX_SOCKET_TEMPERATURE_ACC: 108583340881.125 °C
+            MTBL_MAX_VR_TEMPERATURE_ACC: 109472702595.0 °C
+            MTBL_MAX_HBM_TEMPERATURE_ACC: 111516663941.0 °C
+            MTBL_SOCKET_POWER_LIMIT: 550.0 W
+            MTBL_MAX_SOCKET_POWER_LIMIT: 550.0 W
+            MTBL_SOCKET_POWER: 102.678 W
+            MTBL_TIMESTAMP_RAW: 288731677361880
+            MTBL_TIMESTAMP_READABLE: Tue Mar 19 12:32:21 2024
+            MTBL_SOCKET_ENERGY_ACC: 166127.84 kJ
+            MTBL_CCD_ENERGY_ACC: 3317.837 kJ
+            MTBL_XCD_ENERGY_ACC: 21889.147 kJ
+            MTBL_AID_ENERGY_ACC: 121932.397 kJ
+            MTBL_HBM_ENERGY_ACC: 18994.108 kJ
+            MTBL_CCLK_FREQUENCY_LIMIT: 3.7 GHz
+            MTBL_GFXCLK_FREQUENCY_LIMIT: 0.0 MHz
+            MTBL_FCLK_FREQUENCY: 1999.988 MHz
+            MTBL_UCLK_FREQUENCY: 1299.993 MHz
+            MTBL_SOCCLK_FREQUENCY: [35.716, 35.715, 35.714, 35.714] MHz
+            MTBL_VCLK_FREQUENCY: [0.0, 53.749, 53.749, 53.749] MHz
+            MTBL_DCLK_FREQUENCY: [7.143, 44.791, 44.791, 44.791] MHz
+            MTBL_LCLK_FREQUENCY: [20.872, 18.75, 35.938, 599.558] MHz
+            MTBL_FCLK_FREQUENCY_TABLE: [1200.0, 1600.0, 1900.0, 2000.0] MHz
+            MTBL_UCLK_FREQUENCY_TABLE: [900.0, 1100.0, 1200.0, 1300.0] MHz
+            MTBL_SOCCLK_FREQUENCY_TABLE: [800.0, 1000.0, 1142.857, 1142.857] MHz
+            MTBL_VCLK_FREQUENCY_TABLE: [914.286, 1300.0, 1560.0, 1720.0] MHz
+            MTBL_DCLK_FREQUENCY_TABLE: [711.111, 975.0, 1300.0, 1433.333] MHz
+            MTBL_LCLK_FREQUENCY_TABLE: [600.0, 844.444, 1150.0, 1150.0] MHz
+            MTBL_CCLK_FREQUENCY_ACC: [4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639,
+                4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639,
+                4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639,
+                4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639,
+                4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639, 4399751656.639,
+                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] GHz
+            MTBL_GFXCLK_FREQUENCY_ACC: [0.0, 0.0, 250534397827.603, 251546257401.82, 250811364089.836,
+                249999070486.505, 251622633562.855, 251342375116.05] MHz
+            MTBL_GFXCLK_FREQUENCY: [0.0, 0.0, 31.091, 31.414, 31.141, 31.478, 31.32, 31.453]
+                MHz
+            MTBL_MAX_CCLK_FREQUENCY: 3.7 GHz
+            MTBL_MIN_CCLK_FREQUENCY: 0.4 GHz
+            MTBL_MAX_GFXCLK_FREQUENCY: 2100.0 MHz
+            MTBL_MIN_GFXCLK_FREQUENCY: 500.0 MHz
+            MTBL_MAX_LCLK_DPM_RANGE: 2
+            MTBL_MIN_LCLK_DPM_RANGE: 0
+            MTBL_XGMI_WIDTH: 0.0
+            MTBL_XGMI_BITRATE: 0.0 Gbps
+            MTBL_XGMI_READ_BANDWIDTH_ACC: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] Gbps
+            MTBL_XGMI_WRITE_BANDWIDTH_ACC: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] Gbps
+            MTBL_SOCKET_C0_RESIDENCY: 4.329 %
+            MTBL_SOCKET_GFX_BUSY: 0.0 %
+            MTBL_HBM_BANDWIDTH_UTILIZATION: 0.001 %
+            MTBL_SOCKET_C0_RESIDENCY_ACC: 311523106.34
+            MTBL_SOCKET_GFX_BUSY_ACC: 84739.281
+            MTBL_HBM_BANDWIDTH_ACC: 33231180.073 Gbps
+            MTBL_MAX_HBM_BANDWIDTH: 5324.801 Gbps
+            MTBL_DRAM_BANDWIDTH_UTILIZATION_ACC: 612843.699
+            MTBL_PCIE_BANDWIDTH_ACC: [0.0, 0.0, 0.0, 0.0] Gbps
+            MTBL_PROCHOT_RESIDENCY_ACC: 0
+            MTBL_PPT_RESIDENCY_ACC: 2887162626
+            MTBL_SOCKET_THM_RESIDENCY_ACC: 2887162626
+            MTBL_VR_THM_RESIDENCY_ACC: 0
+            MTBL_HBM_THM_RESIDENCY_ACC: 2887162626
+    SOCKET_ENERGY:
+        RESPONSE: N/A
+    DDR_BANDWIDTH:
+        RESPONSE: N/A
+    CPU_TEMP:
+        RESPONSE: N/A
+```
+- **Added support for new metrics: VCN, JPEG engines, and PCIe errors**  
+Using the AMD SMI tool, users can retreive VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs (with 4 separate VCN engine activities) for older asics `MM_ACTIVITY` with UVD/VCN engine activity (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new fields.
+
+```shell
+$ amd-smi metric -P
+GPU: 0
+    PCIE:
+        WIDTH: 16
+        SPEED: 16 GT/s
+        REPLAY_COUNT: 0
+        L0_TO_RECOVERY_COUNT: 1
+        REPLAY_ROLL_OVER_COUNT: 0
+        NAK_SENT_COUNT: 0
+        NAK_RECEIVED_COUNT: 0
+        CURRENT_BANDWIDTH_SENT: N/A
+        CURRENT_BANDWIDTH_RECEIVED: N/A
+        MAX_PACKET_SIZE: N/A
+```
+```shell
+$ amd-smi metric --usage
+GPU: 0
+    USAGE:
+        GFX_ACTIVITY: 0 %
+        UMC_ACTIVITY: 0 %
+        MM_ACTIVITY: N/A
+        VCN_ACTIVITY: [0 %, 0 %, 0 %, 0 %]
+        JPEG_ACTIVITY: [0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0
+            %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %, 0 %,
+            0 %, 0 %, 0 %, 0 %]
+
+```
+- **Added AMDSMI Tool Version**  
+AMD SMI will report ***three versions***: AMDSMI Tool, AMDSMI Library version, and ROCm version.  
+The AMDSMI Tool version is the CLI/tool version number with commit ID appended after `+` sign.  
+The AMDSMI Library version is the library package version number.  
+The ROCm version is the system's installed ROCm version, if ROCm is not installed it will report N/A.
+```shell
+$ amd-smi version
+AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6.1.0
+```
+
+- **Added XGMI table**  
+Displays XGMI information for AMD GPU devices in a table format. Only available on supported ASICs (eg. MI300). Here users can view read/write data XGMI or PCIe accumulated data transfer size (in KiloBytes).
+```shell
+$ amd-smi xgmi
+LINK METRIC TABLE:
+       bdf          bit_rate max_bandwidth link_type 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0
+GPU0   0000:0c:00.0 32 Gb/s  512 Gb/s      XGMI
+ Read                                                N/A          2 KB         2 KB         1 KB         2 KB         1 KB         2 KB         2 KB
+ Write                                               N/A          1 KB         1 KB         1 KB         1 KB         1 KB         1 KB         1 KB
+GPU1   0000:22:00.0 32 Gb/s  512 Gb/s      XGMI
+ Read                                                0 KB         N/A          2 KB         2 KB         1 KB         2 KB         1 KB         2 KB
+ Write                                               0 KB         N/A          1 KB         1 KB         1 KB         1 KB         1 KB         1 KB
+GPU2   0000:38:00.0 32 Gb/s  512 Gb/s      XGMI
+ Read                                                0 KB         1 KB         N/A          2 KB         1 KB         2 KB         0 KB         0 KB
+ Write                                               0 KB         1 KB         N/A          1 KB         1 KB         1 KB         1 KB         1 KB
+GPU3   0000:5c:00.0 32 Gb/s  512 Gb/s      XGMI
+ Read                                                0 KB         0 KB         2 KB         N/A          1 KB         0 KB         0 KB         2 KB
+ Write                                               0 KB         1 KB         1 KB         N/A          1 KB         1 KB         1 KB         1 KB
+GPU4   0000:9f:00.0 32 Gb/s  512 Gb/s      XGMI
+ Read                                                0 KB         1 KB         0 KB         0 KB         N/A          2 KB         0 KB         2 KB
+ Write                                               0 KB         1 KB         1 KB         1 KB         N/A          1 KB         1 KB         1 KB
+GPU5   0000:af:00.0 32 Gb/s  512 Gb/s      XGMI
+ Read                                                0 KB         2 KB         0 KB         0 KB         0 KB         N/A          2 KB         0 KB
+ Write                                               0 KB         1 KB         1 KB         1 KB         1 KB         N/A          1 KB         1 KB
+GPU6   0000:bf:00.0 32 Gb/s  512 Gb/s      XGMI
+ Read                                                0 KB         0 KB         0 KB         0 KB         0 KB         0 KB         N/A          0 KB
+ Write                                               0 KB         1 KB         1 KB         1 KB         1 KB         1 KB         N/A          1 KB
+GPU7   0000:df:00.0 32 Gb/s  512 Gb/s      XGMI
+ Read                                                0 KB         0 KB         0 KB         0 KB         0 KB         0 KB         0 KB         N/A
+ Write                                               0 KB         1 KB         1 KB         1 KB         1 KB         1 KB         1 KB         N/A
+
+```
+- **Added units of measure to JSON output.**  
+We added unit of measure to JSON/CSV `amd-smi metric`, `amd-smi static`, and `amd-smi monitor` commands.
+
+Ex.  
+```shell
+amd-smi metric -p --json
+[
+    {
+        "gpu": 0,
+        "power": {
+            "socket_power": {
+                "value": 10,
+                "unit": "W"
+            },
+            "gfx_voltage": {
+                "value": 6,
+                "unit": "mV"
+            },
+            "soc_voltage": {
+                "value": 918,
+                "unit": "mV"
+            },
+            "mem_voltage": {
+                "value": 1250,
+                "unit": "mV"
+            },
+            "power_management": "ENABLED",
+            "throttle_status": "UNTHROTTLED"
+        }
+    }
+]
+```
+
+### Changed
+
+- **Topology is now left-aligned with BDF of each device listed individual table's row/coloumns.**  
+We provided each device's BDF for every table's row/columns, then left aligned data. We want AMD SMI Tool output to be easy to understand and digest for our users. Having users scroll up to find this information made it difficult to follow, especially for devices which have many devices associated with one ASIC. 
+```shell
+$ amd-smi topology
+ACCESS TABLE:
+             0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0
+0000:0c:00.0 ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED
+0000:22:00.0 ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED
+0000:38:00.0 ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED
+0000:5c:00.0 ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED
+0000:9f:00.0 ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED
+0000:af:00.0 ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED
+0000:bf:00.0 ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED
+0000:df:00.0 ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED      ENABLED
+
+WEIGHT TABLE:
+             0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0
+0000:0c:00.0 0            15           15           15           15           15           15           15
+0000:22:00.0 15           0            15           15           15           15           15           15
+0000:38:00.0 15           15           0            15           15           15           15           15
+0000:5c:00.0 15           15           15           0            15           15           15           15
+0000:9f:00.0 15           15           15           15           0            15           15           15
+0000:af:00.0 15           15           15           15           15           0            15           15
+0000:bf:00.0 15           15           15           15           15           15           0            15
+0000:df:00.0 15           15           15           15           15           15           15           0
+
+HOPS TABLE:
+             0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0
+0000:0c:00.0 0            1            1            1            1            1            1            1
+0000:22:00.0 1            0            1            1            1            1            1            1
+0000:38:00.0 1            1            0            1            1            1            1            1
+0000:5c:00.0 1            1            1            0            1            1            1            1
+0000:9f:00.0 1            1            1            1            0            1            1            1
+0000:af:00.0 1            1            1            1            1            0            1            1
+0000:bf:00.0 1            1            1            1            1            1            0            1
+0000:df:00.0 1            1            1            1            1            1            1            0
+
+LINK TYPE TABLE:
+             0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0
+0000:0c:00.0 SELF         XGMI         XGMI         XGMI         XGMI         XGMI         XGMI         XGMI
+0000:22:00.0 XGMI         SELF         XGMI         XGMI         XGMI         XGMI         XGMI         XGMI
+0000:38:00.0 XGMI         XGMI         SELF         XGMI         XGMI         XGMI         XGMI         XGMI
+0000:5c:00.0 XGMI         XGMI         XGMI         SELF         XGMI         XGMI         XGMI         XGMI
+0000:9f:00.0 XGMI         XGMI         XGMI         XGMI         SELF         XGMI         XGMI         XGMI
+0000:af:00.0 XGMI         XGMI         XGMI         XGMI         XGMI         SELF         XGMI         XGMI
+0000:bf:00.0 XGMI         XGMI         XGMI         XGMI         XGMI         XGMI         SELF         XGMI
+0000:df:00.0 XGMI         XGMI         XGMI         XGMI         XGMI         XGMI         XGMI         SELF
+
+NUMA BW TABLE:
+             0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0
+0000:0c:00.0 N/A          50000-50000  50000-50000  50000-50000  50000-50000  50000-50000  50000-50000  50000-50000
+0000:22:00.0 50000-50000  N/A          50000-50000  50000-50000  50000-50000  50000-50000  50000-50000  50000-50000
+0000:38:00.0 50000-50000  50000-50000  N/A          50000-50000  50000-50000  50000-50000  50000-50000  50000-50000
+0000:5c:00.0 50000-50000  50000-50000  50000-50000  N/A          50000-50000  50000-50000  50000-50000  50000-50000
+0000:9f:00.0 50000-50000  50000-50000  50000-50000  50000-50000  N/A          50000-50000  50000-50000  50000-50000
+0000:af:00.0 50000-50000  50000-50000  50000-50000  50000-50000  50000-50000  N/A          50000-50000  50000-50000
+0000:bf:00.0 50000-50000  50000-50000  50000-50000  50000-50000  50000-50000  50000-50000  N/A          50000-50000
+0000:df:00.0 50000-50000  50000-50000  50000-50000  50000-50000  50000-50000  50000-50000  50000-50000  N/A
+```
+
+### Optimizations
+- N/A
+
+### Fixed
+
+- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests**  
+Devices which do not report (eg. Navi3X/Navi2X/MI100) we have added checks to confirm these devices return AMDSMI_STATUS_NOT_SUPPORTED. Otherwise, tests now display a return string.
+- **Fix for devices which have an older pyyaml installed**  
+Platforms which are identified as having an older pyyaml version or pip, we no manually update both pip and pyyaml as needed. This corrects issues identified below. Fix impacts the following CLI commands:  
+  - `amd-smi list`
+  - `amd-smi static`
+  - `amd-smi firmware`
+  - `amd-smi metric`
+  - `amd-smi topology`
+```shell
+TypeError: dump_all() got an unexpected keyword argument 'sort_keys'
+```
+- **Fix for crash when user is not a member of video/render groups**
+AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid crashes when DRM/device data is inaccessable to the logged in user.
+
+
+
+### Known Issues
+
+- N/A
 
 ## amd_smi_lib for ROCm 6.0.0
 
@@ -26,7 +430,7 @@ Now the information is displayed as a table by each GPU's BDF, which closer rese
 
 ### Optimizations
 
-- N/A
+- Updated to C++17, gtest-1.14, and cmake 3.14
 
 ### Fixed
 
diff --git a/README.md b/README.md
index 109f890e..e50ac965 100755
--- a/README.md
+++ b/README.md
@@ -26,11 +26,18 @@ installed to query firmware information and hardware IPs.
 
 ### Installation
 
-* Install amdgpu driver
-* Install amd-smi-lib package through package manager
+### Install amdgpu using ROCm
+* Install amdgpu driver:  
+See example below, your release and link may differ. The `amdgpu-install --usecase=rocm` triggers both an amdgpu driver update and AMD SMI packages to be installed on your device.
+```shell
+sudo apt update
+wget https://repo.radeon.com/amdgpu-install/6.0.2/ubuntu/jammy/amdgpu-install_6.0.60002-1_all.deb
+sudo apt install ./amdgpu-install_6.0.60002-1_all.deb
+sudo amdgpu-install --usecase=rocm
+```
 * amd-smi --help
 
-### Install Example for Ubuntu 22.04
+### Install Example for Ubuntu 22.04 (without ROCm)
 
 ``` bash
 apt install amd-smi-lib
@@ -277,4 +284,4 @@ Path to the program `amdsmitst`: build/tests/amd_smi_test/
 
 The information contained herein is for informational purposes only, and is subject to change without notice. In addition, any stated support is planned and is also subject to change. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein.
 
-© 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+© 2023-2024 Advanced Micro Devices, Inc. All Rights Reserved.
diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md
index cf2b81df..3273f807 100644
--- a/amdsmi_cli/README.md
+++ b/amdsmi_cli/README.md
@@ -15,7 +15,7 @@ Recommended: At least one AMD GPU with AMD driver installed
 
 ### Installation
 
-* Install amdgpu driver
+* [Install amdgpu driver](../README.md#install-amdgpu-using-rocm)
 * Optionally install amd_hsmp driver for ESMI CPU functions
 * Install amd-smi-lib package through package manager
 * amd-smi --help
diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py
index 8fcdb375..f718dcfa 100644
--- a/py-interface/amdsmi_wrapper.py
+++ b/py-interface/amdsmi_wrapper.py
@@ -1688,7 +1688,8 @@ class struct_amdsmi_error_count_t(Structure):
 struct_amdsmi_error_count_t._fields_ = [
     ('correctable_count', ctypes.c_uint64),
     ('uncorrectable_count', ctypes.c_uint64),
-    ('reserved', ctypes.c_uint64 * 2),
+    ('deferred_count', ctypes.c_uint64),
+    ('reserved', ctypes.c_uint64 * 5),
 ]
 
 amdsmi_error_count_t = struct_amdsmi_error_count_t

From a3407090c3eaa339a53d8a9cac9f8d51d04337ed Mon Sep 17 00:00:00 2001
From: Deepak Mewar <deepak.mewar@amd.com>
Date: Mon, 18 Mar 2024 04:49:17 -0400
Subject: [PATCH 02/18] Updated README with esmi sample code

Change-Id: I50de7926fd76757e5810e8c531bcb6f5770ff454
---
 README.md | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e50ac965..bd25e588 100755
--- a/README.md
+++ b/README.md
@@ -108,7 +108,7 @@ The only required AMD-SMI call for any program that wants to use AMD-SMI is the
 
 When AMD-SMI is no longer being used, `amdsmi_shut_down()` should be called. This provides a way to do any releasing of resources that AMD-SMI may have held.
 
-A simple "Hello World" type program that displays the temperature of detected devices would look like this:
+1) A simple "Hello World" type program that displays the temperature of detected devices would look like this:
 
 ```c++
 #include <iostream>
@@ -184,6 +184,67 @@ int main() {
 }
 ```
 
+2) A sample program that displays the power of detected cpus would look like this:
+
+```c++
+#include <iostream>
+#include <vector>
+#include "amd_smi/amdsmi.h"
+
+int main(int argc, char **argv) {
+    amdsmi_status_t ret;
+	uint32_t socket_count = 0;
+
+    // Initialize amdsmi for AMD CPUs
+    ret = amdsmi_init(AMDSMI_INIT_AMD_CPUS);
+
+    ret = amdsmi_get_socket_handles(&socket_count, nullptr);
+
+    // Allocate the memory for the sockets
+    std::vector<amdsmi_socket_handle> sockets(socket_count);
+
+    // Get the sockets of the system
+    ret = amdsmi_get_socket_handles(&socket_count, &sockets[0]);
+
+    std::cout << "Total Socket: " << socket_count << std::endl;
+
+    // For each socket, get cpus
+    for (uint32_t i = 0; i < socket_count; i++) {
+        uint32_t cpu_count = 0;
+
+        // Set processor type as AMD_CPU
+        processor_type_t processor_type = AMD_CPU;
+        ret = amdsmi_get_processor_handles_by_type(sockets[i], processor_type, nullptr, &cpu_count);
+
+        // Allocate the memory for the cpus
+        std::vector<amdsmi_processor_handle> plist(cpu_count);
+
+		 // Get the cpus for each socket
+        ret = amdsmi_get_processor_handles_by_type(sockets[i], processor_type, &plist[0], &cpu_count);
+
+        for (uint32_t index = 0; index < plist.size(); index++) {
+            uint32_t socket_power;
+            std::cout<<"CPU "<<index<<"\t"<< std::endl;
+            std::cout<<"Power (Watts): ";
+
+            ret = amdsmi_get_cpu_socket_power(plist[index], &socket_power);
+            if(ret != AMDSMI_STATUS_SUCCESS)
+                std::cout<<"Failed to get cpu socket power"<<"["<<index<<"] , Err["<<ret<<"] "<< std::endl;
+
+            if (!ret) {
+                std::cout<<static_cast<double>(socket_power)/1000<<std::endl;
+            }
+            std::cout<<std::endl;
+        }
+    }
+
+    // Clean up resources allocated at amdsmi_init
+    ret = amdsmi_shut_down();
+
+    return 0;
+}
+```
+
 ### Documentation
 
 The reference manual, `AMD_SMI_Manual.pdf` will be in the /opt/rocm/share/doc/amd_smi directory upon a successful build.

From 06fa6580c46bad5ef8a454c188ed3878a131556f Mon Sep 17 00:00:00 2001
From: Maisam Arif <maisarif@amd.com>
Date: Thu, 21 Mar 2024 14:53:35 -0500
Subject: [PATCH 03/18] SWDEV-438593 - Updated proccess output error handling

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I67747da06362428587dab7467d85d8c9296d442e
---
 amdsmi_cli/amdsmi_commands.py | 39 ++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index 697513f5..27152afc 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -2511,6 +2511,8 @@ def process(self, args, multiple_devices=False, watching_output=False,
         if args.pid:
             process_pids = []
             for process_info in filtered_process_values:
+                if process_info['process_info'] == "N/A":
+                    continue
                 pid = str(process_info['process_info']['pid'])
                 if str(args.pid) == pid:
                     process_pids.append(process_info)
@@ -2520,36 +2522,45 @@ def process(self, args, multiple_devices=False, watching_output=False,
         if args.name:
             process_names = []
             for process_info in filtered_process_values:
+                if process_info['process_info'] == "N/A":
+                    continue
                 process_name = str(process_info['process_info']['name']).lower()
                 if str(args.name).lower() == process_name:
                     process_names.append(process_info)
             filtered_process_values = process_names
 
+        logging.debug(f"Process Info for GPU {gpu_id} | {filtered_process_values}")
+
         multiple_devices_csv_override = False
         # Convert and store output by pid for csv format
         if self.logger.is_csv_format():
-            for process_info in filtered_process_values:
-                for key, value in process_info['process_info'].items():
-                    multiple_devices_csv_override = True
-
-                    if watching_output:
-                        self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
-                    self.logger.store_output(args.gpu, key, value)
+            # Check for empty list first
+            if filtered_process_values == []:
+                self.logger.store_output(args.gpu, 'process_info', 'No running processes detected')
+            else:
+                for process_info in filtered_process_values:
+                    if process_info['process_info'] == "N/A":
+                        self.logger.store_output(args.gpu, 'process_info', 'No running processes detected')
+                    else:
+                        for key, value in process_info['process_info'].items():
+                            multiple_devices_csv_override = True
+                            if watching_output:
+                                self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
+                            self.logger.store_output(args.gpu, key, value)
 
-                self.logger.store_multiple_device_output()
+                    self.logger.store_multiple_device_output()
         else:
-            # Remove brackets if there is only one value
-            if len(filtered_process_values) == 1:
-                filtered_process_values = filtered_process_values[0]
-
             if watching_output:
                 self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
 
             # Store values in logger.output
             if filtered_process_values == []:
-                self.logger.store_output(args.gpu, 'values', {'process_info': 'Not Found'})
+                self.logger.store_output(args.gpu, 'process_info', 'No running processes detected')
             else:
-                self.logger.store_output(args.gpu, 'values', filtered_process_values)
+                for process_info in filtered_process_values:
+                    if process_info['process_info'] == "N/A":
+                        process_info['process_info'] = 'No running processes detected'
+                    self.logger.store_output(args.gpu, 'process_info', process_info['process_info'])
 
         if multiple_devices:
             self.logger.store_multiple_device_output()

From 1310c767ce6cc600c37256dff156782e125bb868 Mon Sep 17 00:00:00 2001
From: "Oliveira, Daniel" <daniel.oliveira@amd.com>
Date: Tue, 5 Mar 2024 14:01:06 -0600
Subject: [PATCH 04/18] fix: [SWDEV-448201] [rocm/amd_smi_lib]

Adds Add PCIE Errors

Code changes related to the following:
  * amdsmi_get_pcie_info()
  * CLI
  * examples

Change-Id: Ie0b7053e77c88fb18309c16e74bce75d862c45a9
Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
---
 amdsmi_cli/amdsmi_commands.py        | 107 ++++++++++-----------------
 example/amd_smi_drm_example.cc       |   8 ++
 include/amd_smi/amdsmi.h             |   2 +-
 include/amd_smi/impl/amd_smi_utils.h |  54 ++++++++++++++
 py-interface/README.md               |  58 +++++++--------
 py-interface/amdsmi_interface.py     |  47 +++++++++++-
 src/amd_smi/amd_smi.cc               |  28 ++++++-
 7 files changed, 201 insertions(+), 103 deletions(-)

diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index 27152afc..35af8395 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -361,11 +361,11 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None
                 logging.debug("Failed to get bdf for gpu %s | %s", gpu_id, e.get_error_info())
 
             try:
-                link_caps = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)
-                bus_info['max_pcie_width'] = link_caps['pcie_static']['max_pcie_width']
-                bus_info['max_pcie_speed'] = link_caps['pcie_static']['max_pcie_speed']
-                bus_info['pcie_interface_version'] = link_caps['pcie_static']['pcie_interface_version']
-
+                pcie_static = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_static']
+                bus_info['max_pcie_width'] = pcie_static['max_pcie_width']
+                bus_info['max_pcie_speed'] = pcie_static['max_pcie_speed']
+                bus_info['pcie_interface_version'] = pcie_static['pcie_interface_version']
+                bus_info['slot_type'] = pcie_static['slot_type']
                 if bus_info['max_pcie_speed'] % 1000 != 0:
                     pcie_speed_GTs_value = round(bus_info['max_pcie_speed'] / 1000, 1)
                 else:
@@ -373,14 +373,6 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None
 
                 bus_info['max_pcie_speed'] = pcie_speed_GTs_value
 
-                slot_type = link_caps['pcie_static']['slot_type']
-                if isinstance(slot_type, int):
-                    slot_types = amdsmi_interface.amdsmi_wrapper.amdsmi_card_form_factor_t__enumvalues
-                    if slot_type in slot_types:
-                        bus_info['slot_type'] = slot_types[slot_type].replace("AMDSMI_CARD_FORM_FACTOR_", "")
-                    else:
-                        bus_info['slot_type'] = "Unknown"
-
                 if bus_info['pcie_interface_version'] > 0:
                     bus_info['pcie_interface_version'] = f"Gen {bus_info['pcie_interface_version']}"
 
@@ -636,7 +628,7 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None
                 except amdsmi_exception.AmdSmiLibraryException as e:
                     policy_info = "N/A"
                     logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info())
-                
+
                 static_dict['dpm_policy'] = policy_info
         if 'numa' in current_platform_args:
             if args.numa:
@@ -1460,6 +1452,7 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No
             if args.pcie:
                 pcie_dict = {"width": "N/A",
                              "speed": "N/A",
+                             "bandwidth": "N/A",
                              "replay_count" : "N/A",
                              "l0_to_recovery_count" : "N/A",
                              "replay_roll_over_count" : "N/A",
@@ -1470,65 +1463,43 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No
                              "max_packet_size": "N/A"}
 
                 try:
-                    pcie_link_status = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)
+                    pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
+                    logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric)
 
-                    if pcie_link_status['pcie_metric']['pcie_speed'] % 1000 != 0:
-                        pcie_speed_GTs_value = round(pcie_link_status['pcie_metric']['pcie_speed'] / 1000, 1)
-                    else:
-                        pcie_speed_GTs_value = round(pcie_link_status['pcie_metric']['pcie_speed'] / 1000)
+                    pcie_dict['width'] = pcie_metric['pcie_width']
 
-                    pcie_dict['width'] = pcie_link_status['pcie_metric']['pcie_width']
-                    pcie_dict['speed'] = pcie_speed_GTs_value
+                    if pcie_metric['pcie_speed'] != "N/A":
+                        if pcie_metric['pcie_speed'] % 1000 != 0:
+                            pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000, 1)
+                        else:
+                            pcie_speed_GTs_value = round(pcie_metric['pcie_speed'] / 1000)
+                        pcie_dict['speed'] = pcie_speed_GTs_value
+
+                    pcie_dict['bandwidth'] = pcie_metric['pcie_bandwidth']
+                    pcie_dict['replay_count'] = pcie_metric['pcie_replay_count']
+                    pcie_dict['l0_to_recovery_count'] = pcie_metric['pcie_l0_to_recovery_count']
+                    pcie_dict['replay_roll_over_count'] = pcie_metric['pcie_replay_roll_over_count']
+                    pcie_dict['nak_received_count'] = pcie_metric['pcie_nak_received_count']
+                    pcie_dict['nak_sent_count'] = pcie_metric['pcie_nak_sent_count']
 
                     pcie_speed_unit = 'GT/s'
+                    pcie_bw_unit = 'Mb/s'
                     if self.logger.is_human_readable_format():
-                        pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}"
+                        if pcie_dict['speed'] != "N/A":
+                            pcie_dict['speed'] = f"{pcie_dict['speed']} {pcie_speed_unit}"
+                        if pcie_dict['bandwidth'] != "N/A":
+                            pcie_dict['bandwidth'] = f"{pcie_dict['bandwidth']} {pcie_bw_unit}"
                     if self.logger.is_json_format():
-                        pcie_dict['speed'] = {"value" : pcie_dict['speed'],
-                                              "unit" : pcie_speed_unit}
+                        if pcie_dict['speed'] != "N/A":
+                            pcie_dict['speed'] = {"value" : pcie_dict['speed'],
+                                                  "unit" : pcie_speed_unit}
+                        if pcie_dict['bandwidth'] != "N/A":
+                            pcie_dict['bandwidth'] = {"value" : pcie_dict['bandwidth'],
+                                                      "unit" : pcie_bw_unit}
 
                 except amdsmi_exception.AmdSmiLibraryException as e:
                     logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
 
-                try:
-                    pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_count_acc']
-                    if pci_replay_counter == "N/A":
-                        # raising exception here to fall back to sysfs
-                        raise amdsmi_exception.AmdSmiLibraryException(amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED)
-                    pcie_dict['replay_count'] = pci_replay_counter
-                except amdsmi_exception.AmdSmiLibraryException as e:
-                    logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info())
-                    logging.debug("Falling back to sysfs pci replay counter for gpu %s | %s", gpu_id, e.get_error_info())
-                    try:
-                        pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
-                        pcie_dict['replay_count'] = pci_replay_counter
-                    except amdsmi_exception.AmdSmiLibraryException as err:
-                        pcie_dict['replay_count'] = "N/A"
-                        logging.debug("Failed to get sysfs fallback pci replay counter for gpu %s | %s", gpu_id, err.get_error_info())
-
-                try:
-                    l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_l0_to_recov_count_acc']
-                    pcie_dict['l0_to_recovery_count'] = l0_to_recovery_counter
-                except amdsmi_exception.AmdSmiLibraryException as e:
-                    pcie_dict['l0_to_recovery_count'] = "N/A"
-                    logging.debug("Failed to get pcie l0 to recovery counter for gpu %s | %s", gpu_id, e.get_error_info())
-
-                try:
-                    pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['pcie_replay_rover_count_acc']
-                    pcie_dict['replay_roll_over_count'] = pci_replay_rollover_counter
-                except amdsmi_exception.AmdSmiLibraryException as e:
-                    pcie_dict['replay_roll_over_count'] = "N/A"
-                    logging.debug("Failed to get pcie replay rollover counter for gpu %s | %s", gpu_id, e.get_error_info())
-
-                try:
-                    gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
-                    pcie_dict['nak_sent_count'] = gpu_metrics_info['pcie_nak_sent_count_acc']
-                    pcie_dict['nak_received_count'] = gpu_metrics_info['pcie_nak_rcvd_count_acc']
-                except amdsmi_exception.AmdSmiLibraryException as e:
-                    pcie_dict['nak_sent_count'] = "N/A"
-                    pcie_dict['nak_received_count'] = "N/A"
-                    logging.debug("Failed to get pcie nak info for gpu %s | %s", gpu_id, e.get_error_info())
-
                 try:
                     pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu)
                     sent = pcie_bw['sent'] * pcie_bw['max_pkt_sz']
@@ -4134,14 +4105,14 @@ def xgmi(self, args, multiple_devices=False, gpu=None, metric=None):
                 }
 
                 try:
-                    pcie_info = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static']
-                    if pcie_info['max_pcie_speed'] % 1000 != 0:
-                        pcie_speed_GTs_value = round(pcie_info['max_pcie_speed'] / 1000, 1)
+                    pcie_static = amdsmi_interface.amdsmi_get_pcie_info(src_gpu)['pcie_static']
+                    if pcie_static['max_pcie_speed'] % 1000 != 0:
+                        pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000, 1)
                     else:
-                        pcie_speed_GTs_value = round(pcie_info['max_pcie_speed'] / 1000)
+                        pcie_speed_GTs_value = round(pcie_static['max_pcie_speed'] / 1000)
 
                     bitrate = pcie_speed_GTs_value
-                    max_bandwidth = bitrate * pcie_info['max_pcie_width']
+                    max_bandwidth = bitrate * pcie_static['max_pcie_width']
                 except amdsmi_exception.AmdSmiLibraryException as e:
                     logging.debug("Failed to get bitrate and bandwidth for GPU %s | %s", src_gpu_id,
                                     e.get_error_info())
diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc
index ea28b8eb..cd9a3a1f 100644
--- a/example/amd_smi_drm_example.cc
+++ b/example/amd_smi_drm_example.cc
@@ -411,6 +411,14 @@ int main() {
             printf("\tPCIe max lanes: %d\n", pcie_info.pcie_static.max_pcie_width);
             printf("\tPCIe max speed: %d\n", pcie_info.pcie_static.max_pcie_speed);
 
+            // additional pcie related metrics
+            printf("\tPCIe bandwidth: %d\n", pcie_info.pcie_metric.pcie_bandwidth);
+            printf("\tPCIe replay count: %d\n", pcie_info.pcie_metric.pcie_replay_count);
+            printf("\tPCIe L0 recovery count: %d\n", pcie_info.pcie_metric.pcie_l0_to_recovery_count);
+            printf("\tPCIe rollover count: %d\n", pcie_info.pcie_metric.pcie_replay_roll_over_count);
+            printf("\tPCIe nak received count: %d\n", pcie_info.pcie_metric.pcie_nak_received_count);
+            printf("\tPCIe nak sent count: %d\n", pcie_info.pcie_metric.pcie_nak_sent_count);
+
             // Get VRAM temperature limit
             int64_t temperature = 0;
             ret = amdsmi_get_temp_metric(
diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h
index ef58a6ce..861709b9 100644
--- a/include/amd_smi/amdsmi.h
+++ b/include/amd_smi/amdsmi.h
@@ -509,7 +509,7 @@ typedef struct {
   struct pcie_metric_ {
     uint16_t pcie_width;                  //!< current PCIe width
     uint32_t pcie_speed;                  //!< current PCIe speed in MT/s
-    uint32_t pcie_bandwidth;              //!< current PCIe bandwidth Mb/s
+    uint32_t pcie_bandwidth;              //!< current instantaneous PCIe bandwidth in Mb/s
     uint64_t pcie_replay_count;           //!< total number of the replays issued on the PCIe link
     uint64_t pcie_l0_to_recovery_count;   //!< total number of times the PCIe link transitioned from L0 to the recovery state
     uint64_t pcie_replay_roll_over_count; //!< total number of replay rollovers issued on the PCIe link
diff --git a/include/amd_smi/impl/amd_smi_utils.h b/include/amd_smi/impl/amd_smi_utils.h
index 7d2df9b1..30897b34 100644
--- a/include/amd_smi/impl/amd_smi_utils.h
+++ b/include/amd_smi/impl/amd_smi_utils.h
@@ -21,6 +21,9 @@
 #ifndef AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
 #define AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
 
+#include <limits>
+#include <type_traits>
+
 #include "amd_smi/amdsmi.h"
 #include "amd_smi/impl/amd_smi_gpu_device.h"
 #include "rocm_smi/rocm_smi_utils.h"
@@ -45,4 +48,55 @@ amdsmi_status_t smi_amdgpu_get_pcie_speed_from_pcie_type(uint16_t pcie_type, uin
 amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(uint32_t device_id, char *market_name);
 amdsmi_status_t smi_amdgpu_is_gpu_power_management_enabled(amd::smi::AMDSmiGPUDevice* device, bool *enabled);
 
+
+template<typename>
+constexpr bool is_dependent_false_v = false;
+
+template<typename T>
+inline constexpr bool is_supported_type_v = (
+    std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint8_t>  ||
+    std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint16_t> ||
+    std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint32_t> ||
+    std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, std::uint64_t>
+);
+
+template<typename T>
+constexpr T get_std_num_limit()
+{
+    if constexpr (is_supported_type_v<T>) {
+        return std::numeric_limits<T>::max();
+    }
+    else {
+        return std::numeric_limits<T>::min();
+        static_assert(is_dependent_false_v<T>, "Error: Type not supported...");
+    }
+}
+
+template<typename T>
+constexpr bool is_std_num_limit(T value)
+{
+    return (value == get_std_num_limit<T>());
+}
+
+template<typename T, typename U,  typename V = T>
+constexpr T translate_umax_or_assign_value(U source_value, V target_value)
+{
+    T result{};
+    if constexpr (is_supported_type_v<T> && is_supported_type_v<U>) {
+        // If the source value is uint<U>::max(), then return is uint<T>::max()
+        if (is_std_num_limit(source_value)) {
+            result = get_std_num_limit<T>();
+        } else {
+            result = static_cast<T>(target_value);
+        }
+
+        return result;
+    }
+    else {
+        static_assert(is_dependent_false_v<T>, "Error: Type not supported...");
+    }
+
+    return result;
+}
+
 #endif //
diff --git a/py-interface/README.md b/py-interface/README.md
index fbc5450c..7d9fd590 100644
--- a/py-interface/README.md
+++ b/py-interface/README.md
@@ -580,7 +580,7 @@ Output: Dictionary with fields
 
 Field | Description
 ---|---
-`fw_list`| List of dictionaries that contain information about a certain firmware block
+`fw_list` | List of dictionaries that contain information about a certain firmware block
 
 Exceptions that can be thrown by `amdsmi_get_fw_info` function:
 
@@ -619,7 +619,7 @@ Output: Dictionary of activites to their respective usage percentage or 'N/A' if
 
 Field | Description
 ---|---
-`gfx_activity`| graphics engine usage percentage (0 - 100)
+`gfx_activity` | graphics engine usage percentage (0 - 100)
 `umc_activity` | memory engine usage percentage (0 - 100)
 `mm_activity` | average multimedia engine usages in percentage (0 - 100)
 
@@ -659,7 +659,7 @@ Output: Dictionary with fields
 
 Field | Description
 ---|---
-`average_socket_power`| average socket power
+`average_socket_power` | average socket power
 `gfx_voltage` | voltage gfx
 `power_limit` | power limit
 
@@ -699,7 +699,7 @@ Output: Dictionary with fields
 Field | Description
 ---|---
 `vram_total` | VRAM total
-`vram_used`| VRAM currently in use
+`vram_used` | VRAM currently in use
 
 Exceptions that can be thrown by `amdsmi_get_gpu_vram_usage` function:
 
@@ -751,7 +751,7 @@ Output: Dictionary with fields
 
 Field | Description
 ---|---
-`cur_clk`| Current clock for given clock type
+`cur_clk` | Current clock for given clock type
 `max_clk` | Maximum clock for given clock type
 `min_clk` | Minimum clock for given clock type
 
@@ -780,20 +780,19 @@ except AmdSmiException as e:
 
 ### amdsmi_get_pcie_info
 
-Description: Returns the pcie link status for the given GPU.
+Description: Returns the pcie metric and static information for the given GPU.
 It is not supported on virtual machine guest
 
 Input parameters:
 
 * `processor_handle` device which to query
 
-Output: Dictionary with fields
+Output: Dictionary with 2 fields `pcie_static` and `pcie_metric`
 
-Field | Description
+Fields | Description
 ---|---
-`pcie_width`| pcie lanes in use
-`pcie_speed`| current pcie speed
-`pcie_interface_version`| current pcie generation
+`pcie_static` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`max_pcie_width`</td><td>Maximum number of pcie lanes available</td></tr><tr><td>`max_pcie_speed`</td><td>Maximum capable pcie speed in GT/s</td></tr><tr><td>`pcie_interface_version`</td><td>PCIe generation ie. 3,4,5...</td></tr><tr><td>`slot_type`</td><td>The type of form factor of the slot: PCIE, OAM, or Unknown</td></tr></tbody></table>
+`pcie_metric` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`pcie_width`</td><td>Current number of pcie lanes available</td></tr><tr><td>`pcie_speed`</td><td>Current pcie speed capable in GT/s</td></tr><tr><td>`pcie_bandwidth`</td><td>Current instantaneous bandwidth usage in Mb/s</td></tr><tr><td>`pcie_replay_count`</td><td>Total number of PCIe replays (NAKs)</td></tr><tr><td>`pcie_l0_to_recovery_count`</td><td>PCIE L0 to recovery state transition accumulated count</td></tr><tr><td>`pcie_replay_roll_over_count`</td><td>PCIe Replay accumulated count</td></tr><tr><td>`pcie_nak_sent_count`</td><td>PCIe NAK sent accumulated count</td></tr><tr><td>`pcie_nak_received_count`</td><td>PCIe NAK received accumulated count</td></tr></tbody></table>
 
 Exceptions that can be thrown by `amdsmi_get_pcie_info` function:
 
@@ -810,10 +809,9 @@ try:
         print("No GPUs on machine")
     else:
         for device in devices:
-            pcie_link_status = amdsmi_get_pcie_info(device)
-            print(pcie_link_status["pcie_width"])
-            print(pcie_link_status["pcie_speed"])
-            print(pcie_link_status["pcie_interface_version"])
+            pcie_info = amdsmi_get_pcie_info(device)
+            print(pcie_info["pcie_static"])
+            print(pcie_info["pcie_metric"])
 except AmdSmiException as e:
     print(e)
 ```
@@ -949,8 +947,8 @@ Output: Dictionary with fields
 
 Field | Description
 ---|---
-`correctable_count`| Correctable ECC error count
-`uncorrectable_count`| Uncorrectable ECC error count
+`correctable_count` | Correctable ECC error count
+`uncorrectable_count` | Uncorrectable ECC error count
 
 Exceptions that can be thrown by `amdsmi_get_gpu_total_ecc_count` function:
 
@@ -2021,9 +2019,9 @@ Output: Dictionary with fields
 
 Field | Description
 ---|---
-`num_supported`| The number of supported frequencies
-`current`| The current frequency index
-`frequency`| List of frequencies, only the first num_supported frequencies are valid
+`num_supported` | The number of supported frequencies
+`current` | The current frequency index
+`frequency` | List of frequencies, only the first num_supported frequencies are valid
 
 Exceptions that can be thrown by `amdsmi_get_clk_freq` function:
 
@@ -2062,8 +2060,8 @@ Field | Description
 `curr_mclk_range` |  <table> <thead><tr><th> Subfield </th><th>Description</th></tr></thead><tbody><tr><td>`lower_bound`</td><td>lower bound mclk range</td></tr><tr><td>`upper_bound`</td><td>upper bound mclk range</td></tr></tbody></table>
 `sclk_freq_limits` |  <table> <thead><tr><th> Subfield </th><th>Description</th></tr></thead><tbody><tr><td>`lower_bound`</td><td>lower bound sclk range limt</td></tr><tr><td>`upper_bound`</td><td>upper bound sclk range limit</td></tr></tbody></table>
 `mclk_freq_limits` |  <table> <thead><tr><th> Subfield </th><th>Description</th></tr></thead><tbody><tr><td>`lower_bound`</td><td>lower bound mclk range limit</td></tr><tr><td>`upper_bound`</td><td>upper bound mclk range limit</td></tr></tbody></table>
-`curve.vc_points`| The number of supported frequencies
-`num_regions`| The current frequency index
+`curve.vc_points` | The number of supported frequencies
+`num_regions` | The current frequency index
 
 Exceptions that can be thrown by `amdsmi_get_gpu_od_volt_info` function:
 
@@ -2228,9 +2226,9 @@ Output: Dictionary with fields
 
 Field | Description
 ---|---
-`available_profiles`| Which profiles are supported by this system
-`current`| Which power profile is currently active
-`num_profiles`| How many power profiles are available
+`available_profiles` | Which profiles are supported by this system
+`current` | Which power profile is currently active
+`num_profiles` | How many power profiles are available
 
 Exceptions that can be thrown by `amdsmi_get_gpu_power_profile_presets` function:
 
@@ -2391,9 +2389,9 @@ Output: Dictionary with fields
 
 Field | Description
 ---|---
-`value`| Counter value
-`time_enabled`| Time that the counter was enabled in nanoseconds
-`time_running`| Time that the counter was running in nanoseconds
+`value` | Counter value
+`time_enabled` | Time that the counter was enabled in nanoseconds
+`time_running` | Time that the counter was running in nanoseconds
 
 Exceptions that can be thrown by `amdsmi_gpu_read_counter` function:
 
@@ -2661,8 +2659,8 @@ Output: Dict containing information about error counts
 
 Field | Description
 ---|---
-`correctable_count`| Count of correctable errors
-`uncorrectable_count`| Count of uncorrectable errors
+`correctable_count` | Count of correctable errors
+`uncorrectable_count` | Count of uncorrectable errors
 
 Exceptions that can be thrown by `amdsmi_get_gpu_ecc_count` function:
 
diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py
index 7eb501bb..e27451da 100644
--- a/py-interface/amdsmi_interface.py
+++ b/py-interface/amdsmi_interface.py
@@ -2134,7 +2134,7 @@ def amdsmi_get_pcie_info(
         )
     )
 
-    return {
+    pcie_info_dict = {
         "pcie_static": {
             "max_pcie_width": pcie_info.pcie_static.max_pcie_width,
             "max_pcie_speed": pcie_info.pcie_static.max_pcie_speed,
@@ -2153,6 +2153,49 @@ def amdsmi_get_pcie_info(
         }
     }
 
+    # Check pcie static values for uint max
+    if pcie_info_dict['pcie_static']['max_pcie_width'] == 0xFFFF:
+        pcie_info_dict['pcie_static']['max_pcie_width'] = "N/A"
+    if pcie_info_dict['pcie_static']['max_pcie_speed'] == 0xFFFFFFFF:
+        pcie_info_dict['pcie_static']['max_pcie_speed'] = "N/A"
+    if pcie_info_dict['pcie_static']['pcie_interface_version'] == 0xFFFFFFFF:
+        pcie_info_dict['pcie_static']['pcie_interface_version'] = "N/A"
+
+    slot_type = pcie_info_dict['pcie_static']['slot_type']
+    if isinstance(slot_type, int):
+        slot_types = amdsmi_wrapper.amdsmi_card_form_factor_t__enumvalues
+        if slot_type in slot_types:
+            pcie_info_dict['pcie_static']['slot_type'] = slot_types[slot_type].replace("AMDSMI_CARD_FORM_FACTOR_", "")
+        else:
+            pcie_info_dict['pcie_static']['slot_type'] = "Unknown"
+    else:
+        pcie_info_dict['pcie_static']['slot_type'] = "N/A"
+
+    # Check pcie metric values for uint max
+    if pcie_info_dict['pcie_metric']['pcie_width'] == 0xFFFF:
+        pcie_info_dict['pcie_metric']['pcie_width'] = "N/A"
+    if pcie_info_dict['pcie_metric']['pcie_speed'] == 0xFFFFFFFF:
+        pcie_info_dict['pcie_metric']['pcie_speed'] = "N/A"
+    if pcie_info_dict['pcie_metric']['pcie_bandwidth'] == 0xFFFFFFFF:
+        pcie_info_dict['pcie_metric']['pcie_bandwidth'] = "N/A"
+
+    # TODO Just Navi 21 has a different uint max size for pcie_bandwidth
+    # if pcie_info_dict['pcie_metric']['pcie_bandwidth'] == 0xFFFFFFFF:
+    #     pcie_info_dict['pcie_metric']['pcie_bandwidth'] = "N/A"
+
+    if pcie_info_dict['pcie_metric']['pcie_replay_count'] == 0xFFFFFFFFFFFFFFFF:
+        pcie_info_dict['pcie_metric']['pcie_replay_count'] = "N/A"
+    if pcie_info_dict['pcie_metric']['pcie_l0_to_recovery_count'] == 0xFFFFFFFFFFFFFFFF:
+        pcie_info_dict['pcie_metric']['pcie_l0_to_recovery_count'] = "N/A"
+    if pcie_info_dict['pcie_metric']['pcie_replay_roll_over_count'] == 0xFFFFFFFFFFFFFFFF:
+        pcie_info_dict['pcie_metric']['pcie_replay_roll_over_count'] = "N/A"
+    if pcie_info_dict['pcie_metric']['pcie_nak_sent_count'] == 0xFFFFFFFFFFFFFFFF:
+        pcie_info_dict['pcie_metric']['pcie_nak_sent_count'] = "N/A"
+    if pcie_info_dict['pcie_metric']['pcie_nak_received_count'] == 0xFFFFFFFFFFFFFFFF:
+        pcie_info_dict['pcie_metric']['pcie_nak_received_count'] = "N/A"
+
+    return pcie_info_dict
+
 
 def amdsmi_get_processor_handle_from_bdf(bdf):
     bdf = _parse_bdf(bdf)
@@ -3275,7 +3318,7 @@ def amdsmi_get_dpm_policy(
             processor_handle, ctypes.byref(policy)
         )
     )
-        
+
     polices = []
     for i in range(0, policy.num_supported):
         id = policy.policies[i].policy_id
diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc
index 392b6188..e57ae30c 100644
--- a/src/amd_smi/amd_smi.cc
+++ b/src/amd_smi/amd_smi.cc
@@ -2052,8 +2052,32 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a
         status = smi_amdgpu_get_pcie_speed_from_pcie_type(metric_info.pcie_link_speed, &info->pcie_metric.pcie_speed); // mapping to MT/s
     } else {
         // gpu metrics returns pcie link speed in .1 GT/s ex. 160 vs 16
-        info->pcie_metric.pcie_speed = metric_info.pcie_link_speed * 100;
-    }
+        info->pcie_metric.pcie_speed = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_speed)>
+                                          (metric_info.pcie_link_speed, (metric_info.pcie_link_speed * 100));
+    }
+
+    // additional pcie related metrics
+    /**
+     * pcie_metric.pcie_bandwidth:      MB/s  (uint32_t)
+     * metric_info.pcie_bandwidth_inst: GB/s  (uint64_t)
+     */
+    info->pcie_metric.pcie_bandwidth = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_bandwidth)>
+                                          (metric_info.pcie_bandwidth_inst, metric_info.pcie_bandwidth_inst);
+    info->pcie_metric.pcie_replay_count = metric_info.pcie_replay_count_acc;
+    info->pcie_metric.pcie_l0_to_recovery_count = metric_info.pcie_l0_to_recov_count_acc;
+    info->pcie_metric.pcie_replay_roll_over_count = metric_info.pcie_replay_rover_count_acc;
+    /**
+     * pcie_metric.pcie_nak_received_count: (uint64_t)
+     * metric_info.pcie_nak_rcvd_count_acc: (uint32_t)
+     */
+    info->pcie_metric.pcie_nak_received_count = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_nak_received_count)>
+                                                  (metric_info.pcie_nak_rcvd_count_acc, (metric_info.pcie_nak_rcvd_count_acc));
+    /**
+     * pcie_metric.pcie_nak_sent_count:     (uint64_t)
+     * metric_info.pcie_nak_sent_count_acc: (uint32_t)
+     */
+    info->pcie_metric.pcie_nak_sent_count = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_nak_sent_count)>
+                                              (metric_info.pcie_nak_sent_count_acc, (metric_info.pcie_nak_sent_count_acc));
 
     return AMDSMI_STATUS_SUCCESS;
 }

From 1ac1ee4b9abed1ac01094dc0cc22c1ecba21d667 Mon Sep 17 00:00:00 2001
From: Deepak Mewar <deepak.mewar@amd.com>
Date: Mon, 11 Mar 2024 12:44:14 +0000
Subject: [PATCH 05/18] fix for cpu enable apb error

Signed-off-by: Deepak Mewar <deepak.mewar@amd.com>
Change-Id: I092b88484046671857c4adbbbeaba78180b103ab
---
 amdsmi_cli/amdsmi_commands.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index 35af8395..dbdc16ac 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -3325,18 +3325,13 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level
         cpu_args_enabled = False
         cpu_attributes = ["cpu_pwr_limit", "cpu_xgmi_link_width", "cpu_lclk_dpm_level", "cpu_pwr_eff_mode",
                           "cpu_gmi3_link_width", "cpu_pcie_link_rate", "cpu_df_pstate_range",
-                          "cpu_disable_apb", "soc_boost_limit"]
+                          "cpu_enable_apb", "cpu_disable_apb", "soc_boost_limit"]
         for attr in cpu_attributes:
             if hasattr(args, attr):
                 if getattr(args, attr) is not None:
                     cpu_args_enabled = True
                     break
 
-        # Check if CPU set argument with store_true has been passed
-        if hasattr(args, "cpu_enable_apb"):
-            if getattr(args, attr):
-                cpu_args_enabled = True
-
         # Check if a Core argument has been set
         core_args_enabled = False
         core_attributes = ["core_boost_limit"]

From e4085c641431fe9c12ec585a91e39a57db54ecd8 Mon Sep 17 00:00:00 2001
From: "Bill(Shuzhou) Liu" <shuzhou.liu@amd.com>
Date: Wed, 20 Mar 2024 12:06:24 -0500
Subject: [PATCH 06/18] Get and set the XGMI PLPD

Update the API and CLI to support XGMI Per-Link Power Down Policy.

Change-Id: Iaf04a771eb8bb0829a5b3088d803a7355a8dfd0b
---
 amdsmi_cli/README.md                 |  67 +++++++++++---
 amdsmi_cli/amdsmi_commands.py        |  52 ++++++++---
 amdsmi_cli/amdsmi_parser.py          |   4 +
 include/amd_smi/amdsmi.h             |  43 +++++++++
 py-interface/README.md               |  72 ++++++++++++++-
 py-interface/amdsmi_interface.py     |  45 ++++++++++
 py-interface/amdsmi_wrapper.py       |  45 ++++++----
 rocm_smi/include/rocm_smi/rocm_smi.h |  39 ++++++++
 rocm_smi/src/rocm_smi.cc             | 128 ++++++++++++++++++++++++++-
 rocm_smi/src/rocm_smi_device.cc      |   6 +-
 src/amd_smi/amd_smi.cc               |  16 ++++
 11 files changed, 467 insertions(+), 50 deletions(-)

diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md
index 3273f807..f9c0c067 100644
--- a/amdsmi_cli/README.md
+++ b/amdsmi_cli/README.md
@@ -280,7 +280,7 @@ usage: amd-smi metric [-h] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE
                       [--core-curr-active-freq-core-limit] [--core-energy]
                       [--json | --csv] [--file FILE] [--loglevel LEVEL]
 
-If no GPU is specified, returns metric information for all GPUs on the system.                                
+If no GPU is specified, returns metric information for all GPUs on the system.
 If no metric argument is provided all metric information will be displayed.
 
 Metric arguments:
@@ -325,16 +325,16 @@ CPU Arguments:
   --cpu-c0-res                              Displays C0 residency
   --cpu-lclk-dpm-level NBIOID               Displays lclk dpm level range. Requires socket ID and NBOID as inputs
   --cpu-pwr-svi-telemtry-rails              Displays svi based telemetry for all rails
-  --cpu-io-bandwidth IO_BW LINKID_NAME      Displays current IO bandwidth for the selected CPU.        
-                                             input parameters are bandwidth type(1) and link ID encodings        
+  --cpu-io-bandwidth IO_BW LINKID_NAME      Displays current IO bandwidth for the selected CPU.
+                                             input parameters are bandwidth type(1) and link ID encodings
                                              i.e. P2, P3, G0 - G7
-  --cpu-xgmi-bandwidth XGMI_BW LINKID_NAME  Displays current XGMI bandwidth for the selected CPU        
-                                             input parameters are bandwidth type(1,2,4) and link ID encodings        
+  --cpu-xgmi-bandwidth XGMI_BW LINKID_NAME  Displays current XGMI bandwidth for the selected CPU
+                                             input parameters are bandwidth type(1,2,4) and link ID encodings
                                              i.e. P2, P3, G0 - G7
   --cpu-metrics-ver                         Displays metrics table version
   --cpu-metrics-table                       Displays metric table
   --cpu-socket-energy                       Displays socket energy for the selected CPU socket
-  --cpu-ddr-bandwidth                       Displays per socket max ddr bw, current utilized bw,        
+  --cpu-ddr-bandwidth                       Displays per socket max ddr bw, current utilized bw,
                                              and current utilized ddr bw in percentage
   --cpu-temp                                Displays cpu socket temperature
   --cpu-dimm-temp-range-rate DIMM_ADDR      Displays dimm temperature range and refresh rate
@@ -437,7 +437,7 @@ usage: amd-smi topology [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
                         [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [-a]
                         [-w] [-o] [-t] [-b]
 
-If no GPU is specified, returns information for all GPUs on the system.                                
+If no GPU is specified, returns information for all GPUs on the system.
 If no topology argument is provided all topology information will be displayed.
 
 Topology arguments:
@@ -483,7 +483,7 @@ usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...
                    [--core-boost-limit BOOST_LIMIT] [--json | --csv] [--file FILE]
                    [--loglevel LEVEL]
 
-A GPU must be specified to set a configuration.                                    
+A GPU must be specified to set a configuration.
 A set argument must be provided; Multiple set arguments are accepted
 
 Set Arguments:
@@ -513,11 +513,12 @@ Set Arguments:
                                                 NPS1, NPS2, NPS4, NPS8
   -o, --power-cap WATTS                        Set power capacity limit
   -p, --dpm-policy POLICY_ID                   Set the GPU DPM policy using policy id
+  -x, --xgmi-plpd POLICY_ID                    Set the GPU XGMI per-link power down policy using policy id
 
 CPU Arguments:
   --cpu-pwr-limit PWR_LIMIT                    Set power limit for the given socket. Input parameter is power limit value.
   --cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH    Set max and Min linkwidth. Input parameters are min and max link width values
-  --cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM  Sets the max and min dpm level on a given NBIO.        
+  --cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM  Sets the max and min dpm level on a given NBIO.
                                                 Input parameters are die_index, min dpm, max dpm.
   --cpu-pwr-eff-mode MODE                      Sets the power efficency mode policy. Input parameter is mode.
   --cpu-gmi3-link-width MIN_LW MAX_LW          Sets max and min gmi3 link width range
@@ -675,7 +676,7 @@ GPU: 0
     PARTITION:
         COMPUTE_PARTITION: SPX
         MEMORY_PARTITION: NPS1
-    POLICY:
+    DPM_POLICY:
         NUM_SUPPORTED: 4
         CURRENT_ID: 1
         POLICIES:
@@ -687,6 +688,16 @@ GPU: 0
             POLICY_DESCRIPTION: soc_pstate_1
             POLICY_ID: 3
             POLICY_DESCRIPTION: soc_pstate_2
+    XGMI_PLPD:
+        NUM_SUPPORTED: 3
+        CURRENT_ID: 1
+        PLPDS:
+            POLICY_ID: 0
+            POLICY_DESCRIPTION: plpd_disallow
+            POLICY_ID: 1
+            POLICY_DESCRIPTION: plpd_default
+            POLICY_ID: 2
+            POLICY_DESCRIPTION: plpd_optimized
     NUMA:
         NODE: 0
         AFFINITY: 0
@@ -783,7 +794,7 @@ GPU: 1
     PARTITION:
         COMPUTE_PARTITION: SPX
         MEMORY_PARTITION: NPS1
-    POLICY:
+    DPM_POLICY:
         NUM_SUPPORTED: 4
         CURRENT_ID: 1
         POLICIES:
@@ -795,6 +806,16 @@ GPU: 1
             POLICY_DESCRIPTION: soc_pstate_1
             POLICY_ID: 3
             POLICY_DESCRIPTION: soc_pstate_2
+    XGMI_PLPD:
+        NUM_SUPPORTED: 3
+        CURRENT_ID: 1
+        PLPDS:
+            POLICY_ID: 0
+            POLICY_DESCRIPTION: plpd_disallow
+            POLICY_ID: 1
+            POLICY_DESCRIPTION: plpd_default
+            POLICY_ID: 2
+            POLICY_DESCRIPTION: plpd_optimized
     NUMA:
         NODE: 1
         AFFINITY: 1
@@ -891,7 +912,7 @@ GPU: 2
     PARTITION:
         COMPUTE_PARTITION: SPX
         MEMORY_PARTITION: NPS1
-    POLICY:
+    DPM_POLICY:
         NUM_SUPPORTED: 4
         CURRENT_ID: 1
         POLICIES:
@@ -903,6 +924,16 @@ GPU: 2
             POLICY_DESCRIPTION: soc_pstate_1
             POLICY_ID: 3
             POLICY_DESCRIPTION: soc_pstate_2
+    XGMI_PLPD:
+        NUM_SUPPORTED: 3
+        CURRENT_ID: 1
+        PLPDS:
+            POLICY_ID: 0
+            POLICY_DESCRIPTION: plpd_disallow
+            POLICY_ID: 1
+            POLICY_DESCRIPTION: plpd_default
+            POLICY_ID: 2
+            POLICY_DESCRIPTION: plpd_optimized
     NUMA:
         NODE: 2
         AFFINITY: 2
@@ -999,7 +1030,7 @@ GPU: 3
     PARTITION:
         COMPUTE_PARTITION: SPX
         MEMORY_PARTITION: NPS1
-    POLICY:
+    DPM_POLICY:
         NUM_SUPPORTED: 4
         CURRENT_ID: 1
         POLICIES:
@@ -1011,6 +1042,16 @@ GPU: 3
             POLICY_DESCRIPTION: soc_pstate_1
             POLICY_ID: 3
             POLICY_DESCRIPTION: soc_pstate_2
+    XGMI_PLPD:
+        NUM_SUPPORTED: 3
+        CURRENT_ID: 1
+        PLPDS:
+            POLICY_ID: 0
+            POLICY_DESCRIPTION: plpd_disallow
+            POLICY_ID: 1
+            POLICY_DESCRIPTION: plpd_default
+            POLICY_ID: 2
+            POLICY_DESCRIPTION: plpd_optimized
     NUMA:
         NODE: 3
         AFFINITY: 3
diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index dbdc16ac..689b3fa5 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -244,7 +244,8 @@ def static_cpu(self, args, multiple_devices=False, cpu=None, interface_ver=None)
 
     def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None,
                         limit=None, driver=None, ras=None, board=None, numa=None, vram=None,
-                        cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, policy=None):
+                        cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None,
+                        policy=None, xgmi_plpd=None):
         """Get Static information for target gpu
 
         Args:
@@ -268,6 +269,7 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None
             fb_info (bool, optional): Value override for args.fb_info. Defaults to None.
             num_vf (bool, optional): Value override for args.num_vf. Defaults to None.
             policy (bool, optional): Value override for args.policy. Defaults to None.
+            xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None.
         Returns:
             None: Print output via AMDSMILogger to destination
         """
@@ -302,8 +304,10 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None
                 args.limit = limit
             if policy:
                 args.policy = policy
-            current_platform_args += ["ras", "limit", "partition", "policy"]
-            current_platform_values += [args.ras, args.limit, args.partition, args.policy]
+            if xgmi_plpd:
+                args.xgmi_plpd = xgmi_plpd
+            current_platform_args += ["ras", "limit", "partition", "policy", "xgmi_plpd"]
+            current_platform_values += [args.ras, args.limit, args.partition, args.policy, args.xgmi_plpd]
 
         if self.helpers.is_linux() and not self.helpers.is_virtual_os():
             if numa:
@@ -630,6 +634,15 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None
                     logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info())
 
                 static_dict['dpm_policy'] = policy_info
+        if 'xgmi_plpd' in current_platform_args:
+            if args.xgmi_plpd:
+                try:
+                    policy_info = amdsmi_interface.amdsmi_get_xgmi_plpd(args.gpu)
+                except amdsmi_exception.AmdSmiLibraryException as e:
+                    policy_info = "N/A"
+                    logging.debug("Failed to get xgmi_plpd info for gpu %s | %s", gpu_id, e.get_error_info())
+
+                static_dict['xgmi_plpd'] = policy_info
         if 'numa' in current_platform_args:
             if args.numa:
                 try:
@@ -766,7 +779,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None,
                 bus=None, vbios=None, limit=None, driver=None, ras=None,
                 board=None, numa=None, vram=None, cache=None, partition=None,
                 dfc_ucode=None, fb_info=None, num_vf=None, cpu=None,
-                interface_ver=None, policy=None):
+                interface_ver=None, policy=None, xgmi_plpd = None):
         """Get Static information for target gpu and cpu
 
         Args:
@@ -790,6 +803,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None,
             cpu (cpu_handle, optional): cpu_handle for target device. Defaults to None.
             interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None
             policy (bool, optional): Value override for args.policy. Defaults to None.
+            xgmi_plpd (bool, optional): Value override for args.xgmi_plpd. Defaults to None.
         Raises:
             IndexError: Index error if gpu list is empty
 
@@ -815,7 +829,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None,
         gpu_args_enabled = False
         gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras",
                           "board", "numa", "vram", "cache", "partition",
-                          "dfc_ucode", "fb_info", "num_vf", "policy"]
+                          "dfc_ucode", "fb_info", "num_vf", "policy", "xgmi_plpd"]
         for attr in gpu_attributes:
             if hasattr(args, attr):
                 if getattr(args, attr):
@@ -859,7 +873,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None,
             self.static_gpu(args, multiple_devices, gpu, asic,
                                 bus, vbios, limit, driver, ras,
                                 board, numa, vram, cache, partition,
-                                dfc_ucode, fb_info, num_vf, policy)
+                                dfc_ucode, fb_info, num_vf, policy, xgmi_plpd)
 
 
     def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True):
@@ -3090,7 +3104,7 @@ def set_cpu(self, args, multiple_devices=False, cpu=None, cpu_pwr_limit=None,
 
     def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None,
                   profile=None, perf_determinism=None, compute_partition=None,
-                  memory_partition=None, power_cap=None, dpm_policy=None):
+                  memory_partition=None, power_cap=None, dpm_policy=None, xgmi_plpd = None):
         """Issue reset commands to target gpu(s)
 
         Args:
@@ -3105,6 +3119,7 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N
             memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None.
             power_cap (int, optional): Value override for args.power_cap. Defaults to None.
             dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None.
+            xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None.
 
         Raises:
             ValueError: Value error if no gpu value is provided
@@ -3132,6 +3147,8 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N
             args.power_cap = power_cap
         if dpm_policy:
             args.dpm_policy = dpm_policy
+        if xgmi_plpd:
+            args.xgmi_plpd = xgmi_plpd
         # Handle No GPU passed
         if args.gpu == None:
             raise ValueError('No GPU provided, specific GPU target(s) are needed')
@@ -3151,7 +3168,8 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N
                     args.memory_partition,
                     args.perf_determinism is not None,
                     args.power_cap,
-                    args.dpm_policy]):
+                    args.dpm_policy,
+                    args.xgmi_plpd]):
             command = " ".join(sys.argv[1:])
             raise AmdSmiRequiredCommandException(command, self.logger.format)
 
@@ -3225,6 +3243,15 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N
                 raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e
             self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}")
 
+        if args.xgmi_plpd:
+            try:
+                amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd)
+            except amdsmi_exception.AmdSmiLibraryException as e:
+                if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
+                    raise PermissionError('Command requires elevation') from e
+                raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e
+            self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.dpm_policy}")
+
         if isinstance(args.power_cap, int):
             try:
                 power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
@@ -3264,7 +3291,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level
                   cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None,
                   cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None,
                   cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None,
-                  soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None):
+                  soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None, xgmi_plpd=None):
         """Issue reset commands to target gpu(s)
 
         Args:
@@ -3294,6 +3321,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level
             core (device_handle, optional): device_handle for target core. Defaults to None.
             core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None
             dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None.
+            xgmi_plpd (int, optional): Value override for args.xgmi_plpd. Defaults to None.
 
         Raises:
             ValueError: Value error if no gpu value is provided
@@ -3314,7 +3342,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level
         # Check if a GPU argument has been set
         gpu_args_enabled = False
         gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition",
-                          "memory_partition", "power_cap", "dpm_policy"]
+                          "memory_partition", "power_cap", "dpm_policy", "xgmi_plpd"]
         for attr in gpu_attributes:
             if hasattr(args, attr):
                 if getattr(args, attr) is not None:
@@ -3370,7 +3398,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level
                 self.logger.clear_multiple_devices_ouput()
                 self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
                                 profile, perf_determinism, compute_partition,
-                                memory_partition, power_cap, dpm_policy)
+                                memory_partition, power_cap, dpm_policy, xgmi_plpd)
         elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
             if args.cpu == None and args.core == None:
                 raise ValueError('No CPU or CORE provided, specific target(s) are needed')
@@ -3389,7 +3417,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level
             self.logger.clear_multiple_devices_ouput()
             self.set_gpu(args, multiple_devices, gpu, fan, perf_level,
                             profile, perf_determinism, compute_partition,
-                            memory_partition, power_cap, dpm_policy)
+                            memory_partition, power_cap, dpm_policy, xgmi_plpd)
 
 
     def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py
index 5341b274..adaa91c3 100644
--- a/amdsmi_cli/amdsmi_parser.py
+++ b/amdsmi_cli/amdsmi_parser.py
@@ -544,6 +544,7 @@ def _add_static_parser(self, subparsers, func):
         cache_help = "All cache information"
         board_help = "All board information"
         dpm_policy_help = "The available DPM policy"
+        xgmi_plpd_help = "The available XGMI per-link power down policy"
 
         # Options arguments help text for Hypervisors and Baremetal
         ras_help = "Displays RAS features information"
@@ -584,6 +585,7 @@ def _add_static_parser(self, subparsers, func):
                 static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help)
                 static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help)
                 static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help)
+                static_parser.add_argument('-x', '--xgmi-plpd', action='store_true', required=False, help=xgmi_plpd_help)
 
             if self.helpers.is_linux() and not self.helpers.is_virtual_os():
                 static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help)
@@ -966,6 +968,7 @@ def _add_set_value_parser(self, subparsers, func):
         set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}"
         set_power_cap_help = "Set power capacity limit"
         set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n"
+        set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id\n"
 
         # Help text for CPU set options
         set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value."
@@ -1002,6 +1005,7 @@ def _add_set_value_parser(self, subparsers, func):
             set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION')
             set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS')
             set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False,  type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID')
+            set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False,  type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID')
 
         if self.helpers.is_amd_hsmp_initialized():
             # Optional CPU Args
diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h
index 861709b9..64bdb125 100644
--- a/include/amd_smi/amdsmi.h
+++ b/include/amd_smi/amdsmi.h
@@ -3405,6 +3405,49 @@ amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle,
  */
 amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle,
                              uint32_t policy_id);
+
+/**
+ * @brief Get the xgmi per-link power down policy parameter for the processor
+ *
+ * @platform{gpu_bm_linux}
+ *
+ * @details Given a processor handle @p processor_handle, this function will write
+ * current xgmi plpd settings to @p policy. All the processors at the same socket
+ * will have the same policy.
+ *
+ *  @param[in] processor_handle a processor handle
+ *
+ *  @param[in, out] policy the xgmi plpd for this processor.
+ *  If this parameter is nullptr, this function will return
+ *  ::AMDSMI_STATUS_INVAL
+ *
+ *  @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
+ */
+amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle,
+                             amdsmi_dpm_policy_t* xgmi_plpd);
+
+/**
+ * @brief Set the xgmi per-link power down policy parameter for the processor
+ *
+ * @platform{gpu_bm_linux}
+ *
+ * @details Given a processor handle @p processor_handle and a dpm policy @p plpd_id,
+ * this function will set the xgmi plpd for this processor. All the processors at
+ * the same socket will be set to the same policy.
+ *
+ *  @note This function requires root access
+ *
+ *  @param[in] processor_handle a processor handle
+ *
+ *  @param[in] xgmi_plpd_id the xgmi plpd id to set. The id is the id in
+ *  amdsmi_dpm_policy_entry_t, which can be obtained by calling
+ *  amdsmi_get_xgmi_plpd()
+ *
+ *  @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
+ */
+amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle,
+                             uint32_t plpd_id);
+
 /** @} End PerfCont */
 
 /*****************************************************************************/
diff --git a/py-interface/README.md b/py-interface/README.md
index 7d9fd590..82f8ca97 100644
--- a/py-interface/README.md
+++ b/py-interface/README.md
@@ -909,8 +909,8 @@ Field | Description
 `name` | Name of process
 `pid` | Process ID
 `mem` | Process memory usage
-`engine_usage`| <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
-`memory_usage`| <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
+`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
+`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
 
 Exceptions that can be thrown by `amdsmi_get_gpu_process_info` function:
 
@@ -2612,6 +2612,74 @@ except AmdSmiException as e:
     print(e)
 ```
 
+### amdsmi_set_xgmi_plpd
+
+Description: Set the xgmi per-link power down policy parameter for the processor
+
+Input parameters:
+
+* `processor_handle` handle for the given device
+* `policy_id` the xgmi plpd id to set.
+
+Output: None
+
+Exceptions that can be thrown by `amdsmi_set_xgmi_plpd` function:
+
+* `AmdSmiLibraryException`
+* `AmdSmiRetryException`
+* `AmdSmiParameterException`
+
+Example:
+
+```python
+try:
+    devices = amdsmi_get_processor_handles()
+    if len(devices) == 0:
+        print("No GPUs on machine")
+    else:
+        for device in devices:
+            amdsmi_set_xgmi_plpd(device, 0)
+except AmdSmiException as e:
+    print(e)
+```
+
+### amdsmi_get_xgmi_plpd
+
+Description: Get the xgmi per-link power down policy parameter for the processor
+
+Input parameters:
+
+* `processor_handle` handle for the given device
+
+Output: Dict containing information about xgmi per-link power down policy
+
+Field | Description
+---|---
+`num_supported` | The number of supported policies
+`current_id` | The current policy index
+`plpds` | List of policies.
+
+Exceptions that can be thrown by `amdsmi_get_xgmi_plpd` function:
+
+* `AmdSmiLibraryException`
+* `AmdSmiRetryException`
+* `AmdSmiParameterException`
+
+Example:
+
+```python
+try:
+    devices = amdsmi_get_processor_handles()
+    if len(devices) == 0:
+        print("No GPUs on machine")
+    else:
+        for device in devices:
+            xgmi_plpd =  amdsmi_get_xgmi_plpd(device)
+            print(xgmi_plpd)
+except AmdSmiException as e:
+    print(e)
+```
+
 ### amdsmi_set_gpu_overdrive_level
 
 Description: **deprecated** Set the overdrive percent associated with the
diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py
index e27451da..c9e773b8 100644
--- a/py-interface/amdsmi_interface.py
+++ b/py-interface/amdsmi_interface.py
@@ -2746,6 +2746,20 @@ def amdsmi_set_dpm_policy(
         )
     )
 
+def amdsmi_set_xgmi_plpd(
+    processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
+    policy_id: int,
+):
+    if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
+        raise AmdSmiParameterException(
+            processor_handle, amdsmi_wrapper.amdsmi_processor_handle
+        )
+    _check_res(
+        amdsmi_wrapper.amdsmi_set_xgmi_plpd(
+            processor_handle, policy_id
+        )
+    )
+
 def amdsmi_set_gpu_overdrive_level(
     processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int
 ):
@@ -3335,6 +3349,37 @@ def amdsmi_get_dpm_policy(
         "policies": polices,
     }
 
+def amdsmi_get_xgmi_plpd(
+    processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
+) -> Dict[str, Any]:
+    if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
+        raise AmdSmiParameterException(
+            processor_handle, amdsmi_wrapper.amdsmi_processor_handle
+        )
+
+    policy = amdsmi_wrapper.amdsmi_dpm_policy_t()
+    _check_res(
+        amdsmi_wrapper.amdsmi_get_xgmi_plpd(
+            processor_handle, ctypes.byref(policy)
+        )
+    )
+
+    polices = []
+    for i in range(0, policy.num_supported):
+        id = policy.policies[i].policy_id
+        desc = policy.policies[i].policy_description
+        polices.append({
+            'policy_id' : id,
+            'policy_description': desc.decode()
+        })
+    current_id = policy.policies[policy.current].policy_id
+
+    return  {
+        "num_supported": policy.num_supported,
+        "current_id": current_id,
+        "plpds": polices,
+    }
+
 def amdsmi_get_gpu_od_volt_info(
     processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
 ) -> Dict[str, Any]:
diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py
index f718dcfa..13cd2062 100644
--- a/py-interface/amdsmi_wrapper.py
+++ b/py-interface/amdsmi_wrapper.py
@@ -746,19 +746,6 @@ class struct_fields_(Structure):
 class struct_amdsmi_pcie_info_t(Structure):
     pass
 
-class struct_pcie_static_(Structure):
-    pass
-
-struct_pcie_static_._pack_ = 1 # source:False
-struct_pcie_static_._fields_ = [
-    ('max_pcie_width', ctypes.c_uint16),
-    ('PADDING_0', ctypes.c_ubyte * 2),
-    ('max_pcie_speed', ctypes.c_uint32),
-    ('pcie_interface_version', ctypes.c_uint32),
-    ('slot_type', amdsmi_card_form_factor_t),
-    ('reserved', ctypes.c_uint64 * 10),
-]
-
 class struct_pcie_metric_(Structure):
     pass
 
@@ -777,6 +764,19 @@ class struct_pcie_metric_(Structure):
     ('reserved', ctypes.c_uint64 * 13),
 ]
 
+class struct_pcie_static_(Structure):
+    pass
+
+struct_pcie_static_._pack_ = 1 # source:False
+struct_pcie_static_._fields_ = [
+    ('max_pcie_width', ctypes.c_uint16),
+    ('PADDING_0', ctypes.c_ubyte * 2),
+    ('max_pcie_speed', ctypes.c_uint32),
+    ('pcie_interface_version', ctypes.c_uint32),
+    ('slot_type', amdsmi_card_form_factor_t),
+    ('reserved', ctypes.c_uint64 * 10),
+]
+
 struct_amdsmi_pcie_info_t._pack_ = 1 # source:False
 struct_amdsmi_pcie_info_t._fields_ = [
     ('pcie_static', struct_pcie_static_),
@@ -2058,6 +2058,12 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure):
 amdsmi_set_dpm_policy = _libraries['libamd_smi.so'].amdsmi_set_dpm_policy
 amdsmi_set_dpm_policy.restype = amdsmi_status_t
 amdsmi_set_dpm_policy.argtypes = [amdsmi_processor_handle, uint32_t]
+amdsmi_get_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_get_xgmi_plpd
+amdsmi_get_xgmi_plpd.restype = amdsmi_status_t
+amdsmi_get_xgmi_plpd.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_dpm_policy_t)]
+amdsmi_set_xgmi_plpd = _libraries['libamd_smi.so'].amdsmi_set_xgmi_plpd
+amdsmi_set_xgmi_plpd.restype = amdsmi_status_t
+amdsmi_set_xgmi_plpd.argtypes = [amdsmi_processor_handle, uint32_t]
 amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version
 amdsmi_get_lib_version.restype = amdsmi_status_t
 amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)]
@@ -2594,8 +2600,9 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure):
     'amdsmi_get_processor_info', 'amdsmi_get_processor_type',
     'amdsmi_get_socket_handles', 'amdsmi_get_socket_info',
     'amdsmi_get_temp_metric', 'amdsmi_get_utilization_count',
-    'amdsmi_get_xgmi_info', 'amdsmi_gpu_block_t',
-    'amdsmi_gpu_cache_info_t', 'amdsmi_gpu_control_counter',
+    'amdsmi_get_xgmi_info', 'amdsmi_get_xgmi_plpd',
+    'amdsmi_gpu_block_t', 'amdsmi_gpu_cache_info_t',
+    'amdsmi_gpu_control_counter',
     'amdsmi_gpu_counter_group_supported', 'amdsmi_gpu_create_counter',
     'amdsmi_gpu_destroy_counter', 'amdsmi_gpu_metrics_t',
     'amdsmi_gpu_read_counter', 'amdsmi_gpu_xgmi_error_status',
@@ -2636,10 +2643,10 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure):
     'amdsmi_set_gpu_overdrive_level', 'amdsmi_set_gpu_pci_bandwidth',
     'amdsmi_set_gpu_perf_determinism_mode',
     'amdsmi_set_gpu_perf_level', 'amdsmi_set_gpu_power_profile',
-    'amdsmi_set_power_cap', 'amdsmi_shut_down',
-    'amdsmi_smu_fw_version_t', 'amdsmi_socket_handle',
-    'amdsmi_status_code_to_string', 'amdsmi_status_t',
-    'amdsmi_stop_gpu_event_notification',
+    'amdsmi_set_power_cap', 'amdsmi_set_xgmi_plpd',
+    'amdsmi_shut_down', 'amdsmi_smu_fw_version_t',
+    'amdsmi_socket_handle', 'amdsmi_status_code_to_string',
+    'amdsmi_status_t', 'amdsmi_stop_gpu_event_notification',
     'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t',
     'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type',
     'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number',
diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h
index 12654213..e10ab49b 100755
--- a/rocm_smi/include/rocm_smi/rocm_smi.h
+++ b/rocm_smi/include/rocm_smi/rocm_smi.h
@@ -3364,6 +3364,45 @@ rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind,
 rsmi_status_t rsmi_dev_dpm_policy_set(uint32_t dv_ind,
                              uint32_t policy_id);
 
+/**
+ * @brief Get the xgmi per-link power down policy parameter for a device
+ *
+ *
+ * @details Given a device index @p dv_ind, this function will write
+ * current xgmi plpd settings to @p xgmi_plpd. All the processors at the same socket
+ * will have the same policy.
+ *
+ *  @param[in] dv_ind a device index
+ *
+ *  @param[in, out] xgmi_plpd the xgmi_plpd policy for this device.
+ *  If this parameter is nullptr, this function will return
+ *  ::RSMI_STATUS_INVAL
+ *
+ *  @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail
+ */
+rsmi_status_t rsmi_dev_xgmi_plpd_get(uint32_t dv_ind,
+                             rsmi_dpm_policy_t* xgmi_plpd);
+
+/**
+ * @brief Set the xgmi per-link power down policy parameter for a device
+ *
+ *
+ * @details  Given a device index @p dv_ind, and a dpm policy @p plpd_id,
+ * this function will set the xgmi plpd for this processor. All the processors at
+ * the same socket will be set to the same policy.
+ *
+ *  @note This function requires root access
+ *
+ *  @param[in] processor_handle a processor handle
+ *
+ *  @param[in] xgmi_plpd_id the xgmi plpd id to set. The id is the id in
+ *  rsmi_dpm_policy_entry_t, which can be obtained by calling
+ *  rsmi_dev_xgmi_plpd_get()
+ *
+ *  @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail
+ */
+rsmi_status_t rsmi_dev_xgmi_plpd_set(uint32_t dv_ind,
+                             uint32_t plpd_id);
 /** @} */  // end of PerfCont
 
 /*****************************************************************************/
diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc
index 91c8ddbb..6aa0d86f 100755
--- a/rocm_smi/src/rocm_smi.cc
+++ b/rocm_smi/src/rocm_smi.cc
@@ -2038,6 +2038,130 @@ rsmi_dev_dpm_policy_set(uint32_t dv_ind,
   CATCH
 }
 
+rsmi_status_t
+rsmi_dev_xgmi_plpd_get(uint32_t dv_ind,
+                      rsmi_dpm_policy_t* policy) {
+  rsmi_status_t ret;
+  std::vector<std::string> val_vec;
+
+  if (policy == nullptr) {
+    return RSMI_STATUS_INVALID_ARGS;
+  }
+
+  *policy = {};
+
+  TRY
+  std::ostringstream ss;
+  ss << __PRETTY_FUNCTION__ << " | ======= start =======";
+  LOG_TRACE(ss);
+  DEVICE_MUTEX
+
+  ret = GetDevValueVec(amd::smi::kDevDPMPolicy, dv_ind, &val_vec);
+  if (ret == RSMI_STATUS_FILE_ERROR) {
+    ss << __PRETTY_FUNCTION__ << " | ======= end ======="
+       << ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR "
+       << "-> reporting RSMI_STATUS_NOT_SUPPORTED";
+    LOG_ERROR(ss);
+    return RSMI_STATUS_NOT_SUPPORTED;
+  }
+  if (ret != RSMI_STATUS_SUCCESS) {
+    ss << __PRETTY_FUNCTION__ << " | ======= end ======="
+       << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS"
+       << " -> reporting " << amd::smi::getRSMIStatusString(ret);
+    LOG_ERROR(ss);
+    return ret;
+  }
+  /*
+    It will reply on the number but no string as it may vary from soc to soc.
+    The current xmgi plpd marked with *
+    xgmi plpd
+    0 : plpd_disallow
+    1 : plpd_default
+    2 : plpd_optimized*
+  */
+  bool see_plpd_pstate = false;
+  bool see_current = false;
+  policy->num_supported = 0;
+  for (uint32_t i = 0; i < val_vec.size(); ++i) {
+    auto current_line = amd::smi::trim(val_vec[i]);
+    if (current_line == "xgmi plpd") {
+      see_plpd_pstate = true;
+      continue;
+    }
+    if (see_plpd_pstate == false) continue;
+
+    // Get tokens: <integer> : <string *>
+    std::vector<std::string> tokens;
+    std::istringstream f(current_line);
+    std::string s;
+    while (getline(f, s, ':')) {
+          tokens.push_back(s);
+    }
+
+    int value = 0;
+    // At the end
+    if (tokens.size() < 2 || !amd::smi::stringToInteger(tokens[0], value)) {
+      break;
+    }
+
+    if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) {
+      ss << __PRETTY_FUNCTION__ << " | ======= end ======="
+          << ", Unexpected pstat data: the id is negative or too many plpd policies.";
+          LOG_ERROR(ss);
+          return RSMI_STATUS_UNEXPECTED_DATA;
+    }
+
+    policy->policies[policy->num_supported].policy_id = value;
+    std::string description = amd::smi::trim(tokens[1]);
+    if (current_line.back() == '*') {  // current policy
+        description.pop_back();  // remove last *
+        description = amd::smi::trim(description);
+        policy->current = policy->num_supported;
+        see_current = true;
+    }
+    strncpy(policy->policies[policy->num_supported].policy_description,
+          description.c_str(),
+          RSMI_MAX_POLICY_NAME-1);
+    policy->num_supported++;
+  }  //  end for
+
+  if (!see_plpd_pstate) {
+    return RSMI_STATUS_NOT_SUPPORTED;
+  }
+
+  if (!see_current) {
+      ss << __PRETTY_FUNCTION__ << " | ======= end ======="
+          << ", Unexpected pstat data: cannot find the current plpd policy.";
+          LOG_ERROR(ss);
+          return RSMI_STATUS_UNEXPECTED_DATA;
+  }
+  // Cannot find it
+  return RSMI_STATUS_SUCCESS;
+
+  CATCH
+}
+
+rsmi_status_t
+rsmi_dev_xgmi_plpd_set(uint32_t dv_ind,
+                      uint32_t plpd_id) {
+  rsmi_status_t ret;
+
+  TRY
+  std::ostringstream ss;
+  ss << __PRETTY_FUNCTION__ << " | ======= start =======";
+  LOG_TRACE(ss);
+  REQUIRE_ROOT_ACCESS
+  DEVICE_MUTEX
+  GET_DEV_FROM_INDX
+
+  std::string value("xgmi ");
+  value += std::to_string(plpd_id);
+  int ret = dev->writeDevInfo(amd::smi::kDevDPMPolicy , value);
+  return amd::smi::ErrnoToRsmiStatus(ret);
+
+  CATCH
+}
+
 rsmi_status_t
 rsmi_dev_dpm_policy_get(uint32_t dv_ind,
                       rsmi_dpm_policy_t* policy) {
@@ -2107,7 +2231,7 @@ rsmi_dev_dpm_policy_get(uint32_t dv_ind,
 
     if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) {
       ss << __PRETTY_FUNCTION__ << " | ======= end ======="
-          << ", Unexpeced pstat data: the id is negative or too many policies.";
+          << ", Unexpected pstat data: the id is negative or too many policies.";
           LOG_ERROR(ss);
           return RSMI_STATUS_UNEXPECTED_DATA;
     }
@@ -2132,7 +2256,7 @@ rsmi_dev_dpm_policy_get(uint32_t dv_ind,
 
   if (!see_current) {
       ss << __PRETTY_FUNCTION__ << " | ======= end ======="
-          << ", Unexpeced pstat data: cannot find the current policy.";
+          << ", Unexpected pstat data: cannot find the current policy.";
           LOG_ERROR(ss);
           return RSMI_STATUS_UNEXPECTED_DATA;
   }
diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc
index 3e63659c..92de58c6 100755
--- a/rocm_smi/src/rocm_smi_device.cc
+++ b/rocm_smi/src/rocm_smi_device.cc
@@ -536,8 +536,10 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
   {"rsmi_topo_numa_affinity_get",        {{kDevNumaNodeFName}, {}}},
   {"rsmi_dev_gpu_metrics_info_get",      {{kDevGpuMetricsFName}, {}}},
   {"rsmi_dev_pm_metrics_info_get",       {{kDevPmMetricsFName}, {}}},
-  {"rsmi_dev_dpm_policy_get",           {{kDevDPMPolicyFName}, {}}},
-  {"rsmi_dev_dpm_policy_set",           {{kDevDPMPolicyFName}, {}}},
+  {"rsmi_dev_dpm_policy_get",            {{kDevDPMPolicyFName}, {}}},
+  {"rsmi_dev_dpm_policy_set",            {{kDevDPMPolicyFName}, {}}},
+  {"rsmi_dev_xgmi_plpd_get",             {{kDevDPMPolicyFName}, {}}},
+  {"rsmi_dev_xgmi_plpd_set",             {{kDevDPMPolicyFName}, {}}},
   {"rsmi_dev_reg_table_info_get",        {{kDevRegMetricsFName}, {}}},
   {"rsmi_dev_gpu_reset",                 {{kDevGpuResetFName}, {}}},
   {"rsmi_dev_compute_partition_get",     {{kDevComputePartitionFName}, {}}},
diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc
index e57ae30c..1dafee87 100644
--- a/src/amd_smi/amd_smi.cc
+++ b/src/amd_smi/amd_smi.cc
@@ -1369,6 +1369,22 @@ amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle,
                     reinterpret_cast<rsmi_dpm_policy_t*>(policy));
 }
 
+amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle,
+                         uint32_t policy) {
+    AMDSMI_CHECK_INIT();
+
+    return rsmi_wrapper(rsmi_dev_xgmi_plpd_set, processor_handle,
+                    policy);
+}
+
+amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle,
+                         amdsmi_dpm_policy_t* policy) {
+    AMDSMI_CHECK_INIT();
+
+    return rsmi_wrapper(rsmi_dev_xgmi_plpd_get, processor_handle,
+                    reinterpret_cast<rsmi_dpm_policy_t*>(policy));
+}
+
 amdsmi_status_t
 amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle,
                                     uint32_t *num_pages,

From 72b0a6efe56cc368e02d783efab0a800c30bfff3 Mon Sep 17 00:00:00 2001
From: Maisam Arif <maisarif@amd.com>
Date: Tue, 26 Mar 2024 08:24:12 -0500
Subject: [PATCH 07/18] SWDEV-431924 - Corrected amdsmi_get_gpu_board_info() to
 return N/A for invalid values

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I3f7e7c873c24b8f5ddd6784700f193c2fdf199e0
---
 py-interface/amdsmi_interface.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py
index c9e773b8..09d829fe 100644
--- a/py-interface/amdsmi_interface.py
+++ b/py-interface/amdsmi_interface.py
@@ -1848,7 +1848,7 @@ def amdsmi_get_gpu_board_info(
             processor_handle, ctypes.byref(board_info))
     )
 
-    return {
+    board_info_dict = {
         "model_number": board_info.model_number.decode("utf-8").strip(),
         "product_serial": board_info.product_serial.decode("utf-8").strip(),
         "fru_id": board_info.fru_id.decode("utf-8").strip(),
@@ -1856,6 +1856,12 @@ def amdsmi_get_gpu_board_info(
         "manufacturer_name": board_info.manufacturer_name.decode("utf-8").strip()
     }
 
+    for key, value in board_info_dict.items():
+        if value == "":
+            board_info_dict[key] = "N/A"
+
+    return board_info_dict
+
 
 def amdsmi_get_gpu_ras_feature_info(
     processor_handle: amdsmi_wrapper.amdsmi_processor_handle,

From dad2c430ead1a4214d90d72c07eadc9717ed1bfa Mon Sep 17 00:00:00 2001
From: Maisam Arif <maisarif@amd.com>
Date: Tue, 26 Mar 2024 08:45:08 -0500
Subject: [PATCH 08/18] SWDEV-435406 - Corrected amdsmi_get_power_info() to
 return N/A for invalid values

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I2aeb6f6670f6f47cd496faf7fc41192647f7d58c
---
 py-interface/amdsmi_interface.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py
index 09d829fe..e3dfa1a4 100644
--- a/py-interface/amdsmi_interface.py
+++ b/py-interface/amdsmi_interface.py
@@ -2038,7 +2038,7 @@ def amdsmi_get_power_info(
         )
     )
 
-    return {
+    power_info_dict = {
         "current_socket_power": power_measure.current_socket_power,
         "average_socket_power": power_measure.average_socket_power,
         "gfx_voltage": power_measure.gfx_voltage,
@@ -2047,6 +2047,12 @@ def amdsmi_get_power_info(
         "power_limit" : power_measure.power_limit,
     }
 
+    for key, value in power_info_dict.items():
+        if value == 0xFFFF:
+            power_info_dict[key] = "N/A"
+
+    return power_info_dict
+
 
 def amdsmi_is_gpu_power_management_enabled(
     processor_handle: amdsmi_wrapper.amdsmi_processor_handle

From 8bf2bd4b898c885b0e94247fc84a2c6c8acf2671 Mon Sep 17 00:00:00 2001
From: Maisam Arif <maisarif@amd.com>
Date: Tue, 26 Mar 2024 07:59:50 -0500
Subject: [PATCH 09/18] SWDEV-447333 - Corrected amdsmi_init() python
 documentation

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: If46e7236316687cd97cf1a69770f87154e2681ff
---
 py-interface/README.md | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/py-interface/README.md b/py-interface/README.md
index 82f8ca97..ae9b3568 100644
--- a/py-interface/README.md
+++ b/py-interface/README.md
@@ -73,9 +73,9 @@ except AmdSmiException as e:
 
 ### amdsmi_init
 
-Description: Dynamically initialize amdsmi with amd_hsmp and amdgpu drivers
+Description: Initialize amdsmi with AmdSmiInitFlags
 
-Input parameters: `None`
+Input parameters: AmdSmiInitFlags
 
 Output: `None`
 
@@ -83,19 +83,37 @@ Exceptions that can be thrown by `amdsmi_init` function:
 
 * `AmdSmiLibraryException`
 
-Example:
+Initialize GPUs only example:
 
 ```python
 try:
+    # by default we initalize with AmdSmiInitFlags.INIT_AMD_GPUS
     init_flag = amdsmi_init()
-    # Print out integer bitmask of initialized drivers
-    # 1 is for amd_hsmp
-    # 2 is for amdgpu
-    # 3 is for amd_hsmp and amdgpu
-    print(init_flag)
     # continue with amdsmi
 except AmdSmiException as e:
-    print("Init failed")
+    print("Init GPUs failed")
+    print(e)
+```
+
+Initialize CPUs only example:
+
+```python
+try:
+    init_flag = amdsmi_init(AmdSmiInitFlags.INIT_AMD_CPUS)
+    # continue with amdsmi
+except AmdSmiException as e:
+    print("Init CPUs failed")
+    print(e)
+```
+
+Initialize both GPUs and CPUs example:
+
+```python
+try:
+    init_flag = amdsmi_init(AmdSmiInitFlags.INIT_AMD_APUS)
+    # continue with amdsmi
+except AmdSmiException as e:
+    print("Init both GPUs & CPUs failed")
     print(e)
 ```
 

From 93b81e501250fb6fc885cdd11e56407ea5ef87f5 Mon Sep 17 00:00:00 2001
From: Maisam Arif <maisarif@amd.com>
Date: Tue, 26 Mar 2024 01:01:38 -0500
Subject: [PATCH 10/18] SWDEV-445664 - Aligned metric --clock with Host

Change-Id: Ib4dc372aed61f6301680ac746eccf448e9d0ed00
Signed-off-by: Maisam Arif <maisarif@amd.com>
---
 CHANGELOG.md                     | 136 +++++++++++++++++-
 amdsmi_cli/amdsmi_commands.py    | 240 +++++++++++++++++++++++--------
 amdsmi_cli/amdsmi_helpers.py     |  20 ++-
 py-interface/README.md           |   2 +-
 py-interface/amdsmi_interface.py |   4 +-
 src/amd_smi/amd_smi.cc           |   6 +
 src/amd_smi/amd_smi_utils.cc     |   6 +
 7 files changed, 343 insertions(+), 71 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 00728e53..b7a3a85d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,11 +4,127 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
 
 ***All information listed below is for reference and subject to change.***
 
+## amd_smi_lib for ROCm 6.2.0
+
+### Changed
+
+Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status.
+
+``` shell
+$ amd-smi metric --clock
+GPU: 0
+    CLOCK:
+        GFX_0:
+            CLK: 113 MHz
+            MIN_CLK: 500 MHz
+            MAX_CLK: 1800 MHz
+            CLK_LOCKED: DISABLED
+            DEEP_SLEEP: ENABLED
+        GFX_1:
+            CLK: 113 MHz
+            MIN_CLK: 500 MHz
+            MAX_CLK: 1800 MHz
+            CLK_LOCKED: DISABLED
+            DEEP_SLEEP: ENABLED
+        GFX_2:
+            CLK: 112 MHz
+            MIN_CLK: 500 MHz
+            MAX_CLK: 1800 MHz
+            CLK_LOCKED: DISABLED
+            DEEP_SLEEP: ENABLED
+        GFX_3:
+            CLK: 113 MHz
+            MIN_CLK: 500 MHz
+            MAX_CLK: 1800 MHz
+            CLK_LOCKED: DISABLED
+            DEEP_SLEEP: ENABLED
+        GFX_4:
+            CLK: 113 MHz
+            MIN_CLK: 500 MHz
+            MAX_CLK: 1800 MHz
+            CLK_LOCKED: DISABLED
+            DEEP_SLEEP: ENABLED
+        GFX_5:
+            CLK: 113 MHz
+            MIN_CLK: 500 MHz
+            MAX_CLK: 1800 MHz
+            CLK_LOCKED: DISABLED
+            DEEP_SLEEP: ENABLED
+        GFX_6:
+            CLK: 113 MHz
+            MIN_CLK: 500 MHz
+            MAX_CLK: 1800 MHz
+            CLK_LOCKED: DISABLED
+            DEEP_SLEEP: ENABLED
+        GFX_7:
+            CLK: 113 MHz
+            MIN_CLK: 500 MHz
+            MAX_CLK: 1800 MHz
+            CLK_LOCKED: DISABLED
+            DEEP_SLEEP: ENABLED
+        MEM_0:
+            CLK: 900 MHz
+            MIN_CLK: 900 MHz
+            MAX_CLK: 1200 MHz
+            CLK_LOCKED: N/A
+            DEEP_SLEEP: DISABLED
+        VCLK_0:
+            CLK: 29 MHz
+            MIN_CLK: 914 MHz
+            MAX_CLK: 1480 MHz
+            CLK_LOCKED: N/A
+            DEEP_SLEEP: ENABLED
+        VCLK_1:
+            CLK: 29 MHz
+            MIN_CLK: 914 MHz
+            MAX_CLK: 1480 MHz
+            CLK_LOCKED: N/A
+            DEEP_SLEEP: ENABLED
+        VCLK_2:
+            CLK: 29 MHz
+            MIN_CLK: 914 MHz
+            MAX_CLK: 1480 MHz
+            CLK_LOCKED: N/A
+            DEEP_SLEEP: ENABLED
+        VCLK_3:
+            CLK: 29 MHz
+            MIN_CLK: 914 MHz
+            MAX_CLK: 1480 MHz
+            CLK_LOCKED: N/A
+            DEEP_SLEEP: ENABLED
+        DCLK_0:
+            CLK: 22 MHz
+            MIN_CLK: 711 MHz
+            MAX_CLK: 1233 MHz
+            CLK_LOCKED: N/A
+            DEEP_SLEEP: ENABLED
+        DCLK_1:
+            CLK: 22 MHz
+            MIN_CLK: 711 MHz
+            MAX_CLK: 1233 MHz
+            CLK_LOCKED: N/A
+            DEEP_SLEEP: ENABLED
+        DCLK_2:
+            CLK: 22 MHz
+            MIN_CLK: 711 MHz
+            MAX_CLK: 1233 MHz
+            CLK_LOCKED: N/A
+            DEEP_SLEEP: ENABLED
+        DCLK_3:
+            CLK: 22 MHz
+            MIN_CLK: 711 MHz
+            MAX_CLK: 1233 MHz
+            CLK_LOCKED: N/A
+            DEEP_SLEEP: ENABLED
+```
+
 ## amd_smi_lib for ROCm 6.1.0
 
 ### Added
+
 - **Added Monitor Command**  
 Provides users the ability to customize GPU metrics to capture, collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi` (no argument), additionally allows uers to customize what data is helpful for their use-case.
+
 ```shell
 $ amd-smi monitor -h
 usage: amd-smi monitor [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
@@ -52,6 +168,7 @@ Command Modifiers:
   --loglevel LEVEL             Set the logging level from the possible choices:
                                 DEBUG, INFO, WARNING, ERROR, CRITICAL
 ```
+
 ```shell
 $ amd-smi monitor -ptumv
 GPU  POWER  GPU_TEMP  MEM_TEMP  GFX_UTIL  GFX_CLOCK  MEM_UTIL  MEM_CLOCK  VRAM_USED  VRAM_TOTAL
@@ -80,6 +197,7 @@ CPU: 0
     INTERFACE_VERSION:
         PROTO VERSION: 6
 ```
+
 ```shell
 $ amd-smi metric -O 0 1 2
 CORE: 0
@@ -106,6 +224,7 @@ CORE: 2
     CORE_ENERGY:
         VALUE: N/A
 ```
+
 ```shell
 $ amd-smi metric -U all
 CPU: 0
@@ -212,6 +331,7 @@ CPU: 0
     CPU_TEMP:
         RESPONSE: N/A
 ```
+
 - **Added support for new metrics: VCN, JPEG engines, and PCIe errors**  
 Using the AMD SMI tool, users can retreive VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs (with 4 separate VCN engine activities) for older asics `MM_ACTIVITY` with UVD/VCN engine activity (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new fields.
 
@@ -230,6 +350,7 @@ GPU: 0
         CURRENT_BANDWIDTH_RECEIVED: N/A
         MAX_PACKET_SIZE: N/A
 ```
+
 ```shell
 $ amd-smi metric --usage
 GPU: 0
@@ -243,11 +364,13 @@ GPU: 0
             0 %, 0 %, 0 %, 0 %]
 
 ```
+
 - **Added AMDSMI Tool Version**  
 AMD SMI will report ***three versions***: AMDSMI Tool, AMDSMI Library version, and ROCm version.  
 The AMDSMI Tool version is the CLI/tool version number with commit ID appended after `+` sign.  
 The AMDSMI Library version is the library package version number.  
 The ROCm version is the system's installed ROCm version, if ROCm is not installed it will report N/A.
+
 ```shell
 $ amd-smi version
 AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6.1.0
@@ -255,6 +378,7 @@ AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6
 
 - **Added XGMI table**  
 Displays XGMI information for AMD GPU devices in a table format. Only available on supported ASICs (eg. MI300). Here users can view read/write data XGMI or PCIe accumulated data transfer size (in KiloBytes).
+
 ```shell
 $ amd-smi xgmi
 LINK METRIC TABLE:
@@ -285,10 +409,12 @@ GPU7   0000:df:00.0 32 Gb/s  512 Gb/s      XGMI
  Write                                               0 KB         1 KB         1 KB         1 KB         1 KB         1 KB         1 KB         N/A
 
 ```
+
 - **Added units of measure to JSON output.**  
 We added unit of measure to JSON/CSV `amd-smi metric`, `amd-smi static`, and `amd-smi monitor` commands.
 
 Ex.  
+
 ```shell
 amd-smi metric -p --json
 [
@@ -321,7 +447,8 @@ amd-smi metric -p --json
 ### Changed
 
 - **Topology is now left-aligned with BDF of each device listed individual table's row/coloumns.**  
-We provided each device's BDF for every table's row/columns, then left aligned data. We want AMD SMI Tool output to be easy to understand and digest for our users. Having users scroll up to find this information made it difficult to follow, especially for devices which have many devices associated with one ASIC. 
+We provided each device's BDF for every table's row/columns, then left aligned data. We want AMD SMI Tool output to be easy to understand and digest for our users. Having users scroll up to find this information made it difficult to follow, especially for devices which have many devices associated with one ASIC.
+
 ```shell
 $ amd-smi topology
 ACCESS TABLE:
@@ -381,6 +508,7 @@ NUMA BW TABLE:
 ```
 
 ### Optimizations
+
 - N/A
 
 ### Fixed
@@ -394,14 +522,14 @@ Platforms which are identified as having an older pyyaml version or pip, we no m
   - `amd-smi firmware`
   - `amd-smi metric`
   - `amd-smi topology`
+
 ```shell
 TypeError: dump_all() got an unexpected keyword argument 'sort_keys'
 ```
+
 - **Fix for crash when user is not a member of video/render groups**
 AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid crashes when DRM/device data is inaccessable to the logged in user.
 
-
-
 ### Known Issues
 
 - N/A
@@ -419,7 +547,6 @@ You can now query MI300 device metrics to get real-time information. Metrics inc
 - **Compute and memory partition support**  
 Users can now view, set, and reset partitions. The topology display can provide a more in-depth look at the device's current configuration.
 
-
 ### Changed
 
 - **GPU index sorting made consistent with other tools**  
@@ -437,7 +564,6 @@ Now the information is displayed as a table by each GPU's BDF, which closer rese
 - **Fix for driver not initialized**  
 If driver module is not loaded, user retrieve error reponse indicating amdgpu module is not loaded.
 
-
 ### Known Issues
 
 - N/A
diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index 689b3fa5..fce9e852 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -1344,73 +1344,189 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No
                 values_dict['power'] = power_dict
         if "clock" in current_platform_args:
             if args.clock:
+                # Populate Skeleton output with N/A
                 clocks = {}
-                clock_types = [amdsmi_interface.AmdSmiClkType.GFX,
-                                amdsmi_interface.AmdSmiClkType.MEM,
-                                amdsmi_interface.AmdSmiClkType.VCLK0,
-                                amdsmi_interface.AmdSmiClkType.VCLK1]
-                for clock_type in clock_types:
-                    clock_name = amdsmi_interface.amdsmi_wrapper.amdsmi_clk_type_t__enumvalues[clock_type].replace("CLK_TYPE_", "")
-                    # Ensure that gfx is the clock_name instead of another macro
-                    if clock_type == amdsmi_interface.AmdSmiClkType.GFX:
-                        clock_name = "gfx"
-
-                    # Store the clock_name for vclk0
-                    vlck0_clock_name = None
-                    if clock_type == amdsmi_interface.AmdSmiClkType.VCLK0:
-                        vlck0_clock_name = clock_name
 
-                    try:
-                        clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, clock_type)
-                        clock_info = {"clk" : clock_info_dict["cur_clk"]}
-                        del clock_info_dict["cur_clk"]
-                        clock_info.update(clock_info_dict)
-
-                        if clock_info['sleep_clk'] == 0xFFFFFFFF:
-                            clock_info['sleep_clk'] = "N/A"
-
-                        clock_freq_unit = 'MHz'
-                        for key, value in clock_info.items():
-                            if isinstance(value, int):
-                                if self.logger.is_human_readable_format():
-                                    clock_info[key] = f"{value} {clock_freq_unit}"
-                                if self.logger.is_json_format():
-                                    clock_info[key] = {"value" : value,
-                                                       "unit" : clock_freq_unit}
-
-                        clocks[clock_name] = clock_info
-                    except amdsmi_exception.AmdSmiLibraryException as e:
-                        # Handle the case where VCLK1 is not enaled in sysfs on all GPUs
-                        if clock_type == amdsmi_interface.AmdSmiClkType.VCLK1:
-                            # Check if VCLK0 was retrieved successfully
-                            if vlck0_clock_name in clocks:
-                                # Since VCLK0 exists, do not error
-                                logging.debug("VLCK0 exists, not adding %s clock info to output for gpu %s | %s", clock_name, gpu_id, e.get_error_info())
-                                continue
+                for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS):
+                    gfx_index = f"gfx_{clock_index}"
+                    clocks[gfx_index] = {"clk" : "N/A",
+                                         "min_clk" : "N/A",
+                                         "max_clk" : "N/A",
+                                         "clk_locked" : "N/A",
+                                         "deep_sleep" : "N/A"}
+
+                clocks["mem_0"] = {"clk" : "N/A",
+                                   "min_clk" : "N/A",
+                                   "max_clk" : "N/A",
+                                   "clk_locked" : "N/A",
+                                   "deep_sleep" : "N/A"}
+
+                for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS):
+                    vclk_index = f"vclk_{clock_index}"
+                    clocks[vclk_index] = {"clk" : "N/A",
+                                          "min_clk" : "N/A",
+                                          "max_clk" : "N/A",
+                                          "clk_locked" : "N/A",
+                                          "deep_sleep" : "N/A"}
+
+                for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS):
+                    dclk_index = f"dclk_{clock_index}"
+                    clocks[dclk_index] = {"clk" : "N/A",
+                                          "min_clk" : "N/A",
+                                          "max_clk" : "N/A",
+                                          "clk_locked" : "N/A",
+                                          "deep_sleep" : "N/A"}
+
+                clock_unit = "MHz"
+                # TODO make the deepsleep threshold correspond to the * in sysfs for current deep sleep status
+                deep_sleep_threshold = 140
+
+                # Populate clock values from gpu_metrics_info
+                try:
+                    gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
+
+                    # Populate GFX clock values
+                    current_gfx_clocks = gpu_metrics_info["current_gfxclks"]
+                    for clock_index, current_gfx_clock in enumerate(current_gfx_clocks):
+                        # If the current clock is N/A then nothing else applies
+                        if current_gfx_clock == "N/A":
+                            continue
+
+                        gfx_index = f"gfx_{clock_index}"
+                        clocks[gfx_index]["clk"] = self.helpers.unit_format(self.logger,
+                                                                            current_gfx_clock,
+                                                                            clock_unit)
+
+                        # Populate clock locked status
+                        if gpu_metrics_info["gfxclk_lock_status"] != "N/A":
+                            gfx_clock_lock_flag = 1 << clock_index # This is the position of the clock lock flag
+                            if gpu_metrics_info["gfxclk_lock_status"] & gfx_clock_lock_flag:
+                                clocks[gfx_index]["clk_locked"] = "ENABLED"
+                            else:
+                                clocks[gfx_index]["clk_locked"] = "DISABLED"
+
+                        # Populate deep sleep status
+                        if int(current_gfx_clock) <= deep_sleep_threshold:
+                            clocks[gfx_index]["deep_sleep"] = "ENABLED"
                         else:
-                            # Handle all other failed to get clock info
-                            clocks[clock_name] = {"clk": "N/A",
-                                                  "max_clk": "N/A",
-                                                  "min_clk": "N/A",
-                                                  "sleep_clk": "N/A"}
-                            logging.debug("Failed to get %s clock info for gpu %s | %s", clock_name, gpu_id, e.get_error_info())
+                            clocks[gfx_index]["deep_sleep"] = "DISABLED"
 
-                try:
-                    gfxclk_lock_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['gfxclk_lock_status']
-                    if gfxclk_lock_status != "N/A":
-                        if gfxclk_lock_status:
-                            gfxclk_lock_status = "ENABLED"
+                    # Populate MEM clock value
+                    current_mem_clock = gpu_metrics_info["current_uclk"] # single value
+                    if current_mem_clock != "N/A":
+                        clocks["mem_0"]["clk"] = self.helpers.unit_format(self.logger,
+                                                                          current_mem_clock,
+                                                                          clock_unit)
+
+                        if int(current_mem_clock) <= deep_sleep_threshold:
+                            clocks["mem_0"]["deep_sleep"] = "ENABLED"
+                        else:
+                            clocks["mem_0"]["deep_sleep"] = "DISABLED"
+
+                    # Populate VCLK clock values
+                    current_vclk_clocks = gpu_metrics_info["current_vclk0s"]
+                    for clock_index, current_vclk_clock in enumerate(current_vclk_clocks):
+                        # If the current clock is N/A then nothing else applies
+                        if current_vclk_clock == "N/A":
+                            continue
+
+                        vclk_index = f"vclk_{clock_index}"
+                        clocks[vclk_index]["clk"] = self.helpers.unit_format(self.logger,
+                                                                             current_vclk_clock,
+                                                                             clock_unit)
+
+                        if int(current_vclk_clock) <= deep_sleep_threshold:
+                            clocks[vclk_index]["deep_sleep"] = "ENABLED"
                         else:
-                            gfxclk_lock_status = "DISABLED"
+                            clocks[vclk_index]["deep_sleep"] = "DISABLED"
+
+                    # Populate DCLK clock values
+                    current_dclk_clocks = gpu_metrics_info["current_dclk0s"]
+                    for clock_index, current_dclk_clock in enumerate(current_dclk_clocks):
+                        # If the current clock is N/A then nothing else applies
+                        if current_dclk_clock == "N/A":
+                            continue
+
+                        dclk_index = f"dclk_{clock_index}"
+                        clocks[dclk_index]["clk"] = self.helpers.unit_format(self.logger,
+                                                                             current_dclk_clock,
+                                                                             clock_unit)
+
+                        if int(current_dclk_clock) <= deep_sleep_threshold:
+                            clocks[dclk_index]["deep_sleep"] = "ENABLED"
+                        else:
+                            clocks[dclk_index]["deep_sleep"] = "DISABLED"
                 except amdsmi_exception.AmdSmiLibraryException as e:
-                    gfxclk_lock_status = "N/A"
-                    logging.debug("Failed to get gfx clock lock status info for gpu %s | %s", gpu_id, e.get_error_info())
+                    logging.debug("Failed to get gpu_metrics_info for gpu %s | %s", gpu_id, e.get_error_info())
 
-                if "gfx" in clocks:
-                    if isinstance(clocks['gfx'], dict):
-                        clocks['gfx']['clk_locked'] = gfxclk_lock_status
-                    else:
-                        clocks['gfx'] = {"clk_locked": gfxclk_lock_status}
+                # Populate the max and min clock values from sysfs
+                #   Min and Max values are per clock type, not per clock engine
+
+                # GFX min and max clocks
+                try:
+                    gfx_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
+                                                                                 amdsmi_interface.AmdSmiClkType.GFX)
+
+                    for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_GFX_CLKS):
+                        gfx_index = f"gfx_{clock_index}"
+                        if clocks[gfx_index]["clk"] == "N/A":
+                            # if the current clock is N/A then we shouldn't populate the max and min values
+                            continue
+
+                        clocks[gfx_index]["min_clk"] = self.helpers.unit_format(self.logger,
+                                                                                gfx_clock_info_dict["min_clk"],
+                                                                                clock_unit)
+                        clocks[gfx_index]["max_clk"] = self.helpers.unit_format(self.logger,
+                                                                                gfx_clock_info_dict["max_clk"],
+                                                                                clock_unit)
+                except amdsmi_exception.AmdSmiLibraryException as e:
+                    logging.debug("Failed to get gfx clock info for gpu %s | %s", gpu_id, e.get_error_info())
+
+                # MEM min and max clocks
+                try:
+                    mem_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
+                                                                                 amdsmi_interface.AmdSmiClkType.MEM)
+
+                    # if the current clock is N/A then we shouldn't populate the max and min values
+                    if clocks["mem_0"]["clk"] != "N/A":
+                        clocks["mem_0"]["min_clk"] = self.helpers.unit_format(self.logger,
+                                                                                mem_clock_info_dict["min_clk"],
+                                                                                clock_unit)
+                        clocks["mem_0"]["max_clk"] = self.helpers.unit_format(self.logger,
+                                                                                mem_clock_info_dict["max_clk"],
+                                                                                clock_unit)
+                except amdsmi_exception.AmdSmiLibraryException as e:
+                    logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e.get_error_info())
+
+                # VCLK & DCLK min and max clocks
+                try:
+                    vclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, 
+                                                                                   amdsmi_interface.AmdSmiClkType.VCLK0)
+
+                    dclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
+                                                                                   amdsmi_interface.AmdSmiClkType.DCLK0)
+
+                    for clock_index in range(amdsmi_interface.AMDSMI_MAX_NUM_CLKS):
+                        vclk_index = f"vclk_{clock_index}"
+                        # if the current clock is N/A then we shouldn't populate the max and min values
+                        if clocks[vclk_index]["clk"] != "N/A":
+                            clocks[vclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
+                                                                                     vclk0_clock_info_dict["min_clk"],
+                                                                                     clock_unit)
+                            clocks[vclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
+                                                                                     vclk0_clock_info_dict["max_clk"],
+                                                                                     clock_unit)
+
+                        dclk_index = f"dclk_{clock_index}"
+                        if clocks[dclk_index]["clk"] != "N/A":
+                            clocks[dclk_index]["min_clk"] = self.helpers.unit_format(self.logger,
+                                                                                     dclk0_clock_info_dict["min_clk"],
+                                                                                     clock_unit)
+                            clocks[dclk_index]["max_clk"] = self.helpers.unit_format(self.logger,
+                                                                                     dclk0_clock_info_dict["max_clk"],
+                                                                                     clock_unit)
+                except amdsmi_exception.AmdSmiLibraryException as e:
+                    logging.debug("Failed to get vclk and/or dclk clock info for gpu %s | %s", gpu_id, e.get_error_info())
 
                 values_dict['clock'] = clocks
         if "temperature" in current_platform_args:
@@ -4116,7 +4232,7 @@ def xgmi(self, args, multiple_devices=False, gpu=None, metric=None):
             for xgmi_dict in xgmi_values:
                 src_gpu_id = xgmi_dict['gpu']
                 src_gpu_bdf = xgmi_dict['bdf']
-                src_gpu = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(src_gpu_bdf) #TODO VERIFY this is correct
+                src_gpu = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(src_gpu_bdf)
                 logging.debug("check2 device_handle: %s", src_gpu)
                 # This should be the same order as the check1
 
@@ -4256,7 +4372,7 @@ def xgmi(self, args, multiple_devices=False, gpu=None, metric=None):
 
         self.logger.multiple_device_output = xgmi_values
 
-        if self.logger.is_csv_format(): # @TODO Test topology override needed
+        if self.logger.is_csv_format():
             new_output = []
             for elem in self.logger.multiple_device_output:
                 new_output.append(self.logger.flatten_dict(elem, topology_override=True))
diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py
index 2083c155..6383969a 100644
--- a/amdsmi_cli/amdsmi_helpers.py
+++ b/amdsmi_cli/amdsmi_helpers.py
@@ -412,7 +412,7 @@ def get_device_handles_from_core_selections(self, core_selections: List[str], co
         return True, selected_device_handles
 
 
-    def handle_gpus(self, args,logger, subcommand):
+    def handle_gpus(self, args, logger, subcommand):
         """This function will run execute the subcommands based on the number
             of gpus passed in via args.
         params:
@@ -708,3 +708,21 @@ def convert_bytes_to_readable(self, bytes_input):
                 return f"{bytes_input:3.1f} {unit}"
             bytes_input /= 1024
         return f"{bytes_input:.1f} YB"
+
+
+    def unit_format(self, logger, value, unit):
+        """This function will format output with unit based on the logger output format
+
+        params:
+            args - argparser args to pass to subcommand
+            logger (AMDSMILogger) - Logger to print out output
+            value - the value to be formatted
+            unit - the unit to be formatted with the value
+        return:
+            str or dict : formatted output
+        """
+        if logger.is_json_format():
+            return {"value": value, "unit": unit}
+        if logger.is_human_readable_format():
+            return f"{value} {unit}"
+        return f"{value}"
diff --git a/py-interface/README.md b/py-interface/README.md
index ae9b3568..f8b8b3d2 100644
--- a/py-interface/README.md
+++ b/py-interface/README.md
@@ -2155,7 +2155,7 @@ Output: Dictionary with fields
 `indep_throttle_status` | ASIC independent throttle status (see drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h for bit flags) |
 `current_socket_power` | Current socket power (also known as instant socket power) | W
 `vcn_activity` | List of VCN encode/decode engine utilization per AID | %
-`gfxclk_lock_status` | Clock lock status. Each bit corresponds to clock instance. |
+`gfxclk_lock_status` | Clock lock status. Bits 0:7 correspond to each gfx clock engine instance. Bits 0:5 for APU/AID devices |
 `xgmi_link_width` | XGMI bus width | lanes
 `xgmi_link_speed` | XGMI bitrate | GB/s
 `pcie_bandwidth_acc` | PCIe accumulated bandwidth | GB/s
diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py
index e3dfa1a4..98c41f73 100644
--- a/py-interface/amdsmi_interface.py
+++ b/py-interface/amdsmi_interface.py
@@ -3519,7 +3519,7 @@ def amdsmi_get_gpu_metrics_info(
         if gpu_metrics_output[metric] == 0xFFFF:
             gpu_metrics_output[metric] = "N/A"
 
-    uint_32_metrics = ['gfx_activity_acc','mem_activity_acc', 'pcie_nak_sent_count_acc', 'pcie_nak_rcvd_count_acc']
+    uint_32_metrics = ['gfx_activity_acc','mem_activity_acc', 'pcie_nak_sent_count_acc', 'pcie_nak_rcvd_count_acc', 'gfxclk_lock_status']
     for metric in uint_32_metrics:
         if gpu_metrics_output[metric] == 0xFFFFFFFF:
             gpu_metrics_output[metric] = "N/A"
@@ -3533,7 +3533,7 @@ def amdsmi_get_gpu_metrics_info(
             gpu_metrics_output[metric] = "N/A"
 
     # Custom validation for metrics in a bool format
-    uint_32_bool_metrics = ['throttle_status', 'gfxclk_lock_status']
+    uint_32_bool_metrics = ['throttle_status']
     for metric in uint_32_bool_metrics:
         if gpu_metrics_output[metric] == 0xFFFFFFFF:
             gpu_metrics_output[metric] = "N/A"
diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc
index 1dafee87..2f56eb45 100644
--- a/src/amd_smi/amd_smi.cc
+++ b/src/amd_smi/amd_smi.cc
@@ -1651,6 +1651,12 @@ amdsmi_get_clock_info(amdsmi_processor_handle processor_handle, amdsmi_clk_type_
     case CLK_TYPE_VCLK1:
         info->cur_clk = metrics.current_vclk1;
         break;
+    case CLK_TYPE_DCLK0:
+        info->cur_clk = metrics.current_dclk0;
+      break;
+    case CLK_TYPE_DCLK1:
+        info->cur_clk = metrics.current_dclk1;
+        break;
     default:
         return AMDSMI_STATUS_INVAL;
     }
diff --git a/src/amd_smi/amd_smi_utils.cc b/src/amd_smi/amd_smi_utils.cc
index f73a1a76..13762c38 100644
--- a/src/amd_smi/amd_smi_utils.cc
+++ b/src/amd_smi/amd_smi_utils.cc
@@ -229,6 +229,12 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_
         case CLK_TYPE_VCLK1:
             fullpath += "/pp_dpm_vclk1";
             break;
+        case CLK_TYPE_DCLK0:
+            fullpath += "/pp_dpm_dclk";
+            break;
+        case CLK_TYPE_DCLK1:
+            fullpath += "/pp_dpm_dclk1";
+            break;
         default:
             return AMDSMI_STATUS_INVAL;
     }

From e2e4349bd245b06791dab7157ca39fa0563c76ff Mon Sep 17 00:00:00 2001
From: Maisam Arif <maisarif@amd.com>
Date: Tue, 26 Mar 2024 03:33:19 -0500
Subject: [PATCH 11/18] SWDEV-445664 - Aligned metric --ecc & --ecc-blocks with
 Host

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I93cf2bdab8c4c066bacf0e910e5620d37b362b07
---
 CHANGELOG.md                     | 57 +++++++++++++++++++++++---------
 amdsmi_cli/amdsmi_commands.py    | 11 +++---
 py-interface/README.md           |  2 ++
 py-interface/amdsmi_interface.py | 10 +++---
 4 files changed, 56 insertions(+), 24 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b7a3a85d..74deff1b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Change Log for AMD SMI Library
 
-Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](https://rocm.docs.amd.com/projects/amdsmi/en/latest/).  
+Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](https://rocm.docs.amd.com/projects/amdsmi/en/latest/).
 
 ***All information listed below is for reference and subject to change.***
 
@@ -8,6 +8,7 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
 
 ### Changed
 
+- **Updated metrics --clocks**
 Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status.
 
 ``` shell
@@ -118,11 +119,35 @@ GPU: 0
             DEEP_SLEEP: ENABLED
 ```
 
+- **Added deferred ecc counts**
+Added deferred error correctable counts to `amd-smi metric --ecc --ecc-blocks`
+
+```shell
+$ amd-smi metric --ecc --ecc-blocks
+GPU: 0
+    ECC:
+        TOTAL_CORRECTABLE_COUNT: 0
+        TOTAL_UNCORRECTABLE_COUNT: 0
+        TOTAL_DEFERRED_COUNT: 0
+        CACHE_CORRECTABLE_COUNT: 0
+        CACHE_UNCORRECTABLE_COUNT: 0
+    ECC_BLOCKS:
+        UMC:
+            CORRECTABLE_COUNT: 0
+            UNCORRECTABLE_COUNT: 0
+            DEFERRED_COUNT: 0
+        SDMA:
+            CORRECTABLE_COUNT: 0
+            UNCORRECTABLE_COUNT: 0
+            DEFERRED_COUNT: 0
+        ...
+```
+
 ## amd_smi_lib for ROCm 6.1.0
 
 ### Added
 
-- **Added Monitor Command**  
+- **Added Monitor Command**
 Provides users the ability to customize GPU metrics to capture, collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi` (no argument), additionally allows uers to customize what data is helpful for their use-case.
 
 ```shell
@@ -182,10 +207,10 @@ GPU  POWER  GPU_TEMP  MEM_TEMP  GFX_UTIL  GFX_CLOCK  MEM_UTIL  MEM_CLOCK  VRAM_U
   7  175 W     34 °C     32 °C       0 %    113 MHz       0 %    900 MHz     283 MB   196300 MB
 ```
 
-- **Integrated ESMI Tool**  
-Users can get CPU metrics and telemetry through our API and CLI tools. This information can be seen in `amd-smi static` and `amd-smi metric` commands. Only available for limited target processors. As of ROCm 6.0.2, this is listed as:  
-  - AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh  
-  - AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh  
+- **Integrated ESMI Tool**
+Users can get CPU metrics and telemetry through our API and CLI tools. This information can be seen in `amd-smi static` and `amd-smi metric` commands. Only available for limited target processors. As of ROCm 6.0.2, this is listed as:
+  - AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh
+  - AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh
 
   See a few examples listed below.
 
@@ -332,7 +357,7 @@ CPU: 0
         RESPONSE: N/A
 ```
 
-- **Added support for new metrics: VCN, JPEG engines, and PCIe errors**  
+- **Added support for new metrics: VCN, JPEG engines, and PCIe errors**
 Using the AMD SMI tool, users can retreive VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs (with 4 separate VCN engine activities) for older asics `MM_ACTIVITY` with UVD/VCN engine activity (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new fields.
 
 ```shell
@@ -376,7 +401,7 @@ $ amd-smi version
 AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6.1.0
 ```
 
-- **Added XGMI table**  
+- **Added XGMI table**
 Displays XGMI information for AMD GPU devices in a table format. Only available on supported ASICs (eg. MI300). Here users can view read/write data XGMI or PCIe accumulated data transfer size (in KiloBytes).
 
 ```shell
@@ -513,10 +538,10 @@ NUMA BW TABLE:
 
 ### Fixed
 
-- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests**  
+- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests**
 Devices which do not report (eg. Navi3X/Navi2X/MI100) we have added checks to confirm these devices return AMDSMI_STATUS_NOT_SUPPORTED. Otherwise, tests now display a return string.
-- **Fix for devices which have an older pyyaml installed**  
-Platforms which are identified as having an older pyyaml version or pip, we no manually update both pip and pyyaml as needed. This corrects issues identified below. Fix impacts the following CLI commands:  
+- **Fix for devices which have an older pyyaml installed**
+Platforms which are identified as having an older pyyaml version or pip, we no manually update both pip and pyyaml as needed. This corrects issues identified below. Fix impacts the following CLI commands:
   - `amd-smi list`
   - `amd-smi static`
   - `amd-smi firmware`
@@ -538,18 +563,18 @@ AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid cr
 
 ### Added
 
-- **Integrated the E-SMI (EPYC-SMI) library**  
+- **Integrated the E-SMI (EPYC-SMI) library**
 You can now query CPU-related information directly through AMD SMI. Metrics include power, energy, performance, and other system details.
 
-- **Added support for gfx942 metrics**  
+- **Added support for gfx942 metrics**
 You can now query MI300 device metrics to get real-time information. Metrics include power, temperature, energy, and performance.
 
-- **Compute and memory partition support**  
+- **Compute and memory partition support**
 Users can now view, set, and reset partitions. The topology display can provide a more in-depth look at the device's current configuration.
 
 ### Changed
 
-- **GPU index sorting made consistent with other tools**  
+- **GPU index sorting made consistent with other tools**
 To ensure alignment with other ROCm software tools, GPU index sorting is optimized to use Bus:Device.Function (BDF) rather than the card number.
 - **Topology output is now aligned with GPU BDF table**
 Earlier versions of the topology output were difficult to read since each GPU was displayed linearly.
@@ -561,7 +586,7 @@ Now the information is displayed as a table by each GPU's BDF, which closer rese
 
 ### Fixed
 
-- **Fix for driver not initialized**  
+- **Fix for driver not initialized**
 If driver module is not loaded, user retrieve error reponse indicating amdgpu module is not loaded.
 
 ### Known Issues
diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index fce9e852..cc2ab30e 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -1500,7 +1500,7 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No
 
                 # VCLK & DCLK min and max clocks
                 try:
-                    vclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, 
+                    vclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
                                                                                    amdsmi_interface.AmdSmiClkType.VCLK0)
 
                     dclk0_clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu,
@@ -1668,6 +1668,7 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No
                     ecc_count = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu)
                     ecc_count['total_correctable_count'] = ecc_count.pop('correctable_count')
                     ecc_count['total_uncorrectable_count'] = ecc_count.pop('uncorrectable_count')
+                    ecc_count['total_deferred_count'] = ecc_count.pop('deferred_count')
                 except amdsmi_exception.AmdSmiLibraryException as e:
                     ecc_count['total_correctable_count'] = "N/A"
                     ecc_count['total_uncorrectable_count'] = "N/A"
@@ -1691,7 +1692,7 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No
         if "ecc_blocks" in current_platform_args:
             if args.ecc_blocks:
                 ecc_dict = {}
-                uncountable_blocks = ["ATHUB", "DF", "SMN", "SEM", "MP0", "MP1", "FUSE"]
+                uncountable_blocks = ["ATHUB", "DF", "SMN", "SEM", "FUSE"]
                 try:
                     ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu)
                     for state in ras_states:
@@ -1702,10 +1703,12 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No
                                 try:
                                     ecc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, gpu_block)
                                     ecc_dict[state['block']] = {'correctable_count' : ecc_count['correctable_count'],
-                                                                'uncorrectable_count' : ecc_count['uncorrectable_count']}
+                                                                'uncorrectable_count' : ecc_count['uncorrectable_count'],
+                                                                'deferred_count' : ecc_count['deferred_count']}
                                 except amdsmi_exception.AmdSmiLibraryException as e:
                                     ecc_dict[state['block']] = {'correctable_count' : "N/A",
-                                                                'uncorrectable_count' : "N/A"}
+                                                                'uncorrectable_count' : "N/A",
+                                                                'deferred_count' : "N/A"}
                                     logging.debug("Failed to get ecc count for gpu %s at block %s | %s", gpu_id, gpu_block, e.get_error_info())
 
                     values_dict['ecc_blocks'] = ecc_dict
diff --git a/py-interface/README.md b/py-interface/README.md
index f8b8b3d2..87089306 100644
--- a/py-interface/README.md
+++ b/py-interface/README.md
@@ -967,6 +967,7 @@ Field | Description
 ---|---
 `correctable_count` | Correctable ECC error count
 `uncorrectable_count` | Uncorrectable ECC error count
+`deferred_count` | Deferred ECC error count
 
 Exceptions that can be thrown by `amdsmi_get_gpu_total_ecc_count` function:
 
@@ -2747,6 +2748,7 @@ Field | Description
 ---|---
 `correctable_count` | Count of correctable errors
 `uncorrectable_count` | Count of uncorrectable errors
+`deferred_count` | Count of deferred errors
 
 Exceptions that can be thrown by `amdsmi_get_gpu_ecc_count` function:
 
diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py
index 98c41f73..bf9fa6a0 100644
--- a/py-interface/amdsmi_interface.py
+++ b/py-interface/amdsmi_interface.py
@@ -1821,16 +1821,17 @@ def amdsmi_get_gpu_total_ecc_count(
             processor_handle, amdsmi_wrapper.amdsmi_processor_handle
         )
 
-    error_count = amdsmi_wrapper.amdsmi_error_count_t()
+    ec = amdsmi_wrapper.amdsmi_error_count_t()
     _check_res(
         amdsmi_wrapper.amdsmi_get_gpu_total_ecc_count(
-            processor_handle, ctypes.byref(error_count)
+            processor_handle, ctypes.byref(ec)
         )
     )
 
     return {
-        "correctable_count": error_count.correctable_count,
-        "uncorrectable_count": error_count.uncorrectable_count,
+        "correctable_count": ec.correctable_count,
+        "uncorrectable_count": ec.uncorrectable_count,
+        "deferred_count": ec.deferred_count,
     }
 
 
@@ -3655,6 +3656,7 @@ def amdsmi_get_gpu_ecc_count(
     return {
         "correctable_count": ec.correctable_count,
         "uncorrectable_count": ec.uncorrectable_count,
+        "deferred_count": ec.deferred_count,
     }
 
 

From 51b3f8cccbaff1fe1164215f5eef4d9b77c6fac8 Mon Sep 17 00:00:00 2001
From: Maisam Arif <maisarif@amd.com>
Date: Wed, 27 Mar 2024 00:45:53 -0500
Subject: [PATCH 12/18] SWDEV-452739 - Add CEM slot type to amd-smi

Updated CHANGELOG.md and re-added spaces after bolded lines

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: Ic728b3e9b083c62fe4c9791b8ede991f5dacc1ca
---
 CHANGELOG.md                   | 49 +++++++++++++++++++++++-----------
 include/amd_smi/amdsmi.h       |  1 +
 py-interface/README.md         |  2 +-
 py-interface/amdsmi_wrapper.py | 14 +++++-----
 src/amd_smi/amd_smi.cc         |  7 +++--
 5 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 74deff1b..1c6dc5bd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,11 +4,11 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
 
 ***All information listed below is for reference and subject to change.***
 
-## amd_smi_lib for ROCm 6.2.0
+## amd_smi_lib for ROCm 6.1.1
 
 ### Changed
 
-- **Updated metrics --clocks**
+- **Updated metrics --clocks**  
 Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status.
 
 ``` shell
@@ -119,7 +119,7 @@ GPU: 0
             DEEP_SLEEP: ENABLED
 ```
 
-- **Added deferred ecc counts**
+- **Added deferred ecc counts**  
 Added deferred error correctable counts to `amd-smi metric --ecc --ecc-blocks`
 
 ```shell
@@ -143,11 +143,28 @@ GPU: 0
         ...
 ```
 
+### Fixed
+
+- **Fix for GPU reset error on non-amdgpu cards**
+Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix
+updates CLI to target only AMD ASICs.
+
+- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards**  
+Updated API to include `amdsmi_card_form_factor_t.AMDSMI_CARD_FORM_FACTOR_CEM`. Prevously, this would report "UNKNOWN". This fix
+provides the correct board `SLOT_TYPE` associated with these ASICs (and other Navi cards).
+
+- **Improved Error handling for `amd-smi process`**  
+Fixed Attribute Error when getting process in csv format
+
+### Known issues
+
+- `amd-smi bad-pages` can results with "ValueError: NULL pointer access" with certain PM FW versions
+
 ## amd_smi_lib for ROCm 6.1.0
 
 ### Added
 
-- **Added Monitor Command**
+- **Added Monitor Command**  
 Provides users the ability to customize GPU metrics to capture, collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi` (no argument), additionally allows uers to customize what data is helpful for their use-case.
 
 ```shell
@@ -207,7 +224,7 @@ GPU  POWER  GPU_TEMP  MEM_TEMP  GFX_UTIL  GFX_CLOCK  MEM_UTIL  MEM_CLOCK  VRAM_U
   7  175 W     34 °C     32 °C       0 %    113 MHz       0 %    900 MHz     283 MB   196300 MB
 ```
 
-- **Integrated ESMI Tool**
+- **Integrated ESMI Tool**  
 Users can get CPU metrics and telemetry through our API and CLI tools. This information can be seen in `amd-smi static` and `amd-smi metric` commands. Only available for limited target processors. As of ROCm 6.0.2, this is listed as:
   - AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh
   - AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh
@@ -357,7 +374,7 @@ CPU: 0
         RESPONSE: N/A
 ```
 
-- **Added support for new metrics: VCN, JPEG engines, and PCIe errors**
+- **Added support for new metrics: VCN, JPEG engines, and PCIe errors**  
 Using the AMD SMI tool, users can retreive VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs (with 4 separate VCN engine activities) for older asics `MM_ACTIVITY` with UVD/VCN engine activity (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new fields.
 
 ```shell
@@ -401,7 +418,7 @@ $ amd-smi version
 AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6.1.0
 ```
 
-- **Added XGMI table**
+- **Added XGMI table**  
 Displays XGMI information for AMD GPU devices in a table format. Only available on supported ASICs (eg. MI300). Here users can view read/write data XGMI or PCIe accumulated data transfer size (in KiloBytes).
 
 ```shell
@@ -538,9 +555,9 @@ NUMA BW TABLE:
 
 ### Fixed
 
-- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests**
+- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests**  
 Devices which do not report (eg. Navi3X/Navi2X/MI100) we have added checks to confirm these devices return AMDSMI_STATUS_NOT_SUPPORTED. Otherwise, tests now display a return string.
-- **Fix for devices which have an older pyyaml installed**
+- **Fix for devices which have an older pyyaml installed**  
 Platforms which are identified as having an older pyyaml version or pip, we no manually update both pip and pyyaml as needed. This corrects issues identified below. Fix impacts the following CLI commands:
   - `amd-smi list`
   - `amd-smi static`
@@ -552,7 +569,7 @@ Platforms which are identified as having an older pyyaml version or pip, we no m
 TypeError: dump_all() got an unexpected keyword argument 'sort_keys'
 ```
 
-- **Fix for crash when user is not a member of video/render groups**
+- **Fix for crash when user is not a member of video/render groups**  
 AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid crashes when DRM/device data is inaccessable to the logged in user.
 
 ### Known Issues
@@ -563,20 +580,20 @@ AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid cr
 
 ### Added
 
-- **Integrated the E-SMI (EPYC-SMI) library**
+- **Integrated the E-SMI (EPYC-SMI) library**  
 You can now query CPU-related information directly through AMD SMI. Metrics include power, energy, performance, and other system details.
 
-- **Added support for gfx942 metrics**
+- **Added support for gfx942 metrics**  
 You can now query MI300 device metrics to get real-time information. Metrics include power, temperature, energy, and performance.
 
-- **Compute and memory partition support**
+- **Compute and memory partition support**  
 Users can now view, set, and reset partitions. The topology display can provide a more in-depth look at the device's current configuration.
 
 ### Changed
 
-- **GPU index sorting made consistent with other tools**
+- **GPU index sorting made consistent with other tools**  
 To ensure alignment with other ROCm software tools, GPU index sorting is optimized to use Bus:Device.Function (BDF) rather than the card number.
-- **Topology output is now aligned with GPU BDF table**
+- **Topology output is now aligned with GPU BDF table**  
 Earlier versions of the topology output were difficult to read since each GPU was displayed linearly.
 Now the information is displayed as a table by each GPU's BDF, which closer resembles rocm-smi output.
 
@@ -586,7 +603,7 @@ Now the information is displayed as a table by each GPU's BDF, which closer rese
 
 ### Fixed
 
-- **Fix for driver not initialized**
+- **Fix for driver not initialized**  
 If driver module is not loaded, user retrieve error reponse indicating amdgpu module is not loaded.
 
 ### Known Issues
diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h
index 64bdb125..834b820c 100644
--- a/include/amd_smi/amdsmi.h
+++ b/include/amd_smi/amdsmi.h
@@ -495,6 +495,7 @@ typedef union {
 typedef enum {
   AMDSMI_CARD_FORM_FACTOR_PCIE,
   AMDSMI_CARD_FORM_FACTOR_OAM,
+  AMDSMI_CARD_FORM_FACTOR_CEM,
   AMDSMI_CARD_FORM_FACTOR_UNKNOWN
 } amdsmi_card_form_factor_t;
 
diff --git a/py-interface/README.md b/py-interface/README.md
index 87089306..f965fb02 100644
--- a/py-interface/README.md
+++ b/py-interface/README.md
@@ -809,7 +809,7 @@ Output: Dictionary with 2 fields `pcie_static` and `pcie_metric`
 
 Fields | Description
 ---|---
-`pcie_static` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`max_pcie_width`</td><td>Maximum number of pcie lanes available</td></tr><tr><td>`max_pcie_speed`</td><td>Maximum capable pcie speed in GT/s</td></tr><tr><td>`pcie_interface_version`</td><td>PCIe generation ie. 3,4,5...</td></tr><tr><td>`slot_type`</td><td>The type of form factor of the slot: PCIE, OAM, or Unknown</td></tr></tbody></table>
+`pcie_static` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`max_pcie_width`</td><td>Maximum number of pcie lanes available</td></tr><tr><td>`max_pcie_speed`</td><td>Maximum capable pcie speed in GT/s</td></tr><tr><td>`pcie_interface_version`</td><td>PCIe generation ie. 3,4,5...</td></tr><tr><td>`slot_type`</td><td>The type of form factor of the slot: OAM, PCIE, CEM, or Unknown</td></tr></tbody></table>
 `pcie_metric` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`pcie_width`</td><td>Current number of pcie lanes available</td></tr><tr><td>`pcie_speed`</td><td>Current pcie speed capable in GT/s</td></tr><tr><td>`pcie_bandwidth`</td><td>Current instantaneous bandwidth usage in Mb/s</td></tr><tr><td>`pcie_replay_count`</td><td>Total number of PCIe replays (NAKs)</td></tr><tr><td>`pcie_l0_to_recovery_count`</td><td>PCIE L0 to recovery state transition accumulated count</td></tr><tr><td>`pcie_replay_roll_over_count`</td><td>PCIe Replay accumulated count</td></tr><tr><td>`pcie_nak_sent_count`</td><td>PCIe NAK sent accumulated count</td></tr><tr><td>`pcie_nak_received_count`</td><td>PCIe NAK received accumulated count</td></tr></tbody></table>
 
 Exceptions that can be thrown by `amdsmi_get_pcie_info` function:
diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py
index 13cd2062..560590ea 100644
--- a/py-interface/amdsmi_wrapper.py
+++ b/py-interface/amdsmi_wrapper.py
@@ -737,11 +737,13 @@ class struct_fields_(Structure):
 amdsmi_card_form_factor_t__enumvalues = {
     0: 'AMDSMI_CARD_FORM_FACTOR_PCIE',
     1: 'AMDSMI_CARD_FORM_FACTOR_OAM',
-    2: 'AMDSMI_CARD_FORM_FACTOR_UNKNOWN',
+    2: 'AMDSMI_CARD_FORM_FACTOR_CEM',
+    3: 'AMDSMI_CARD_FORM_FACTOR_UNKNOWN',
 }
 AMDSMI_CARD_FORM_FACTOR_PCIE = 0
 AMDSMI_CARD_FORM_FACTOR_OAM = 1
-AMDSMI_CARD_FORM_FACTOR_UNKNOWN = 2
+AMDSMI_CARD_FORM_FACTOR_CEM = 2
+AMDSMI_CARD_FORM_FACTOR_UNKNOWN = 3
 amdsmi_card_form_factor_t = ctypes.c_uint32 # enum
 class struct_amdsmi_pcie_info_t(Structure):
     pass
@@ -2347,10 +2349,10 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure):
     'AMDSMI_CACHE_PROPERTY_DATA_CACHE',
     'AMDSMI_CACHE_PROPERTY_ENABLED',
     'AMDSMI_CACHE_PROPERTY_INST_CACHE',
-    'AMDSMI_CACHE_PROPERTY_SIMD_CACHE', 'AMDSMI_CARD_FORM_FACTOR_OAM',
-    'AMDSMI_CARD_FORM_FACTOR_PCIE', 'AMDSMI_CARD_FORM_FACTOR_UNKNOWN',
-    'AMDSMI_CNTR_CMD_START', 'AMDSMI_CNTR_CMD_STOP',
-    'AMDSMI_COARSE_GRAIN_GFX_ACTIVITY',
+    'AMDSMI_CACHE_PROPERTY_SIMD_CACHE', 'AMDSMI_CARD_FORM_FACTOR_CEM',
+    'AMDSMI_CARD_FORM_FACTOR_OAM', 'AMDSMI_CARD_FORM_FACTOR_PCIE',
+    'AMDSMI_CARD_FORM_FACTOR_UNKNOWN', 'AMDSMI_CNTR_CMD_START',
+    'AMDSMI_CNTR_CMD_STOP', 'AMDSMI_COARSE_GRAIN_GFX_ACTIVITY',
     'AMDSMI_COARSE_GRAIN_MEM_ACTIVITY', 'AMDSMI_CURRENT_POWER',
     'AMDSMI_DEV_PERF_LEVEL_AUTO', 'AMDSMI_DEV_PERF_LEVEL_DETERMINISM',
     'AMDSMI_DEV_PERF_LEVEL_FIRST', 'AMDSMI_DEV_PERF_LEVEL_HIGH',
diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc
index 2f56eb45..e85646d2 100644
--- a/src/amd_smi/amd_smi.cc
+++ b/src/amd_smi/amd_smi.cc
@@ -2050,11 +2050,14 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a
             processor_handle, &slot_type);
     if (status == AMDSMI_STATUS_SUCCESS) {
         switch (slot_type) {
+            case RSMI_PCIE_SLOT_PCIE:
+                info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_PCIE;
+                break;
             case RSMI_PCIE_SLOT_OAM:
                 info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_OAM;
                 break;
-            case RSMI_PCIE_SLOT_PCIE:
-                info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_PCIE;
+            case RSMI_PCIE_SLOT_CEM:
+                info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_CEM;
                 break;
             default:
                 info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_UNKNOWN;

From 9800156a7a28e7c2c516039994103b447a478a2f Mon Sep 17 00:00:00 2001
From: Maisam Arif <maisarif@amd.com>
Date: Wed, 27 Mar 2024 01:08:37 -0500
Subject: [PATCH 13/18] Bump Version to 24.5.0.0

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I2509c8c2df54f0c5e9376fc0a21c09adc74f0ea8
---
 CMakeLists.txt        | 2 +-
 amdsmi_cli/README.md  | 2 +-
 docs/doxygen/Doxyfile | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d59fe2f4..97dbc610 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ find_program(GIT NAMES git)
 
 ## Setup the package version based on git tags.
 set(PKG_VERSION_GIT_TAG_PREFIX "amdsmi_pkg_ver")
-get_package_version_number("24.4.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
+get_package_version_number("24.5.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
 message("Package version: ${PKG_VERSION_STR}")
 set(${AMD_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}")
 set(${AMD_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}")
diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md
index f9c0c067..72028e5b 100644
--- a/amdsmi_cli/README.md
+++ b/amdsmi_cli/README.md
@@ -79,7 +79,7 @@ amd-smi will report the version and current platform detected when running the c
 ~$ amd-smi
 usage: amd-smi [-h]  ...
 
-AMD System Management Interface | Version: 24.4.0.0 | ROCm version: 6.1.0 | Platform: Linux Baremetal
+AMD System Management Interface | Version: 24.5.0.0 | ROCm version: 6.1.1 | Platform: Linux Baremetal
 
 options:
   -h, --help          show this help message and exit
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index ff7a8a18..de8ab73b 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -48,7 +48,7 @@ PROJECT_NAME           = AMD SMI
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "24.4.0.0"
+PROJECT_NUMBER         = "24.5.0.0"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a

From 08e2e21bab124011e6f556e9335290e567e2efe6 Mon Sep 17 00:00:00 2001
From: "Oliveira, Daniel" <daniel.oliveira@amd.com>
Date: Thu, 14 Mar 2024 05:53:26 -0500
Subject: [PATCH 14/18] fix: [SWDEV-442525] [rocm/amd_smi_lib]

Fixes gpu_process_list

Code changes related to the following:
  * amdsmi_get_gpu_process_list()
  * CLI
  * Examples
  * Unit tests
  * Changelog
  * Readme
  * rocm_smi_lib commit: 677433b367c5738e165c74ac07bdb7ab26d22949

Change-Id: I9210fbca7a5da92d0a8b472b72ca82597c8e4fb5
Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
---
 CHANGELOG.md                                  |  51 +++---
 amdsmi_cli/amdsmi_commands.py                 |   8 +-
 example/amd_smi_drm_example.cc                |  87 +++++-----
 include/amd_smi/amdsmi.h                      |  49 +++---
 include/amd_smi/impl/amd_smi_gpu_device.h     |  22 +++
 py-interface/README.md                        |  44 +----
 py-interface/amdsmi_interface.py              |  40 ++---
 py-interface/amdsmi_wrapper.py                |   7 +-
 rocm_smi/include/rocm_smi/rocm_smi.h          |  28 ++--
 rocm_smi/src/rocm_smi_kfd.cc                  |   4 +-
 src/amd_smi/amd_smi.cc                        | 101 +++++-------
 src/amd_smi/amd_smi_gpu_device.cc             | 155 +++++++++++++++++-
 src/amd_smi/fdinfo.cc                         |  11 +-
 .../functional/process_info_read.cc           |   1 +
 14 files changed, 371 insertions(+), 237 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1c6dc5bd..f606276c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
 
 ### Changed
 
-- **Updated metrics --clocks**  
+- **Updated metrics --clocks**
 Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status.
 
 ``` shell
@@ -119,7 +119,7 @@ GPU: 0
             DEEP_SLEEP: ENABLED
 ```
 
-- **Added deferred ecc counts**  
+- **Added deferred ecc counts**
 Added deferred error correctable counts to `amd-smi metric --ecc --ecc-blocks`
 
 ```shell
@@ -149,11 +149,14 @@ GPU: 0
 Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix
 updates CLI to target only AMD ASICs.
 
-- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards**  
+- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards**
 Updated API to include `amdsmi_card_form_factor_t.AMDSMI_CARD_FORM_FACTOR_CEM`. Prevously, this would report "UNKNOWN". This fix
 provides the correct board `SLOT_TYPE` associated with these ASICs (and other Navi cards).
 
-- **Improved Error handling for `amd-smi process`**  
+- **Fix for `amd-smi process`**
+Fixed output results when getting processes running on a device.
+
+- **Improved Error handling for `amd-smi process`**
 Fixed Attribute Error when getting process in csv format
 
 ### Known issues
@@ -164,7 +167,7 @@ Fixed Attribute Error when getting process in csv format
 
 ### Added
 
-- **Added Monitor Command**  
+- **Added Monitor Command**
 Provides users the ability to customize GPU metrics to capture, collect, and observe. Output is provided in a table view. This aligns closer to ROCm SMI `rocm-smi` (no argument), additionally allows uers to customize what data is helpful for their use-case.
 
 ```shell
@@ -224,7 +227,7 @@ GPU  POWER  GPU_TEMP  MEM_TEMP  GFX_UTIL  GFX_CLOCK  MEM_UTIL  MEM_CLOCK  VRAM_U
   7  175 W     34 °C     32 °C       0 %    113 MHz       0 %    900 MHz     283 MB   196300 MB
 ```
 
-- **Integrated ESMI Tool**  
+- **Integrated ESMI Tool**
 Users can get CPU metrics and telemetry through our API and CLI tools. This information can be seen in `amd-smi static` and `amd-smi metric` commands. Only available for limited target processors. As of ROCm 6.0.2, this is listed as:
   - AMD Zen3 based CPU Family 19h Models 0h-Fh and 30h-3Fh
   - AMD Zen4 based CPU Family 19h Models 10h-1Fh and A0-AFh
@@ -374,7 +377,7 @@ CPU: 0
         RESPONSE: N/A
 ```
 
-- **Added support for new metrics: VCN, JPEG engines, and PCIe errors**  
+- **Added support for new metrics: VCN, JPEG engines, and PCIe errors**
 Using the AMD SMI tool, users can retreive VCN, JPEG engines, and PCIe errors by calling `amd-smi metric -P` or `amd-smi metric --usage`. Depending on device support, `VCN_ACTIVITY` will update for MI3x ASICs (with 4 separate VCN engine activities) for older asics `MM_ACTIVITY` with UVD/VCN engine activity (average of all engines). `JPEG_ACTIVITY` is a new field for MI3x ASICs, where device can support up to 32 JPEG engine activities. See our documentation for more in-depth understanding of these new fields.
 
 ```shell
@@ -407,10 +410,10 @@ GPU: 0
 
 ```
 
-- **Added AMDSMI Tool Version**  
-AMD SMI will report ***three versions***: AMDSMI Tool, AMDSMI Library version, and ROCm version.  
-The AMDSMI Tool version is the CLI/tool version number with commit ID appended after `+` sign.  
-The AMDSMI Library version is the library package version number.  
+- **Added AMDSMI Tool Version**
+AMD SMI will report ***three versions***: AMDSMI Tool, AMDSMI Library version, and ROCm version.
+The AMDSMI Tool version is the CLI/tool version number with commit ID appended after `+` sign.
+The AMDSMI Library version is the library package version number.
 The ROCm version is the system's installed ROCm version, if ROCm is not installed it will report N/A.
 
 ```shell
@@ -418,7 +421,7 @@ $ amd-smi version
 AMDSMI Tool: 23.4.2+505b858 | AMDSMI Library version: 24.2.0.0 | ROCm version: 6.1.0
 ```
 
-- **Added XGMI table**  
+- **Added XGMI table**
 Displays XGMI information for AMD GPU devices in a table format. Only available on supported ASICs (eg. MI300). Here users can view read/write data XGMI or PCIe accumulated data transfer size (in KiloBytes).
 
 ```shell
@@ -452,10 +455,10 @@ GPU7   0000:df:00.0 32 Gb/s  512 Gb/s      XGMI
 
 ```
 
-- **Added units of measure to JSON output.**  
+- **Added units of measure to JSON output.**
 We added unit of measure to JSON/CSV `amd-smi metric`, `amd-smi static`, and `amd-smi monitor` commands.
 
-Ex.  
+Ex.
 
 ```shell
 amd-smi metric -p --json
@@ -488,7 +491,7 @@ amd-smi metric -p --json
 
 ### Changed
 
-- **Topology is now left-aligned with BDF of each device listed individual table's row/coloumns.**  
+- **Topology is now left-aligned with BDF of each device listed individual table's row/coloumns.**
 We provided each device's BDF for every table's row/columns, then left aligned data. We want AMD SMI Tool output to be easy to understand and digest for our users. Having users scroll up to find this information made it difficult to follow, especially for devices which have many devices associated with one ASIC.
 
 ```shell
@@ -555,9 +558,9 @@ NUMA BW TABLE:
 
 ### Fixed
 
-- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests**  
+- **Fix for Navi3X/Navi2X/MI100 `amdsmi_get_gpu_pci_bandwidth()` in frequencies_read tests**
 Devices which do not report (eg. Navi3X/Navi2X/MI100) we have added checks to confirm these devices return AMDSMI_STATUS_NOT_SUPPORTED. Otherwise, tests now display a return string.
-- **Fix for devices which have an older pyyaml installed**  
+- **Fix for devices which have an older pyyaml installed**
 Platforms which are identified as having an older pyyaml version or pip, we no manually update both pip and pyyaml as needed. This corrects issues identified below. Fix impacts the following CLI commands:
   - `amd-smi list`
   - `amd-smi static`
@@ -569,7 +572,7 @@ Platforms which are identified as having an older pyyaml version or pip, we no m
 TypeError: dump_all() got an unexpected keyword argument 'sort_keys'
 ```
 
-- **Fix for crash when user is not a member of video/render groups**  
+- **Fix for crash when user is not a member of video/render groups**
 AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid crashes when DRM/device data is inaccessable to the logged in user.
 
 ### Known Issues
@@ -580,20 +583,20 @@ AMD SMI now uses same mutex handler for devices as rocm-smi. This helps avoid cr
 
 ### Added
 
-- **Integrated the E-SMI (EPYC-SMI) library**  
+- **Integrated the E-SMI (EPYC-SMI) library**
 You can now query CPU-related information directly through AMD SMI. Metrics include power, energy, performance, and other system details.
 
-- **Added support for gfx942 metrics**  
+- **Added support for gfx942 metrics**
 You can now query MI300 device metrics to get real-time information. Metrics include power, temperature, energy, and performance.
 
-- **Compute and memory partition support**  
+- **Compute and memory partition support**
 Users can now view, set, and reset partitions. The topology display can provide a more in-depth look at the device's current configuration.
 
 ### Changed
 
-- **GPU index sorting made consistent with other tools**  
+- **GPU index sorting made consistent with other tools**
 To ensure alignment with other ROCm software tools, GPU index sorting is optimized to use Bus:Device.Function (BDF) rather than the card number.
-- **Topology output is now aligned with GPU BDF table**  
+- **Topology output is now aligned with GPU BDF table**
 Earlier versions of the topology output were difficult to read since each GPU was displayed linearly.
 Now the information is displayed as a table by each GPU's BDF, which closer resembles rocm-smi output.
 
@@ -603,7 +606,7 @@ Now the information is displayed as a table by each GPU's BDF, which closer rese
 
 ### Fixed
 
-- **Fix for driver not initialized**  
+- **Fix for driver not initialized**
 If driver module is not loaded, user retrieve error reponse indicating amdgpu module is not loaded.
 
 ### Known Issues
diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index cc2ab30e..b9bc3a5e 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -2570,16 +2570,18 @@ def process(self, args, multiple_devices=False, watching_output=False,
         try:
             process_list = amdsmi_interface.amdsmi_get_gpu_process_list(args.gpu)
         except amdsmi_exception.AmdSmiLibraryException as e:
+            if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
+                raise PermissionError('Command requires elevation') from e
             logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info())
             raise e
 
         filtered_process_values = []
-        for process_handle in process_list:
+        for process in process_list:
             try:
-                process_info = amdsmi_interface.amdsmi_get_gpu_process_info(args.gpu, process_handle)
+                process_info = amdsmi_interface.amdsmi_get_gpu_process_info(args.gpu, process)
             except amdsmi_exception.AmdSmiLibraryException as e:
                 process_info = "N/A"
-                logging.debug("Failed to get process info for gpu %s on process_handle %s | %s", gpu_id, process_handle, e.get_error_info())
+                logging.debug("Failed to get process info for process %s on gpu %s | %s", process, gpu_id, e.get_error_info())
                 filtered_process_values.append({'process_info': process_info})
                 continue
 
diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc
index cd9a3a1f..25ac6ade 100644
--- a/example/amd_smi_drm_example.cc
+++ b/example/amd_smi_drm_example.cc
@@ -432,7 +432,9 @@ int main() {
             ret = amdsmi_get_temp_metric(
                 processor_handles[j], TEMPERATURE_TYPE_EDGE,
                 AMDSMI_TEMP_CRITICAL, &temperature);
-            CHK_AMDSMI_RET(ret)
+            if (ret != amdsmi_status_t::AMDSMI_STATUS_NOT_SUPPORTED) {
+                CHK_AMDSMI_RET(ret)
+            }
             printf("\tGPU GFX temp limit: %ld\n\n", temperature);
 
             // Get temperature measurements
@@ -447,7 +449,9 @@ int main() {
                     processor_handles[j], temp_type,
                     AMDSMI_TEMP_CURRENT,
                     &temp_measurements[(int)(temp_type)]);
-                CHK_AMDSMI_RET(ret)
+                if (ret != amdsmi_status_t::AMDSMI_STATUS_NOT_SUPPORTED) {
+                    CHK_AMDSMI_RET(ret)
+                }
             }
             printf("    Output of amdsmi_get_temp_metric:\n");
             printf("\tGPU Edge temp measurement: %ld\n",
@@ -526,14 +530,13 @@ int main() {
             };
 
             uint32_t num_process = 0;
-            ret = amdsmi_get_gpu_process_list(processor_handles[j], &num_process,
-                                          nullptr);
+            ret = amdsmi_get_gpu_process_list(processor_handles[j], &num_process, nullptr);
             CHK_AMDSMI_RET(ret)
             if (!num_process) {
                 printf("No processes found.\n");
             } else {
-                amdsmi_process_handle_t process_list[num_process];
-                amdsmi_proc_info_t info_list[num_process];
+                std::cout << "Processes found: " << num_process << "\n";
+                amdsmi_proc_info_t process_info_list[num_process];
                 amdsmi_proc_info_t process = {};
                 uint64_t mem = 0, gtt_mem = 0, cpu_mem = 0, vram_mem = 0;
                 uint64_t gfx = 0, enc = 0;
@@ -544,24 +547,14 @@ int main() {
                         bdf.fields.device_number,
                         bdf.fields.function_number);
                 int num = 0;
-                ret = amdsmi_get_gpu_process_list(processor_handles[j], &num_process,
-                                            process_list);
-                CHK_AMDSMI_RET(ret)
-                for (uint32_t it = 0; it < num_process; it += 1) {
-                    if (getpid() == process_list[it]) {
-                        continue;
-                    }
-                    ret = amdsmi_get_gpu_process_info(processor_handles[j],
-                                                  process_list[it], &process);
-                    if (ret != AMDSMI_STATUS_SUCCESS) {
-                        printf("amdsmi_get_gpu_process_info() failed for "
-                               "process_list[%d], returned %d\n",
-                               it, ret);
-                        continue;
-                    }
-                    info_list[num++] = process;
+                ret = amdsmi_get_gpu_process_list(processor_handles[j], &num_process, process_info_list);
+                std::cout << "Allocation size for process list: " << num_process << "\n";
+                CHK_AMDSMI_RET(ret);
+                for (auto idx = uint32_t(0); idx < num_process; ++idx) {
+                    process = static_cast<amdsmi_proc_info_t>(process_info_list[idx]);
+                    printf("\t *Process id: %ld / Name: %s / VRAM: %lld \n", process.pid, process.name, process.memory_usage.vram_mem);
                 }
-                qsort(info_list, num, sizeof(info_list[0]), compare);
+
                 printf("+=======+==================+============+=============="
                        "+=============+=============+=============+============"
                        "==+=========================================+\n");
@@ -575,41 +568,41 @@ int main() {
                 printf("+=======+"
                        "+=============+=============+=============+============"
                        "==+=========================================+\n");
-                for (int it = 0; it < num; it++) {
+                for (int it = 0; it < num_process; it++) {
                     char command[30];
                     struct passwd *pwd = nullptr;
                     struct stat st;
 
-                    sprintf(command, "/proc/%d", info_list[it].pid);
+                    sprintf(command, "/proc/%d", process_info_list[it].pid);
                     if (stat(command, &st))
                         continue;
                     pwd = getpwuid(st.st_uid);
                     if (!pwd)
                         printf("| %5d | %16s | %10d | %s | %7ld KiB | %7ld KiB "
                                "| %7ld KiB | %7ld KiB  | %lu  %lu |\n",
-                               info_list[it].pid, info_list[it].name, st.st_uid,
-                               bdf_str, info_list[it].mem / 1024,
-                               info_list[it].memory_usage.gtt_mem / 1024,
-                               info_list[it].memory_usage.cpu_mem / 1024,
-                               info_list[it].memory_usage.vram_mem / 1024,
-                               info_list[it].engine_usage.gfx,
-                               info_list[it].engine_usage.enc);
+                               process_info_list[it].pid, process_info_list[it].name, st.st_uid,
+                               bdf_str, process_info_list[it].mem / 1024,
+                               process_info_list[it].memory_usage.gtt_mem / 1024,
+                               process_info_list[it].memory_usage.cpu_mem / 1024,
+                               process_info_list[it].memory_usage.vram_mem / 1024,
+                               process_info_list[it].engine_usage.gfx,
+                               process_info_list[it].engine_usage.enc);
                     else
                         printf("| %5d | %16s | %10s | %s | %7ld KiB | %7ld KiB "
                                "| %7ld KiB | %7ld KiB  | %lu  %lu |\n",
-                               info_list[it].pid, info_list[it].name,
-                               pwd->pw_name, bdf_str, info_list[it].mem / 1024,
-                               info_list[it].memory_usage.gtt_mem / 1024,
-                               info_list[it].memory_usage.cpu_mem / 1024,
-                               info_list[it].memory_usage.vram_mem / 1024,
-                               info_list[it].engine_usage.gfx,
-                               info_list[it].engine_usage.enc);
-                    mem += info_list[it].mem / 1024;
-                    gtt_mem += info_list[it].memory_usage.gtt_mem / 1024;
-                    cpu_mem += info_list[it].memory_usage.cpu_mem / 1024;
-                    vram_mem += info_list[it].memory_usage.vram_mem / 1024;
-                    gfx = info_list[it].engine_usage.gfx;
-                    enc = info_list[it].engine_usage.enc;
+                               process_info_list[it].pid, process_info_list[it].name,
+                               pwd->pw_name, bdf_str, process_info_list[it].mem / 1024,
+                               process_info_list[it].memory_usage.gtt_mem / 1024,
+                               process_info_list[it].memory_usage.cpu_mem / 1024,
+                               process_info_list[it].memory_usage.vram_mem / 1024,
+                               process_info_list[it].engine_usage.gfx,
+                               process_info_list[it].engine_usage.enc);
+                    mem += process_info_list[it].mem / 1024;
+                    gtt_mem += process_info_list[it].memory_usage.gtt_mem / 1024;
+                    cpu_mem += process_info_list[it].memory_usage.cpu_mem / 1024;
+                    vram_mem += process_info_list[it].memory_usage.vram_mem / 1024;
+                    gfx = process_info_list[it].engine_usage.gfx;
+                    enc = process_info_list[it].engine_usage.enc;
                     printf(
                         "+-------+------------------+------------+-------------"
                         "-+-------------+-------------+-------------+----------"
@@ -644,7 +637,9 @@ int main() {
             int64_t val_i64 = 0;
             ret =  amdsmi_get_temp_metric(processor_handles[j], TEMPERATURE_TYPE_EDGE,
                                              AMDSMI_TEMP_CURRENT, &val_i64);
-            CHK_AMDSMI_RET(ret)
+            if (ret != amdsmi_status_t::AMDSMI_STATUS_NOT_SUPPORTED) {
+                CHK_AMDSMI_RET(ret)
+            }
             printf("    Output of  amdsmi_get_temp_metric:\n");
             std::cout << "\t\tTemperature: " << val_i64 << "C"
                       << "\n\n";
diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h
index 834b820c..3f7bd398 100644
--- a/include/amd_smi/amdsmi.h
+++ b/include/amd_smi/amdsmi.h
@@ -657,9 +657,9 @@ typedef struct {
   uint32_t mm_activity;
   uint32_t reserved[13];
 } amdsmi_engine_usage_t;
-
 typedef uint32_t amdsmi_process_handle_t;
 
+
 typedef struct {
   char name[AMDSMI_NORMAL_STRING_LENGTH];
   amdsmi_process_handle_t pid;
@@ -679,6 +679,7 @@ typedef struct {
   uint32_t reserved[4];
 } amdsmi_proc_info_t;
 
+
 //! Guaranteed maximum possible number of supported frequencies
 #define AMDSMI_MAX_NUM_FREQUENCIES 33
 
@@ -4743,33 +4744,39 @@ amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle, amdsmi_vram_
  *                  number of processes currently running,
  *                  AMDSMI_STATUS_OUT_OF_RESOURCES will be returned.
  *
+ *                  For cases where max_process is not zero (0), it specifies the list's size limit.
+ *                  That is, the maximum size this list will be able to hold. After the list is built
+ *                  internally, as a return status, we will have AMDSMI_STATUS_OUT_OF_RESOURCES when
+ *                  the original size limit is smaller than the actual list of processes running.
+ *                  Hence, the caller is aware the list size needs to be resized, or
+ *                  AMDSMI_STATUS_SUCCESS otherwise.
+ *                  Holding a copy of max_process before it is passed in will be helpful for monitoring
+ *                  the allocations done upon each call since the max_process will permanently be changed
+ *                  to reflect the actual number of processes running.
+ *                  Note: For the specific cases where the return status is AMDSMI_STATUS_NO_PERM only.
+ *                        The list of process and size are AMDSMI_STATUS_SUCCESS, however there are
+ *                        processes details not fully retrieved due to permissions.
+ *
+ *
  *  @param[out]     list Reference to a user-provided buffer where the process
  *                  list will be returned. This buffer must contain at least
- *                  max_processes entries of type smi_process_handle. Must be allocated
+ *                  max_processes entries of type amd_proc_info_list_t. Must be allocated
  *                  by user.
  *
- *  @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
- */
-amdsmi_status_t
-amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_process_handle_t *list);
-
-/**
- *  @brief          Returns the process information of a given process.
- *                  Engine usage show how much time the process spend using these engines in ns.
- *
- *  @platform{gpu_bm_linux} @platform{guest_1vf}  @platform{guest_mvf} @platform{guest_windows}
- *
- *  @param[in]      processor_handle Device which to query
- *
- *  @param[in]      process Handle of process to query.
+ *  @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success,
+ *                            | ::AMDSMI_STATUS_NO_PERM on success, but not all details from process retrieved,
+ *                            | ::AMDSMI_STATUS_OUT_OF_RESOURCES, filled list buffer with data, but number of
+ *                                actual running processes is larger than the size provided.
  *
- *  @param[out]     info Reference to a process information structure where to return
- *                  information. Must be allocated by user.
- *
- *  @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
  */
+    //  Note: If the reserved size for processes is smaller than the number of
+    //        actual processes running. The AMDSMI_STATUS_OUT_OF_RESOURCES is
+    //        an indication the caller should handle the situation (resize).
+    //        The max_processes is always changed to reflect the actual size of
+    //        list of processes running, so the caller knows where it is at.
+    //
 amdsmi_status_t
-amdsmi_get_gpu_process_info(amdsmi_processor_handle processor_handle, amdsmi_process_handle_t process, amdsmi_proc_info_t *info);
+amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_proc_info_t *list);
 
 /** @} End processinfo */
 
diff --git a/include/amd_smi/impl/amd_smi_gpu_device.h b/include/amd_smi/impl/amd_smi_gpu_device.h
index 527d5277..9dd39424 100644
--- a/include/amd_smi/impl/amd_smi_gpu_device.h
+++ b/include/amd_smi/impl/amd_smi_gpu_device.h
@@ -53,7 +53,20 @@
 namespace amd {
 namespace smi {
 
+
+// PID, amdsmi_proc_info_t
+using GPUComputeProcessList_t = std::map<amdsmi_process_handle_t, amdsmi_proc_info_t>;
+using ComputeProcessListClassType_t = uint16_t;
+
+enum class ComputeProcessListType_t : ComputeProcessListClassType_t
+{
+    kAllProcesses,
+    kAllProcessesOnDevice,
+};
+
+
 class AMDSmiGPUDevice: public AMDSmiProcessor {
+
  public:
     AMDSmiGPUDevice(uint32_t gpu_id, uint32_t fd, std::string path, amdsmi_bdf_t bdf, AMDSmiDrm& drm):
             AMDSmiProcessor(AMD_GPU), gpu_id_(gpu_id), fd_(fd), path_(path), bdf_(bdf), drm_(drm) {}
@@ -73,6 +86,10 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
     amdsmi_bdf_t  get_bdf();
     bool check_if_drm_is_supported() { return drm_.check_if_drm_is_supported(); }
     uint32_t get_vendor_id();
+    const GPUComputeProcessList_t& amdgpu_get_compute_process_list(ComputeProcessListType_t list_type = ComputeProcessListType_t::kAllProcessesOnDevice);
+    const GPUComputeProcessList_t& amdgpu_get_all_compute_process_list() {
+        return amdgpu_get_compute_process_list(ComputeProcessListType_t::kAllProcesses);
+    }
 
     amdsmi_status_t amdgpu_query_info(unsigned info_id,
                     unsigned size, void *value) const;
@@ -83,6 +100,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
     amdsmi_status_t amdgpu_query_vbios(void *info) const;
     amdsmi_status_t amdgpu_query_driver_name(std::string& name) const;
     amdsmi_status_t amdgpu_query_driver_date(std::string& date) const;
+
  private:
     uint32_t gpu_id_;
     uint32_t fd_;
@@ -90,6 +108,10 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
     amdsmi_bdf_t bdf_;
     uint32_t vendor_id_;
     AMDSmiDrm& drm_;
+    GPUComputeProcessList_t compute_process_list_;
+    int32_t get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list,
+                                          ComputeProcessListType_t list_type);
+
 };
 
 
diff --git a/py-interface/README.md b/py-interface/README.md
index f965fb02..4199f1a9 100644
--- a/py-interface/README.md
+++ b/py-interface/README.md
@@ -882,43 +882,14 @@ except AmdSmiException as e:
 
 ### amdsmi_get_gpu_process_list
 
-Description: Returns the list of processes for the given GPU
+Description: Returns the list of processes for the given GPU.
+The list is of type `amdsmi_proc_info_t` and holds information about the running process.
 
 Input parameters:
 
 * `processor_handle` device which to query
 
-Output: List of process handles found
-
-Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
-
-* `AmdSmiLibraryException`
-* `AmdSmiRetryException`
-* `AmdSmiParameterException`
-
-Example:
-
-```python
-try:
-    devices = amdsmi_get_processor_handles()
-    if len(devices) == 0:
-        print("No GPUs on machine")
-    else:
-        for device in devices:
-            processes = amdsmi_get_gpu_process_list(device)
-            print(processes)
-except AmdSmiException as e:
-    print(e)
-```
-
-### amdsmi_get_gpu_process_info
-
-Description: Returns the info for the given process
-
-Input parameters:
-
-* `processor_handle` device which to query
-* `process_handle` process which to query
+Output: List of process processes with fields
 
 Output: Dictionary with fields
 
@@ -930,7 +901,7 @@ Field | Description
 `engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
 `memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
 
-Exceptions that can be thrown by `amdsmi_get_gpu_process_info` function:
+Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
 
 * `AmdSmiLibraryException`
 * `AmdSmiRetryException`
@@ -946,8 +917,11 @@ try:
     else:
         for device in devices:
             processes = amdsmi_get_gpu_process_list(device)
-            for process in processes:
-                print(amdsmi_get_gpu_process_info(device, process))
+            if len(processes) == 0:
+                print("No processes running on this GPU")
+            else:
+                for process in processes:
+                    print(process)
 except AmdSmiException as e:
     print(e)
 ```
diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py
index bf9fa6a0..bf45635d 100644
--- a/py-interface/amdsmi_interface.py
+++ b/py-interface/amdsmi_interface.py
@@ -1923,15 +1923,16 @@ def amdsmi_get_gpu_ras_block_features_enabled(
 
 def amdsmi_get_gpu_process_list(
     processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
-) -> List[amdsmi_wrapper.amdsmi_process_handle_t]:
+) -> List[amdsmi_wrapper.amdsmi_proc_info_t]:
     if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
         raise AmdSmiParameterException(
             processor_handle, amdsmi_wrapper.amdsmi_processor_handle
         )
 
+    # This will get populated with the number of processes found
     max_processes = ctypes.c_uint32(MAX_NUM_PROCESSES)
 
-    process_list = (amdsmi_wrapper.amdsmi_process_handle_t *
+    process_list = (amdsmi_wrapper.amdsmi_proc_info_t *
                     max_processes.value)()
     _check_res(
         amdsmi_wrapper.amdsmi_get_gpu_process_list(
@@ -1939,42 +1940,37 @@ def amdsmi_get_gpu_process_list(
         )
     )
 
-    return [amdsmi_wrapper.amdsmi_process_handle_t(process_list[x])\
-    for x in range(0, max_processes.value)]
+    result = []
+    for index in range(max_processes.value):
+        result.append(process_list[index])
+    return result
 
 
 def amdsmi_get_gpu_process_info(
     processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
-    process: amdsmi_wrapper.amdsmi_process_handle_t,
+    process: amdsmi_wrapper.amdsmi_proc_info_t,
 ) -> Dict[str, Any]:
     if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
         raise AmdSmiParameterException(
             processor_handle, amdsmi_wrapper.amdsmi_processor_handle
         )
 
-    if not isinstance(process, amdsmi_wrapper.amdsmi_process_handle_t):
+    if not isinstance(process, amdsmi_wrapper.amdsmi_proc_info_t):
         raise AmdSmiParameterException(
-            process, amdsmi_wrapper.amdsmi_process_handle_t)
-
-    info = amdsmi_wrapper.amdsmi_proc_info_t()
-    _check_res(
-        amdsmi_wrapper.amdsmi_get_gpu_process_info(
-            processor_handle, process, ctypes.byref(info)
-        )
-    )
+            process, amdsmi_wrapper.amdsmi_proc_info_t)
 
     return {
-        "name": info.name.decode("utf-8"),
-        "pid": info.pid,
-        "mem": info.mem,
+        "name": process.name.decode("utf-8"),
+        "pid": process.pid,
+        "mem": process.mem,
         "engine_usage": {
-            "gfx": info.engine_usage.gfx,
-            "enc": info.engine_usage.enc
+            "gfx": process.engine_usage.gfx,
+            "enc": process.engine_usage.enc
         },
         "memory_usage": {
-            "gtt_mem": info.memory_usage.gtt_mem,
-            "cpu_mem": info.memory_usage.cpu_mem,
-            "vram_mem": info.memory_usage.vram_mem,
+            "gtt_mem": process.memory_usage.gtt_mem,
+            "cpu_mem": process.memory_usage.cpu_mem,
+            "vram_mem": process.memory_usage.vram_mem,
         },
     }
 
diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py
index 560590ea..d9116193 100644
--- a/py-interface/amdsmi_wrapper.py
+++ b/py-interface/amdsmi_wrapper.py
@@ -2212,10 +2212,7 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure):
 amdsmi_get_gpu_vram_usage.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_vram_usage_t)]
 amdsmi_get_gpu_process_list = _libraries['libamd_smi.so'].amdsmi_get_gpu_process_list
 amdsmi_get_gpu_process_list.restype = amdsmi_status_t
-amdsmi_get_gpu_process_list.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_uint32)]
-amdsmi_get_gpu_process_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_process_info
-amdsmi_get_gpu_process_info.restype = amdsmi_status_t
-amdsmi_get_gpu_process_info.argtypes = [amdsmi_processor_handle, amdsmi_process_handle_t, ctypes.POINTER(struct_amdsmi_proc_info_t)]
+amdsmi_get_gpu_process_list.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(struct_amdsmi_proc_info_t)]
 amdsmi_get_gpu_total_ecc_count = _libraries['libamd_smi.so'].amdsmi_get_gpu_total_ecc_count
 amdsmi_get_gpu_total_ecc_count.restype = amdsmi_status_t
 amdsmi_get_gpu_total_ecc_count.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_error_count_t)]
@@ -2580,7 +2577,7 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure):
     'amdsmi_get_gpu_pci_throughput', 'amdsmi_get_gpu_perf_level',
     'amdsmi_get_gpu_pm_metrics_info',
     'amdsmi_get_gpu_power_profile_presets',
-    'amdsmi_get_gpu_process_info', 'amdsmi_get_gpu_process_list',
+    'amdsmi_get_gpu_process_list',
     'amdsmi_get_gpu_ras_block_features_enabled',
     'amdsmi_get_gpu_ras_feature_info',
     'amdsmi_get_gpu_reg_table_info', 'amdsmi_get_gpu_revision',
diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h
index e10ab49b..b6420d79 100755
--- a/rocm_smi/include/rocm_smi/rocm_smi.h
+++ b/rocm_smi/include/rocm_smi/rocm_smi.h
@@ -902,7 +902,7 @@ typedef struct {
   struct {
     uint32_t cache_size_kb; /* In KB */
     uint32_t cache_level;
-    /* 
+    /*
     HSA_CACHE_TYPE_DATA     0x00000001
     HSA_CACHE_TYPE_INSTRUCTION  0x00000002
     HSA_CACHE_TYPE_CPU      0x00000004
@@ -1248,12 +1248,14 @@ typedef struct {
  */
 typedef struct {
     uint32_t process_id;      //!< Process ID
-    uint32_t pasid;           //!< PASID
+    uint32_t pasid;           //!< PASID: (Process Address Space ID)
     uint64_t vram_usage;      //!< VRAM usage
     uint64_t sdma_usage;      //!< SDMA usage in microseconds
     uint32_t cu_occupancy;    //!< Compute Unit usage in percent
 } rsmi_process_info_t;
 
+//! CU occupancy invalidation value for the GFX revisions not providing cu_occupancy debugfs method
+#define CU_OCCUPANCY_INVALID 0xFFFFFFFF
 
 /**
  * @brief Opaque handle to function-support object
@@ -1447,7 +1449,7 @@ rsmi_status_t rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id);
  *
  *  @details Given a device index @p dv_ind, a pointer to a caller provided
  *  char buffer @p name, and a length of this buffer @p len, this function will
- *  write the name of the PCIe vendor (up to @p len characters) buffer @p name. 
+ *  write the name of the PCIe vendor (up to @p len characters) buffer @p name.
  *
  *  If the integer ID associated with the PCIe vendor is not found in one of the
  *  system files containing device name information (e.g.
@@ -2294,9 +2296,9 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
 
 /**
  *  @brief Get gpu cache info.
- * 
- *  @details Given a device index @p dv_ind, and a pointer to a cache 
- *  info @p info, this function will write the cache size and level 
+ *
+ *  @details Given a device index @p dv_ind, and a pointer to a cache
+ *  info @p info, this function will write the cache size and level
  *  to the location pointed to by @p info.
  *  @param[in] dv_ind a device index
  *
@@ -2930,16 +2932,16 @@ rsmi_status_t rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind,
  *  @brief Get the pm metrics table with provided device index.
  *
  *  @details Given a device index @p dv_ind, @p pm_metrics pointer,
- *  and @p num_of_metrics pointer, 
+ *  and @p num_of_metrics pointer,
  *  this function will write the pm metrics name value pair
  *  to the array at @p pm_metrics and the number of metrics retreived to @p num_of_metrics
  *  Note: the library allocated memory for pm_metrics, and user must call
  *  free(pm_metrics) to free it after use.
- * 
+ *
  *  @param[in] dv_ind a device index
  *
  *  @param[inout] pm_metrics A pointerto an array to hold multiple PM metrics. On successs,
- *  the library will allocate memory of pm_metrics and write metrics to this array. 
+ *  the library will allocate memory of pm_metrics and write metrics to this array.
  *  The caller must free this memory after usage to avoid memory leak.
  *
  *  @param[inout] num_of_metrics a pointer to uint32_t to which the number of
@@ -2964,18 +2966,18 @@ rsmi_status_t rsmi_dev_pm_metrics_info_get(uint32_t dv_ind,
  *  @brief Get the register metrics table with provided device index and registertype.
  *
  *  @details Given a device index @p dv_ind, @p reg_type, @p reg_metrics pointer,
- *  and @p num_of_metrics pointer, 
+ *  and @p num_of_metrics pointer,
  *  this function will write the register metrics name value pair
  *  to the array at @p reg_metrics and the number of metrics retreived to @p num_of_metrics
  *  Note: the library allocated memory for reg_metrics, and user must call
  *  free(reg_metrics) to free it after use.
- * 
+ *
  *  @param[in] dv_ind a device index
- * 
+ *
  *  @param[in] reg_type The register type
  *
  *  @param[inout] reg_metrics A pointerto an array to hold multiple register metrics. On successs,
- *  the library will allocate memory of reg_metrics and write metrics to this array. 
+ *  the library will allocate memory of reg_metrics and write metrics to this array.
  *  The caller must free this memory after usage to avoid memory leak.
  *
  *  @param[inout] num_of_metrics a pointer to uint32_t to which the number of
diff --git a/rocm_smi/src/rocm_smi_kfd.cc b/rocm_smi/src/rocm_smi_kfd.cc
index fb2c2157..13d2c27b 100755
--- a/rocm_smi/src/rocm_smi_kfd.cc
+++ b/rocm_smi/src/rocm_smi_kfd.cc
@@ -526,7 +526,9 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
       // Collect count of compute units
       cu_count += kfd_node_map[gpu_id]->cu_count();
     } else {
-      return err;
+      //Some GFX revisions do not provide cu_occupancy debugfs method
+      proc->cu_occupancy = CU_OCCUPANCY_INVALID;
+      cu_count = 0;
     }
   }
 
diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc
index e85646d2..7d375fb3 100644
--- a/src/amd_smi/amd_smi.cc
+++ b/src/amd_smi/amd_smi.cc
@@ -1785,76 +1785,55 @@ amdsmi_get_gpu_total_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_
 }
 
 amdsmi_status_t
-amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_process_handle_t *list) {
+amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_proc_info_t *list) {
     AMDSMI_CHECK_INIT();
-
-    if (max_processes == nullptr) {
+    if (!max_processes) {
         return AMDSMI_STATUS_INVAL;
     }
 
-    std::vector<long int> pids;
-    uint32_t i = 0;
-    uint64_t size = 0;
-    amdsmi_status_t status;
     amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
-    amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
-    if (r != AMDSMI_STATUS_SUCCESS)
-        return r;
-
-    if (gpu_device->check_if_drm_is_supported()){
-        amdsmi_bdf_t bdf = gpu_device->get_bdf();
-        status = gpuvsmi_get_pids(bdf, pids, &size);
-        if (status != AMDSMI_STATUS_SUCCESS) {
-            return status;
-        }
-        if (*max_processes == 0 || (pids.size() == 0)) {
-            *max_processes = (uint32_t)pids.size();
-            return AMDSMI_STATUS_SUCCESS;
-        }
-        if (!list) {
-            return AMDSMI_STATUS_INVAL;
-        }
-        if (*max_processes < pids.size()) {
-            return AMDSMI_STATUS_OUT_OF_RESOURCES;
-        }
-        for (auto &pid : pids) {
-            if (i >= *max_processes) {
-                break;
+    amdsmi_status_t status_code = get_gpu_device_from_handle(processor_handle, &gpu_device);
+    if (status_code != amdsmi_status_t::AMDSMI_STATUS_SUCCESS) {
+        return status_code;
+    }
+
+    auto compute_process_list = gpu_device->amdgpu_get_compute_process_list();
+    if ((*max_processes == 0) || compute_process_list.empty()) {
+        *max_processes = static_cast<uint32_t>(compute_process_list.size());
+        return amdsmi_status_t::AMDSMI_STATUS_SUCCESS;
+    }
+    if (!list) {
+        return amdsmi_status_t::AMDSMI_STATUS_INVAL;
+    }
+
+    const auto max_processes_original_size(*max_processes);
+    auto idx = uint32_t(0);
+    auto is_required_previlegies_required(false);
+    for (auto& process : compute_process_list) {
+        if (idx < *max_processes) {
+            list[idx++] = static_cast<amdsmi_proc_info_t>(process.second);
+            //  Note: If we could not read the process info for an existing process,
+            //        that is likely a permission error.
+            if (!is_required_previlegies_required && std::string(process.second.name).empty()) {
+                is_required_previlegies_required = true;
             }
-            list[i++] = (uint32_t)pid;
+        } else {
+            break;
         }
-        *max_processes = (uint32_t)pids.size();
-    }
-    else {
-        // rocm
-    }
-
-    return AMDSMI_STATUS_SUCCESS;
-}
-
-amdsmi_status_t
-amdsmi_get_gpu_process_info(amdsmi_processor_handle processor_handle, amdsmi_process_handle_t process, amdsmi_proc_info_t *info) {
-    AMDSMI_CHECK_INIT();
-
-    if (info == nullptr) {
-        return AMDSMI_STATUS_INVAL;
     }
 
-    amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
-    amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
-    if (r != AMDSMI_STATUS_SUCCESS)
-        return r;
-
-    amdsmi_status_t status;
-    if (gpu_device->check_if_drm_is_supported()) {
-        status = gpuvsmi_get_pid_info(gpu_device->get_bdf(), process, *info);
-        if (status != AMDSMI_STATUS_SUCCESS) return status;
-    }
-    else {
-        // rocm
-    }
-
-    return AMDSMI_STATUS_SUCCESS;
+    //  Note: If the reserved size for processes is smaller than the number of
+    //        actual processes running. The AMDSMI_STATUS_OUT_OF_RESOURCES is
+    //        an indication the caller should handle the situation (resize).
+    //        The max_processes is always changed to reflect the actual size of
+    //        list of processes running, so the caller knows where it is at.
+    //        Holding a copy of max_process before it is passed in will be helpful
+    //        for the caller.
+    status_code = is_required_previlegies_required
+          ? amdsmi_status_t::AMDSMI_STATUS_NO_PERM : AMDSMI_STATUS_SUCCESS;
+    *max_processes = static_cast<uint32_t>(compute_process_list.size());
+    return (max_processes_original_size >= static_cast<uint32_t>(compute_process_list.size()))
+            ? status_code : amdsmi_status_t::AMDSMI_STATUS_OUT_OF_RESOURCES;
 }
 
 amdsmi_status_t
diff --git a/src/amd_smi/amd_smi_gpu_device.cc b/src/amd_smi/amd_smi_gpu_device.cc
index 45d419f2..72c5cc4d 100644
--- a/src/amd_smi/amd_smi_gpu_device.cc
+++ b/src/amd_smi/amd_smi_gpu_device.cc
@@ -41,10 +41,16 @@
  *
  */
 
-#include <functional>
 #include "amd_smi/impl/amd_smi_gpu_device.h"
+#include "amd_smi/impl/amd_smi_common.h"
+#include "amd_smi/impl/fdinfo.h"
+#include "rocm_smi/rocm_smi_kfd.h"
 #include "rocm_smi/rocm_smi_utils.h"
 
+#include <functional>
+#include <map>
+#include <memory>
+#include <unordered_set>
 
 namespace amd {
 namespace smi {
@@ -148,6 +154,153 @@ amdsmi_status_t AMDSmiGPUDevice::amdgpu_query_vbios(void *info) const {
     return drm_.amdgpu_query_vbios(fd, info);
 }
 
+
+int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list,
+                                                       ComputeProcessListType_t list_type)
+{
+    /**
+     *  The first call to GetProcessInfo() helps to find the size it needs,
+     *  so we can create a tailored size list.
+     */
+    auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
+    auto list_process_running_size = uint32_t(0);
+    auto list_process_allocation_size = uint32_t(0);
+
+    status_code = rsmi_compute_process_info_get(nullptr, &list_process_running_size);
+    if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (list_process_running_size <= 0)) {
+        return status_code;
+    }
+
+    /**
+     *  The second call to GetProcessInfo() helps to set proper sizes for both,
+     *  the raw array of processes (amdsmi_process_info_t) and list of processes (amdsmi_proc_info_t).
+     */
+    using RsmiDeviceList_t = uint32_t[];
+    using RsmiProcessList_t = rsmi_process_info_t[];
+    std::unique_ptr<RsmiProcessList_t> list_all_processes_ptr = std::make_unique<RsmiProcessList_t>(list_process_running_size);
+
+    list_process_allocation_size = list_process_running_size;
+    status_code = rsmi_compute_process_info_get(list_all_processes_ptr.get(), &list_process_allocation_size);
+    if (status_code) {
+        return status_code;
+    }
+
+    // Restore the original size to read
+    list_process_running_size = list_process_allocation_size;
+    if (list_process_running_size <= 0) {
+        return rsmi_status_t::RSMI_STATUS_NOT_FOUND;
+    }
+
+
+    /**
+     *  Setup for the cases where the process list is by device.
+     */
+    auto list_device_running_size = uint32_t(0);
+    auto list_device_allocation_size = uint32_t(0);
+    status_code = rsmi_num_monitor_devices(&list_device_running_size);
+    if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (list_device_running_size <= 0)) {
+        return status_code;
+    }
+
+
+    /**
+     * Complete the process information
+     */
+    auto get_process_info = [&](const rsmi_process_info_t& rsmi_proc_info, amdsmi_proc_info_t& asmi_proc_info) {
+        auto status_code = gpuvsmi_get_pid_info(get_bdf(), rsmi_proc_info.process_id, asmi_proc_info);
+        // If we cannot get the info from sysfs, save the minimum info
+        if (status_code != amdsmi_status_t::AMDSMI_STATUS_SUCCESS) {
+            asmi_proc_info.pid = rsmi_proc_info.process_id;
+            asmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage;
+        }
+
+        return status_code;
+    };
+
+    /**
+     * Get process information
+     */
+    auto update_list_by_running_process = [&](const uint32_t process_id) {
+        auto status_result(true);
+        rsmi_process_info_t rsmi_proc_info{};
+        auto status_code = rsmi_compute_process_info_by_pid_get(process_id, &rsmi_proc_info);
+        if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {
+            status_result = false;
+            return status_result;
+        }
+
+        amdsmi_proc_info_t tmp_asmi_proc_info{};
+        get_process_info(rsmi_proc_info, tmp_asmi_proc_info);
+        compute_process_list.emplace(process_id, tmp_asmi_proc_info);
+
+        return status_result;
+    };
+
+
+    /**
+     *  Devices used by a process.
+     */
+    auto update_list_by_running_device = [&](const uint32_t process_id,
+                                             const uint32_t proc_addr_id) {
+        // Get all devices running this process
+        auto status_result(true);
+        std::unique_ptr<RsmiDeviceList_t> list_device_ptr = std::make_unique<RsmiDeviceList_t>(list_device_running_size);
+        list_device_allocation_size = list_device_running_size;
+        auto status_code = rsmi_compute_process_gpus_get(process_id, list_device_ptr.get(), &list_device_allocation_size);
+        if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {
+            status_result = false;
+            return status_result;
+        }
+
+        for (auto device_idx = uint32_t(0); device_idx < list_device_allocation_size; ++device_idx) {
+            // Is this device running this process?
+            if (list_device_ptr[device_idx] == get_gpu_id()) {
+                rsmi_process_info_t rsmi_dev_proc_info{};
+                auto status_code = rsmi_compute_process_info_by_device_get(process_id, list_device_ptr[device_idx], &rsmi_dev_proc_info);
+                if ((status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) &&
+                    ((rsmi_dev_proc_info.process_id == process_id) && (rsmi_dev_proc_info.pasid == proc_addr_id))) {
+                    amdsmi_proc_info_t tmp_asmi_proc_info{};
+                    get_process_info(rsmi_dev_proc_info, tmp_asmi_proc_info);
+                    compute_process_list.emplace(process_id, tmp_asmi_proc_info);
+                }
+            }
+        }
+
+        return status_result;
+    };
+
+
+    /**
+     *  Transfer/Save the ones linked to this device.
+     */
+    compute_process_list.clear();
+    for (auto process_idx = uint32_t(0); process_idx < list_process_running_size; ++process_idx) {
+        if (list_type == ComputeProcessListType_t::kAllProcesses) {
+            if (update_list_by_running_process(list_all_processes_ptr[process_idx].process_id)) {
+            }
+        }
+
+        if (list_type == ComputeProcessListType_t::kAllProcessesOnDevice) {
+            if (update_list_by_running_device(list_all_processes_ptr[process_idx].process_id,
+                                              list_all_processes_ptr[process_idx].pasid)) {
+            }
+        }
+    }
+
+    return status_code;
+}
+
+const GPUComputeProcessList_t& AMDSmiGPUDevice::amdgpu_get_compute_process_list(ComputeProcessListType_t list_type)
+{
+    auto error_code = get_compute_process_list_impl(compute_process_list_, list_type);
+    if (error_code) {
+        compute_process_list_.clear();
+    }
+
+    return compute_process_list_;
+}
+
+
 }  // namespace smi
 }  // namespace amd
 
diff --git a/src/amd_smi/fdinfo.cc b/src/amd_smi/fdinfo.cc
index 997bc225..9b963a8a 100644
--- a/src/amd_smi/fdinfo.cc
+++ b/src/amd_smi/fdinfo.cc
@@ -220,12 +220,10 @@ amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid,
 		}
 	}
 
-
 	closedir(d);
 
-	if (!pasids.size())
-		return AMDSMI_STATUS_NOT_FOUND;
-
+  //  Note: If possible at all, try to get the name of the process/container.
+  //        In case the other info fail, get at least something.
 	std::ifstream filename(name_path.c_str());
 	std::string name;
 
@@ -252,9 +250,12 @@ amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid,
 		if (strlen(info.container_name) > 0)
 			break;
 	}
-
 	info.pid = (uint32_t)pid;
 
+	if (!pasids.size()) {
+		return AMDSMI_STATUS_NOT_FOUND;
+  }
+
 	return AMDSMI_STATUS_SUCCESS;
 }
 
diff --git a/tests/amd_smi_test/functional/process_info_read.cc b/tests/amd_smi_test/functional/process_info_read.cc
index f0394593..d88bbe49 100755
--- a/tests/amd_smi_test/functional/process_info_read.cc
+++ b/tests/amd_smi_test/functional/process_info_read.cc
@@ -226,4 +226,5 @@ void TestProcInfoRead::Run(void) {
     }
   }
   delete []procs;
+
 }

From 08a3e76b269aa2ff72977b4aa7b80a06781ec12f Mon Sep 17 00:00:00 2001
From: Charis Poag <Charis.Poag@amd.com>
Date: Wed, 27 Mar 2024 17:10:59 -0500
Subject: [PATCH 15/18] SWDEV-445668 - Align topology JSON

Updates:
    - [CLI] Updated json output to provide format
      similar to host
      eg.
      [
    {
        "gpu": 0,
        "bdf": "0000:01:00.0",
        "links": [
            {
                "gpu": 0,
                "bdf": "0000:01:00.0",
                "weight": 0,
                "link_status": "ENABLED",
                "link_type": "SELF",
                "num_hops": 0,
                "bandwidth": "N/A",
                "fb_sharing": "ENABLED"
            },
            {
                "gpu": 1,
                "bdf": "0001:01:00.0",
                "weight": 15,
                "link_status": "ENABLED",
                "link_type": "XGMI",
                "num_hops": 1,
                "bandwidth": "50000-100000",
                "fb_sharing": "ENABLED"
            },
        ...
        ]
    },
    {
    ...

Change-Id: I63217f63a4d6ebc23a8a84eaac9dbb7aff5f4cb4
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
---
 CHANGELOG.md                  | 140 ++++++++++++++++++++++++++++++++++
 amdsmi_cli/amdsmi_commands.py | 135 +++++++++++++++++++++++++++-----
 2 files changed, 255 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f606276c..6a5fcc99 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,10 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
 
 ## amd_smi_lib for ROCm 6.1.1
 
+### Added
+
+- N/A
+
 ### Changed
 
 - **Updated metrics --clocks**
@@ -143,6 +147,142 @@ GPU: 0
         ...
 ```
 
+- **Updated `amd-smi topology --json` to align with host/guest**  
+Topology's `--json` output now is changed to align with output reported bt host/guest systems. Additionally, users can select/filter specific topology details as desired (refer to `amd-smi topology -h` for full list). See examples shown below.
+
+*Previous format:*  
+```shell
+$ amd-smi topology --json
+[
+    {
+        "gpu": 0,
+        "link_accessibility": {
+            "gpu_0": "ENABLED",
+            "gpu_1": "DISABLED"
+        },
+        "weight": {
+            "gpu_0": 0,
+            "gpu_1": 40
+        },
+        "hops": {
+            "gpu_0": 0,
+            "gpu_1": 2
+        },
+        "link_type": {
+            "gpu_0": "SELF",
+            "gpu_1": "PCIE"
+        },
+        "numa_bandwidth": {
+            "gpu_0": "N/A",
+            "gpu_1": "N/A"
+        }
+    },
+    {
+        "gpu": 1,
+        "link_accessibility": {
+            "gpu_0": "DISABLED",
+            "gpu_1": "ENABLED"
+        },
+        "weight": {
+            "gpu_0": 40,
+            "gpu_1": 0
+        },
+        "hops": {
+            "gpu_0": 2,
+            "gpu_1": 0
+        },
+        "link_type": {
+            "gpu_0": "PCIE",
+            "gpu_1": "SELF"
+        },
+        "numa_bandwidth": {
+            "gpu_0": "N/A",
+            "gpu_1": "N/A"
+        }
+    }
+]
+```
+
+*New format:*
+```shell
+$ amd-smi topology --json
+[
+    {
+        "gpu": 0,
+        "bdf": "0000:01:00.0",
+        "links": [
+            {
+                "gpu": 0,
+                "bdf": "0000:01:00.0",
+                "weight": 0,
+                "link_status": "ENABLED",
+                "link_type": "SELF",
+                "num_hops": 0,
+                "bandwidth": "N/A",
+                "fb_sharing": "ENABLED"
+            },
+            {
+                "gpu": 1,
+                "bdf": "0001:01:00.0",
+                "weight": 15,
+                "link_status": "ENABLED",
+                "link_type": "XGMI",
+                "num_hops": 1,
+                "bandwidth": "50000-100000",
+                "fb_sharing": "ENABLED"
+            },
+        ...
+        ]
+    },
+    ...
+]
+```
+```shell
+$ /opt/rocm/bin/amd-smi topology -a -t --json
+[
+    {
+        "gpu": 0,
+        "bdf": "0000:08:00.0",
+        "links": [
+            {
+                "gpu": 0,
+                "bdf": "0000:08:00.0",
+                "link_status": "ENABLED",
+                "link_type": "SELF"
+            },
+            {
+                "gpu": 1,
+                "bdf": "0000:44:00.0",
+                "link_status": "DISABLED",
+                "link_type": "PCIE"
+            }
+        ]
+    },
+    {
+        "gpu": 1,
+        "bdf": "0000:44:00.0",
+        "links": [
+            {
+                "gpu": 0,
+                "bdf": "0000:08:00.0",
+                "link_status": "DISABLED",
+                "link_type": "PCIE"
+            },
+            {
+                "gpu": 1,
+                "bdf": "0000:44:00.0",
+                "link_status": "ENABLED",
+                "link_type": "SELF"
+            }
+        ]
+    }
+]
+```
+
+### Optimizations
+
+- N/A
+
 ### Fixed
 
 - **Fix for GPU reset error on non-amdgpu cards**
diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index b9bc3a5e..431b23fa 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -2763,20 +2763,115 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None,
 
         # Populate the possible gpus
         topo_values = []
-        for gpu in args.gpu:
-            gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu)
-            topo_values.append({"gpu" : gpu_id})
-            gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(gpu)
-            self.logger.table_header += gpu_bdf.rjust(13)
+        for src_gpu_index, src_gpu in enumerate(args.gpu):
+            src_gpu_id = self.helpers.get_gpu_id_from_device_handle(src_gpu)
+            topo_values.append({"gpu" : src_gpu_id})
+            src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
+            topo_values[src_gpu_index]['bdf'] = src_gpu_bdf
+            self.logger.table_header += src_gpu_bdf.rjust(13)
+
+            if not self.logger.is_json_format():
+                continue  # below is for JSON format only
+
+            ##########################
+            # JSON formatting start  #
+            ##########################
+            links = []
+            # create json obj for data alignment
+            #  dest_gpu_links = {
+            #         "gpu": GPU #
+            #         "bdf": BDF identification
+            #         "weight": 0 - self (current node); weight >= 0 correlated with hops (GPU-CPU, GPU-GPU, GPU-CPU-CPU-GPU, etc..)
+            #         "link_status": "ENABLED" - devices linked; "DISABLED" - devices not linked
+            #         "link_type": "SELF" - current node, "PCIE", "XGMI", "N/A" - no link,"UNKNOWN" - unidentified link type
+            #         "num_hops": num_hops - # of hops between devices
+            #         "bandwidth": numa_bw - The NUMA "minimum bandwidth-maximum bandwidth" beween src and dest nodes
+            #                      "N/A" - self node or not connected devices
+            #         "fb_sharing": "ENABLED/DISABLED" - same output as defined in link_status. Devices in a hive setup should
+            #                       all have sharing enabled.
+            #     }
+
+            for dest_gpu_index, dest_gpu in enumerate(args.gpu):
+                link_type = "SELF"
+                if src_gpu != dest_gpu:
+                    link_type = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['type']
+                if isinstance(link_type, int):
+                    if link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_UNDEFINED:
+                        link_type = "UNKNOWN"
+                    elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_PCIEXPRESS:
+                        link_type = "PCIE"
+                    elif link_type == amdsmi_interface.amdsmi_wrapper.AMDSMI_IOLINK_TYPE_XGMI:
+                        link_type = "XGMI"
+                    else:
+                        link_type = "N/A"
+
+                numa_bw = "N/A"
+                if src_gpu != dest_gpu:
+                    try:
+                        bw_dict = amdsmi_interface.amdsmi_get_minmax_bandwidth_between_processors(src_gpu, dest_gpu)
+                        numa_bw = f"{bw_dict['min_bandwidth']}-{bw_dict['max_bandwidth']}"
+                    except amdsmi_exception.AmdSmiLibraryException as e:
+                        logging.debug("Failed to get min max bandwidth for %s to %s | %s",
+                                    self.helpers.get_gpu_id_from_device_handle(src_gpu),
+                                    self.helpers.get_gpu_id_from_device_handle(dest_gpu),
+                                    e.get_error_info())
+
+                weight = 0
+                num_hops = 0
+                if src_gpu != dest_gpu: 
+                    weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu)
+                    num_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops']
+                link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu)
+                if link_status:
+                    link_status = "ENABLED"
+                else:
+                    link_status = "DISABLED"
+
+                # fb_sharing in BM - in a hive configuration, this is
+                # link_status = amdsmi_is_P2P_accessible(src,dest)
+                dest_gpu_links = {
+                    "gpu": self.helpers.get_gpu_id_from_device_handle(dest_gpu),
+                    "bdf": amdsmi_interface.amdsmi_get_gpu_device_bdf(dest_gpu),
+                    "weight": weight,
+                    "link_status": link_status,
+                    "link_type": link_type,
+                    "num_hops": num_hops,
+                    "bandwidth": numa_bw,
+                    "fb_sharing": link_status
+                }
+                if not args.access: # currently includes fb_sharing
+                    del dest_gpu_links['link_status']
+                    del dest_gpu_links['fb_sharing']
+                if not args.weight:
+                    del dest_gpu_links['weight']
+                if not args.link_type:
+                    del dest_gpu_links['link_type']
+                if not args.hops:
+                    del dest_gpu_links['num_hops']
+                if not args.numa_bw:
+                    del dest_gpu_links['bandwidth']
+                links.append(dest_gpu_links)
+                isEndOfDest = dest_gpu_index+1 == len(args.gpu)
+                isEndOfSrc = src_gpu_index+1 == len(args.gpu)
+                if isEndOfDest:
+                    topo_values[src_gpu_index]['links'] = links
+                    continue
+            if isEndOfSrc:
+                self.logger.multiple_device_output = topo_values
+                self.logger.print_output(multiple_device_enabled=True, tabular=True)
+                return
+            ##########################
+            # JSON formatting end    #
+            ##########################
 
         if args.access:
             tabular_output = []
             for src_gpu_index, src_gpu in enumerate(args.gpu):
-                gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
+                src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
                 if self.logger.is_human_readable_format():
-                    tabular_output_dict = {'gpu' : f"{gpu_bdf} "}
+                    tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
                 else:
-                    tabular_output_dict = {'gpu' : gpu_bdf}
+                    tabular_output_dict = {'gpu' : src_gpu_bdf}
                 src_gpu_links = {}
                 for dest_gpu in args.gpu:
                     dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
@@ -2808,11 +2903,11 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None,
         if args.weight:
             tabular_output = []
             for src_gpu_index, src_gpu in enumerate(args.gpu):
-                gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
+                src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
                 if self.logger.is_human_readable_format():
-                    tabular_output_dict = {'gpu' : f"{gpu_bdf} "}
+                    tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
                 else:
-                    tabular_output_dict = {'gpu' : gpu_bdf}
+                    tabular_output_dict = {'gpu' : src_gpu_bdf}
                 src_gpu_weight = {}
                 for dest_gpu in args.gpu:
                     dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
@@ -2845,11 +2940,11 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None,
         if args.hops:
             tabular_output = []
             for src_gpu_index, src_gpu in enumerate(args.gpu):
-                gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
+                src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
                 if self.logger.is_human_readable_format():
-                    tabular_output_dict = {'gpu' : f"{gpu_bdf} "}
+                    tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
                 else:
-                    tabular_output_dict = {'gpu' : gpu_bdf}
+                    tabular_output_dict = {'gpu' : src_gpu_bdf}
                 src_gpu_hops = {}
                 for dest_gpu in args.gpu:
                     dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
@@ -2882,11 +2977,11 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None,
         if args.link_type:
             tabular_output = []
             for src_gpu_index, src_gpu in enumerate(args.gpu):
-                gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
+                src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
                 if self.logger.is_human_readable_format():
-                    tabular_output_dict = {'gpu' : f"{gpu_bdf} "}
+                    tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
                 else:
-                    tabular_output_dict = {'gpu' : gpu_bdf}
+                    tabular_output_dict = {'gpu' : src_gpu_bdf}
                 src_gpu_link_type = {}
                 for dest_gpu in args.gpu:
                     dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)
@@ -2924,11 +3019,11 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None,
         if args.numa_bw:
             tabular_output = []
             for src_gpu_index, src_gpu in enumerate(args.gpu):
-                gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
+                src_gpu_bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(src_gpu)
                 if self.logger.is_human_readable_format():
-                    tabular_output_dict = {'gpu' : f"{gpu_bdf} "}
+                    tabular_output_dict = {'gpu' : f"{src_gpu_bdf} "}
                 else:
-                    tabular_output_dict = {'gpu' : gpu_bdf}
+                    tabular_output_dict = {'gpu' : src_gpu_bdf}
                 src_gpu_link_type = {}
                 for dest_gpu in args.gpu:
                     dest_gpu_id = self.helpers.get_gpu_id_from_device_handle(dest_gpu)

From 9758a8bc3318f02d7aa2ef14165f4ace0fca6201 Mon Sep 17 00:00:00 2001
From: Maisam Arif <maisarif@amd.com>
Date: Fri, 5 Apr 2024 01:58:39 -0500
Subject: [PATCH 16/18] Removed fb_sharing fields from Linux BM

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: Ia2942b9d33699ced1683270454c479701bce1246
---
 CHANGELOG.md                  |  2 --
 amdsmi_cli/amdsmi_commands.py | 15 +++++----------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6a5fcc99..bea6fc89 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -219,7 +219,6 @@ $ amd-smi topology --json
                 "link_type": "SELF",
                 "num_hops": 0,
                 "bandwidth": "N/A",
-                "fb_sharing": "ENABLED"
             },
             {
                 "gpu": 1,
@@ -229,7 +228,6 @@ $ amd-smi topology --json
                 "link_type": "XGMI",
                 "num_hops": 1,
                 "bandwidth": "50000-100000",
-                "fb_sharing": "ENABLED"
             },
         ...
         ]
diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index 431b23fa..ee6159e5 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -2782,13 +2782,11 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None,
             #         "gpu": GPU #
             #         "bdf": BDF identification
             #         "weight": 0 - self (current node); weight >= 0 correlated with hops (GPU-CPU, GPU-GPU, GPU-CPU-CPU-GPU, etc..)
-            #         "link_status": "ENABLED" - devices linked; "DISABLED" - devices not linked
+            #         "link_status": "ENABLED" - devices linked; "DISABLED" - devices not linked; Correlated to access
             #         "link_type": "SELF" - current node, "PCIE", "XGMI", "N/A" - no link,"UNKNOWN" - unidentified link type
             #         "num_hops": num_hops - # of hops between devices
             #         "bandwidth": numa_bw - The NUMA "minimum bandwidth-maximum bandwidth" beween src and dest nodes
             #                      "N/A" - self node or not connected devices
-            #         "fb_sharing": "ENABLED/DISABLED" - same output as defined in link_status. Devices in a hive setup should
-            #                       all have sharing enabled.
             #     }
 
             for dest_gpu_index, dest_gpu in enumerate(args.gpu):
@@ -2818,7 +2816,7 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None,
 
                 weight = 0
                 num_hops = 0
-                if src_gpu != dest_gpu: 
+                if src_gpu != dest_gpu:
                     weight = amdsmi_interface.amdsmi_topo_get_link_weight(src_gpu, dest_gpu)
                     num_hops = amdsmi_interface.amdsmi_topo_get_link_type(src_gpu, dest_gpu)['hops']
                 link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu)
@@ -2827,7 +2825,6 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None,
                 else:
                     link_status = "DISABLED"
 
-                # fb_sharing in BM - in a hive configuration, this is
                 # link_status = amdsmi_is_P2P_accessible(src,dest)
                 dest_gpu_links = {
                     "gpu": self.helpers.get_gpu_id_from_device_handle(dest_gpu),
@@ -2837,11 +2834,9 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None,
                     "link_type": link_type,
                     "num_hops": num_hops,
                     "bandwidth": numa_bw,
-                    "fb_sharing": link_status
                 }
-                if not args.access: # currently includes fb_sharing
+                if not args.access:
                     del dest_gpu_links['link_status']
-                    del dest_gpu_links['fb_sharing']
                 if not args.weight:
                     del dest_gpu_links['weight']
                 if not args.link_type:
@@ -2851,9 +2846,9 @@ def topology(self, args, multiple_devices=False, gpu=None, access=None,
                 if not args.numa_bw:
                     del dest_gpu_links['bandwidth']
                 links.append(dest_gpu_links)
-                isEndOfDest = dest_gpu_index+1 == len(args.gpu)
+                dest_end = dest_gpu_index+1 == len(args.gpu)
                 isEndOfSrc = src_gpu_index+1 == len(args.gpu)
-                if isEndOfDest:
+                if dest_end:
                     topo_values[src_gpu_index]['links'] = links
                     continue
             if isEndOfSrc:

From 50450a2a69bfe33da51d5398deed381a340530cc Mon Sep 17 00:00:00 2001
From: Maisam Arif <maisarif@amd.com>
Date: Fri, 5 Apr 2024 02:30:08 -0500
Subject: [PATCH 17/18] Added amdsmi_get_gpu_process_info python library
 documentation

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I2218bf664a8a155e6b3085378db0fb20f3be3f70
---
 py-interface/README.md | 44 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/py-interface/README.md b/py-interface/README.md
index 4199f1a9..19454121 100644
--- a/py-interface/README.md
+++ b/py-interface/README.md
@@ -882,14 +882,46 @@ except AmdSmiException as e:
 
 ### amdsmi_get_gpu_process_list
 
-Description: Returns the list of processes for the given GPU.
-The list is of type `amdsmi_proc_info_t` and holds information about the running process.
+Description: Returns the list of processes running on the target GPU.
 
 Input parameters:
 
 * `processor_handle` device which to query
 
-Output: List of process processes with fields
+Output: List of `amdsmi_proc_info_t` process objects running on the target GPU; can be empty
+
+Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
+
+* `AmdSmiLibraryException`
+* `AmdSmiRetryException`
+* `AmdSmiParameterException`
+
+Example:
+
+```python
+try:
+    devices = amdsmi_get_processor_handles()
+    if len(devices) == 0:
+        print("No GPUs on machine")
+    else:
+        for device in devices:
+            processes = amdsmi_get_gpu_process_list(device)
+            if len(processes) == 0:
+                print("No processes running on this GPU")
+            else:
+                for process in processes:
+                    print(amdsmi_get_gpu_process_info(device, process))
+except AmdSmiException as e:
+    print(e)
+```
+
+### amdsmi_get_gpu_process_info
+
+Description: Returns info about process given the target GPU and the corresponding `amdsmi_proc_info_t` object
+
+Input parameters:
+
+* `processor_handle` device which to query
 
 Output: Dictionary with fields
 
@@ -899,9 +931,9 @@ Field | Description
 `pid` | Process ID
 `mem` | Process memory usage
 `engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
-`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
+`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table
 
-Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
+Exceptions that can be thrown by `amdsmi_get_gpu_process_info` function:
 
 * `AmdSmiLibraryException`
 * `AmdSmiRetryException`
@@ -921,7 +953,7 @@ try:
                 print("No processes running on this GPU")
             else:
                 for process in processes:
-                    print(process)
+                    print(amdsmi_get_gpu_process_info(device, process))
 except AmdSmiException as e:
     print(e)
 ```

From 092908daee65b9ec86748c718dbd9bb1ec74716d Mon Sep 17 00:00:00 2001
From: Maisam Arif <maisarif@amd.com>
Date: Fri, 5 Apr 2024 02:31:08 -0500
Subject: [PATCH 18/18] Bump Version to 24.5.1.0

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I842e223b78f337a39098f652fa6e7ef51948fbaf
---
 CMakeLists.txt           | 2 +-
 amdsmi_cli/README.md     | 2 +-
 docs/doxygen/Doxyfile    | 2 +-
 include/amd_smi/amdsmi.h | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 97dbc610..6cf0d289 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ find_program(GIT NAMES git)
 
 ## Setup the package version based on git tags.
 set(PKG_VERSION_GIT_TAG_PREFIX "amdsmi_pkg_ver")
-get_package_version_number("24.5.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
+get_package_version_number("24.5.1" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
 message("Package version: ${PKG_VERSION_STR}")
 set(${AMD_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}")
 set(${AMD_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}")
diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md
index 72028e5b..06e89147 100644
--- a/amdsmi_cli/README.md
+++ b/amdsmi_cli/README.md
@@ -79,7 +79,7 @@ amd-smi will report the version and current platform detected when running the c
 ~$ amd-smi
 usage: amd-smi [-h]  ...
 
-AMD System Management Interface | Version: 24.5.0.0 | ROCm version: 6.1.1 | Platform: Linux Baremetal
+AMD System Management Interface | Version: 24.5.1.0 | ROCm version: 6.1.1 | Platform: Linux Baremetal
 
 options:
   -h, --help          show this help message and exit
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index de8ab73b..ef62d4d1 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -48,7 +48,7 @@ PROJECT_NAME           = AMD SMI
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "24.5.0.0"
+PROJECT_NUMBER         = "24.5.1.0"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h
index 3f7bd398..ba73c093 100644
--- a/include/amd_smi/amdsmi.h
+++ b/include/amd_smi/amdsmi.h
@@ -154,7 +154,7 @@ typedef enum {
 #define AMDSMI_LIB_VERSION_MAJOR 5
 
 //! Minor version should be updated for each API change, but without changing headers
-#define AMDSMI_LIB_VERSION_MINOR 0
+#define AMDSMI_LIB_VERSION_MINOR 1
 
 //! Release version should be set to 0 as default and can be updated by the PMs for each CSP point release
 #define AMDSMI_LIB_VERSION_RELEASE 0