diff --git a/CHANGELOG.md b/CHANGELOG.md
index cbd9fc2a..11ca692f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,8 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
### Additions
+- **`amd-smi dmon` is now available as an alias to `amd-smi monitor`**.
+
- **Added optional process table under `amd-smi monitor -q`**.
The monitor subcommand within the CLI Tool now has the `-q` option to enable an optional process table underneath the original monitored output.
@@ -40,6 +42,48 @@ Added `AMDSMI_EVT_NOTIF_RING_HANG` to the possible events in the `amdsmi_evt_not
### Optimizations
+- **Updated CLI error strings to specify invalid device type queried**
+
+```shell
+$ amd-smi static --asic --gpu 123123
+Can not find a device: GPU '123123' Error code: -3
+```
+
+- **Removed elevated permission requirements for `amdsmi_get_gpu_process_list()`**.
+Previously if a processes with elevated permissions was running amd-smi would required sudo to display all output. Now amd-smi will populate all process data and return N/A for elevated process names instead. However if ran with sudo you will be able to see the name like so:
+
+```shell
+$ amd-smi process
+GPU: 0
+ PROCESS_INFO:
+ NAME: N/A
+ PID: 1693982
+ MEMORY_USAGE:
+ GTT_MEM: 0.0 B
+ CPU_MEM: 0.0 B
+ VRAM_MEM: 10.1 GB
+ MEM_USAGE: 0.0 B
+ USAGE:
+ GFX: 0 ns
+ ENC: 0 ns
+```
+
+```shell
+$ sudo amd-smi process
+GPU: 0
+ PROCESS_INFO:
+ NAME: TransferBench
+ PID: 1693982
+ MEMORY_USAGE:
+ GTT_MEM: 0.0 B
+ CPU_MEM: 0.0 B
+ VRAM_MEM: 10.1 GB
+ MEM_USAGE: 0.0 B
+ USAGE:
+ GFX: 0 ns
+ ENC: 0 ns
+```
+
- **Updated naming for `amdsmi_set_gpu_clear_sram_data()` to `amdsmi_clean_gpu_local_data()`**.
Changed the naming to be more accurate to what the function was doing. This change also extends to the CLI where we changed the `clear-sram-data` command to `clean_local_data`.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67cc63ce..882ab7a1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ find_program(GIT NAMES git)
## Setup the package version based on git tags.
set(PKG_VERSION_GIT_TAG_PREFIX "amdsmi_pkg_ver")
-get_package_version_number("24.6.1" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
+get_package_version_number("24.6.2" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
message("Package version: ${PKG_VERSION_STR}")
set(${AMD_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}")
set(${AMD_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}")
diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md
index f0dc69c3..12af1461 100644
--- a/amdsmi_cli/README.md
+++ b/amdsmi_cli/README.md
@@ -73,13 +73,13 @@ Type "help", "copyright", "credits" or "license" for more information.
## Usage
-amd-smi will report the version and current platform detected when running the command without arguments:
+AMD-SMI reports the version and current platform detected when running the command line interface (CLI) without arguments:
``` bash
~$ amd-smi
usage: amd-smi [-h] ...
-AMD System Management Interface | Version: 24.6.1.0 | ROCm version: 6.2.0 | Platform: Linux Baremetal
+AMD System Management Interface | Version: 24.6.2.0 | ROCm version: 6.2.0 | Platform: Linux Baremetal
options:
-h, --help show this help message and exit
@@ -97,7 +97,7 @@ AMD-SMI Commands:
topology Displays topology information of the devices
set Set options for devices
reset Reset options for devices
- monitor Monitor metrics for target devices
+ monitor (dmon) Monitor metrics for target devices
xgmi Displays xgmi information of the devices
```
@@ -594,7 +594,7 @@ Command Modifiers:
usage: amd-smi monitor [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
[-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]]
[-w INTERVAL] [-W TIME] [-i ITERATIONS] [-p] [-t] [-u] [-m] [-n]
- [-d] [-e] [-v] [-r]
+ [-d] [-e] [-v] [-r] [-q]
Monitor a target device for the specified arguments.
If no arguments are provided, all arguments will be enabled.
@@ -629,6 +629,7 @@ Monitor Arguments:
-e, --ecc Monitor ECC single bit, ECC double bit, and PCIe replay error counts
-v, --vram-usage Monitor memory usage in MB
-r, --pcie Monitor PCIe bandwidth in Mb/s
+ -q, --process Enable Process information table below monitor output
Command Modifiers:
--json Displays output in JSON format (human readable by default).
diff --git a/amdsmi_cli/amdsmi_cli_exceptions.py b/amdsmi_cli/amdsmi_cli_exceptions.py
index c61e7d94..fe6cc79f 100644
--- a/amdsmi_cli/amdsmi_cli_exceptions.py
+++ b/amdsmi_cli/amdsmi_cli_exceptions.py
@@ -69,6 +69,7 @@ def __init__(self):
self.stdout_message = ''
self.message = ''
self.output_format = ''
+ self.device_type = ''
def __str__(self):
# Return message according to the current output format
@@ -83,7 +84,7 @@ def __str__(self):
class AmdSmiInvalidCommandException(AmdSmiException):
- def __init__(self, command, outputformat):
+ def __init__(self, command, outputformat: str):
super().__init__()
self.value = -1
self.command = command
@@ -98,7 +99,7 @@ def __init__(self, command, outputformat):
class AmdSmiInvalidParameterException(AmdSmiException):
- def __init__(self, command, outputformat):
+ def __init__(self, command, outputformat: str):
super().__init__()
self.value = -2
self.command = command
@@ -113,13 +114,22 @@ def __init__(self, command, outputformat):
class AmdSmiDeviceNotFoundException(AmdSmiException):
- def __init__(self, command, outputformat):
+ def __init__(self, command, outputformat: str, gpu: bool, cpu: bool, core: bool):
super().__init__()
self.value = -3
self.command = command
self.output_format = outputformat
- common_message = f"Can not find a device with the corresponding identifier: '{self.command}'"
+ # Handle different devices
+ self.device_type = ""
+ if gpu:
+ self.device_type = "GPU"
+ elif cpu:
+ self.device_type = "CPU"
+ elif core:
+ self.device_type = "CPU CORE"
+
+ common_message = f"Can not find a device: {self.device_type} '{self.command}'"
self.json_message["error"] = common_message
self.json_message["code"] = self.value
@@ -128,7 +138,7 @@ def __init__(self, command, outputformat):
class AmdSmiInvalidFilePathException(AmdSmiException):
- def __init__(self, command, outputformat):
+ def __init__(self, command, outputformat: str):
super().__init__()
self.value = -4
self.command = command
@@ -143,7 +153,7 @@ def __init__(self, command, outputformat):
class AmdSmiInvalidParameterValueException(AmdSmiException):
- def __init__(self, command, outputformat):
+ def __init__(self, command, outputformat: str):
super().__init__()
self.value = -5
self.command = command
@@ -158,7 +168,7 @@ def __init__(self, command, outputformat):
class AmdSmiMissingParameterValueException(AmdSmiException):
- def __init__(self, command, outputformat):
+ def __init__(self, command, outputformat: str):
super().__init__()
self.value = -6
self.command = command
@@ -172,8 +182,23 @@ def __init__(self, command, outputformat):
self.stdout_message = f"{common_message} Error code: {self.value}"
+class AmdSmiCommandNotSupportedException(AmdSmiException):
+ def __init__(self, command, outputformat: str):
+ super().__init__()
+ self.value = -7
+ self.command = command
+ self.output_format = outputformat
+
+ common_message = f"Command '{self.command}' is not supported on the system. Run '--help' for more info."
+
+ self.json_message["error"] = common_message
+ self.json_message["code"] = self.value
+ self.csv_message = f"error,code\n{common_message}, {self.value}"
+ self.stdout_message = f"{common_message} Error code: {self.value}"
+
+
class AmdSmiParameterNotSupportedException(AmdSmiException):
- def __init__(self, command, outputformat):
+ def __init__(self, command, outputformat: str):
super().__init__()
self.value = -8
self.command = command
@@ -188,7 +213,7 @@ def __init__(self, command, outputformat):
class AmdSmiRequiredCommandException(AmdSmiException):
- def __init__(self, command, outputformat):
+ def __init__(self, command, outputformat: str):
super().__init__()
self.value = -9
self.command = command
@@ -203,7 +228,7 @@ def __init__(self, command, outputformat):
class AmdSmiUnknownErrorException(AmdSmiException):
- def __init__(self, command, outputformat):
+ def __init__(self, command, outputformat: str):
super().__init__()
self.value = -100
self.command = command
@@ -218,7 +243,7 @@ def __init__(self, command, outputformat):
class AmdSmiAMDSMIErrorException(AmdSmiException):
- def __init__(self, outputformat, error_code):
+ def __init__(self, outputformat: str, error_code):
super().__init__()
self.value = -1000 - abs(error_code)
self.smilibcode = error_code
diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index 6a078ffb..16823c4b 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -115,9 +115,15 @@ def version(self, args):
self.logger.output['rocm_version'] = f'{rocm_version_str}'
if self.logger.is_human_readable_format():
- print(f'AMDSMI Tool: {__version__} | '\
- f'AMDSMI Library version: {amdsmi_lib_version_str} | ' \
- f'ROCm version: {rocm_version_str}')
+ human_readable_output = f"AMDSMI Tool: {__version__} | " \
+ f"AMDSMI Library version: {amdsmi_lib_version_str} | " \
+ f"ROCm version: {rocm_version_str}"
+ # Custom human readable handling for version
+ if self.logger.destination == 'stdout':
+ print(human_readable_output)
+ else:
+ with self.logger.destination.open('a') as output_file:
+ output_file.write(human_readable_output + '\n')
elif self.logger.is_json_format() or self.logger.is_csv_format():
self.logger.print_output()
@@ -2631,8 +2637,6 @@ def process(self, args, multiple_devices=False, watching_output=False,
try:
process_list = amdsmi_interface.amdsmi_get_gpu_process_list(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
- if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
- raise PermissionError('Command requires elevation') from e
logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info())
raise e
diff --git a/amdsmi_cli/amdsmi_logger.py b/amdsmi_cli/amdsmi_logger.py
index c0ffe5a8..b54d6698 100644
--- a/amdsmi_cli/amdsmi_logger.py
+++ b/amdsmi_cli/amdsmi_logger.py
@@ -465,7 +465,7 @@ def print_output(self, multiple_device_enabled=False, watching_output=False, tab
self._print_tabular_output(multiple_device_enabled=multiple_device_enabled, watching_output=watching_output)
else:
self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled,
- watching_output=watching_output)
+ watching_output=watching_output)
def _print_json_output(self, multiple_device_enabled=False, watching_output=False):
diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py
index 10cebf4b..4d2984d6 100644
--- a/amdsmi_cli/amdsmi_parser.py
+++ b/amdsmi_cli/amdsmi_parser.py
@@ -111,6 +111,11 @@ def __init__(self, version, list, static, firmware, bad_pages, metric,
help="Descriptions:",
metavar='')
+ # Store possible subcommands & aliases for later errors
+ self.possible_commands = ['version', 'list', 'static', 'firmware', 'ucode', 'bad-pages',
+ 'metric', 'process', 'profile', 'event', 'topology', 'set',
+ 'reset', 'monitor', 'dmon', 'xgmi']
+
# Add all subparsers
self._add_version_parser(self.subparsers, version)
self._add_list_parser(self.subparsers, list)
@@ -257,7 +262,9 @@ def __call__(self, parser, args, values, option_string=None):
if selected_device_handles == '':
raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--gpu", _GPUSelectAction.ouputformat)
else:
- raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles, _GPUSelectAction.ouputformat)
+ raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles,
+ _GPUSelectAction.ouputformat,
+ True, False, False)
return _GPUSelectAction
@@ -283,7 +290,8 @@ def __call__(self, parser, args, values, option_string=None):
raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--cpu", _CPUSelectAction.ouputformat)
else:
raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles,
- _CPUSelectAction.ouputformat)
+ _CPUSelectAction.ouputformat,
+ False, True, False)
return _CPUSelectAction
@@ -308,7 +316,8 @@ def __call__(self, parser, args, values, option_string=None):
raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException("--core", _CoreSelectAction.ouputformat)
else:
raise amdsmi_cli_exceptions.AmdSmiDeviceNotFoundException(selected_device_handles,
- _CoreSelectAction.ouputformat)
+ _CoreSelectAction.ouputformat,
+ False, False, True)
return _CoreSelectAction
@@ -1129,7 +1138,7 @@ def _add_monitor_parser(self, subparsers, func):
process_help = "Enable Process information table below monitor output"
# Create monitor subparser
- monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help)
+ monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help, aliases=["dmon"])
monitor_parser._optionals.title = monitor_optionals_title
monitor_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog)
monitor_parser.set_defaults(func=func)
@@ -1232,6 +1241,9 @@ def error(self, message):
l = len("argument : invalid choice: ") + 1
message = message[l:]
message = message.split("'")[0]
+ # Check if the command is possible in other system configurations and error accordingly
+ if message in self.possible_commands:
+ raise amdsmi_cli_exceptions.AmdSmiCommandNotSupportedException(message, outputformat)
raise amdsmi_cli_exceptions.AmdSmiInvalidCommandException(message, outputformat)
elif "unrecognized arguments: " in message:
l = len("unrecognized arguments: ")
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 8f1b03c9..ed6ff05a 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -48,7 +48,7 @@ PROJECT_NAME = AMD SMI
# could be handy for archiving the generated documentation or if some version
# control system is used.
-PROJECT_NUMBER = "24.6.1.0"
+PROJECT_NUMBER = "24.6.2.0"
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
diff --git a/docs/how-to/using-AMD-SMI-CLI-tool.md b/docs/how-to/using-AMD-SMI-CLI-tool.md
index 6c22596f..4438c321 100644
--- a/docs/how-to/using-AMD-SMI-CLI-tool.md
+++ b/docs/how-to/using-AMD-SMI-CLI-tool.md
@@ -6,7 +6,7 @@ AMD-SMI reports the version and current platform detected when running the comma
~$ amd-smi
usage: amd-smi [-h] ...
-AMD System Management Interface | Version: 24.6.1.0 | ROCm version: 6.2.0 | Platform: Linux Baremetal
+AMD System Management Interface | Version: 24.6.2.0 | ROCm version: 6.2.0 | Platform: Linux Baremetal
options:
-h, --help show this help message and exit
@@ -521,7 +521,7 @@ Command Modifiers:
usage: amd-smi monitor [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
[-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]]
[-w INTERVAL] [-W TIME] [-i ITERATIONS] [-p] [-t] [-u] [-m] [-n]
- [-d] [-e] [-v] [-r]
+ [-d] [-e] [-v] [-r] [-q]
Monitor a target device for the specified arguments.
If no arguments are provided, all arguments will be enabled.
@@ -556,6 +556,7 @@ Monitor Arguments:
-e, --ecc Monitor ECC single bit, ECC double bit, and PCIe replay error counts
-v, --vram-usage Monitor memory usage in MB
-r, --pcie Monitor PCIe bandwidth in Mb/s
+ -q, --process Enable Process information table below monitor output
Command Modifiers:
--json Displays output in JSON format (human readable by default).
diff --git a/docs/how-to/using-amdsmi-for-python.md b/docs/how-to/using-amdsmi-for-python.md
index 18d4246f..5ab542f9 100644
--- a/docs/how-to/using-amdsmi-for-python.md
+++ b/docs/how-to/using-amdsmi-for-python.md
@@ -1,9 +1,7 @@
-
# AMD SMI Python Library
## Requirements
-
* Python 3.6+ 64-bit
* Driver must be loaded for amdsmi_init() to pass
@@ -11,7 +9,6 @@
### Folder structure
-
File Name | Note
---|---
`__init__.py` | Python package initialization file
@@ -20,7 +17,7 @@ File Name | Note
`amdsmi_exception.py` | Amdsmi exceptions python file
`README.md` | Documentation
-## Usage
+### Usage
`amdsmi` folder should be copied and placed next to importing script. It should be imported as:
@@ -45,17 +42,15 @@ To initialize amdsmi lib, amdsmi_init() must be called before all other calls to
To close connection to driver, amdsmi_shut_down() must be the last call.
-## Exceptions
+### Exceptions
All exceptions are in `amdsmi_exception.py` file.
Exceptions that can be thrown are:
* `AmdSmiException`: base amdsmi exception class
* `AmdSmiLibraryException`: derives base `AmdSmiException` class and represents errors that can occur in amdsmi-lib.
-
When this exception is thrown, `err_code` and `err_info` are set. `err_code` is an integer that corresponds to errors that can occur
in amdsmi-lib and `err_info` is a string that explains the error that occurred.
-
Example:
```python
@@ -124,7 +119,6 @@ except AmdSmiException as e:
### amdsmi_shut_down
-
Description: Finalize and close connection to driver
Input parameters: `None`
@@ -226,7 +220,6 @@ except AmdSmiException as e:
Description: Return socket name
Input parameters:
-
`socket_handle` socket handle
Output: Socket name
@@ -423,13 +416,13 @@ Input parameters:
Output: Dictionary with fields
-Field | Description
----|---
-`power_cap` | power capability
-`dpm_cap` | dynamic power management capability
-`default_power_cap` | default power capability
-`min_power_cap` | min power capability
-`max_power_cap` | max power capability
+Field | Description | Units
+---|---|---
+`power_cap` | power capability | uW
+`dpm_cap` | dynamic power management capability | MHz
+`default_power_cap` | default power capability | uW
+`min_power_cap` | min power capability | uW
+`max_power_cap` | max power capability | uW
Exceptions that can be thrown by `amdsmi_get_power_cap_info` function:
@@ -504,10 +497,9 @@ Input parameters:
* `processor_handle` device which to query
Output: List of Dictionaries containing cache information following the schema below:
-
Schema:
-```
+```JSON
{
cache_properties:
{
@@ -519,7 +511,6 @@ Schema:
max_num_cu_shared: {"type" : "number"},
num_cache_instance: {"type" : "number"}
}
-
```
Field | Description
@@ -688,8 +679,11 @@ Output: Dictionary with fields
Field | Description
---|---
+`current_socket_power` | current socket power
`average_socket_power` | average socket power
`gfx_voltage` | voltage gfx
+`soc_voltage` | voltage soc
+`mem_voltage` | voltage mem
`power_limit` | power limit
Exceptions that can be thrown by `amdsmi_get_power_info` function:
@@ -708,8 +702,11 @@ try:
else:
for device in devices:
power_measure = amdsmi_get_power_info(device)
+ print(power_measure['current_socket_power'])
print(power_measure['average_socket_power'])
print(power_measure['gfx_voltage'])
+ print(power_measure['soc_voltage'])
+ print(power_measure['mem_voltage'])
print(power_measure['power_limit'])
except AmdSmiException as e:
print(e)
@@ -780,9 +777,11 @@ Output: Dictionary with fields
Field | Description
---|---
-`cur_clk` | Current clock for given clock type
-`max_clk` | Maximum clock for given clock type
+`clk` | Current clock for given clock type
`min_clk` | Minimum clock for given clock type
+`max_clk` | Maximum clock for given clock type
+`clk_locked` | flag only supported on GFX clock domain
+`clk_deep_sleep` | clock deep sleep mode flag
Exceptions that can be thrown by `amdsmi_get_clock_info` function:
@@ -800,9 +799,11 @@ try:
else:
for device in devices:
clock_measure = amdsmi_get_clock_info(device, AmdSmiClkType.GFX)
- print(clock_measure['cur_clk'])
+ print(clock_measure['clk'])
print(clock_measure['min_clk'])
print(clock_measure['max_clk'])
+ print(clock_measure['clk_locked'])
+ print(clock_measure['clk_deep_sleep'])
except AmdSmiException as e:
print(e)
```
@@ -854,7 +855,7 @@ Input parameters:
* `processor_handle` device which to query
-Output: List consisting of dictionaries with fields for each bad page found
+Output: List consisting of dictionaries with fields for each bad page found; can be an empty list
Field | Description
---|---
@@ -879,7 +880,7 @@ try:
else:
for device in devices:
bad_page_info = amdsmi_get_gpu_bad_page_info(device)
- if not len(bad_page_info):
+ if not bad_page_info: # Can be empty list
print("No bad pages found")
continue
for bad_page in bad_page_info:
@@ -891,9 +892,56 @@ except AmdSmiException as e:
print(e)
```
+### amdsmi_get_gpu_memory_reserved_pages
+
+Description: Returns reserved memory page info for the given GPU.
+It is not supported on virtual machine guest
+
+Input parameters:
+
+* `processor_handle` device which to query
+
+Output: List consisting of dictionaries with fields for each reserved memory page found; can be an empty list
+
+Field | Description
+---|---
+`value` | Value of memory reserved page
+`page_address` | Address of memory reserved page
+`page_size` | Size of memory reserved page
+`status` | Status of memory reserved page
+
+Exceptions that can be thrown by `amdsmi_get_gpu_memory_reserved_pages` function:
+
+* `AmdSmiLibraryException`
+* `AmdSmiRetryException`
+* `AmdSmiParameterException`
+
+Example:
+
+```python
+try:
+ devices = amdsmi_get_processor_handles()
+ if len(devices) == 0:
+ print("No GPUs on machine")
+ else:
+ for device in devices:
+ reserved_memory_page_info = amdsmi_get_gpu_memory_reserved_pages(device)
+ if not reserved_memory_page_info: # Can be empty list
+ print("No memory reserved pages found")
+ continue
+ for reserved_memory_page in reserved_memory_page_info:
+ print(reserved_memory_page["value"])
+ print(reserved_memory_page["page_address"])
+ print(reserved_memory_page["page_size"])
+ print(reserved_memory_page["status"])
+except AmdSmiException as e:
+ print(e)
+```
+
+
### amdsmi_get_gpu_process_list
-Description: Returns the list of processes running on the target GPU; May require root level access
+Description: Returns the list of processes running on the target GPU; Requires root level access to display root process names; otherwise will return "N/A"
Input parameters:
@@ -903,7 +951,7 @@ Output: List of Dictionaries with the corresponding fields; empty list if no run
Field | Description
---|---
-`name` | Name of process
+`name` | Name of process. If user does not have permission this will be "N/A"
`pid` | Process ID
`mem` | Process memory usage
`engine_usage` |
Subfield | Description |
`gfx` | GFX engine usage in ns |
`enc` | Encode engine usage in ns |
@@ -1109,8 +1157,9 @@ Event Type | Description
---|------
`VMFAULT` | VM page fault
`THERMAL_THROTTLE` | thermal throttle
-`GPU_PRE_RESET` | gpu pre reset
+`GPU_PRE_RESET` | gpu pre reset
`GPU_POST_RESET` | gpu post reset
+`RING_HANG` | ring hang event
#### read
@@ -1187,7 +1236,7 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- amdsmi_set_gpu_pci_bandwidth(device, 0)
+ amdsmi_set_gpu_pci_bandwidth(device, 0)
except AmdSmiException as e:
print(e)
```
@@ -1547,8 +1596,12 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- memory = amdsmi_get_gpu_memory_total(device)
- print(memory)
+ vram_memory_total = amdsmi_get_gpu_memory_total(device, amdsmi_interface.AmdSmiMemoryType.VRAM)
+ print(vram_memory_total)
+ vis_vram_memory_total = amdsmi_get_gpu_memory_total(device, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM)
+ print(vis_vram_memory_total)
+ gtt_memory_total = amdsmi_get_gpu_memory_total(device, amdsmi_interface.AmdSmiMemoryType.GTT)
+ print(gtt_memory_total)
except AmdSmiException as e:
print(e)
```
@@ -1583,7 +1636,7 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- amdsmi_set_gpu_od_clk_info(
+ amdsmi_set_gpu_od_clk_info(
device,
AmdSmiFreqInd.AMDSMI_FREQ_IND_MAX,
1000,
@@ -1619,8 +1672,12 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- memory = amdsmi_get_gpu_memory_usage(device)
- print(memory)
+ vram_memory_usage = amdsmi_get_gpu_memory_usage(device, amdsmi_interface.AmdSmiMemoryType.VRAM)
+ print(vram_memory_usage)
+ vis_vram_memory_usage = amdsmi_get_gpu_memory_usage(device, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM)
+ print(vis_vram_memory_usage)
+ gtt_memory_usage = amdsmi_get_gpu_memory_usage(device, amdsmi_interface.AmdSmiMemoryType.GTT)
+ print(gtt_memory_usage)
except AmdSmiException as e:
print(e)
```
@@ -1654,7 +1711,7 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- amdsmi_set_gpu_od_volt_info(device, 1, 1000, 980)
+ amdsmi_set_gpu_od_volt_info(device, 1, 1000, 980)
except AmdSmiException as e:
print(e)
```
@@ -2036,7 +2093,7 @@ except AmdSmiException as e:
```
### amdsmi_clean_gpu_local_data
-Description: Clear the local data of the given device. This can be called between user logins to prevent information leak.
+Description: Clear the SRAM data of the given device. This can be called between user logins to prevent information leak.
Input parameters:
@@ -2130,15 +2187,16 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- amdsmi_get_clk_freq(device, AmdSmiClkType.SYS)
+ amdsmi_get_clk_freq(device, AmdSmiClkType.SYS)
except AmdSmiException as e:
print(e)
```
### amdsmi_get_gpu_od_volt_info
-Description: This function retrieves the voltage/frequency curve information
-It is not supported on virtual machine guest
+Description: This function retrieves the voltage/frequency curve information.
+If the num_regions is 0 then the voltage curve is not supported.
+It is not supported on virtual machine guest.
Input parameters:
@@ -2152,8 +2210,8 @@ Field | Description
`curr_mclk_range` | Subfield | Description |
---|
`lower_bound` | lower bound mclk range |
`upper_bound` | upper bound mclk range |
`sclk_freq_limits` | Subfield | Description |
---|
`lower_bound` | lower bound sclk range limt |
`upper_bound` | upper bound sclk range limit |
`mclk_freq_limits` | Subfield | Description |
---|
`lower_bound` | lower bound mclk range limit |
`upper_bound` | upper bound mclk range limit |
-`curve.vc_points` | The number of supported frequencies
-`num_regions` | The current frequency index
+`curve.vc_points` | List of voltage curve points
+`num_regions` | The number of voltage curve regions
Exceptions that can be thrown by `amdsmi_get_gpu_od_volt_info` function:
@@ -2170,7 +2228,7 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- amdsmi_get_gpu_od_volt_info(dev)
+ amdsmi_get_gpu_od_volt_info(dev)
except AmdSmiException as e:
print(e)
```
@@ -2214,7 +2272,7 @@ Output: Dictionary with fields
`current_dclk0` | Current dclk0 | MHz
`current_vclk1` | Current vclk1 | MHz
`current_dclk1` | Current dclk1 | MHz
-`throttle_status` | Current throttle status | MHz
+`throttle_status` | Current throttle status | bool
`current_fan_speed` | Current fan speed | RPM
`pcie_link_width` | PCIe link width (number of lanes) | lanes
`pcie_link_speed` | PCIe link speed in 0.1 GT/s (Giga Transfers per second) | GT/s
@@ -2262,7 +2320,7 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- amdsmi_get_gpu_metrics_info(dev)
+ amdsmi_get_gpu_metrics_info(dev)
except AmdSmiException as e:
print(e)
```
@@ -2299,7 +2357,7 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- amdsmi_get_gpu_od_volt_curve_regions(device, 3)
+ amdsmi_get_gpu_od_volt_curve_regions(device, 3)
except AmdSmiException as e:
print(e)
```
@@ -2337,7 +2395,7 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- amdsmi_get_gpu_power_profile_presets(device, 0)
+ amdsmi_get_gpu_power_profile_presets(device, 0)
except AmdSmiException as e:
print(e)
```
@@ -2566,7 +2624,7 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- amdsmi_set_gpu_perf_level(device, AmdSmiDevPerfLevel.STABLE_PEAK)
+ amdsmi_set_gpu_perf_level(device, AmdSmiDevPerfLevel.STABLE_PEAK)
except AmdSmiException as e:
print(e)
```
@@ -2869,7 +2927,7 @@ try:
print("No GPUs on machine")
else:
for device in devices:
- amdsmi_set_gpu_overdrive_level(device, 0)
+ amdsmi_set_gpu_overdrive_level(device, 0)
except AmdSmiException as e:
print(e)
```
@@ -3330,13 +3388,8 @@ Example:
```python
try:
- devices = amdsmi_get_processor_handles()
- if len(devices) == 0:
- print("No GPUs on machine")
- else:
- for device in devices:
- version = amdsmi_get_lib_version()
- print(version)
+ version = amdsmi_get_lib_version()
+ print(version)
except AmdSmiException as e:
print(e)
```
@@ -3748,6 +3801,7 @@ except AmdSmiException as e:
### amdsmi_get_processor_info
**Note: CURRENTLY HARDCODED TO RETURN EMPTY VALUES**
+
Description: Return processor name
Input parameters:
diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h
index 33ac03e1..2ab20a5b 100644
--- a/include/amd_smi/amdsmi.h
+++ b/include/amd_smi/amdsmi.h
@@ -154,7 +154,7 @@ typedef enum {
#define AMDSMI_LIB_VERSION_MAJOR 6
//! Minor version should be updated for each API change, but without changing headers
-#define AMDSMI_LIB_VERSION_MINOR 1
+#define AMDSMI_LIB_VERSION_MINOR 2
//! Release version should be set to 0 as default and can be updated by the PMs for each CSP point release
#define AMDSMI_LIB_VERSION_RELEASE 0
@@ -4830,10 +4830,6 @@ amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle, amdsmi_vram_
* Holding a copy of max_process before it is passed in will be helpful for monitoring
* the allocations done upon each call since the max_process will permanently be changed
* to reflect the actual number of processes running.
- * Note: For the specific cases where the return status is AMDSMI_STATUS_NO_PERM only.
- * The list of process and size are AMDSMI_STATUS_SUCCESS, however there are
- * processes details not fully retrieved due to permissions.
- *
*
* @param[out] list Reference to a user-provided buffer where the process
* list will be returned. This buffer must contain at least
@@ -4841,17 +4837,16 @@ amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle, amdsmi_vram_
* by user.
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success,
- * | ::AMDSMI_STATUS_NO_PERM on success, but not all details from process retrieved,
* | ::AMDSMI_STATUS_OUT_OF_RESOURCES, filled list buffer with data, but number of
* actual running processes is larger than the size provided.
*
*/
- // Note: If the reserved size for processes is smaller than the number of
- // actual processes running. The AMDSMI_STATUS_OUT_OF_RESOURCES is
- // an indication the caller should handle the situation (resize).
- // The max_processes is always changed to reflect the actual size of
- // list of processes running, so the caller knows where it is at.
- //
+ // Note: If the reserved size for processes is smaller than the number of
+ // actual processes running. The AMDSMI_STATUS_OUT_OF_RESOURCES is
+ // an indication the caller should handle the situation (resize).
+ // The max_processes is always changed to reflect the actual size of
+ // list of processes running, so the caller knows where it is at.
+ //
amdsmi_status_t
amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_proc_info_t *list);
@@ -4923,6 +4918,17 @@ amdsmi_status_t amdsmi_get_cpu_socket_energy(amdsmi_processor_handle processor_h
* @{
*/
+/**
+ * @brief Get Number of threads Per Core.
+ *
+ * @platform{cpu_bm}
+ *
+ * @param[in,out] threads_per_core - Input buffer to return the Number of threads Per Core
+ *
+ * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
+ */
+amdsmi_status_t amdsmi_get_threads_per_core(uint32_t *threads_per_core);
+
/**
* @brief Get SMU Firmware Version.
*
diff --git a/py-interface/README.md b/py-interface/README.md
index fdfdbefe..5ab542f9 100644
--- a/py-interface/README.md
+++ b/py-interface/README.md
@@ -2,12 +2,12 @@
## Requirements
-* python 3.6+ 64-bit
-* driver must be loaded for amdsmi_init() to pass
+* Python 3.6+ 64-bit
+* Driver must be loaded for amdsmi_init() to pass
## Overview
-## Folder structure
+### Folder structure
File Name | Note
---|---
@@ -17,7 +17,7 @@ File Name | Note
`amdsmi_exception.py` | Amdsmi exceptions python file
`README.md` | Documentation
-## Usage
+### Usage
`amdsmi` folder should be copied and placed next to importing script. It should be imported as:
@@ -42,7 +42,7 @@ To initialize amdsmi lib, amdsmi_init() must be called before all other calls to
To close connection to driver, amdsmi_shut_down() must be the last call.
-## Exceptions
+### Exceptions
All exceptions are in `amdsmi_exception.py` file.
Exceptions that can be thrown are:
@@ -192,6 +192,7 @@ except AmdSmiException as e:
### amdsmi_get_socket_handles
**Note: CURRENTLY HARDCODED TO RETURN DUMMY DATA**
+
Description: Returns list of socket device handle objects on current machine
Input parameters: `None`
@@ -215,6 +216,7 @@ except AmdSmiException as e:
### amdsmi_get_socket_info
**Note: CURRENTLY HARDCODED TO RETURN EMPTY VALUES**
+
Description: Return socket name
Input parameters:
@@ -939,7 +941,7 @@ except AmdSmiException as e:
### amdsmi_get_gpu_process_list
-Description: Returns the list of processes running on the target GPU; May require root level access
+Description: Returns the list of processes running on the target GPU; Requires root level access to display root process names; otherwise will return "N/A"
Input parameters:
@@ -949,7 +951,7 @@ Output: List of Dictionaries with the corresponding fields; empty list if no run
Field | Description
---|---
-`name` | Name of process
+`name` | Name of process. If user does not have permission this will be "N/A"
`pid` | Process ID
`mem` | Process memory usage
`engine_usage` | Subfield | Description |
`gfx` | GFX engine usage in ns |
`enc` | Encode engine usage in ns |
@@ -3799,6 +3801,7 @@ except AmdSmiException as e:
### amdsmi_get_processor_info
**Note: CURRENTLY HARDCODED TO RETURN EMPTY VALUES**
+
Description: Return processor name
Input parameters:
diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py
index 928fc1b4..391a7422 100644
--- a/py-interface/amdsmi_interface.py
+++ b/py-interface/amdsmi_interface.py
@@ -1991,8 +1991,7 @@ def amdsmi_get_gpu_process_list(
# This will get populated with the number of processes found
max_processes = ctypes.c_uint32(MAX_NUM_PROCESSES)
- process_list = (amdsmi_wrapper.amdsmi_proc_info_t *
- max_processes.value)()
+ process_list = (amdsmi_wrapper.amdsmi_proc_info_t * max_processes.value)()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_process_list(
processor_handle, ctypes.byref(max_processes), process_list
@@ -2001,8 +2000,11 @@ def amdsmi_get_gpu_process_list(
result = []
for index in range(max_processes.value):
+ process_name = process_list[index].name.decode("utf-8").strip()
+ if process_name == "":
+ process_name = "N/A"
result.append({
- "name": process_list[index].name.decode("utf-8"),
+ "name": process_name,
"pid": process_list[index].pid,
"mem": process_list[index].mem,
"engine_usage": {
diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py
index 570dc741..870512cc 100644
--- a/py-interface/amdsmi_wrapper.py
+++ b/py-interface/amdsmi_wrapper.py
@@ -2259,6 +2259,9 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure):
amdsmi_get_cpu_socket_energy = _libraries['libamd_smi.so'].amdsmi_get_cpu_socket_energy
amdsmi_get_cpu_socket_energy.restype = amdsmi_status_t
amdsmi_get_cpu_socket_energy.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint64)]
+amdsmi_get_threads_per_core = _libraries['libamd_smi.so'].amdsmi_get_threads_per_core
+amdsmi_get_threads_per_core.restype = amdsmi_status_t
+amdsmi_get_threads_per_core.argtypes = [ctypes.POINTER(ctypes.c_uint32)]
amdsmi_get_cpu_smu_fw_version = _libraries['libamd_smi.so'].amdsmi_get_cpu_smu_fw_version
amdsmi_get_cpu_smu_fw_version.restype = amdsmi_status_t
amdsmi_get_cpu_smu_fw_version.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_smu_fw_version_t)]
@@ -2668,9 +2671,10 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure):
'amdsmi_get_processor_info', 'amdsmi_get_processor_type',
'amdsmi_get_soc_pstate', 'amdsmi_get_socket_handles',
'amdsmi_get_socket_info', 'amdsmi_get_temp_metric',
- 'amdsmi_get_utilization_count', 'amdsmi_get_xgmi_info',
- 'amdsmi_get_xgmi_plpd', 'amdsmi_gpu_block_t',
- 'amdsmi_gpu_cache_info_t', 'amdsmi_gpu_control_counter',
+ 'amdsmi_get_threads_per_core', 'amdsmi_get_utilization_count',
+ 'amdsmi_get_xgmi_info', 'amdsmi_get_xgmi_plpd',
+ 'amdsmi_gpu_block_t', 'amdsmi_gpu_cache_info_t',
+ 'amdsmi_gpu_control_counter',
'amdsmi_gpu_counter_group_supported', 'amdsmi_gpu_create_counter',
'amdsmi_gpu_destroy_counter', 'amdsmi_gpu_metrics_t',
'amdsmi_gpu_read_counter', 'amdsmi_gpu_xgmi_error_status',
diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h
index 00b553d2..426a9ad0 100755
--- a/rocm_smi/include/rocm_smi/rocm_smi_device.h
+++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h
@@ -261,6 +261,7 @@ class Device {
AMGpuMetricsPublicLatestTupl_t dev_copy_internal_to_external_metrics();
static const std::map devInfoTypesStrings;
+ static const char* get_type_string(DevInfoTypes type);
private:
std::shared_ptr monitor_;
diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc
index 28131249..12e26959 100755
--- a/rocm_smi/src/rocm_smi.cc
+++ b/rocm_smi/src/rocm_smi.cc
@@ -83,7 +83,6 @@ using amd::smi::monitorTypesToString;
using amd::smi::getRSMIStatusString;
using amd::smi::AMDGpuMetricsUnitType_t;
using amd::smi::AMDGpuMetricTypeId_t;
-auto &devInfoTypesStrings = amd::smi::Device::devInfoTypesStrings;
static const uint32_t kMaxOverdriveLevel = 20;
static const float kEnergyCounterResolution = 15.3F;
@@ -3849,7 +3848,7 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
ss << __PRETTY_FUNCTION__
<< " | inside success fallback... "
<< " | Device #: " << std::to_string(dv_ind)
- << " | Type = " << devInfoTypesStrings.at(mem_type_file)
+ << " | Type = " << amd::smi::Device::get_type_string(mem_type_file)
<< " | Data: total = " << std::to_string(*total)
<< " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS);
LOG_DEBUG(ss);
@@ -3860,7 +3859,7 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
ss << __PRETTY_FUNCTION__
<< " | after fallback... "
<< " | Device #: " << std::to_string(dv_ind)
- << " | Type = " << devInfoTypesStrings.at(mem_type_file)
+ << " | Type = " << amd::smi::Device::get_type_string(mem_type_file)
<< " | Data: total = " << std::to_string(*total)
<< " | ret = " << getRSMIStatusString(ret);
LOG_DEBUG(ss);
@@ -3929,7 +3928,7 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
ss << __PRETTY_FUNCTION__
<< " no fallback needed! - "
<< " | Device #: " << std::to_string(dv_ind)
- << " | Type = " << devInfoTypesStrings.at(mem_type_file)
+ << " | Type = " << amd::smi::Device::get_type_string(mem_type_file)
<< " | Data: Used = " << std::to_string(*used)
<< " | Data: total = " << std::to_string(total)
<< " | ret = " << getRSMIStatusString(ret);
@@ -3940,7 +3939,7 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
ss << __PRETTY_FUNCTION__
<< " | in fallback == success ..."
<< " | Device #: " << std::to_string(dv_ind)
- << " | Type = " << devInfoTypesStrings.at(mem_type_file)
+ << " | Type = " << amd::smi::Device::get_type_string(mem_type_file)
<< " | Data: Used = " << std::to_string(*used)
<< " | Data: total = " << std::to_string(total)
<< " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS);
@@ -3951,7 +3950,7 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
ss << __PRETTY_FUNCTION__
<< " | at end!!!! after fallback ..."
<< " | Device #: " << std::to_string(dv_ind)
- << " | Type = " << devInfoTypesStrings.at(mem_type_file)
+ << " | Type = " << amd::smi::Device::get_type_string(mem_type_file)
<< " | Data: Used = " << std::to_string(*used)
<< " | ret = " << getRSMIStatusString(ret);
LOG_DEBUG(ss);
@@ -5234,7 +5233,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevComputePartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition)
<< " | Cause: len was 0 or compute_partition variable was null"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
@@ -5253,7 +5252,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevComputePartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition)
<< " | Cause: could not retrieve current compute partition"
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
@@ -5270,7 +5269,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevComputePartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition)
<< " | Cause: requested size was insufficient"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |";
@@ -5282,7 +5281,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
<< " | Success "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevComputePartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition)
<< " | Data: " << compute_partition
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
@@ -5342,7 +5341,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevComputePartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition)
<< " | Data: " << newComputePartitionStr
<< " | Cause: requested setting was invalid"
<< " | Returning = "
@@ -5361,7 +5360,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevComputePartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition)
<< " | Data: " << newComputePartitionStr
<< " | Cause: not an available compute partition setting"
<< " | Returning = "
@@ -5381,7 +5380,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevComputePartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition)
<< " | Cause: could retrieve current compute partition or retrieved"
<< " unexpected data"
<< " | Returning = "
@@ -5397,7 +5396,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
<< " | Success - compute partition was already set at requested value"
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevComputePartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition)
<< " | Data: " << newComputePartitionStr
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |";
@@ -5423,7 +5422,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
<< " | Success "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevComputePartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition)
<< " | Data: " << newComputePartitionStr
<< " | Returning = "
<< getRSMIStatusString(returnResponse) << " |";
@@ -5495,7 +5494,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Cause: device board name does not support this action"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
@@ -5516,7 +5515,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Cause: requested setting was invalid"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
@@ -5537,7 +5536,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Cause: could retrieve current memory partition or retrieved"
<< " unexpected data"
<< " | Returning = "
@@ -5554,7 +5553,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " setting"
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Data: " << newMemoryPartition
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |";
@@ -5576,7 +5575,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Cause: issue writing reqested setting of " + newMemoryPartition
<< " | Returning = "
<< getRSMIStatusString(err) << " |";
@@ -5590,7 +5589,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " | Success - if restart completed successfully"
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Data: " << newMemoryPartition
<< " | Returning = "
<< getRSMIStatusString(restartRet) << " |";
@@ -5612,7 +5611,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Cause: user sent invalid arguments, len = 0 or memory partition"
<< " was a null ptr"
<< " | Returning = "
@@ -5632,7 +5631,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Cause: could not successfully retrieve current memory partition "
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
@@ -5650,7 +5649,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition,
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Cause: could not successfully retrieve current memory partition "
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
@@ -5662,7 +5661,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition,
<< " | Success "
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Data: " << memory_partition
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
@@ -5701,7 +5700,7 @@ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) {
<< " | Success - if original boot state was not unknown or valid setting"
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevComputePartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevComputePartition)
<< " | Data: " << bootState
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
@@ -5740,7 +5739,7 @@ rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) {
<< " | Success - if original boot state was not unknown or valid setting"
<< " | Device #: " << dv_ind
<< " | Type: "
- << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
+ << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Data: " << bootState
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc
index 5eafc455..e0ebe8a0 100755
--- a/rocm_smi/src/rocm_smi_device.cc
+++ b/rocm_smi/src/rocm_smi_device.cc
@@ -746,7 +746,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
if (ret != 0) {
ss << __PRETTY_FUNCTION__ << " | Issue: File did not exist - SYSFS file ("
<< sysfs_path
- << ") for DevInfoInfoType (" << devInfoTypesStrings.at(type)
+ << ") for DevInfoInfoType (" << get_type_string(type)
<< "), returning " << std::to_string(ret);
LOG_ERROR(ss);
return ret;
@@ -755,7 +755,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
ss << __PRETTY_FUNCTION__
<< " | Issue: File is not a regular file - SYSFS file ("
<< sysfs_path << ") for "
- << "DevInfoInfoType (" << devInfoTypesStrings.at(type) << "),"
+ << "DevInfoInfoType (" << get_type_string(type) << "),"
<< " returning ENOENT (" << std::strerror(ENOENT) << ")";
LOG_ERROR(ss);
return ENOENT;
@@ -766,7 +766,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
if (!fs->is_open()) {
ss << __PRETTY_FUNCTION__
<< " | Issue: Could not open - SYSFS file (" << sysfs_path << ") for "
- << "DevInfoInfoType (" << devInfoTypesStrings.at(type) << "), "
+ << "DevInfoInfoType (" << get_type_string(type) << "), "
<< ", returning " << std::to_string(errno) << " ("
<< std::strerror(errno) << ")";
LOG_ERROR(ss);
@@ -775,7 +775,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
ss << __PRETTY_FUNCTION__ << " | Successfully opened SYSFS file ("
<< sysfs_path
- << ") for DevInfoInfoType (" << devInfoTypesStrings.at(type)
+ << ") for DevInfoInfoType (" << get_type_string(type)
<< ")";
LOG_INFO(ss);
return 0;
@@ -792,7 +792,7 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) {
ret = openDebugFileStream(type, &fs);
if (ret != 0) {
ss << "Could not read debugInfoStr for DevInfoType ("
- << devInfoTypesStrings.at(type)<< "), returning "
+ << get_type_string(type)<< "), returning "
<< std::to_string(ret);
LOG_ERROR(ss);
return ret;
@@ -806,7 +806,7 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) {
fs.close();
ss << "Successfully read debugInfoStr for DevInfoType ("
- << devInfoTypesStrings.at(type)<< "), retString= " << *retStr;
+ << get_type_string(type)<< "), retString= " << *retStr;
LOG_INFO(ss);
return 0;
@@ -822,7 +822,7 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) {
ret = openSysfsFileStream(type, &fs);
if (ret != 0) {
ss << "Could not read device info string for DevInfoType ("
- << devInfoTypesStrings.at(type) << "), returning "
+ << get_type_string(type) << "), returning "
<< std::to_string(ret);
LOG_ERROR(ss);
return ret;
@@ -832,7 +832,7 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) {
fs.close();
ss << __PRETTY_FUNCTION__
<< "Successfully read device info string for DevInfoType (" <<
- devInfoTypesStrings.at(type) << "): " + *retStr
+ get_type_string(type) << "): " + *retStr
<< " | "
<< (fs.is_open() ? " File stream is opened" : " File stream is closed")
<< " | " << (fs.bad() ? "[ERROR] Bad read operation" :
@@ -867,7 +867,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr,
fs.close();
ss << __PRETTY_FUNCTION__ << " | Issue: Could not open fileStream; "
<< "Could not write device info string (" << valStr
- << ") for DevInfoType (" << devInfoTypesStrings.at(type)
+ << ") for DevInfoType (" << get_type_string(type)
<< "), returning " << std::to_string(ret);
LOG_ERROR(ss);
return ret;
@@ -878,7 +878,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr,
fs.flush();
fs.close();
ss << "Successfully wrote device info string (" << valStr
- << ") for DevInfoType (" << devInfoTypesStrings.at(type)
+ << ") for DevInfoType (" << get_type_string(type)
<< "), returning RSMI_STATUS_SUCCESS";
LOG_INFO(ss);
ret = RSMI_STATUS_SUCCESS;
@@ -892,7 +892,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr,
fs.close();
ss << __PRETTY_FUNCTION__ << " | Issue: Could not write to file; "
<< "Could not write device info string (" << valStr
- << ") for DevInfoType (" << devInfoTypesStrings.at(type)
+ << ") for DevInfoType (" << get_type_string(type)
<< "), returning " << getRSMIStatusString(ErrnoToRsmiStatus(ret));
ss << " | "
<< (fs.is_open() ? "[ERROR] File stream open" :
@@ -983,20 +983,29 @@ int Device::readDevInfoLine(DevInfoTypes type, std::string *line) {
ret = openSysfsFileStream(type, &fs);
if (ret != 0) {
ss << "Could not read DevInfoLine for DevInfoType ("
- << devInfoTypesStrings.at(type) << ")";
+ << get_type_string(type) << ")";
LOG_ERROR(ss);
return ret;
}
std::getline(fs, *line);
ss << "Successfully read DevInfoLine for DevInfoType ("
- << devInfoTypesStrings.at(type) << "), returning *line = "
+ << get_type_string(type) << "), returning *line = "
<< *line;
LOG_INFO(ss);
return 0;
}
+const char* Device::get_type_string(DevInfoTypes type) {
+ auto ite = devInfoTypesStrings.find(type);
+ if (ite != devInfoTypesStrings.end()) {
+ return ite->second;
+ }
+
+ return "Unknown";
+
+}
int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
void *p_binary_data) {
auto sysfs_path = path_;
@@ -1009,7 +1018,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
ptr = fopen(sysfs_path.c_str(), "rb");
if (!ptr) {
ss << "Could not read DevInfoBinary for DevInfoType ("
- << devInfoTypesStrings.at(type) << ")"
+ << get_type_string(type) << ")"
<< " - SYSFS (" << sysfs_path << ")"
<< ", returning " << std::to_string(errno) << " ("
<< std::strerror(errno) << ")";
@@ -1021,7 +1030,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
fclose(ptr);
if ((num*b_size) != b_size) {
ss << "Could not read DevInfoBinary for DevInfoType ("
- << devInfoTypesStrings.at(type) << ") - SYSFS ("
+ << get_type_string(type) << ") - SYSFS ("
<< sysfs_path << "), binary size error; "
<< "[buff: "
<< p_binary_data
@@ -1035,7 +1044,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
return ENOENT;
}
ss << "Successfully read DevInfoBinary for DevInfoType ("
- << devInfoTypesStrings.at(type) << ") - SYSFS ("
+ << get_type_string(type) << ") - SYSFS ("
<< sysfs_path << "), returning binaryData = " << p_binary_data
<< "; byte_size = " << std::dec << static_cast(b_size);
@@ -1067,7 +1076,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type,
if (retVec->empty()) {
ss << "Read devInfoMultiLineStr for DevInfoType ("
- << devInfoTypesStrings.at(type) << ")"
+ << get_type_string(type) << ")"
<< ", but contained no string lines";
LOG_ERROR(ss);
return ENXIO;
@@ -1085,12 +1094,12 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type,
if (!allLines.empty()) {
ss << "Successfully read devInfoMultiLineStr for DevInfoType ("
- << devInfoTypesStrings.at(type) << ") "
+ << get_type_string(type) << ") "
<< ", returning lines read = " << allLines;
LOG_INFO(ss);
} else {
ss << "Read devInfoMultiLineStr for DevInfoType ("
- << devInfoTypesStrings.at(type) << ")"
+ << get_type_string(type) << ")"
<< ", but lines were empty";
LOG_INFO(ss);
return ENXIO;
diff --git a/rocm_smi/src/rocm_smi_main.cc b/rocm_smi/src/rocm_smi_main.cc
index 3b27d6ae..4c6b0190 100755
--- a/rocm_smi/src/rocm_smi_main.cc
+++ b/rocm_smi/src/rocm_smi_main.cc
@@ -560,7 +560,7 @@ std::string RocmSMI::getRSMIEnvVarInfo(void) {
for (auto it=env_vars_.enum_overrides.begin();
it != env_vars_.enum_overrides.end(); ++it) {
DevInfoTypes type = static_cast(*it);
- ss << (std::to_string(*it) + " (" + Device::devInfoTypesStrings.at(type) + ")");
+ ss << (std::to_string(*it) + " (" + Device::get_type_string(type) + ")");
auto temp_it = it;
if(++temp_it != env_vars_.enum_overrides.end()) {
ss << ", ";
diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc
index 79c82d52..0ae21147 100644
--- a/src/amd_smi/amd_smi.cc
+++ b/src/amd_smi/amd_smi.cc
@@ -1889,15 +1889,9 @@ amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *
const auto max_processes_original_size(*max_processes);
auto idx = uint32_t(0);
- auto is_required_previlegies_required(false);
for (auto& process : compute_process_list) {
if (idx < *max_processes) {
list[idx++] = static_cast(process.second);
- // Note: If we could not read the process info for an existing process,
- // that is likely a permission error.
- if (!is_required_previlegies_required && std::string(process.second.name).empty()) {
- is_required_previlegies_required = true;
- }
} else {
break;
}
@@ -1910,11 +1904,9 @@ amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *
// list of processes running, so the caller knows where it is at.
// Holding a copy of max_process before it is passed in will be helpful
// for the caller.
- status_code = is_required_previlegies_required
- ? amdsmi_status_t::AMDSMI_STATUS_NO_PERM : AMDSMI_STATUS_SUCCESS;
*max_processes = static_cast(compute_process_list.size());
return (max_processes_original_size >= static_cast(compute_process_list.size()))
- ? status_code : amdsmi_status_t::AMDSMI_STATUS_OUT_OF_RESOURCES;
+ ? AMDSMI_STATUS_SUCCESS : amdsmi_status_t::AMDSMI_STATUS_OUT_OF_RESOURCES;
}
amdsmi_status_t
@@ -2231,6 +2223,22 @@ static amdsmi_status_t amdsmi_errno_to_esmi_status(amdsmi_status_t status)
return AMDSMI_STATUS_SUCCESS;
}
+amdsmi_status_t amdsmi_get_threads_per_core(uint32_t *threads_per_core)
+{
+ amdsmi_status_t status;
+ uint32_t esmi_threads_per_core;
+
+ AMDSMI_CHECK_INIT();
+
+ status = static_cast(esmi_threads_per_core_get(&esmi_threads_per_core));
+ if (status != AMDSMI_STATUS_SUCCESS)
+ return amdsmi_errno_to_esmi_status(status);
+
+ *threads_per_core = esmi_threads_per_core;
+
+ return AMDSMI_STATUS_SUCCESS;
+}
+
amdsmi_status_t amdsmi_get_cpu_hsmp_proto_ver(amdsmi_processor_handle processor_handle,
uint32_t *proto_ver)
{
diff --git a/tests/amd_smi_test/amdsmitst.exclude b/tests/amd_smi_test/amdsmitst.exclude
index 8f955bae..c49791c1 100644
--- a/tests/amd_smi_test/amdsmitst.exclude
+++ b/tests/amd_smi_test/amdsmitst.exclude
@@ -56,18 +56,6 @@ FILTER[sienna_cichlid]=\
$BLACKLIST_ALL_ASICS\
"amdsmitstReadWrite.TestPerfLevelReadWrite"
-# SWDEV-391407
-# aqua_vanjaram and later systems show 'ip discovery' in
-# /sys/class/kfd/kfd/topology/nodes/*/name
-#
-# For those systems gfx_target_version must be used. It can be found in
-# /sys/class/kfd/kfd/topology/nodes/*/properties
-FILTER[90400]=\
-$BLACKLIST_ALL_ASICS\
-# "amdsmitstReadOnly.TestVoltCurvRead"
-FILTER[90401]=${FILTER[90400]}
-FILTER[90402]=${FILTER[90400]}
-
# SWDEV-321166
FILTER[virtualization]=\
$BLACKLIST_ALL_ASICS\
@@ -77,3 +65,14 @@ $BLACKLIST_ALL_ASICS\
"amdsmitstReadWrite.TestOverdriveReadWrite:"\
"amdsmitstReadWrite.TestPowerReadWrite:"\
"amdsmitstReadWrite.TestPowerCapReadWrite"
+
+# aqua_vanjaram and later systems show 'ip discovery' in
+# /sys/class/kfd/kfd/topology/nodes/*/name
+#
+# For those systems gfx_target_version must be used. It can be found in
+# /sys/class/kfd/kfd/topology/nodes/*/properties
+#
+# ex.
+# FILTER[90400]=\
+# $BLACKLIST_ALL_ASICS\
+# "amdsmitstReadOnly.TestVoltCurvRead"