Skip to content

Commit

Permalink
Merge amd-master into release/rocm-rel-6.2 20240719
Browse files Browse the repository at this point in the history
Signed-off-by: Maisam Arif <[email protected]>
Change-Id: Ic02d5721c3970ce237515a5455f7941af2229172
  • Loading branch information
Maisam Arif authored and Maisam Arif committed Jul 19, 2024
2 parents 32e3fda + 23fc9e4 commit 2b02a07
Show file tree
Hide file tree
Showing 13 changed files with 211 additions and 75 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ ASIC products. This requires users to update any ABIs using this structure.

### Fixes

- **Fixed Leftover Mutex deadlock when running multiple instances of the CLI tool**.
When running `amd-smi reset --gpureset --gpu all` and then running an instance of `amd-smi static` (or any other subcommand that access the GPUs) a mutex would lock and not return requiring either a clear of the mutex in /dev/shm or rebooting the machine.

- **Fixed multiple processes not being registered in `amd-smi process` with json and csv format**.
Multiple process outputs in the CLI tool were not being registered correctly. The json output did not handle multiple processes and is now in a new valid json format:

Expand Down
2 changes: 1 addition & 1 deletion amdsmi_cli/amdsmi_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def _print_error(e, destination):
if destination in ['stdout', 'json', 'csv']:
print(e)
else:
f = open(destination, "w")
f = open(destination, "w", encoding="utf-8")
f.write(e)
f.close()
print("Error occured. Result written to " + str(destination) + " file")
Expand Down
16 changes: 12 additions & 4 deletions amdsmi_cli/amdsmi_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def version(self, args):
if self.logger.destination == 'stdout':
print(human_readable_output)
else:
with self.logger.destination.open('a') as output_file:
with self.logger.destination.open('a', encoding="utf-8") as output_file:
output_file.write(human_readable_output + '\n')
elif self.logger.is_json_format() or self.logger.is_csv_format():
self.logger.print_output()
Expand Down Expand Up @@ -4300,11 +4300,19 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
self.logger.table_header += 'DOUBLE_ECC'.rjust(12)

try:
pcie_replay = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
monitor_values['pcie_replay'] = pcie_replay
pcie_metric = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)['pcie_metric']
logging.debug("PCIE Metric for %s | %s", gpu_id, pcie_metric)
monitor_values['pcie_replay'] = pcie_metric['pcie_replay_count']
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['pcie_replay'] = "N/A"
logging.debug("Failed to get pcie replay counter on gpu %s | %s", gpu_id, e.get_error_info())
logging.debug("Failed to get gpu_metrics pcie replay counter on gpu %s | %s", gpu_id, e.get_error_info())

if monitor_values['pcie_replay'] == "N/A":
try:
pcie_replay = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
monitor_values['pcie_replay'] = pcie_replay
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get sysfs pcie replay counter on gpu %s | %s", gpu_id, e.get_error_info())

self.logger.table_header += 'PCIE_REPLAY'.rjust(13)
if args.vram_usage:
Expand Down
40 changes: 30 additions & 10 deletions amdsmi_cli/amdsmi_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import logging
import math
import os
import platform
import sys
import time
Expand All @@ -30,6 +31,7 @@
from subprocess import PIPE, STDOUT
from typing import List
from enum import Enum
from typing import Set

from amdsmi_init import *
from BDF import BDF
Expand Down Expand Up @@ -58,19 +60,23 @@ def __init__(self) -> None:
self._is_linux = True
logging.debug(f"AMDSMIHelpers: Platform is linux:{self._is_linux}")

output = run(["lscpu"], stdout=PIPE, stderr=STDOUT, encoding="UTF-8").stdout
if "hypervisor" not in output:
self._is_baremetal = True
else:
self._is_virtual_os = True
try:
with open('/proc/cpuinfo', 'r') as f:
if 'hypervisor' in f.read():
self._is_virtual_os = True
except IOError:
pass

self._is_baremetal = not self._is_virtual_os

# Check for passthrough system filtering by device id
output = run(["lspci", "-nn"], stdout=PIPE, stderr=STDOUT, encoding="UTF-8").stdout
output = self.get_pci_device_ids()
passthrough_device_ids = ["7460", "73c8", "74a0", "74a1", "74a2"]
if any(device_id in output for device_id in passthrough_device_ids):
self._is_baremetal = True
self._is_virtual_os = False
self._is_passthrough = True
if any(('0x' + device_id) in output for device_id in passthrough_device_ids):
if self._is_virtual_os:
self._is_baremetal = True
self._is_virtual_os = False
self._is_passthrough = True


def os_info(self, string_format=True):
Expand Down Expand Up @@ -783,3 +789,17 @@ def convert_SI_unit(val: int, unit_in: SI_Unit, unit_out=SI_Unit.BASE) -> int:
int : converted SI unit of value requested
"""
return int(float(val) * unit_in / unit_out)

def get_pci_device_ids(self) -> Set[str]:
pci_devices_path = "/sys/bus/pci/devices"
pci_devices: set[str] = set()
for device in os.listdir(pci_devices_path):
subsystem_device_path = os.path.join(pci_devices_path, device, "subsystem_device")
try:
with open(subsystem_device_path, 'r') as f:
subsystem_device = f.read().strip()
pci_devices.add(subsystem_device)
except Exception as _:
continue
return pci_devices

20 changes: 10 additions & 10 deletions amdsmi_cli/amdsmi_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,10 @@ def _print_json_output(self, multiple_device_enabled=False, watching_output=Fals
print(json_std_output)
else: # Write output to file
if watching_output: # Flush the full JSON output to the file on watch command completion
with self.destination.open('w') as output_file:
with self.destination.open('w', encoding="utf-8") as output_file:
json.dump(self.watch_output, output_file, indent=4)
else:
with self.destination.open('a') as output_file:
with self.destination.open('a', encoding="utf-8") as output_file:
json.dump(json_output, output_file, indent=4)


Expand Down Expand Up @@ -516,7 +516,7 @@ def _print_csv_output(self, multiple_device_enabled=False, watching_output=False
print(str(csv_stdout_output))
else:
if watching_output:
with self.destination.open('w', newline = '') as output_file:
with self.destination.open('w', newline = '', encoding="utf-8") as output_file:
if self.watch_output:
csv_keys = set()
for output in self.watch_output:
Expand All @@ -534,7 +534,7 @@ def _print_csv_output(self, multiple_device_enabled=False, watching_output=False
writer.writeheader()
writer.writerows(self.watch_output)
else:
with self.destination.open('a', newline = '') as output_file:
with self.destination.open('a', newline = '', encoding="utf-8") as output_file:
# Get the header as a list of the first element to maintain order
csv_header = stored_csv_output[0].keys()
writer = csv.DictWriter(output_file, csv_header)
Expand Down Expand Up @@ -622,7 +622,7 @@ def _print_dual_csv_output(self, multiple_device_enabled=False, watching_output=
print()
else:
if watching_output:
with self.destination.open('w', newline = '') as output_file:
with self.destination.open('w', newline = '', encoding="utf-8") as output_file:
primary_csv_output = []
secondary_csv_output = []
if self.watch_output:
Expand Down Expand Up @@ -687,7 +687,7 @@ def _print_dual_csv_output(self, multiple_device_enabled=False, watching_output=
writer.writeheader()
writer.writerows(secondary_csv_output)
else:
with self.destination.open('a', newline = '') as output_file:
with self.destination.open('a', newline = '', encoding="utf-8") as output_file:
if primary_csv_output:
# Get the header as a list of the first element to maintain order
csv_header = primary_csv_output[0].keys()
Expand Down Expand Up @@ -724,13 +724,13 @@ def _print_human_readable_output(self, multiple_device_enabled=False, watching_o
print(human_readable_output.encode('ascii', 'ignore').decode('ascii'))
else:
if watching_output:
with self.destination.open('w') as output_file:
with self.destination.open('w', encoding="utf-8") as output_file:
human_readable_output = ''
for output in self.watch_output:
human_readable_output += self._convert_json_to_human_readable(output)
output_file.write(human_readable_output + '\n')
else:
with self.destination.open('a') as output_file:
with self.destination.open('a', encoding="utf-8") as output_file:
output_file.write(human_readable_output + '\n')


Expand Down Expand Up @@ -806,7 +806,7 @@ def _print_tabular_output(self, multiple_device_enabled=False, watching_output=F
print("\n")
else:
if watching_output: # Write all stored watched output to a file
with self.destination.open('w') as output_file:
with self.destination.open('w', encoding="utf-8") as output_file:
primary_table = ''
secondary_table = ''
# Add process_list to the secondary_table
Expand Down Expand Up @@ -851,6 +851,6 @@ def _print_tabular_output(self, multiple_device_enabled=False, watching_output=F
if secondary_table:
output_file.write("\n" + secondary_table)
else: # Write all singular output to a file
with self.destination.open('a') as output_file:
with self.destination.open('a', encoding="utf-8") as output_file:
output_file.write(primary_table + '\n')
output_file.write(secondary_table)
16 changes: 12 additions & 4 deletions amdsmi_cli/amdsmi_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,10 @@ def _not_negative_int(self, int_value):
return int(int_value)

outputformat = self.helpers.get_output_format()
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat)
if int_value == "":
raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException(int_value, outputformat)
else:
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat)


def _positive_int(self, int_value):
Expand All @@ -150,7 +153,10 @@ def _positive_int(self, int_value):
return int(int_value)

outputformat = self.helpers.get_output_format()
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat)
if int_value == "":
raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException(int_value, outputformat)
else:
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat)


def _is_valid_string(self, string_value):
Expand All @@ -160,8 +166,10 @@ def _is_valid_string(self, string_value):
return string_value

outputformat = self.helpers.get_output_format()
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(string_value, outputformat)

if string_value == "":
raise amdsmi_cli_exceptions.AmdSmiMissingParameterValueException(string_value, outputformat)
else:
raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(string_value, outputformat)

def _check_output_file_path(self):
""" Argument action validator:
Expand Down
15 changes: 8 additions & 7 deletions py-interface/amdsmi_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,13 +442,14 @@ def read(self, timestamp, num_elem=10):
for i in range(0, num_elem):
unique_event_values = set(event.value for event in AmdSmiEvtNotificationType)
if self.event_info[i].event in unique_event_values:
ret.append(
{
"processor_handle": self.event_info[i].processor_handle,
"event": AmdSmiEvtNotificationType(self.event_info[i].event).name,
"message": self.event_info[i].message.decode("utf-8"),
}
)
if AmdSmiEvtNotificationType(self.event_info[i].event).name != "NONE":
ret.append(
{
"processor_handle": self.event_info[i].processor_handle,
"event": AmdSmiEvtNotificationType(self.event_info[i].event).name,
"message": self.event_info[i].message.decode("utf-8"),
}
)

return ret

Expand Down
1 change: 1 addition & 0 deletions rocm_smi/include/rocm_smi/rocm_smi.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ typedef enum {
//!< information can be retrieved. By
//!< default, only AMD devices are
//!< enumerated by RSMI.
RSMI_INIT_FLAG_THRAD_ONLY_MUTEX = 0x400000000000000, //!< The mutex limit to thread
RSMI_INIT_FLAG_RESRV_TEST1 = 0x800000000000000, //!< Reserved for test
} rsmi_init_flags_t;

Expand Down
3 changes: 3 additions & 0 deletions rocm_smi/include/rocm_smi/rocm_smi_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ class RocmSMI {

void set_init_options(uint64_t options) {init_options_ = options;}
uint64_t init_options() const {return init_options_;}
uint64_t is_thread_only_mutex() const {
return init_options_ & RSMI_INIT_FLAG_THRAD_ONLY_MUTEX;
}

uint32_t euid() const {return euid_;}

Expand Down
2 changes: 1 addition & 1 deletion rocm_smi/include/rocm_smi/rocm_smi_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ class ScopeGuard {
__forceinline ~ScopeGuard() {
if (!dismiss_) release_();
}
__forceinline ScopeGuard& operator=(const ScopeGuard& rhs) {
__forceinline ScopeGuard& operator=(ScopeGuard& rhs) {
dismiss_ = rhs.dismiss_;
release_ = rhs.release_;
rhs.dismiss_ = true;
Expand Down
3 changes: 2 additions & 1 deletion rocm_smi/src/rocm_smi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3435,7 +3435,8 @@ rsmi_dev_gpu_reset(uint32_t dv_ind) {
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
// No longer using DEVICE_MUTEX as it blocks long running processes
// DEVICE_MUTEX

rsmi_status_t ret;
uint64_t status_code = 0;
Expand Down
2 changes: 1 addition & 1 deletion tests/amd_smi_test/amdsmitst.exclude
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ $BLACKLIST_ALL_ASICS\
#
# For those systems gfx_target_version must be used. It can be found in
# /sys/class/kfd/kfd/topology/nodes/*/properties
#
#
# ex.
# FILTER[90400]=\
# $BLACKLIST_ALL_ASICS\
Expand Down
Loading

0 comments on commit 2b02a07

Please sign in to comment.