diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ffdc4f48..cf1eca3d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1,6 @@ * @marifamd @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan + +docs/* @ROCm/rocm-documentation +*.md @ROCm/rocm-documentation +*.rst @ROCm/rocm-documentation + diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 00000000..cd0b8a4f --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,103 @@ +# Contributing to AMD SMI # + +We welcome contributions to AMD SMI. +Please follow these details to help ensure your contributions will be successfully accepted. + +## Issue Discussion ## + +Please use the GitHub Issues tab to notify us of issues. + +* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and + comment or post to provide additional details, such as how you reproduced this issue. +* If you're not sure if your issue is the same, err on the side of caution and file your issue. + You can add a comment to include the issue number (and link) for the similar issue. If we evaluate + your issue as being the same as the existing issue, we'll close the duplicate. +* If your issue doesn't exist, use the issue template to file a new issue. + * When filing an issue, be sure to provide as much information as possible, including script output so + we can collect information about your configuration. This helps reduce the time required to + reproduce your issue. + * Check your issue regularly, as we may require additional information to successfully reproduce the + issue. +* You may also open an issue to ask questions to the maintainers about whether a proposed change + meets the acceptance criteria, or to discuss an idea pertaining to the library. + +## Acceptance Criteria ## + +The goal of AMD SMI project is to provide a simple CLI interface and a library +for interacting with AMD GPUs. + +## Coding Style ## + +Please refer to `.clang-format`. It is suggested you use `pre-commit` tool. +It mostly follows Google C++ formatting with 100 character line limit. + +## Pull Request Guidelines ## + +When you create a pull request, you should target the default branch. Our +current default branch is the **develop** branch, which serves as our +integration branch. + +### Deliverables ### + +For each new file in repository, +Please include the licensing header + + /* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017-20XX, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +### Process ### + +* Reviewers are listed in the CODEOWNERS file +* Code format guidelines + +AMD SMI uses the clang-format tool for formatting code in source files. +The formatting style is captured in .clang-format which is located at +the root of AMD SMI. These are different options to follow: + + 1. Using pre-commit and docker - `pre-commit run` + 1. Using only clang-format - `clang-format -i \` + +## References ## + +1. [pre-commit](https://github.com/pre-commit/pre-commit) +1. [clang-format](https://clang.llvm.org/docs/ClangFormat.html) diff --git a/CMakeLists.txt b/CMakeLists.txt index ae080f87..d59fe2f4 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,13 +69,15 @@ set(CMAKE_CXX_EXTENSIONS OFF) include(GNUInstallDirs) option(BUILD_TESTS "Build test suite" OFF) -# TODO: Enable once virtualenv is installed on CI machines -option(BUILD_WRAPPER "Rebuild AMDSMI-wrapper" OFF) -option(BUILD_CLI "Build AMDSMI-CLI and install" ON) -option(ENABLE_LDCONFIG "Set library links and caches using ldconfig." ON) option(ENABLE_ASAN_PACKAGING "" OFF) option(ENABLE_ESMI_LIB "" ON) +include(CMakeDependentOption) +# these options don't work without BUILD_SHARED_LIBS +cmake_dependent_option(BUILD_WRAPPER "Rebuild AMDSMI-wrapper" OFF "BUILD_SHARED_LIBS" OFF) +cmake_dependent_option(BUILD_CLI "Build AMDSMI-CLI and install" ON "BUILD_SHARED_LIBS" OFF) +cmake_dependent_option(ENABLE_LDCONFIG "Set library links and caches using ldconfig." ON "BUILD_SHARED_LIBS" OFF) + # Set share path here because project name != amd_smi set(SHARE_INSTALL_PREFIX "share/${AMD_SMI}" CACHE STRING "Tests and Example install directory") @@ -116,7 +118,7 @@ if(ENABLE_ESMI_LIB) if(NOT EXISTS ${PROJECT_SOURCE_DIR}/esmi_ib_library/src) # TODO: use ExternalProject_Add instead or a submodule # as of 2023.10.16 CI builds are broken with an updated submodule - execute_process(COMMAND git clone --depth=1 -b esmi_so_ver-3.0 https://github.com/amd/esmi_ib_library.git ${PROJECT_SOURCE_DIR}/esmi_ib_library) + execute_process(COMMAND git clone --depth=1 -b esmi_pkg_ver-3.0.3 https://github.com/amd/esmi_ib_library.git ${PROJECT_SOURCE_DIR}/esmi_ib_library) endif() if(NOT EXISTS ${PROJECT_SOURCE_DIR}/esmi_ib_library/include/asm/amd_hsmp.h) file(DOWNLOAD @@ -185,10 +187,12 @@ if(BUILD_TESTS) add_subdirectory("tests/amd_smi_test") endif() -add_subdirectory("py-interface") - -if(BUILD_CLI) - add_subdirectory("amdsmi_cli") +# python interface and CLI depend on shared libraries +if(BUILD_SHARED_LIBS) + add_subdirectory("py-interface") + if(BUILD_CLI) + add_subdirectory("amdsmi_cli") + endif() endif() include(CMakePackageConfigHelpers) diff --git a/DEBIAN/postinst.in b/DEBIAN/postinst.in index d991cf6b..3d0535d8 100755 --- a/DEBIAN/postinst.in +++ b/DEBIAN/postinst.in @@ -138,18 +138,30 @@ do_install_amdsmi_python_lib() { echo "Removed old AMD-SMI python library (amdsmi)..." fi + # static builds don't include python lib + if [ "@BUILD_SHARED_LIBS@" != "ON" ]; then + return + fi + # upgrade pip if it's an ancient version # otherwise the amdsmi install will fail local pip_version pip_version=$(python3 -m pip --version | grep -Eo '^[^\ ]+ ([0-9]+)' | grep -Eo '[0-9]+$') if [[ "$pip_version" -lt 19 ]]; then - echo "Detected ancient pip version ($pip_version)... Upgrading..." - python3 -m pip install --upgrade pip --quiet --disable-pip-version-check + echo "Detected ancient pip version ($pip_version)... Upgrading..." + python3 -m pip install --upgrade pip --quiet --disable-pip-version-check fi unset pip_version - # install PyYAML dependency - python3 -m pip install 'PyYAML>=5.1' --quiet --disable-pip-version-check --ignore-installed + # Check PyYAML dependency + local pyyaml_version + pyyaml_version=$(pip show pyyaml | grep -Po '(?<=Version: )[0-9]') + if [[ "$pyyaml_version" -lt 5 ]]; then + echo "Detected ancient pyyaml version ($pyyaml_version)... Upgrading..." + python3 -m pip install 'PyYAML>=5.1' --quiet --disable-pip-version-check --ignore-installed + fi + unset pyyaml_version + # install python library at @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi local python_lib_path=@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@ python3 -m pip install "$python_lib_path" --quiet --disable-pip-version-check diff --git a/README.md b/README.md index 2dc3a07c..109f890e 100755 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ and [esmi_ib_library](https://github.com/amd/esmi_ib_library) At initial release, the AMD SMI library will support Linux bare metal and Linux virtual machine guest for AMD GPUs. In the future release, the library will be extended to support AMD EPYC™ CPUs. -AMD SMI library can run on AMD ROCm supported platforms, please refer to [List of Supported Operating Systems and GPUs](https://rocm.docs.amd.com/en/latest/release/gpu_os_support.html) +AMD SMI library can run on AMD ROCm supported platforms, refer to [System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) for more information. To run the AMD SMI library, the amdgpu driver and the hsmp driver needs to be installed. Optionally, the libdrm can be installed to query firmware information and hardware IPs. @@ -195,7 +195,7 @@ python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html The output will be in `docs/_build/html`. -For additional details, see the [ROCm Contributing Guide](https://rocm.docs.amd.com/en/latest/contributing.html#building-documentation) +For additional details, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). ## Building AMD SMI diff --git a/RPM/post.in b/RPM/post.in index a58a5f4e..653b365f 100755 --- a/RPM/post.in +++ b/RPM/post.in @@ -137,18 +137,30 @@ do_install_amdsmi_python_lib() { echo "Removed old AMD-SMI python library (amdsmi)..." fi + # static builds don't include python lib + if [ "@BUILD_SHARED_LIBS@" != "ON" ]; then + return + fi + # upgrade pip if it's an ancient version # otherwise the amdsmi install will fail local pip_version pip_version=$(python3 -m pip --version | grep -Eo '^[^\ ]+ ([0-9]+)' | grep -Eo '[0-9]+$') if [[ "$pip_version" -lt 19 ]]; then - echo "Detected ancient pip version ($pip_version)... Upgrading..." - python3 -m pip install --upgrade pip --quiet --disable-pip-version-check + echo "Detected ancient pip version ($pip_version)... Upgrading..." + python3 -m pip install --upgrade pip --quiet --disable-pip-version-check fi unset pip_version - # install PyYAML dependency - python3 -m pip install 'PyYAML>=5.1' --quiet --disable-pip-version-check --ignore-installed + # Check PyYAML dependency + local pyyaml_version + pyyaml_version=$(pip show pyyaml | grep -Po '(?<=Version: )[0-9]') + if [[ "$pyyaml_version" -lt 5 ]]; then + echo "Detected ancient pyyaml version ($pyyaml_version)... Upgrading..." + python3 -m pip install 'PyYAML>=5.1' --quiet --disable-pip-version-check --ignore-installed + fi + unset pyyaml_version + # install python library at @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi local python_lib_path=@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@ python3 -m pip install "$python_lib_path" --quiet --disable-pip-version-check diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index 1e6d8784..f85da62e 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -474,7 +474,7 @@ Command Modifiers: ```bash usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]) [-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION] [-M PARTITION] - [-o WATTS] [--cpu-pwr-limit PWR_LIMIT] + [-o WATTS] [-p POLICY] [--cpu-pwr-limit PWR_LIMIT] [--cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH] [--cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM] [--cpu-pwr-eff-mode MODE] [--cpu-gmi3-link-width MIN_LW MAX_LW] [--cpu-pcie-link-rate LINK_RATE] @@ -512,6 +512,7 @@ Set Arguments: -M, --memory-partition PARTITION Set one of the following the memory partition modes: NPS1, NPS2, NPS4, NPS8 -o, --power-cap WATTS Set power capacity limit + -p, --dpm-policy POLICY_ID Set the GPU DPM policy using policy id CPU Arguments: --cpu-pwr-limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value. @@ -674,6 +675,18 @@ GPU: 0 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 0 AFFINITY: 0 @@ -770,6 +783,18 @@ GPU: 1 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 1 AFFINITY: 1 @@ -866,6 +891,18 @@ GPU: 2 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 2 AFFINITY: 2 @@ -962,6 +999,18 @@ GPU: 3 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 3 AFFINITY: 3 diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 9a531d72..697513f5 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -161,12 +161,13 @@ def list(self, args, multiple_devices=False, gpu=None): except amdsmi_exception.AmdSmiLibraryException as e: uuid = e.get_error_info() - # Store values based on format - if self.logger.is_human_readable_format(): - self.logger.store_output(args.gpu, 'AMDSMI_SPACING_REMOVAL', {'gpu_bdf':bdf, 'gpu_uuid':uuid}) - else: + # CSV format is intentionally aligned with Host + if self.logger.is_csv_format(): self.logger.store_output(args.gpu, 'gpu_bdf', bdf) self.logger.store_output(args.gpu, 'gpu_uuid', uuid) + else: + self.logger.store_output(args.gpu, 'bdf', bdf) + self.logger.store_output(args.gpu, 'uuid', uuid) if multiple_devices: self.logger.store_multiple_device_output() @@ -243,7 +244,7 @@ def static_cpu(self, args, multiple_devices=False, cpu=None, interface_ver=None) def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, - cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None): + cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, policy=None): """Get Static information for target gpu Args: @@ -266,7 +267,7 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None dfc_ucode (bool, optional): Value override for args.dfc_ucode. Defaults to None. fb_info (bool, optional): Value override for args.fb_info. Defaults to None. num_vf (bool, optional): Value override for args.num_vf. Defaults to None. - + policy (bool, optional): Value override for args.policy. Defaults to None. Returns: None: Print output via AMDSMILogger to destination """ @@ -299,8 +300,10 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None args.partition = partition if limit: args.limit = limit - current_platform_args += ["ras", "limit", "partition"] - current_platform_values += [args.ras, args.limit, args.partition] + if policy: + args.policy = policy + current_platform_args += ["ras", "limit", "partition", "policy"] + current_platform_values += [args.ras, args.limit, args.partition, args.policy] if self.helpers.is_linux() and not self.helpers.is_virtual_os(): if numa: @@ -343,7 +346,13 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None static_dict['asic'] = "N/A" logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info()) if args.bus: - bus_info = {} + bus_info = { + 'bdf': "N/A", + 'max_pcie_width': "N/A", + 'max_pcie_speed': "N/A", + 'pcie_interface_version': "N/A", + 'slot_type': "N/A" + } try: bus_info['bdf'] = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu) @@ -355,7 +364,6 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None link_caps = amdsmi_interface.amdsmi_get_pcie_info(args.gpu) bus_info['max_pcie_width'] = link_caps['pcie_static']['max_pcie_width'] bus_info['max_pcie_speed'] = link_caps['pcie_static']['max_pcie_speed'] - bus_info['pcie_slot_type'] = link_caps['pcie_static']['slot_type'] bus_info['pcie_interface_version'] = link_caps['pcie_static']['pcie_interface_version'] if bus_info['max_pcie_speed'] % 1000 != 0: @@ -365,15 +373,13 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None bus_info['max_pcie_speed'] = pcie_speed_GTs_value - slot_type = bus_info.pop('pcie_slot_type') + slot_type = link_caps['pcie_static']['slot_type'] if isinstance(slot_type, int): slot_types = amdsmi_interface.amdsmi_wrapper.amdsmi_card_form_factor_t__enumvalues if slot_type in slot_types: bus_info['slot_type'] = slot_types[slot_type].replace("AMDSMI_CARD_FORM_FACTOR_", "") else: bus_info['slot_type'] = "Unknown" - else: - bus_info['slot_type'] = "N/A" if bus_info['pcie_interface_version'] > 0: bus_info['pcie_interface_version'] = f"Gen {bus_info['pcie_interface_version']}" @@ -388,7 +394,6 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None "unit" : pcie_speed_unit} except amdsmi_exception.AmdSmiLibraryException as e: - bus_info = "N/A" logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['bus'] = bus_info @@ -483,6 +488,7 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None shutdown_temp_vram_limit = "N/A" logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + # Assign units power_unit = 'W' temp_unit_human_readable = '\N{DEGREE SIGN}C' @@ -623,6 +629,15 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None static_dict['partition'] = {"compute_partition": compute_partition, "memory_partition": memory_partition} + if 'policy' in current_platform_args: + if args.policy: + try: + policy_info = amdsmi_interface.amdsmi_get_dpm_policy(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + policy_info = "N/A" + logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['dpm_policy'] = policy_info if 'numa' in current_platform_args: if args.numa: try: @@ -759,7 +774,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, cpu=None, - interface_ver=None): + interface_ver=None, policy=None): """Get Static information for target gpu and cpu Args: @@ -782,7 +797,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, num_vf (bool, optional): Value override for args.num_vf. Defaults to None. cpu (cpu_handle, optional): cpu_handle for target device. Defaults to None. interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None - + policy (bool, optional): Value override for args.policy. Defaults to None. Raises: IndexError: Index error if gpu list is empty @@ -800,7 +815,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, cpu_attributes = ["smu", "interface_ver"] for attr in cpu_attributes: if hasattr(args, attr): - if getattr(args, attr) is not None: + if getattr(args, attr): cpu_args_enabled = True break @@ -808,10 +823,10 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, gpu_args_enabled = False gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras", "board", "numa", "vram", "cache", "partition", - "dfc_ucode", "fb_info", "num_vf"] + "dfc_ucode", "fb_info", "num_vf", "policy"] for attr in gpu_attributes: if hasattr(args, attr): - if getattr(args, attr) is not None: + if getattr(args, attr): gpu_args_enabled = True break @@ -838,7 +853,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf) + dfc_ucode, fb_info, num_vf, policy) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None: args.cpu = self.cpu_handles @@ -852,7 +867,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf) + dfc_ucode, fb_info, num_vf, policy) def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True): @@ -1123,14 +1138,14 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No args.temperature = temperature if ecc: args.ecc = ecc + if ecc_blocks: + args.ecc_blocks = ecc_blocks if pcie: args.pcie = pcie - current_platform_args += ["usage", "power", "clock", "temperature", "ecc", "pcie"] - current_platform_values += [args.usage, args.power, args.clock, args.temperature, args.ecc, args.pcie] + current_platform_args += ["usage", "power", "clock", "temperature", "ecc", "ecc_blocks", "pcie"] + current_platform_values += [args.usage, args.power, args.clock, args.temperature, args.ecc, args.ecc_blocks, args.pcie] if self.helpers.is_baremetal() and self.helpers.is_linux(): - if ecc_blocks: - args.ecc_blocks = ecc_blocks if fan: args.fan = fan if voltage_curve: @@ -1143,8 +1158,8 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No args.xgmi_err = xgmi_err if energy: args.energy = energy - current_platform_args += ["ecc_blocks", "fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy"] - current_platform_values += [args.ecc_blocks, args.fan, args.voltage_curve, args.overdrive, args.perf_level, args.xgmi_err, args.energy] + current_platform_args += ["fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy"] + current_platform_values += [args.fan, args.voltage_curve, args.overdrive, args.perf_level, args.xgmi_err, args.energy] if self.helpers.is_hypervisor(): if schedule: @@ -2255,7 +2270,7 @@ def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, "guard", "guest_data", "fb_usage", "xgmi"] for attr in gpu_attributes: if hasattr(args, attr): - if getattr(args, attr) is not None: + if getattr(args, attr): gpu_args_enabled = True break @@ -2268,7 +2283,7 @@ def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, "cpu_dimm_pow_consumption", "cpu_dimm_thermal_sensor"] for attr in cpu_attributes: if hasattr(args, attr): - if getattr(args, attr) is not None: + if getattr(args, attr): cpu_args_enabled = True break @@ -2277,7 +2292,7 @@ def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, core_attributes = ["core_boost_limit", "core_curr_active_freq_core_limit", "core_energy"] for attr in core_attributes: if hasattr(args, attr): - if getattr(args, attr) is not None: + if getattr(args, attr): core_args_enabled = True break @@ -3093,7 +3108,7 @@ def set_cpu(self, args, multiple_devices=False, cpu=None, cpu_pwr_limit=None, def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, - memory_partition=None, power_cap=None): + memory_partition=None, power_cap=None, dpm_policy=None): """Issue reset commands to target gpu(s) Args: @@ -3107,6 +3122,7 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None. memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None. power_cap (int, optional): Value override for args.power_cap. Defaults to None. + dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -3132,7 +3148,8 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N args.memory_partition = memory_partition if power_cap: args.power_cap = power_cap - + if dpm_policy: + args.dpm_policy = dpm_policy # Handle No GPU passed if args.gpu == None: raise ValueError('No GPU provided, specific GPU target(s) are needed') @@ -3151,7 +3168,8 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N args.compute_partition, args.memory_partition, args.perf_determinism is not None, - args.power_cap]): + args.power_cap, + args.dpm_policy]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -3215,6 +3233,16 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}") + + if args.dpm_policy: + try: + amdsmi_interface.amdsmi_set_dpm_policy(args.gpu, args.dpm_policy) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}") + if isinstance(args.power_cap, int): try: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) @@ -3254,7 +3282,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, - soc_boost_limit=None, core=None, core_boost_limit=None): + soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None): """Issue reset commands to target gpu(s) Args: @@ -3283,6 +3311,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level core (device_handle, optional): device_handle for target core. Defaults to None. core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None + dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -3303,7 +3332,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", - "memory_partition", "power_cap"] + "memory_partition", "power_cap", "dpm_policy"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: @@ -3314,13 +3343,18 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level cpu_args_enabled = False cpu_attributes = ["cpu_pwr_limit", "cpu_xgmi_link_width", "cpu_lclk_dpm_level", "cpu_pwr_eff_mode", "cpu_gmi3_link_width", "cpu_pcie_link_rate", "cpu_df_pstate_range", - "cpu_enable_apb", "cpu_disable_apb", "soc_boost_limit"] + "cpu_disable_apb", "soc_boost_limit"] for attr in cpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: cpu_args_enabled = True break + # Check if CPU set argument with store_true has been passed + if hasattr(args, "cpu_enable_apb"): + if getattr(args, attr): + cpu_args_enabled = True + # Check if a Core argument has been set core_args_enabled = False core_attributes = ["core_boost_limit"] @@ -3359,7 +3393,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap) + memory_partition, power_cap, dpm_policy) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: raise ValueError('No CPU or CORE provided, specific target(s) are needed') @@ -3378,7 +3412,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap) + memory_partition, power_cap, dpm_policy) def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index a685a7e8..2083c155 100644 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -617,8 +617,11 @@ def is_amd_device(self, device_handle): """ # Get card vendor id asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(device_handle) - return asic_info['vendor_id'] == AMD_VENDOR_ID - + try: + vendor_value = int(asic_info['vendor_id'], 16) + return vendor_value == AMD_VENDOR_ID + except: + return False def get_perf_levels(self): perf_levels_str = [clock.name for clock in amdsmi_interface.AmdSmiDevPerfLevel] @@ -632,7 +635,6 @@ def get_compute_partition_types(self): compute_partitions_str.remove('INVALID') return compute_partitions_str - def get_memory_partition_types(self): memory_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiMemoryPartitionType] if 'UNKNOWN' in memory_partitions_str: diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 3140cd52..5341b274 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -543,6 +543,7 @@ def _add_static_parser(self, subparsers, func): vram_help = "All vram information" cache_help = "All cache information" board_help = "All board information" + dpm_policy_help = "The available DPM policy" # Options arguments help text for Hypervisors and Baremetal ras_help = "Displays RAS features information" @@ -582,6 +583,7 @@ def _add_static_parser(self, subparsers, func): static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) + static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help) if self.helpers.is_linux() and not self.helpers.is_virtual_os(): static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) @@ -759,10 +761,10 @@ def _add_metric_parser(self, subparsers, func): metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help) metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) + metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help) # Optional Args for Linux Baremetal Systems if self.helpers.is_baremetal() and self.helpers.is_linux(): - metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help) metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help) metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help) metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help) @@ -963,6 +965,7 @@ def _add_set_value_parser(self, subparsers, func): set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}" set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" set_power_cap_help = "Set power capacity limit" + set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n" # Help text for CPU set options set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." @@ -998,6 +1001,7 @@ def _add_set_value_parser(self, subparsers, func): set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION') set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') + set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False, type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID') if self.helpers.is_amd_hsmp_initialized(): # Optional CPU Args diff --git a/example/amd_smi_nodrm_example.cc b/example/amd_smi_nodrm_example.cc index 0f829375..e6d37ced 100644 --- a/example/amd_smi_nodrm_example.cc +++ b/example/amd_smi_nodrm_example.cc @@ -331,6 +331,18 @@ int main() { printf(" Output of amdsmi_get_power_cap_info:\n"); std::cout << "\t\t Power Cap: " << cap_info.power_cap / 1000000 << "W\n\n"; + + amdsmi_dpm_policy_t policy; + ret = amdsmi_get_dpm_policy(processor_handles[j], &policy); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + std::cout << "\t amdsmi_get_dpm_policy total:" << policy.num_supported + <<" current:" << policy.current << "\n"; + for (int x=0; x < policy.num_supported; x++) { + std::cout << x <<": (" << policy.policies[x].policy_id + <<"," << policy.policies[x].policy_description << ")\n"; + } + } } } diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 57c83ead..ef58a6ce 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -151,7 +151,7 @@ typedef enum { #define AMDSMI_LIB_VERSION_YEAR 24 //! Major version should be changed for every header change (adding/deleting APIs, changing names, fields of structures, etc.) -#define AMDSMI_LIB_VERSION_MAJOR 4 +#define AMDSMI_LIB_VERSION_MAJOR 5 //! Minor version should be updated for each API change, but without changing headers #define AMDSMI_LIB_VERSION_MINOR 0 @@ -1151,6 +1151,37 @@ typedef struct { uint64_t frequency[AMDSMI_MAX_NUM_FREQUENCIES]; } amdsmi_frequencies_t; +/** + * @brief The dpm policy. + */ +typedef struct { + uint32_t policy_id; + char policy_description[AMDSMI_MAX_NAME]; +} amdsmi_dpm_policy_entry_t; + +#define AMDSMI_MAX_NUM_PM_POLICIES 32 + +/** + * @brief This structure holds information about dpm policies. + */ +typedef struct { + /** + * The number of supported policies + */ + uint32_t num_supported; + + /** + * The current policy index + */ + uint32_t current; + + /** + * List of policies. + * Only the first num_supported policies are valid. + */ + amdsmi_dpm_policy_entry_t policies[AMDSMI_MAX_NUM_PM_POLICIES]; +} amdsmi_dpm_policy_t; + /** * @brief This structure holds information about the possible PCIe * bandwidths. Specifically, the possible transfer rates and their @@ -1429,9 +1460,10 @@ typedef struct { * @brief This structure holds error counts. */ typedef struct { - uint64_t correctable_count; //!< Accumulated correctable errors - uint64_t uncorrectable_count; //!< Accumulated uncorrectable errors - uint64_t reserved[2]; + uint64_t correctable_count; //!< Accumulated correctable errors + uint64_t uncorrectable_count; //!< Accumulated uncorrectable errors + uint64_t deferred_count; //!< Accumulated deferred errors + uint64_t reserved[5]; } amdsmi_error_count_t; /** @@ -3332,6 +3364,47 @@ amdsmi_status_t amdsmi_set_gpu_overdrive_level(amdsmi_processor_handle processor amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle, amdsmi_clk_type_t clk_type, uint64_t freq_bitmask); +/** + * @brief Get the dpm policy for the processor + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle, this function will write + * current dpm policy settings to @p policy. All the processors at the same socket + * will have the same policy. + * + * @param[in] processor_handle a processor handle + * + * @param[in, out] policy the dpm policy for this processor. + * If this parameter is nullptr, this function will return + * ::AMDSMI_STATUS_INVAL + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle, + amdsmi_dpm_policy_t* policy); + +/** + * @brief Set the dpm policy for the processor + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle and a dpm policy @p policy_id, + * this function will set the dpm policy for this processor. All the processors at + * the same socket will be set to the same policy. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] policy_id the dpm policy id to set. The id is the id in + * amdsmi_dpm_policy_entry_t, which can be obtained by calling + * amdsmi_get_dpm_policy() + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle, + uint32_t policy_id); /** @} End PerfCont */ /*****************************************************************************/ @@ -4662,8 +4735,8 @@ amdsmi_get_gpu_process_info(amdsmi_processor_handle processor_handle, amdsmi_pro */ /** - * @brief Returns the total number of ECC errors (correctable and - * uncorrectable) in the given GPU. It is not supported on + * @brief Returns the total number of ECC errors (correctable, + * uncorrectable and deferred) in the given GPU. It is not supported on * virtual machine guest * * @platform{gpu_bm_linux} @platform{host} diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 74a190da..7eb501bb 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -360,7 +360,6 @@ class AmdSmiProcessorType(IntEnum): NON_AMD_GPU = amdsmi_wrapper.NON_AMD_GPU NON_AMD_CPU = amdsmi_wrapper.NON_AMD_CPU - class AmdSmiEventReader: def __init__( self, processor_handle: amdsmi_wrapper.amdsmi_processor_handle, @@ -2690,6 +2689,19 @@ def amdsmi_set_clk_freq( ) ) +def amdsmi_set_dpm_policy( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + policy_id: int, +): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + _check_res( + amdsmi_wrapper.amdsmi_set_dpm_policy( + processor_handle, policy_id + ) + ) def amdsmi_set_gpu_overdrive_level( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int @@ -3249,6 +3261,36 @@ def amdsmi_get_clk_freq( "frequency": list(freq.frequency)[: freq.num_supported - 1], } +def amdsmi_get_dpm_policy( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, +) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + policy = amdsmi_wrapper.amdsmi_dpm_policy_t() + _check_res( + amdsmi_wrapper.amdsmi_get_dpm_policy( + processor_handle, ctypes.byref(policy) + ) + ) + + polices = [] + for i in range(0, policy.num_supported): + id = policy.policies[i].policy_id + desc = policy.policies[i].policy_description + polices.append({ + 'policy_id' : id, + 'policy_description': desc.decode() + }) + current_id = policy.policies[policy.current].policy_id + + return { + "num_supported": policy.num_supported, + "current_id": current_id, + "policies": polices, + } def amdsmi_get_gpu_od_volt_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 91bdc8bd..8fcdb375 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -746,6 +746,19 @@ class struct_fields_(Structure): class struct_amdsmi_pcie_info_t(Structure): pass +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('reserved', ctypes.c_uint64 * 10), +] + class struct_pcie_metric_(Structure): pass @@ -764,19 +777,6 @@ class struct_pcie_metric_(Structure): ('reserved', ctypes.c_uint64 * 13), ] -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('reserved', ctypes.c_uint64 * 10), -] - struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -1480,6 +1480,27 @@ class struct_amdsmi_frequencies_t(Structure): ] amdsmi_frequencies_t = struct_amdsmi_frequencies_t +class struct_amdsmi_dpm_policy_entry_t(Structure): + pass + +struct_amdsmi_dpm_policy_entry_t._pack_ = 1 # source:False +struct_amdsmi_dpm_policy_entry_t._fields_ = [ + ('policy_id', ctypes.c_uint32), + ('policy_description', ctypes.c_char * 32), +] + +amdsmi_dpm_policy_entry_t = struct_amdsmi_dpm_policy_entry_t +class struct_amdsmi_dpm_policy_t(Structure): + pass + +struct_amdsmi_dpm_policy_t._pack_ = 1 # source:False +struct_amdsmi_dpm_policy_t._fields_ = [ + ('num_supported', ctypes.c_uint32), + ('current', ctypes.c_uint32), + ('policies', struct_amdsmi_dpm_policy_entry_t * 32), +] + +amdsmi_dpm_policy_t = struct_amdsmi_dpm_policy_t class struct_amdsmi_pcie_bandwidth_t(Structure): pass @@ -2030,6 +2051,12 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): amdsmi_set_clk_freq = _libraries['libamd_smi.so'].amdsmi_set_clk_freq amdsmi_set_clk_freq.restype = amdsmi_status_t amdsmi_set_clk_freq.argtypes = [amdsmi_processor_handle, amdsmi_clk_type_t, uint64_t] +amdsmi_get_dpm_policy = _libraries['libamd_smi.so'].amdsmi_get_dpm_policy +amdsmi_get_dpm_policy.restype = amdsmi_status_t +amdsmi_get_dpm_policy.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_dpm_policy_t)] +amdsmi_set_dpm_policy = _libraries['libamd_smi.so'].amdsmi_set_dpm_policy +amdsmi_set_dpm_policy.restype = amdsmi_status_t +amdsmi_set_dpm_policy.argtypes = [amdsmi_processor_handle, uint32_t] amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version amdsmi_get_lib_version.restype = amdsmi_status_t amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)] @@ -2486,7 +2513,8 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'amdsmi_cpu_apb_enable', 'amdsmi_cpusocket_handle', 'amdsmi_ddr_bw_metrics_t', 'amdsmi_dev_perf_level_t', 'amdsmi_dimm_power_t', 'amdsmi_dimm_thermal_t', - 'amdsmi_dpm_level_t', 'amdsmi_driver_info_t', + 'amdsmi_dpm_level_t', 'amdsmi_dpm_policy_entry_t', + 'amdsmi_dpm_policy_t', 'amdsmi_driver_info_t', 'amdsmi_engine_usage_t', 'amdsmi_error_count_t', 'amdsmi_event_group_t', 'amdsmi_event_handle_t', 'amdsmi_event_type_t', 'amdsmi_evt_notification_data_t', @@ -2516,10 +2544,10 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'amdsmi_get_cpu_socket_power', 'amdsmi_get_cpu_socket_power_cap', 'amdsmi_get_cpu_socket_power_cap_max', 'amdsmi_get_cpu_socket_temperature', 'amdsmi_get_cpucore_handles', - 'amdsmi_get_cpusocket_handles', 'amdsmi_get_energy_count', - 'amdsmi_get_esmi_err_msg', 'amdsmi_get_fw_info', - 'amdsmi_get_gpu_activity', 'amdsmi_get_gpu_asic_info', - 'amdsmi_get_gpu_available_counters', + 'amdsmi_get_cpusocket_handles', 'amdsmi_get_dpm_policy', + 'amdsmi_get_energy_count', 'amdsmi_get_esmi_err_msg', + 'amdsmi_get_fw_info', 'amdsmi_get_gpu_activity', + 'amdsmi_get_gpu_asic_info', 'amdsmi_get_gpu_available_counters', 'amdsmi_get_gpu_bad_page_info', 'amdsmi_get_gpu_bdf_id', 'amdsmi_get_gpu_board_info', 'amdsmi_get_gpu_cache_info', 'amdsmi_get_gpu_compute_partition', @@ -2599,7 +2627,8 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'amdsmi_set_cpu_socket_boostlimit', 'amdsmi_set_cpu_socket_lclk_dpm_level', 'amdsmi_set_cpu_socket_power_cap', 'amdsmi_set_cpu_xgmi_width', - 'amdsmi_set_gpu_clk_range', 'amdsmi_set_gpu_compute_partition', + 'amdsmi_set_dpm_policy', 'amdsmi_set_gpu_clk_range', + 'amdsmi_set_gpu_compute_partition', 'amdsmi_set_gpu_event_notification_mask', 'amdsmi_set_gpu_fan_speed', 'amdsmi_set_gpu_memory_partition', 'amdsmi_set_gpu_od_clk_info', 'amdsmi_set_gpu_od_volt_info', @@ -2625,6 +2654,7 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'struct_amdsmi_clk_info_t', 'struct_amdsmi_counter_value_t', 'struct_amdsmi_ddr_bw_metrics_t', 'struct_amdsmi_dimm_power_t', 'struct_amdsmi_dimm_thermal_t', 'struct_amdsmi_dpm_level_t', + 'struct_amdsmi_dpm_policy_entry_t', 'struct_amdsmi_dpm_policy_t', 'struct_amdsmi_driver_info_t', 'struct_amdsmi_engine_usage_t', 'struct_amdsmi_error_count_t', 'struct_amdsmi_evt_notification_data_t', diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 8d292930..12654213 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -192,6 +192,39 @@ typedef enum { RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100 //!< Unknown performance level } rsmi_dev_perf_level_t; + + +#define RSMI_MAX_NUM_PM_POLICIES 32 +#define RSMI_MAX_POLICY_NAME 32 +/** + * @brief The dpm policy. + */ +typedef struct { + uint32_t policy_id; + char policy_description[RSMI_MAX_POLICY_NAME]; +} rsmi_dpm_policy_entry_t; + +/** + * @brief This structure holds information about dpm policies. + */ +typedef struct { + /** + * The number of supported policies + */ + uint32_t num_supported; + + /** + * The current policy index + */ + uint32_t current; + + /** + * List of policies. + * Only the first num_supported policies are valid. + */ + rsmi_dpm_policy_entry_t policies[RSMI_MAX_NUM_PM_POLICIES]; +} rsmi_dpm_policy_t; + /// \cond Ignore in docs. typedef rsmi_dev_perf_level_t rsmi_dev_perf_level; /// \endcond @@ -1194,8 +1227,10 @@ typedef enum { * @brief This structure holds error counts. */ typedef struct { - uint64_t correctable_err; //!< Accumulated correctable errors - uint64_t uncorrectable_err; //!< Accumulated uncorrectable errors + uint64_t correctable_err; //!< Accumulated correctable errors + uint64_t uncorrectable_err; //!< Accumulated uncorrectable errors + uint64_t deferred_err; //!< Accumulated deferred errors + uint64_t reserved[5]; } rsmi_error_count_t; /** @@ -3293,6 +3328,42 @@ rsmi_status_t rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od); rsmi_status_t rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, rsmi_clk_type_t clk_type, uint64_t freq_bitmask); +/** + * @brief Get the dpm policy for a device + * + * @details Given a device index @p dv_ind, this function will write + * current dpm policy settings to @p policy. All the devices at the same socket + * will have the same policy. + * + * @param[in] dv_ind a device index + * + * @param[in, out] policy the dpm policy for this device. + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVAL + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind, + rsmi_dpm_policy_t* policy); + +/** + * @brief Set the dpm policy for a device + * + * @details Given a device index @p dv_ind and a dpm policy @p policy_id, + * this function will set the DPM policy for this device. All the devices at + * the same socket will be set to the same policy. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] policy_id the dpm policy will be modified + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_dpm_policy_set(uint32_t dv_ind, + uint32_t policy_id); + /** @} */ // end of PerfCont /*****************************************************************************/ diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h index e699eec3..3df15f2e 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -173,6 +173,7 @@ enum DevInfoTypes { kDevNumaNode, kDevGpuMetrics, kDevPmMetrics, + kDevDPMPolicy, kDevRegMetrics, kDevGpuReset, kDevAvailableComputePartition, @@ -255,6 +256,7 @@ class Device { rsmi_status_t dev_log_gpu_metrics(std::ostringstream& outstream_metrics); AMGpuMetricsPublicLatestTupl_t dev_copy_internal_to_external_metrics(); + static const std::map devInfoTypesStrings; private: std::shared_ptr monitor_; diff --git a/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h b/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h index b67f90c7..b6cccdc6 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h @@ -724,7 +724,7 @@ class GpuMetricsBase_t }; using GpuMetricsBasePtr = std::shared_ptr; -using AMDGpuMetricFactories_t = std::map; +using AMDGpuMetricFactories_t = const std::map; class GpuMetricsBase_v11_t final : public GpuMetricsBase_t diff --git a/rocm_smi/include/rocm_smi/rocm_smi_main.h b/rocm_smi/include/rocm_smi/rocm_smi_main.h index 1cd2ec34..c957f512 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_main.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_main.h @@ -117,7 +117,6 @@ class RocmSMI { void debugRSMIEnvVarInfo(); bool isLoggingOn(void); uint32_t getLogSetting(void); - static const std::map devInfoTypesStrings; private: std::vector> devices_; diff --git a/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/rocm_smi/include/rocm_smi/rocm_smi_utils.h index a6c3e80c..67d9d8b8 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -78,6 +78,7 @@ int isRegularFile(std::string fname, bool *is_reg); int ReadSysfsStr(std::string path, std::string *retStr); int WriteSysfsStr(std::string path, std::string val); bool IsInteger(const std::string & n_str); +bool stringToInteger(const std::string & n_str, int& value); std::pair executeCommand(std::string command, bool stdOut = true); rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index fdf64a93..91c8ddbb 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -82,7 +82,7 @@ using amd::smi::monitorTypesToString; using amd::smi::getRSMIStatusString; using amd::smi::AMDGpuMetricsUnitType_t; using amd::smi::AMDGpuMetricTypeId_t; -auto &devInfoTypesStrings = amd::smi::RocmSMI::devInfoTypesStrings; +auto &devInfoTypesStrings = amd::smi::Device::devInfoTypesStrings; static const uint32_t kMaxOverdriveLevel = 20; static const float kEnergyCounterResolution = 15.3F; @@ -145,6 +145,7 @@ static uint64_t get_multiplier_from_str(char units_char) { return multiplier; } + /** * Parse a string of the form: * ": <|*>" @@ -766,6 +767,20 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, assert(junk == "ce:"); fs2 >> ec->correctable_err; + ec->deferred_err = 0; + if (val_vec.size() > 2) { + std::istringstream fs3(val_vec[2]); + fs3 >> junk; + if (junk == "de:") { + fs3 >> ec->deferred_err; + } else { + ss << __PRETTY_FUNCTION__ + << "Trying to get the de count, but got " << junk + << " ignore the defer count"; + LOG_ERROR(ss); + } + } + ss << __PRETTY_FUNCTION__ << " | ======= end =======" << ", reporting " << amd::smi::getRSMIStatusString(ret);; LOG_TRACE(ss); @@ -2000,6 +2015,133 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, CATCH } + + +rsmi_status_t +rsmi_dev_dpm_policy_set(uint32_t dv_ind, + uint32_t policy_id) { + rsmi_status_t ret; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + GET_DEV_FROM_INDX + + std::string value("soc_pstate "); + value += std::to_string(policy_id); + int ret = dev->writeDevInfo(amd::smi::kDevDPMPolicy , value); + return amd::smi::ErrnoToRsmiStatus(ret); + + CATCH +} + +rsmi_status_t +rsmi_dev_dpm_policy_get(uint32_t dv_ind, + rsmi_dpm_policy_t* policy) { + rsmi_status_t ret; + std::vector val_vec; + + if (policy == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + *policy = {}; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + DEVICE_MUTEX + + ret = GetDevValueVec(amd::smi::kDevDPMPolicy, dv_ind, &val_vec); + if (ret == RSMI_STATUS_FILE_ERROR) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR " + << "-> reporting RSMI_STATUS_NOT_SUPPORTED"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS" + << " -> reporting " << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); + return ret; + } + /* + It will reply on the number but no string as it may vary from soc to soc. + The current pstate marked with * + soc pstate + 0 : soc_pstate_default + 1 : soc_pstate_0 + 2 : soc_pstate_1* + 3 : soc_pstate_2 + */ + bool see_soc_pstate = false; + bool see_current = false; + policy->num_supported = 0; + for (uint32_t i = 0; i < val_vec.size(); ++i) { + auto current_line = amd::smi::trim(val_vec[i]); + if (current_line == "soc pstate") { + see_soc_pstate = true; + continue; + } + if (see_soc_pstate == false) continue; + + // Get tokens: : + std::vector tokens; + std::istringstream f(current_line); + std::string s; + while (getline(f, s, ':')) { + tokens.push_back(s); + } + + int value = 0; + // At the end + if (tokens.size() < 2 || !amd::smi::stringToInteger(tokens[0], value)) { + break; + } + + if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", Unexpeced pstat data: the id is negative or too many policies."; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + + policy->policies[policy->num_supported].policy_id = value; + std::string description = amd::smi::trim(tokens[1]); + if (current_line.back() == '*') { // current policy + description.pop_back(); // remove last * + description = amd::smi::trim(description); + policy->current = policy->num_supported; + see_current = true; + } + strncpy(policy->policies[policy->num_supported].policy_description, + description.c_str(), + RSMI_MAX_POLICY_NAME-1); + policy->num_supported++; + } // end for + + if (!see_soc_pstate) { + return RSMI_STATUS_NOT_SUPPORTED; + } + + if (!see_current) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", Unexpeced pstat data: cannot find the current policy."; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + // Cannot find it + return RSMI_STATUS_SUCCESS; + + CATCH +} + static std::vector pci_name_files = { "/usr/share/misc/pci.ids", "/usr/share/hwdata/pci.ids", @@ -2013,7 +2155,7 @@ enum eNameStrType { NAME_STR_SUBSYS }; -std::map +static const std::map mapStringToRSMIComputePartitionTypes { {"CPX", RSMI_COMPUTE_PARTITION_CPX}, {"SPX", RSMI_COMPUTE_PARTITION_SPX}, @@ -2022,7 +2164,7 @@ mapStringToRSMIComputePartitionTypes { {"QPX", RSMI_COMPUTE_PARTITION_QPX} }; -std::map +static const std::map mapRSMIToStringComputePartitionTypes { {RSMI_COMPUTE_PARTITION_INVALID, "UNKNOWN"}, {RSMI_COMPUTE_PARTITION_CPX, "CPX"}, @@ -2032,7 +2174,7 @@ mapRSMIToStringComputePartitionTypes { {RSMI_COMPUTE_PARTITION_QPX, "QPX"} }; -std::map +static const std::map mapRSMIToStringMemoryPartitionTypes { {RSMI_MEMORY_PARTITION_UNKNOWN, "UNKNOWN"}, {RSMI_MEMORY_PARTITION_NPS1, "NPS1"}, @@ -2041,7 +2183,7 @@ mapRSMIToStringMemoryPartitionTypes { {RSMI_MEMORY_PARTITION_NPS8, "NPS8"} }; -std::map +static const std::map mapStringToMemoryPartitionTypes { {"NPS1", RSMI_MEMORY_PARTITION_NPS1}, {"NPS2", RSMI_MEMORY_PARTITION_NPS2}, diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 6e79c29b..3e63659c 100755 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -136,6 +136,7 @@ static const char *kDevAvailableComputePartitionFName = "available_compute_partition"; static const char *kDevComputePartitionFName = "current_compute_partition"; static const char *kDevMemoryPartitionFName = "current_memory_partition"; +static const char* kDevDPMPolicyFName = "pm_policy"; // The PM policy for pstat and XGMI // Firmware version files static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version"; @@ -315,6 +316,7 @@ static const std::map kDevAttribNameMap = { {kDevNumaNode, kDevNumaNodeFName}, {kDevGpuMetrics, kDevGpuMetricsFName}, {kDevPmMetrics, kDevPmMetricsFName}, + {kDevDPMPolicy, kDevDPMPolicyFName}, {kDevRegMetrics, kDevRegMetricsFName}, {kDevGpuReset, kDevGpuResetFName}, {kDevAvailableComputePartition, kDevAvailableComputePartitionFName}, @@ -336,7 +338,7 @@ static const std::map kDevPerfLvlMap = { {RSMI_DEV_PERF_LEVEL_UNKNOWN, kDevPerfLevelUnknownStr}, }; -static std::map kDevInfoVarTypeToRSMIVariant = { +static const std::map kDevInfoVarTypeToRSMIVariant = { // rsmi_memory_type_t {kDevMemTotGTT, RSMI_MEM_TYPE_GTT}, {kDevMemTotVisVRAM, RSMI_MEM_TYPE_VIS_VRAM}, @@ -391,6 +393,90 @@ static std::map kDevInfoVarTypeToRSMIVariant = { {kDevDFCountersAvailable, RSMI_EVNT_GRP_XGMI} }; +const std::map +Device::devInfoTypesStrings = { + {kDevPerfLevel, "kDevPerfLevel"}, + {kDevOverDriveLevel, "kDevOverDriveLevel"}, + {kDevMemOverDriveLevel, "kDevMemOverDriveLevel"}, + {kDevDevID, "kDevDevID"}, + {kDevXGMIPhysicalID, "kDevXGMIPhysicalID"}, + {kDevDevRevID, "kDevDevRevID"}, + {kDevDevProdName, "kDevDevProdName"}, + {kDevBoardInfo, "kDevBoardInfo"}, + {kDevDevProdNum, "kDevDevProdNum"}, + {kDevVendorID, "kDevVendorID"}, + {kDevSubSysDevID, "kDevSubSysDevID"}, + {kDevSubSysVendorID, "kDevSubSysVendorID"}, + {kDevGPUMClk, "kDevGPUMClk"}, + {kDevGPUSClk, "kDevGPUSClk"}, + {kDevDCEFClk, "kDevDCEFClk"}, + {kDevFClk, "kDevFClk"}, + {kDevSOCClk, "kDevSOCClk"}, + {kDevPCIEClk, "kDevPCIEClk"}, + {kDevPowerProfileMode, "kDevPowerProfileMode"}, + {kDevUsage, "kDevUsage"}, + {kDevPowerODVoltage, "kDevPowerODVoltage"}, + {kDevVBiosVer, "kDevVBiosVer"}, + {kDevPCIEThruPut, "kDevPCIEThruPut"}, + {kDevErrCntSDMA, "kDevErrCntSDMA"}, + {kDevErrCntUMC, "kDevErrCntUMC"}, + {kDevErrCntGFX, "kDevErrCntGFX"}, + {kDevErrCntMMHUB, "kDevErrCntMMHUB"}, + {kDevErrCntPCIEBIF, "kDevErrCntPCIEBIF"}, + {kDevErrCntHDP, "kDevErrCntHDP"}, + {kDevErrCntXGMIWAFL, "kDevErrCntXGMIWAFL"}, + {kDevErrCntFeatures, "kDevErrCntFeatures"}, + {kDevErrRASSchema, "kDevErrRASSchema"}, + {kDevErrTableVersion, "kDevErrTableVersion"}, + {kDevMemTotGTT, "kDevMemTotGTT"}, + {kDevMemTotVisVRAM, "kDevMemTotVisVRAM"}, + {kDevMemTotVRAM, "kDevMemTotVRAM"}, + {kDevMemUsedGTT, "kDevMemUsedGTT"}, + {kDevMemUsedVisVRAM, "kDevMemUsedVisVRAM"}, + {kDevMemUsedVRAM, "kDevMemUsedVRAM"}, + {kDevVramVendor, "kDevVramVendor"}, + {kDevPCIEReplayCount, "kDevPCIEReplayCount"}, + {kDevUniqueId, "kDevUniqueId"}, + {kDevDFCountersAvailable, "kDevDFCountersAvailable"}, + {kDevMemBusyPercent, "kDevMemBusyPercent"}, + {kDevXGMIError, "kDevXGMIError"}, + {kDevFwVersionAsd, "kDevFwVersionAsd"}, + {kDevFwVersionCe, "kDevFwVersionCe"}, + {kDevFwVersionDmcu, "kDevFwVersionDmcu"}, + {kDevFwVersionMc, "kDevFwVersionMc"}, + {kDevFwVersionMe, "kDevFwVersionMe"}, + {kDevFwVersionMec, "kDevFwVersionMec"}, + {kDevFwVersionMec2, "kDevFwVersionMec2"}, + {kDevFwVersionMes, "kDevFwVersionMes"}, + {kDevFwVersionMesKiq, "kDevFwVersionMesKiq"}, + {kDevFwVersionPfp, "kDevFwVersionPfp"}, + {kDevFwVersionRlc, "kDevFwVersionRlc"}, + {kDevFwVersionRlcSrlc, "kDevFwVersionRlcSrlc"}, + {kDevFwVersionRlcSrlg, "kDevFwVersionRlcSrlg"}, + {kDevFwVersionRlcSrls, "kDevFwVersionRlcSrls"}, + {kDevFwVersionSdma, "kDevFwVersionSdma"}, + {kDevFwVersionSdma2, "kDevFwVersionSdma2"}, + {kDevFwVersionSmc, "kDevFwVersionSmc"}, + {kDevFwVersionSos, "kDevFwVersionSos"}, + {kDevFwVersionTaRas, "kDevFwVersionTaRas"}, + {kDevFwVersionTaXgmi, "kDevFwVersionTaXgmi"}, + {kDevFwVersionUvd, "kDevFwVersionUvd"}, + {kDevFwVersionVce, "kDevFwVersionVce"}, + {kDevFwVersionVcn, "kDevFwVersionVcn"}, + {kDevSerialNumber, "kDevSerialNumber"}, + {kDevMemPageBad, "kDevMemPageBad"}, + {kDevNumaNode, "kDevNumaNode"}, + {kDevGpuMetrics, "kDevGpuMetrics"}, + {kDevPmMetrics, "kDevPmMetrics"}, + {kDevRegMetrics, "kDevRegMetrics"}, + {kDevGpuReset, "kDevGpuReset"}, + {kDevAvailableComputePartition, "kDevAvailableComputePartition"}, + {kDevComputePartition, "kDevComputePartition"}, + {kDevMemoryPartition, "kDevMemoryPartition"}, + {kDevPCieVendorID, "kDevPCieVendorID"}, + {kDevDPMPolicy, "kDevDPMPolicy"}, +}; + static const std::map kDevFuncDependsMap = { // Functions with only mandatory dependencies {"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}}, @@ -450,6 +536,8 @@ static const std::map kDevFuncDependsMap = { {"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}}, {"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}}, {"rsmi_dev_pm_metrics_info_get", {{kDevPmMetricsFName}, {}}}, + {"rsmi_dev_dpm_policy_get", {{kDevDPMPolicyFName}, {}}}, + {"rsmi_dev_dpm_policy_set", {{kDevDPMPolicyFName}, {}}}, {"rsmi_dev_reg_table_info_get", {{kDevRegMetricsFName}, {}}}, {"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}}, {"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}}, @@ -644,7 +732,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { if (ret != 0) { ss << __PRETTY_FUNCTION__ << " | Issue: File did not exist - SYSFS file (" << sysfs_path - << ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << ") for DevInfoInfoType (" << devInfoTypesStrings.at(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -653,7 +741,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { ss << __PRETTY_FUNCTION__ << " | Issue: File is not a regular file - SYSFS file (" << sysfs_path << ") for " - << "DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")," + << "DevInfoInfoType (" << devInfoTypesStrings.at(type) << ")," << " returning ENOENT (" << std::strerror(ENOENT) << ")"; LOG_ERROR(ss); return ENOENT; @@ -664,7 +752,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { if (!fs->is_open()) { ss << __PRETTY_FUNCTION__ << " | Issue: Could not open - SYSFS file (" << sysfs_path << ") for " - << "DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << "), " + << "DevInfoInfoType (" << devInfoTypesStrings.at(type) << "), " << ", returning " << std::to_string(errno) << " (" << std::strerror(errno) << ")"; LOG_ERROR(ss); @@ -673,7 +761,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { ss << __PRETTY_FUNCTION__ << " | Successfully opened SYSFS file (" << sysfs_path - << ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << ") for DevInfoInfoType (" << devInfoTypesStrings.at(type) << ")"; LOG_INFO(ss); return 0; @@ -690,7 +778,7 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { ret = openDebugFileStream(type, &fs); if (ret != 0) { ss << "Could not read debugInfoStr for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type)<< "), returning " + << devInfoTypesStrings.at(type)<< "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -704,7 +792,7 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { fs.close(); ss << "Successfully read debugInfoStr for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type)<< "), retString= " << *retStr; + << devInfoTypesStrings.at(type)<< "), retString= " << *retStr; LOG_INFO(ss); return 0; @@ -720,7 +808,7 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) { ret = openSysfsFileStream(type, &fs); if (ret != 0) { ss << "Could not read device info string for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << "), returning " + << devInfoTypesStrings.at(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -729,8 +817,8 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) { fs >> *retStr; fs.close(); ss << __PRETTY_FUNCTION__ - << "Successfully read device info string for DevInfoType (" + - RocmSMI::devInfoTypesStrings.at(type) + "): " + *retStr + << "Successfully read device info string for DevInfoType (" << + devInfoTypesStrings.at(type) << "): " + *retStr << " | " << (fs.is_open() ? " File stream is opened" : " File stream is closed") << " | " << (fs.bad() ? "[ERROR] Bad read operation" : @@ -765,7 +853,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr, fs.close(); ss << __PRETTY_FUNCTION__ << " | Issue: Could not open fileStream; " << "Could not write device info string (" << valStr - << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << ") for DevInfoType (" << devInfoTypesStrings.at(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -776,7 +864,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr, fs.flush(); fs.close(); ss << "Successfully wrote device info string (" << valStr - << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << ") for DevInfoType (" << devInfoTypesStrings.at(type) << "), returning RSMI_STATUS_SUCCESS"; LOG_INFO(ss); ret = RSMI_STATUS_SUCCESS; @@ -790,7 +878,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr, fs.close(); ss << __PRETTY_FUNCTION__ << " | Issue: Could not write to file; " << "Could not write device info string (" << valStr - << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << ") for DevInfoType (" << devInfoTypesStrings.at(type) << "), returning " << getRSMIStatusString(ErrnoToRsmiStatus(ret)); ss << " | " << (fs.is_open() ? "[ERROR] File stream open" : @@ -855,6 +943,7 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) { case kDevPCIEClk: case kDevPowerODVoltage: case kDevSOCClk: + case kDevDPMPolicy: return writeDevInfoStr(type, val); case kDevComputePartition: case kDevMemoryPartition: @@ -877,14 +966,14 @@ int Device::readDevInfoLine(DevInfoTypes type, std::string *line) { ret = openSysfsFileStream(type, &fs); if (ret != 0) { ss << "Could not read DevInfoLine for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ")"; + << devInfoTypesStrings.at(type) << ")"; LOG_ERROR(ss); return ret; } std::getline(fs, *line); ss << "Successfully read DevInfoLine for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << "), returning *line = " + << devInfoTypesStrings.at(type) << "), returning *line = " << *line; LOG_INFO(ss); @@ -903,7 +992,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, ptr = fopen(sysfs_path.c_str(), "rb"); if (!ptr) { ss << "Could not read DevInfoBinary for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ")" + << devInfoTypesStrings.at(type) << ")" << " - SYSFS (" << sysfs_path << ")" << ", returning " << std::to_string(errno) << " (" << std::strerror(errno) << ")"; @@ -915,7 +1004,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, fclose(ptr); if ((num*b_size) != b_size) { ss << "Could not read DevInfoBinary for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS (" + << devInfoTypesStrings.at(type) << ") - SYSFS (" << sysfs_path << "), binary size error; " << "[buff: " << p_binary_data @@ -929,7 +1018,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, return ENOENT; } ss << "Successfully read DevInfoBinary for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS (" + << devInfoTypesStrings.at(type) << ") - SYSFS (" << sysfs_path << "), returning binaryData = " << p_binary_data << "; byte_size = " << std::dec << static_cast(b_size); @@ -961,7 +1050,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, if (retVec->empty()) { ss << "Read devInfoMultiLineStr for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ")" + << devInfoTypesStrings.at(type) << ")" << ", but contained no string lines"; LOG_ERROR(ss); return ENXIO; @@ -979,12 +1068,12 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, if (!allLines.empty()) { ss << "Successfully read devInfoMultiLineStr for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ") " + << devInfoTypesStrings.at(type) << ") " << ", returning lines read = " << allLines; LOG_INFO(ss); } else { ss << "Read devInfoMultiLineStr for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ")" + << devInfoTypesStrings.at(type) << ")" << ", but lines were empty"; LOG_INFO(ss); return ENXIO; @@ -1136,6 +1225,7 @@ int Device::readDevInfo(DevInfoTypes type, std::vector *val) { case kDevErrCntHDP: case kDevErrCntXGMIWAFL: case kDevMemPageBad: + case kDevDPMPolicy: return readDevInfoMultiLineStr(type, val); break; diff --git a/rocm_smi/src/rocm_smi_gpu_metrics.cc b/rocm_smi/src/rocm_smi_gpu_metrics.cc index 37b65776..f6d7f80e 100755 --- a/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -368,7 +368,7 @@ GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_v << " |"; LOG_TRACE(ostrstream); - return (amd_gpu_metrics_factory_table[gpu_metric_version]); + return (amd_gpu_metrics_factory_table.at(gpu_metric_version)); } ostrstream << __PRETTY_FUNCTION__ diff --git a/rocm_smi/src/rocm_smi_main.cc b/rocm_smi/src/rocm_smi_main.cc index 612482ae..c078712e 100755 --- a/rocm_smi/src/rocm_smi_main.cc +++ b/rocm_smi/src/rocm_smi_main.cc @@ -57,10 +57,8 @@ #include #include #include -#include #include #include -#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_device.h" @@ -79,91 +77,6 @@ static const char *kDeviceNamePrefix = "card"; static const char *kAMDMonitorTypes[] = {"radeon", "amdgpu", ""}; -static const std::string amdSMI = "amd::smi::"; -const std::map -amd::smi::RocmSMI::devInfoTypesStrings = { - {amd::smi::kDevPerfLevel, amdSMI + "kDevPerfLevel"}, - {amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"}, - {amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"}, - {amd::smi::kDevDevID, amdSMI + "kDevDevID"}, - {amd::smi::kDevXGMIPhysicalID, amdSMI + "kDevXGMIPhysicalID"}, - {amd::smi::kDevDevRevID, amdSMI + "kDevDevRevID"}, - {amd::smi::kDevDevProdName, amdSMI + "kDevDevProdName"}, - {amd::smi::kDevBoardInfo, amdSMI + "kDevBoardInfo"}, - {amd::smi::kDevDevProdNum, amdSMI + "kDevDevProdNum"}, - {amd::smi::kDevVendorID, amdSMI + "kDevVendorID"}, - {amd::smi::kDevSubSysDevID, amdSMI + "kDevSubSysDevID"}, - {amd::smi::kDevSubSysVendorID, amdSMI + "kDevSubSysVendorID"}, - {amd::smi::kDevGPUMClk, amdSMI + "kDevGPUMClk"}, - {amd::smi::kDevGPUSClk, amdSMI + "kDevGPUSClk"}, - {amd::smi::kDevDCEFClk, amdSMI + "kDevDCEFClk"}, - {amd::smi::kDevFClk, amdSMI + "kDevFClk"}, - {amd::smi::kDevSOCClk, amdSMI + "kDevSOCClk"}, - {amd::smi::kDevPCIEClk, amdSMI + "kDevPCIEClk"}, - {amd::smi::kDevPowerProfileMode, amdSMI + "kDevPowerProfileMode"}, - {amd::smi::kDevUsage, amdSMI + "kDevUsage"}, - {amd::smi::kDevPowerODVoltage, amdSMI + "kDevPowerODVoltage"}, - {amd::smi::kDevVBiosVer, amdSMI + "kDevVBiosVer"}, - {amd::smi::kDevPCIEThruPut, amdSMI + "kDevPCIEThruPut"}, - {amd::smi::kDevErrCntSDMA, amdSMI + "kDevErrCntSDMA"}, - {amd::smi::kDevErrCntUMC, amdSMI + "kDevErrCntUMC"}, - {amd::smi::kDevErrCntGFX, amdSMI + "kDevErrCntGFX"}, - {amd::smi::kDevErrCntMMHUB, amdSMI + "kDevErrCntMMHUB"}, - {amd::smi::kDevErrCntPCIEBIF, amdSMI + "kDevErrCntPCIEBIF"}, - {amd::smi::kDevErrCntHDP, amdSMI + "kDevErrCntHDP"}, - {amd::smi::kDevErrCntXGMIWAFL, amdSMI + "kDevErrCntXGMIWAFL"}, - {amd::smi::kDevErrCntFeatures, amdSMI + "kDevErrCntFeatures"}, - {amd::smi::kDevErrRASSchema, amdSMI + "kDevErrRASSchema"}, - {amd::smi::kDevErrTableVersion, amdSMI + "kDevErrTableVersion"}, - {amd::smi::kDevMemTotGTT, amdSMI + "kDevMemTotGTT"}, - {amd::smi::kDevMemTotVisVRAM, amdSMI + "kDevMemTotVisVRAM"}, - {amd::smi::kDevMemTotVRAM, amdSMI + "kDevMemTotVRAM"}, - {amd::smi::kDevMemUsedGTT, amdSMI + "kDevMemUsedGTT"}, - {amd::smi::kDevMemUsedVisVRAM, amdSMI + "kDevMemUsedVisVRAM"}, - {amd::smi::kDevMemUsedVRAM, amdSMI + "kDevMemUsedVRAM"}, - {amd::smi::kDevVramVendor, amdSMI + "kDevVramVendor"}, - {amd::smi::kDevPCIEReplayCount, amdSMI + "kDevPCIEReplayCount"}, - {amd::smi::kDevUniqueId, amdSMI + "kDevUniqueId"}, - {amd::smi::kDevDFCountersAvailable, amdSMI + "kDevDFCountersAvailable"}, - {amd::smi::kDevMemBusyPercent, amdSMI + "kDevMemBusyPercent"}, - {amd::smi::kDevXGMIError, amdSMI + "kDevXGMIError"}, - {amd::smi::kDevFwVersionAsd, amdSMI + "kDevFwVersionAsd"}, - {amd::smi::kDevFwVersionCe, amdSMI + "kDevFwVersionCe"}, - {amd::smi::kDevFwVersionDmcu, amdSMI + "kDevFwVersionDmcu"}, - {amd::smi::kDevFwVersionMc, amdSMI + "kDevFwVersionMc"}, - {amd::smi::kDevFwVersionMe, amdSMI + "kDevFwVersionMe"}, - {amd::smi::kDevFwVersionMec, amdSMI + "kDevFwVersionMec"}, - {amd::smi::kDevFwVersionMec2, amdSMI + "kDevFwVersionMec2"}, - {amd::smi::kDevFwVersionMes, amdSMI + "kDevFwVersionMes"}, - {amd::smi::kDevFwVersionMesKiq, amdSMI + "kDevFwVersionMesKiq"}, - {amd::smi::kDevFwVersionPfp, amdSMI + "kDevFwVersionPfp"}, - {amd::smi::kDevFwVersionRlc, amdSMI + "kDevFwVersionRlc"}, - {amd::smi::kDevFwVersionRlcSrlc, amdSMI + "kDevFwVersionRlcSrlc"}, - {amd::smi::kDevFwVersionRlcSrlg, amdSMI + "kDevFwVersionRlcSrlg"}, - {amd::smi::kDevFwVersionRlcSrls, amdSMI + "kDevFwVersionRlcSrls"}, - {amd::smi::kDevFwVersionSdma, amdSMI + "kDevFwVersionSdma"}, - {amd::smi::kDevFwVersionSdma2, amdSMI + "kDevFwVersionSdma2"}, - {amd::smi::kDevFwVersionSmc, amdSMI + "kDevFwVersionSmc"}, - {amd::smi::kDevFwVersionSos, amdSMI + "kDevFwVersionSos"}, - {amd::smi::kDevFwVersionTaRas, amdSMI + "kDevFwVersionTaRas"}, - {amd::smi::kDevFwVersionTaXgmi, amdSMI + "kDevFwVersionTaXgmi"}, - {amd::smi::kDevFwVersionUvd, amdSMI + "kDevFwVersionUvd"}, - {amd::smi::kDevFwVersionVce, amdSMI + "kDevFwVersionVce"}, - {amd::smi::kDevFwVersionVcn, amdSMI + "kDevFwVersionVcn"}, - {amd::smi::kDevSerialNumber, amdSMI + "kDevSerialNumber"}, - {amd::smi::kDevMemPageBad, amdSMI + "kDevMemPageBad"}, - {amd::smi::kDevNumaNode, amdSMI + "kDevNumaNode"}, - {amd::smi::kDevGpuMetrics, amdSMI + "kDevGpuMetrics"}, - {amd::smi::kDevPmMetrics, amdSMI + "kDevPmMetrics"}, - {amd::smi::kDevRegMetrics, amdSMI + "kDevRegMetrics"}, - {amd::smi::kDevGpuReset, amdSMI + "kDevGpuReset"}, - {amd::smi::kDevAvailableComputePartition, amdSMI + - "kDevAvailableComputePartition"}, - {amd::smi::kDevComputePartition, amdSMI + "kDevComputePartition"}, - {amd::smi::kDevMemoryPartition, amdSMI + "kDevMemoryPartition"}, - {amd::smi::kDevPCieVendorID, amdSMI + "kDevPCieVendorID"}, -}; - namespace amd { namespace smi { @@ -647,7 +560,7 @@ std::string RocmSMI::getRSMIEnvVarInfo(void) { for (auto it=env_vars_.enum_overrides.begin(); it != env_vars_.enum_overrides.end(); ++it) { DevInfoTypes type = static_cast(*it); - ss << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) + ")"); + ss << (std::to_string(*it) + " (" + Device::devInfoTypesStrings.at(type) + ")"); auto temp_it = it; if(++temp_it != env_vars_.enum_overrides.end()) { ss << ", "; diff --git a/rocm_smi/src/rocm_smi_utils.cc b/rocm_smi/src/rocm_smi_utils.cc index f9589be8..61ec4243 100755 --- a/rocm_smi/src/rocm_smi_utils.cc +++ b/rocm_smi/src/rocm_smi_utils.cc @@ -257,6 +257,16 @@ bool IsInteger(const std::string & n_str) { return (*tmp == 0); } +bool stringToInteger(const std::string & n_str, int& value) { + try { + value = std::stoi(trim(n_str), nullptr); + return true; + } catch (...) { + return false; + } + return false; +} + rsmi_status_t handleException() { try { throw; @@ -503,6 +513,7 @@ std::vector getListOfAppTmpFiles() { continue; } } + closedir(dir); return tmpFiles; } diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 7032b1a5..392b6188 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1352,6 +1352,23 @@ amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle, return rsmi_wrapper(rsmi_dev_gpu_clk_freq_set, processor_handle, static_cast(clk_type), freq_bitmask); } + +amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle, + uint32_t policy) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_dpm_policy_set, processor_handle, + policy); +} + +amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle, + amdsmi_dpm_policy_t* policy) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_dpm_policy_get, processor_handle, + reinterpret_cast(policy)); +} + amdsmi_status_t amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle, uint32_t *num_pages, @@ -1733,6 +1750,7 @@ amdsmi_get_gpu_total_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_ // Increase the total ecc counts ec->correctable_count += block_ec.correctable_count; ec->uncorrectable_count += block_ec.uncorrectable_count; + ec->deferred_count += block_ec.deferred_count; } } }