From 463817f34477d9e2a1f73227d6e25ceea0b0c49d Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Fri, 1 Mar 2024 09:08:48 -0600 Subject: [PATCH 01/15] SWDEV-448626 - Removed gpu prefix in non-csv formats Signed-off-by: Maisam Arif Change-Id: I77fc58828a978080482e6ab01ff89f1f5a554dc5 --- amdsmi_cli/amdsmi_commands.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 9a531d72..1168b5f9 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -161,12 +161,13 @@ def list(self, args, multiple_devices=False, gpu=None): except amdsmi_exception.AmdSmiLibraryException as e: uuid = e.get_error_info() - # Store values based on format - if self.logger.is_human_readable_format(): - self.logger.store_output(args.gpu, 'AMDSMI_SPACING_REMOVAL', {'gpu_bdf':bdf, 'gpu_uuid':uuid}) - else: + # CSV format is intentionally aligned with Host + if self.logger.is_csv_format(): self.logger.store_output(args.gpu, 'gpu_bdf', bdf) self.logger.store_output(args.gpu, 'gpu_uuid', uuid) + else: + self.logger.store_output(args.gpu, 'bdf', bdf) + self.logger.store_output(args.gpu, 'uuid', uuid) if multiple_devices: self.logger.store_multiple_device_output() From cfb9b5e7503a08933da3fc445b287843db746185 Mon Sep 17 00:00:00 2001 From: Deepak Mewar Date: Mon, 4 Mar 2024 03:29:03 -0500 Subject: [PATCH 02/15] Updated as per latest esmi library changes in github Change-Id: I949e1f2dcffc223274505764c84f2c6b9a533c98 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ae080f87..e290230e 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,7 +116,7 @@ if(ENABLE_ESMI_LIB) if(NOT EXISTS ${PROJECT_SOURCE_DIR}/esmi_ib_library/src) # TODO: use ExternalProject_Add instead or a submodule # as of 2023.10.16 CI builds are broken with an updated submodule - execute_process(COMMAND git clone --depth=1 -b esmi_so_ver-3.0 https://github.com/amd/esmi_ib_library.git ${PROJECT_SOURCE_DIR}/esmi_ib_library) + execute_process(COMMAND git clone --depth=1 -b esmi_pkg_ver-3.0.3 https://github.com/amd/esmi_ib_library.git ${PROJECT_SOURCE_DIR}/esmi_ib_library) endif() if(NOT EXISTS ${PROJECT_SOURCE_DIR}/esmi_ib_library/include/asm/amd_hsmp.h) file(DOWNLOAD From c8c03dfab08ecdf6cf2aec2606ca46f7325c396e Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Mon, 4 Mar 2024 11:02:50 -0600 Subject: [PATCH 03/15] Revert is not None check for static & metric arugment checks Signed-off-by: Maisam Arif Change-Id: I351c88d53c9a626ad4305a7c61dc18b976b853f2 --- amdsmi_cli/amdsmi_commands.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 1168b5f9..5f28c4a1 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -801,7 +801,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, cpu_attributes = ["smu", "interface_ver"] for attr in cpu_attributes: if hasattr(args, attr): - if getattr(args, attr) is not None: + if getattr(args, attr): cpu_args_enabled = True break @@ -812,7 +812,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, "dfc_ucode", "fb_info", "num_vf"] for attr in gpu_attributes: if hasattr(args, attr): - if getattr(args, attr) is not None: + if getattr(args, attr): gpu_args_enabled = True break @@ -2256,7 +2256,7 @@ def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, "guard", "guest_data", "fb_usage", "xgmi"] for attr in gpu_attributes: if hasattr(args, attr): - if getattr(args, attr) is not None: + if getattr(args, attr): gpu_args_enabled = True break @@ -2269,7 +2269,7 @@ def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, "cpu_dimm_pow_consumption", "cpu_dimm_thermal_sensor"] for attr in cpu_attributes: if hasattr(args, attr): - if getattr(args, attr) is not None: + if getattr(args, attr): cpu_args_enabled = True break @@ -2278,7 +2278,7 @@ def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, core_attributes = ["core_boost_limit", "core_curr_active_freq_core_limit", "core_energy"] for attr in core_attributes: if hasattr(args, attr): - if getattr(args, attr) is not None: + if getattr(args, attr): core_args_enabled = True break @@ -3315,13 +3315,18 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level cpu_args_enabled = False cpu_attributes = ["cpu_pwr_limit", "cpu_xgmi_link_width", "cpu_lclk_dpm_level", "cpu_pwr_eff_mode", "cpu_gmi3_link_width", "cpu_pcie_link_rate", "cpu_df_pstate_range", - "cpu_enable_apb", "cpu_disable_apb", "soc_boost_limit"] + "cpu_disable_apb", "soc_boost_limit"] for attr in cpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: cpu_args_enabled = True break + # Check if CPU set argument with store_true has been passed + if hasattr(args, "cpu_enable_apb"): + if getattr(args, attr): + cpu_args_enabled = True + # Check if a Core argument has been set core_args_enabled = False core_attributes = ["core_boost_limit"] From c489cb8f3f1ea4c370e9b5d32f56bafbfadc5b48 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Mon, 4 Mar 2024 09:27:28 -0600 Subject: [PATCH 04/15] Add support for deferred RAS errors in API The API will support the deferred errors Change-Id: I221a146f09fefde1fc31e5f746d0870e07c93561 --- include/amd_smi/amdsmi.h | 11 ++++++----- rocm_smi/include/rocm_smi/rocm_smi.h | 6 ++++-- rocm_smi/src/rocm_smi.cc | 14 ++++++++++++++ src/amd_smi/amd_smi.cc | 1 + 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 57c83ead..7e283bfb 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -1429,9 +1429,10 @@ typedef struct { * @brief This structure holds error counts. */ typedef struct { - uint64_t correctable_count; //!< Accumulated correctable errors - uint64_t uncorrectable_count; //!< Accumulated uncorrectable errors - uint64_t reserved[2]; + uint64_t correctable_count; //!< Accumulated correctable errors + uint64_t uncorrectable_count; //!< Accumulated uncorrectable errors + uint64_t deferred_count; //!< Accumulated deferred errors + uint64_t reserved[5]; } amdsmi_error_count_t; /** @@ -4662,8 +4663,8 @@ amdsmi_get_gpu_process_info(amdsmi_processor_handle processor_handle, amdsmi_pro */ /** - * @brief Returns the total number of ECC errors (correctable and - * uncorrectable) in the given GPU. It is not supported on + * @brief Returns the total number of ECC errors (correctable, + * uncorrectable and deferred) in the given GPU. It is not supported on * virtual machine guest * * @platform{gpu_bm_linux} @platform{host} diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 8d292930..d42ac466 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -1194,8 +1194,10 @@ typedef enum { * @brief This structure holds error counts. */ typedef struct { - uint64_t correctable_err; //!< Accumulated correctable errors - uint64_t uncorrectable_err; //!< Accumulated uncorrectable errors + uint64_t correctable_err; //!< Accumulated correctable errors + uint64_t uncorrectable_err; //!< Accumulated uncorrectable errors + uint64_t deferred_err; //!< Accumulated deferred errors + uint64_t reserved[5]; } rsmi_error_count_t; /** diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index fdf64a93..35082a98 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -766,6 +766,20 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, assert(junk == "ce:"); fs2 >> ec->correctable_err; + ec->deferred_err = 0; + if (val_vec.size() > 2) { + std::istringstream fs3(val_vec[2]); + fs3 >> junk; + if (junk == "de:") { + fs3 >> ec->deferred_err; + } else { + ss << __PRETTY_FUNCTION__ + << "Trying to get the de count, but got " << junk + << " ignore the defer count"; + LOG_ERROR(ss); + } + } + ss << __PRETTY_FUNCTION__ << " | ======= end =======" << ", reporting " << amd::smi::getRSMIStatusString(ret);; LOG_TRACE(ss); diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 7032b1a5..dc5f9509 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1733,6 +1733,7 @@ amdsmi_get_gpu_total_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_ // Increase the total ecc counts ec->correctable_count += block_ec.correctable_count; ec->uncorrectable_count += block_ec.uncorrectable_count; + ec->deferred_count += block_ec.deferred_count; } } } From 1b0e01d50466393a17c3ef2c3728ffa9451f069f Mon Sep 17 00:00:00 2001 From: David Galiffi Date: Tue, 5 Mar 2024 14:59:55 -0500 Subject: [PATCH 05/15] Add Doc team to CODEOWNERS file Signed-off-by: David Galiffi Change-Id: Iad8eea0645b63bddb835ed22080facc7d25c1bc0 --- .github/CODEOWNERS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ffdc4f48..cf1eca3d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1,6 @@ * @marifamd @bill-shuzhou-liu @dmitrii-galantsev @charis-poag-amd @oliveiradan + +docs/* @ROCm/rocm-documentation +*.md @ROCm/rocm-documentation +*.rst @ROCm/rocm-documentation + From 50740a3e9117d14b4cd5aa7d4849600cd845f0eb Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 6 Mar 2024 18:55:40 -0600 Subject: [PATCH 06/15] Add .github/CONTRIBUTING.md Change-Id: Ia7a2272516f2fed37dd38debad09b79484f04684 Signed-off-by: Galantsev, Dmitrii --- .github/CONTRIBUTING.md | 103 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 .github/CONTRIBUTING.md diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 00000000..cd0b8a4f --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,103 @@ +# Contributing to AMD SMI # + +We welcome contributions to AMD SMI. +Please follow these details to help ensure your contributions will be successfully accepted. + +## Issue Discussion ## + +Please use the GitHub Issues tab to notify us of issues. + +* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and + comment or post to provide additional details, such as how you reproduced this issue. +* If you're not sure if your issue is the same, err on the side of caution and file your issue. + You can add a comment to include the issue number (and link) for the similar issue. If we evaluate + your issue as being the same as the existing issue, we'll close the duplicate. +* If your issue doesn't exist, use the issue template to file a new issue. + * When filing an issue, be sure to provide as much information as possible, including script output so + we can collect information about your configuration. This helps reduce the time required to + reproduce your issue. + * Check your issue regularly, as we may require additional information to successfully reproduce the + issue. +* You may also open an issue to ask questions to the maintainers about whether a proposed change + meets the acceptance criteria, or to discuss an idea pertaining to the library. + +## Acceptance Criteria ## + +The goal of AMD SMI project is to provide a simple CLI interface and a library +for interacting with AMD GPUs. + +## Coding Style ## + +Please refer to `.clang-format`. It is suggested you use `pre-commit` tool. +It mostly follows Google C++ formatting with 100 character line limit. + +## Pull Request Guidelines ## + +When you create a pull request, you should target the default branch. Our +current default branch is the **develop** branch, which serves as our +integration branch. + +### Deliverables ### + +For each new file in repository, +Please include the licensing header + + /* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017-20XX, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +### Process ### + +* Reviewers are listed in the CODEOWNERS file +* Code format guidelines + +AMD SMI uses the clang-format tool for formatting code in source files. +The formatting style is captured in .clang-format which is located at +the root of AMD SMI. These are different options to follow: + + 1. Using pre-commit and docker - `pre-commit run` + 1. Using only clang-format - `clang-format -i \` + +## References ## + +1. [pre-commit](https://github.com/pre-commit/pre-commit) +1. [clang-format](https://clang.llvm.org/docs/ClangFormat.html) From 01552193899a6fefe729b6d2bba85b5cb770bfb4 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 7 Mar 2024 03:36:43 -0600 Subject: [PATCH 07/15] Fix memory leak created by hanging opendir Change-Id: I01e372c6a6b427f21e89cb5e4217f876346a35be Signed-off-by: Galantsev, Dmitrii --- rocm_smi/src/rocm_smi_utils.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/rocm_smi/src/rocm_smi_utils.cc b/rocm_smi/src/rocm_smi_utils.cc index f9589be8..ffa4ef70 100755 --- a/rocm_smi/src/rocm_smi_utils.cc +++ b/rocm_smi/src/rocm_smi_utils.cc @@ -503,6 +503,7 @@ std::vector getListOfAppTmpFiles() { continue; } } + closedir(dir); return tmpFiles; } From 44c189b9f5a642ddf3ac47fb578c53316bf81a5c Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 7 Mar 2024 04:56:16 -0600 Subject: [PATCH 08/15] Fix misc memory leaks Change-Id: I3dbf56e98d8c1312f9081956ed590962b2bdace3 Signed-off-by: Galantsev, Dmitrii --- rocm_smi/include/rocm_smi/rocm_smi_device.h | 1 + .../include/rocm_smi/rocm_smi_gpu_metrics.h | 2 +- rocm_smi/include/rocm_smi/rocm_smi_main.h | 1 - rocm_smi/src/rocm_smi.cc | 10 +- rocm_smi/src/rocm_smi_device.cc | 125 +++++++++++++++--- rocm_smi/src/rocm_smi_gpu_metrics.cc | 2 +- rocm_smi/src/rocm_smi_main.cc | 89 +------------ 7 files changed, 113 insertions(+), 117 deletions(-) diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h index e699eec3..5ca5193b 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -255,6 +255,7 @@ class Device { rsmi_status_t dev_log_gpu_metrics(std::ostringstream& outstream_metrics); AMGpuMetricsPublicLatestTupl_t dev_copy_internal_to_external_metrics(); + static const std::map devInfoTypesStrings; private: std::shared_ptr monitor_; diff --git a/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h b/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h index b67f90c7..b6cccdc6 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h @@ -724,7 +724,7 @@ class GpuMetricsBase_t }; using GpuMetricsBasePtr = std::shared_ptr; -using AMDGpuMetricFactories_t = std::map; +using AMDGpuMetricFactories_t = const std::map; class GpuMetricsBase_v11_t final : public GpuMetricsBase_t diff --git a/rocm_smi/include/rocm_smi/rocm_smi_main.h b/rocm_smi/include/rocm_smi/rocm_smi_main.h index 1cd2ec34..c957f512 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_main.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_main.h @@ -117,7 +117,6 @@ class RocmSMI { void debugRSMIEnvVarInfo(); bool isLoggingOn(void); uint32_t getLogSetting(void); - static const std::map devInfoTypesStrings; private: std::vector> devices_; diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 35082a98..db5dbcc7 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -82,7 +82,7 @@ using amd::smi::monitorTypesToString; using amd::smi::getRSMIStatusString; using amd::smi::AMDGpuMetricsUnitType_t; using amd::smi::AMDGpuMetricTypeId_t; -auto &devInfoTypesStrings = amd::smi::RocmSMI::devInfoTypesStrings; +auto &devInfoTypesStrings = amd::smi::Device::devInfoTypesStrings; static const uint32_t kMaxOverdriveLevel = 20; static const float kEnergyCounterResolution = 15.3F; @@ -2027,7 +2027,7 @@ enum eNameStrType { NAME_STR_SUBSYS }; -std::map +static const std::map mapStringToRSMIComputePartitionTypes { {"CPX", RSMI_COMPUTE_PARTITION_CPX}, {"SPX", RSMI_COMPUTE_PARTITION_SPX}, @@ -2036,7 +2036,7 @@ mapStringToRSMIComputePartitionTypes { {"QPX", RSMI_COMPUTE_PARTITION_QPX} }; -std::map +static const std::map mapRSMIToStringComputePartitionTypes { {RSMI_COMPUTE_PARTITION_INVALID, "UNKNOWN"}, {RSMI_COMPUTE_PARTITION_CPX, "CPX"}, @@ -2046,7 +2046,7 @@ mapRSMIToStringComputePartitionTypes { {RSMI_COMPUTE_PARTITION_QPX, "QPX"} }; -std::map +static const std::map mapRSMIToStringMemoryPartitionTypes { {RSMI_MEMORY_PARTITION_UNKNOWN, "UNKNOWN"}, {RSMI_MEMORY_PARTITION_NPS1, "NPS1"}, @@ -2055,7 +2055,7 @@ mapRSMIToStringMemoryPartitionTypes { {RSMI_MEMORY_PARTITION_NPS8, "NPS8"} }; -std::map +static const std::map mapStringToMemoryPartitionTypes { {"NPS1", RSMI_MEMORY_PARTITION_NPS1}, {"NPS2", RSMI_MEMORY_PARTITION_NPS2}, diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 6e79c29b..305364b6 100755 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -336,7 +336,7 @@ static const std::map kDevPerfLvlMap = { {RSMI_DEV_PERF_LEVEL_UNKNOWN, kDevPerfLevelUnknownStr}, }; -static std::map kDevInfoVarTypeToRSMIVariant = { +static const std::map kDevInfoVarTypeToRSMIVariant = { // rsmi_memory_type_t {kDevMemTotGTT, RSMI_MEM_TYPE_GTT}, {kDevMemTotVisVRAM, RSMI_MEM_TYPE_VIS_VRAM}, @@ -391,6 +391,89 @@ static std::map kDevInfoVarTypeToRSMIVariant = { {kDevDFCountersAvailable, RSMI_EVNT_GRP_XGMI} }; +const std::map +Device::devInfoTypesStrings = { + {kDevPerfLevel, "kDevPerfLevel"}, + {kDevOverDriveLevel, "kDevOverDriveLevel"}, + {kDevMemOverDriveLevel, "kDevMemOverDriveLevel"}, + {kDevDevID, "kDevDevID"}, + {kDevXGMIPhysicalID, "kDevXGMIPhysicalID"}, + {kDevDevRevID, "kDevDevRevID"}, + {kDevDevProdName, "kDevDevProdName"}, + {kDevBoardInfo, "kDevBoardInfo"}, + {kDevDevProdNum, "kDevDevProdNum"}, + {kDevVendorID, "kDevVendorID"}, + {kDevSubSysDevID, "kDevSubSysDevID"}, + {kDevSubSysVendorID, "kDevSubSysVendorID"}, + {kDevGPUMClk, "kDevGPUMClk"}, + {kDevGPUSClk, "kDevGPUSClk"}, + {kDevDCEFClk, "kDevDCEFClk"}, + {kDevFClk, "kDevFClk"}, + {kDevSOCClk, "kDevSOCClk"}, + {kDevPCIEClk, "kDevPCIEClk"}, + {kDevPowerProfileMode, "kDevPowerProfileMode"}, + {kDevUsage, "kDevUsage"}, + {kDevPowerODVoltage, "kDevPowerODVoltage"}, + {kDevVBiosVer, "kDevVBiosVer"}, + {kDevPCIEThruPut, "kDevPCIEThruPut"}, + {kDevErrCntSDMA, "kDevErrCntSDMA"}, + {kDevErrCntUMC, "kDevErrCntUMC"}, + {kDevErrCntGFX, "kDevErrCntGFX"}, + {kDevErrCntMMHUB, "kDevErrCntMMHUB"}, + {kDevErrCntPCIEBIF, "kDevErrCntPCIEBIF"}, + {kDevErrCntHDP, "kDevErrCntHDP"}, + {kDevErrCntXGMIWAFL, "kDevErrCntXGMIWAFL"}, + {kDevErrCntFeatures, "kDevErrCntFeatures"}, + {kDevErrRASSchema, "kDevErrRASSchema"}, + {kDevErrTableVersion, "kDevErrTableVersion"}, + {kDevMemTotGTT, "kDevMemTotGTT"}, + {kDevMemTotVisVRAM, "kDevMemTotVisVRAM"}, + {kDevMemTotVRAM, "kDevMemTotVRAM"}, + {kDevMemUsedGTT, "kDevMemUsedGTT"}, + {kDevMemUsedVisVRAM, "kDevMemUsedVisVRAM"}, + {kDevMemUsedVRAM, "kDevMemUsedVRAM"}, + {kDevVramVendor, "kDevVramVendor"}, + {kDevPCIEReplayCount, "kDevPCIEReplayCount"}, + {kDevUniqueId, "kDevUniqueId"}, + {kDevDFCountersAvailable, "kDevDFCountersAvailable"}, + {kDevMemBusyPercent, "kDevMemBusyPercent"}, + {kDevXGMIError, "kDevXGMIError"}, + {kDevFwVersionAsd, "kDevFwVersionAsd"}, + {kDevFwVersionCe, "kDevFwVersionCe"}, + {kDevFwVersionDmcu, "kDevFwVersionDmcu"}, + {kDevFwVersionMc, "kDevFwVersionMc"}, + {kDevFwVersionMe, "kDevFwVersionMe"}, + {kDevFwVersionMec, "kDevFwVersionMec"}, + {kDevFwVersionMec2, "kDevFwVersionMec2"}, + {kDevFwVersionMes, "kDevFwVersionMes"}, + {kDevFwVersionMesKiq, "kDevFwVersionMesKiq"}, + {kDevFwVersionPfp, "kDevFwVersionPfp"}, + {kDevFwVersionRlc, "kDevFwVersionRlc"}, + {kDevFwVersionRlcSrlc, "kDevFwVersionRlcSrlc"}, + {kDevFwVersionRlcSrlg, "kDevFwVersionRlcSrlg"}, + {kDevFwVersionRlcSrls, "kDevFwVersionRlcSrls"}, + {kDevFwVersionSdma, "kDevFwVersionSdma"}, + {kDevFwVersionSdma2, "kDevFwVersionSdma2"}, + {kDevFwVersionSmc, "kDevFwVersionSmc"}, + {kDevFwVersionSos, "kDevFwVersionSos"}, + {kDevFwVersionTaRas, "kDevFwVersionTaRas"}, + {kDevFwVersionTaXgmi, "kDevFwVersionTaXgmi"}, + {kDevFwVersionUvd, "kDevFwVersionUvd"}, + {kDevFwVersionVce, "kDevFwVersionVce"}, + {kDevFwVersionVcn, "kDevFwVersionVcn"}, + {kDevSerialNumber, "kDevSerialNumber"}, + {kDevMemPageBad, "kDevMemPageBad"}, + {kDevNumaNode, "kDevNumaNode"}, + {kDevGpuMetrics, "kDevGpuMetrics"}, + {kDevPmMetrics, "kDevPmMetrics"}, + {kDevRegMetrics, "kDevRegMetrics"}, + {kDevGpuReset, "kDevGpuReset"}, + {kDevAvailableComputePartition, "kDevAvailableComputePartition"}, + {kDevComputePartition, "kDevComputePartition"}, + {kDevMemoryPartition, "kDevMemoryPartition"}, + {kDevPCieVendorID, "kDevPCieVendorID"}, +}; + static const std::map kDevFuncDependsMap = { // Functions with only mandatory dependencies {"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}}, @@ -644,7 +727,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { if (ret != 0) { ss << __PRETTY_FUNCTION__ << " | Issue: File did not exist - SYSFS file (" << sysfs_path - << ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << ") for DevInfoInfoType (" << devInfoTypesStrings.at(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -653,7 +736,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { ss << __PRETTY_FUNCTION__ << " | Issue: File is not a regular file - SYSFS file (" << sysfs_path << ") for " - << "DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")," + << "DevInfoInfoType (" << devInfoTypesStrings.at(type) << ")," << " returning ENOENT (" << std::strerror(ENOENT) << ")"; LOG_ERROR(ss); return ENOENT; @@ -664,7 +747,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { if (!fs->is_open()) { ss << __PRETTY_FUNCTION__ << " | Issue: Could not open - SYSFS file (" << sysfs_path << ") for " - << "DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << "), " + << "DevInfoInfoType (" << devInfoTypesStrings.at(type) << "), " << ", returning " << std::to_string(errno) << " (" << std::strerror(errno) << ")"; LOG_ERROR(ss); @@ -673,7 +756,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { ss << __PRETTY_FUNCTION__ << " | Successfully opened SYSFS file (" << sysfs_path - << ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << ") for DevInfoInfoType (" << devInfoTypesStrings.at(type) << ")"; LOG_INFO(ss); return 0; @@ -690,7 +773,7 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { ret = openDebugFileStream(type, &fs); if (ret != 0) { ss << "Could not read debugInfoStr for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type)<< "), returning " + << devInfoTypesStrings.at(type)<< "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -704,7 +787,7 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) { fs.close(); ss << "Successfully read debugInfoStr for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type)<< "), retString= " << *retStr; + << devInfoTypesStrings.at(type)<< "), retString= " << *retStr; LOG_INFO(ss); return 0; @@ -720,7 +803,7 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) { ret = openSysfsFileStream(type, &fs); if (ret != 0) { ss << "Could not read device info string for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << "), returning " + << devInfoTypesStrings.at(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -729,8 +812,8 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) { fs >> *retStr; fs.close(); ss << __PRETTY_FUNCTION__ - << "Successfully read device info string for DevInfoType (" + - RocmSMI::devInfoTypesStrings.at(type) + "): " + *retStr + << "Successfully read device info string for DevInfoType (" << + devInfoTypesStrings.at(type) << "): " + *retStr << " | " << (fs.is_open() ? " File stream is opened" : " File stream is closed") << " | " << (fs.bad() ? "[ERROR] Bad read operation" : @@ -765,7 +848,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr, fs.close(); ss << __PRETTY_FUNCTION__ << " | Issue: Could not open fileStream; " << "Could not write device info string (" << valStr - << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << ") for DevInfoType (" << devInfoTypesStrings.at(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; @@ -776,7 +859,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr, fs.flush(); fs.close(); ss << "Successfully wrote device info string (" << valStr - << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << ") for DevInfoType (" << devInfoTypesStrings.at(type) << "), returning RSMI_STATUS_SUCCESS"; LOG_INFO(ss); ret = RSMI_STATUS_SUCCESS; @@ -790,7 +873,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr, fs.close(); ss << __PRETTY_FUNCTION__ << " | Issue: Could not write to file; " << "Could not write device info string (" << valStr - << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) + << ") for DevInfoType (" << devInfoTypesStrings.at(type) << "), returning " << getRSMIStatusString(ErrnoToRsmiStatus(ret)); ss << " | " << (fs.is_open() ? "[ERROR] File stream open" : @@ -877,14 +960,14 @@ int Device::readDevInfoLine(DevInfoTypes type, std::string *line) { ret = openSysfsFileStream(type, &fs); if (ret != 0) { ss << "Could not read DevInfoLine for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ")"; + << devInfoTypesStrings.at(type) << ")"; LOG_ERROR(ss); return ret; } std::getline(fs, *line); ss << "Successfully read DevInfoLine for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << "), returning *line = " + << devInfoTypesStrings.at(type) << "), returning *line = " << *line; LOG_INFO(ss); @@ -903,7 +986,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, ptr = fopen(sysfs_path.c_str(), "rb"); if (!ptr) { ss << "Could not read DevInfoBinary for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ")" + << devInfoTypesStrings.at(type) << ")" << " - SYSFS (" << sysfs_path << ")" << ", returning " << std::to_string(errno) << " (" << std::strerror(errno) << ")"; @@ -915,7 +998,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, fclose(ptr); if ((num*b_size) != b_size) { ss << "Could not read DevInfoBinary for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS (" + << devInfoTypesStrings.at(type) << ") - SYSFS (" << sysfs_path << "), binary size error; " << "[buff: " << p_binary_data @@ -929,7 +1012,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, return ENOENT; } ss << "Successfully read DevInfoBinary for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS (" + << devInfoTypesStrings.at(type) << ") - SYSFS (" << sysfs_path << "), returning binaryData = " << p_binary_data << "; byte_size = " << std::dec << static_cast(b_size); @@ -961,7 +1044,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, if (retVec->empty()) { ss << "Read devInfoMultiLineStr for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ")" + << devInfoTypesStrings.at(type) << ")" << ", but contained no string lines"; LOG_ERROR(ss); return ENXIO; @@ -979,12 +1062,12 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, if (!allLines.empty()) { ss << "Successfully read devInfoMultiLineStr for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ") " + << devInfoTypesStrings.at(type) << ") " << ", returning lines read = " << allLines; LOG_INFO(ss); } else { ss << "Read devInfoMultiLineStr for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type) << ")" + << devInfoTypesStrings.at(type) << ")" << ", but lines were empty"; LOG_INFO(ss); return ENXIO; diff --git a/rocm_smi/src/rocm_smi_gpu_metrics.cc b/rocm_smi/src/rocm_smi_gpu_metrics.cc index 37b65776..f6d7f80e 100755 --- a/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -368,7 +368,7 @@ GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_v << " |"; LOG_TRACE(ostrstream); - return (amd_gpu_metrics_factory_table[gpu_metric_version]); + return (amd_gpu_metrics_factory_table.at(gpu_metric_version)); } ostrstream << __PRETTY_FUNCTION__ diff --git a/rocm_smi/src/rocm_smi_main.cc b/rocm_smi/src/rocm_smi_main.cc index 612482ae..c078712e 100755 --- a/rocm_smi/src/rocm_smi_main.cc +++ b/rocm_smi/src/rocm_smi_main.cc @@ -57,10 +57,8 @@ #include #include #include -#include #include #include -#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_device.h" @@ -79,91 +77,6 @@ static const char *kDeviceNamePrefix = "card"; static const char *kAMDMonitorTypes[] = {"radeon", "amdgpu", ""}; -static const std::string amdSMI = "amd::smi::"; -const std::map -amd::smi::RocmSMI::devInfoTypesStrings = { - {amd::smi::kDevPerfLevel, amdSMI + "kDevPerfLevel"}, - {amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"}, - {amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"}, - {amd::smi::kDevDevID, amdSMI + "kDevDevID"}, - {amd::smi::kDevXGMIPhysicalID, amdSMI + "kDevXGMIPhysicalID"}, - {amd::smi::kDevDevRevID, amdSMI + "kDevDevRevID"}, - {amd::smi::kDevDevProdName, amdSMI + "kDevDevProdName"}, - {amd::smi::kDevBoardInfo, amdSMI + "kDevBoardInfo"}, - {amd::smi::kDevDevProdNum, amdSMI + "kDevDevProdNum"}, - {amd::smi::kDevVendorID, amdSMI + "kDevVendorID"}, - {amd::smi::kDevSubSysDevID, amdSMI + "kDevSubSysDevID"}, - {amd::smi::kDevSubSysVendorID, amdSMI + "kDevSubSysVendorID"}, - {amd::smi::kDevGPUMClk, amdSMI + "kDevGPUMClk"}, - {amd::smi::kDevGPUSClk, amdSMI + "kDevGPUSClk"}, - {amd::smi::kDevDCEFClk, amdSMI + "kDevDCEFClk"}, - {amd::smi::kDevFClk, amdSMI + "kDevFClk"}, - {amd::smi::kDevSOCClk, amdSMI + "kDevSOCClk"}, - {amd::smi::kDevPCIEClk, amdSMI + "kDevPCIEClk"}, - {amd::smi::kDevPowerProfileMode, amdSMI + "kDevPowerProfileMode"}, - {amd::smi::kDevUsage, amdSMI + "kDevUsage"}, - {amd::smi::kDevPowerODVoltage, amdSMI + "kDevPowerODVoltage"}, - {amd::smi::kDevVBiosVer, amdSMI + "kDevVBiosVer"}, - {amd::smi::kDevPCIEThruPut, amdSMI + "kDevPCIEThruPut"}, - {amd::smi::kDevErrCntSDMA, amdSMI + "kDevErrCntSDMA"}, - {amd::smi::kDevErrCntUMC, amdSMI + "kDevErrCntUMC"}, - {amd::smi::kDevErrCntGFX, amdSMI + "kDevErrCntGFX"}, - {amd::smi::kDevErrCntMMHUB, amdSMI + "kDevErrCntMMHUB"}, - {amd::smi::kDevErrCntPCIEBIF, amdSMI + "kDevErrCntPCIEBIF"}, - {amd::smi::kDevErrCntHDP, amdSMI + "kDevErrCntHDP"}, - {amd::smi::kDevErrCntXGMIWAFL, amdSMI + "kDevErrCntXGMIWAFL"}, - {amd::smi::kDevErrCntFeatures, amdSMI + "kDevErrCntFeatures"}, - {amd::smi::kDevErrRASSchema, amdSMI + "kDevErrRASSchema"}, - {amd::smi::kDevErrTableVersion, amdSMI + "kDevErrTableVersion"}, - {amd::smi::kDevMemTotGTT, amdSMI + "kDevMemTotGTT"}, - {amd::smi::kDevMemTotVisVRAM, amdSMI + "kDevMemTotVisVRAM"}, - {amd::smi::kDevMemTotVRAM, amdSMI + "kDevMemTotVRAM"}, - {amd::smi::kDevMemUsedGTT, amdSMI + "kDevMemUsedGTT"}, - {amd::smi::kDevMemUsedVisVRAM, amdSMI + "kDevMemUsedVisVRAM"}, - {amd::smi::kDevMemUsedVRAM, amdSMI + "kDevMemUsedVRAM"}, - {amd::smi::kDevVramVendor, amdSMI + "kDevVramVendor"}, - {amd::smi::kDevPCIEReplayCount, amdSMI + "kDevPCIEReplayCount"}, - {amd::smi::kDevUniqueId, amdSMI + "kDevUniqueId"}, - {amd::smi::kDevDFCountersAvailable, amdSMI + "kDevDFCountersAvailable"}, - {amd::smi::kDevMemBusyPercent, amdSMI + "kDevMemBusyPercent"}, - {amd::smi::kDevXGMIError, amdSMI + "kDevXGMIError"}, - {amd::smi::kDevFwVersionAsd, amdSMI + "kDevFwVersionAsd"}, - {amd::smi::kDevFwVersionCe, amdSMI + "kDevFwVersionCe"}, - {amd::smi::kDevFwVersionDmcu, amdSMI + "kDevFwVersionDmcu"}, - {amd::smi::kDevFwVersionMc, amdSMI + "kDevFwVersionMc"}, - {amd::smi::kDevFwVersionMe, amdSMI + "kDevFwVersionMe"}, - {amd::smi::kDevFwVersionMec, amdSMI + "kDevFwVersionMec"}, - {amd::smi::kDevFwVersionMec2, amdSMI + "kDevFwVersionMec2"}, - {amd::smi::kDevFwVersionMes, amdSMI + "kDevFwVersionMes"}, - {amd::smi::kDevFwVersionMesKiq, amdSMI + "kDevFwVersionMesKiq"}, - {amd::smi::kDevFwVersionPfp, amdSMI + "kDevFwVersionPfp"}, - {amd::smi::kDevFwVersionRlc, amdSMI + "kDevFwVersionRlc"}, - {amd::smi::kDevFwVersionRlcSrlc, amdSMI + "kDevFwVersionRlcSrlc"}, - {amd::smi::kDevFwVersionRlcSrlg, amdSMI + "kDevFwVersionRlcSrlg"}, - {amd::smi::kDevFwVersionRlcSrls, amdSMI + "kDevFwVersionRlcSrls"}, - {amd::smi::kDevFwVersionSdma, amdSMI + "kDevFwVersionSdma"}, - {amd::smi::kDevFwVersionSdma2, amdSMI + "kDevFwVersionSdma2"}, - {amd::smi::kDevFwVersionSmc, amdSMI + "kDevFwVersionSmc"}, - {amd::smi::kDevFwVersionSos, amdSMI + "kDevFwVersionSos"}, - {amd::smi::kDevFwVersionTaRas, amdSMI + "kDevFwVersionTaRas"}, - {amd::smi::kDevFwVersionTaXgmi, amdSMI + "kDevFwVersionTaXgmi"}, - {amd::smi::kDevFwVersionUvd, amdSMI + "kDevFwVersionUvd"}, - {amd::smi::kDevFwVersionVce, amdSMI + "kDevFwVersionVce"}, - {amd::smi::kDevFwVersionVcn, amdSMI + "kDevFwVersionVcn"}, - {amd::smi::kDevSerialNumber, amdSMI + "kDevSerialNumber"}, - {amd::smi::kDevMemPageBad, amdSMI + "kDevMemPageBad"}, - {amd::smi::kDevNumaNode, amdSMI + "kDevNumaNode"}, - {amd::smi::kDevGpuMetrics, amdSMI + "kDevGpuMetrics"}, - {amd::smi::kDevPmMetrics, amdSMI + "kDevPmMetrics"}, - {amd::smi::kDevRegMetrics, amdSMI + "kDevRegMetrics"}, - {amd::smi::kDevGpuReset, amdSMI + "kDevGpuReset"}, - {amd::smi::kDevAvailableComputePartition, amdSMI + - "kDevAvailableComputePartition"}, - {amd::smi::kDevComputePartition, amdSMI + "kDevComputePartition"}, - {amd::smi::kDevMemoryPartition, amdSMI + "kDevMemoryPartition"}, - {amd::smi::kDevPCieVendorID, amdSMI + "kDevPCieVendorID"}, -}; - namespace amd { namespace smi { @@ -647,7 +560,7 @@ std::string RocmSMI::getRSMIEnvVarInfo(void) { for (auto it=env_vars_.enum_overrides.begin(); it != env_vars_.enum_overrides.end(); ++it) { DevInfoTypes type = static_cast(*it); - ss << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) + ")"); + ss << (std::to_string(*it) + " (" + Device::devInfoTypesStrings.at(type) + ")"); auto temp_it = it; if(++temp_it != env_vars_.enum_overrides.end()) { ss << ", "; From ec56aba6c1ec402023aca959c218204d4b8076e6 Mon Sep 17 00:00:00 2001 From: Lisa Date: Tue, 5 Mar 2024 09:09:13 -0700 Subject: [PATCH 09/15] fix links Change-Id: I23520f7abf5e67453a928a07b46f126bcd5c1469 Reviewed-By: Galantsev, Dmitrii --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2dc3a07c..109f890e 100755 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ and [esmi_ib_library](https://github.com/amd/esmi_ib_library) At initial release, the AMD SMI library will support Linux bare metal and Linux virtual machine guest for AMD GPUs. In the future release, the library will be extended to support AMD EPYC™ CPUs. -AMD SMI library can run on AMD ROCm supported platforms, please refer to [List of Supported Operating Systems and GPUs](https://rocm.docs.amd.com/en/latest/release/gpu_os_support.html) +AMD SMI library can run on AMD ROCm supported platforms, refer to [System requirements (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) for more information. To run the AMD SMI library, the amdgpu driver and the hsmp driver needs to be installed. Optionally, the libdrm can be installed to query firmware information and hardware IPs. @@ -195,7 +195,7 @@ python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html The output will be in `docs/_build/html`. -For additional details, see the [ROCm Contributing Guide](https://rocm.docs.amd.com/en/latest/contributing.html#building-documentation) +For additional details, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). ## Building AMD SMI From dea4fac979142f4fd1eff31069927cc027ad6321 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Mon, 4 Mar 2024 21:42:44 -0600 Subject: [PATCH 10/15] Enabled ecc-blocks argument to linux VM Signed-off-by: Maisam Arif Change-Id: I310c227ffa3ef45688a49cdedb43844aafe86339 --- amdsmi_cli/amdsmi_commands.py | 12 ++++++------ amdsmi_cli/amdsmi_parser.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 5f28c4a1..536f3792 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -1124,14 +1124,14 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No args.temperature = temperature if ecc: args.ecc = ecc + if ecc_blocks: + args.ecc_blocks = ecc_blocks if pcie: args.pcie = pcie - current_platform_args += ["usage", "power", "clock", "temperature", "ecc", "pcie"] - current_platform_values += [args.usage, args.power, args.clock, args.temperature, args.ecc, args.pcie] + current_platform_args += ["usage", "power", "clock", "temperature", "ecc", "ecc_blocks", "pcie"] + current_platform_values += [args.usage, args.power, args.clock, args.temperature, args.ecc, args.ecc_blocks, args.pcie] if self.helpers.is_baremetal() and self.helpers.is_linux(): - if ecc_blocks: - args.ecc_blocks = ecc_blocks if fan: args.fan = fan if voltage_curve: @@ -1144,8 +1144,8 @@ def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=No args.xgmi_err = xgmi_err if energy: args.energy = energy - current_platform_args += ["ecc_blocks", "fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy"] - current_platform_values += [args.ecc_blocks, args.fan, args.voltage_curve, args.overdrive, args.perf_level, args.xgmi_err, args.energy] + current_platform_args += ["fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy"] + current_platform_values += [args.fan, args.voltage_curve, args.overdrive, args.perf_level, args.xgmi_err, args.energy] if self.helpers.is_hypervisor(): if schedule: diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 3140cd52..ab283d9c 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -759,10 +759,10 @@ def _add_metric_parser(self, subparsers, func): metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help) metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) + metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help) # Optional Args for Linux Baremetal Systems if self.helpers.is_baremetal() and self.helpers.is_linux(): - metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help) metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help) metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help) metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help) From 2f8f34946e6a37acd73969b11b975a22de4a98c0 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Wed, 6 Mar 2024 08:39:42 -0600 Subject: [PATCH 11/15] SWDEV-443112 - Ensured dictionary output when static --bus is empty Signed-off-by: Maisam Arif Change-Id: Ibd61eeec417a9ff40cb868073b3e1eed2a87cc59 --- amdsmi_cli/amdsmi_commands.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 536f3792..a919b472 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -344,7 +344,13 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None static_dict['asic'] = "N/A" logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info()) if args.bus: - bus_info = {} + bus_info = { + 'bdf': "N/A", + 'max_pcie_width': "N/A", + 'max_pcie_speed': "N/A", + 'pcie_interface_version': "N/A", + 'slot_type': "N/A" + } try: bus_info['bdf'] = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu) @@ -356,7 +362,6 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None link_caps = amdsmi_interface.amdsmi_get_pcie_info(args.gpu) bus_info['max_pcie_width'] = link_caps['pcie_static']['max_pcie_width'] bus_info['max_pcie_speed'] = link_caps['pcie_static']['max_pcie_speed'] - bus_info['pcie_slot_type'] = link_caps['pcie_static']['slot_type'] bus_info['pcie_interface_version'] = link_caps['pcie_static']['pcie_interface_version'] if bus_info['max_pcie_speed'] % 1000 != 0: @@ -366,15 +371,13 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None bus_info['max_pcie_speed'] = pcie_speed_GTs_value - slot_type = bus_info.pop('pcie_slot_type') + slot_type = link_caps['pcie_static']['slot_type'] if isinstance(slot_type, int): slot_types = amdsmi_interface.amdsmi_wrapper.amdsmi_card_form_factor_t__enumvalues if slot_type in slot_types: bus_info['slot_type'] = slot_types[slot_type].replace("AMDSMI_CARD_FORM_FACTOR_", "") else: bus_info['slot_type'] = "Unknown" - else: - bus_info['slot_type'] = "N/A" if bus_info['pcie_interface_version'] > 0: bus_info['pcie_interface_version'] = f"Gen {bus_info['pcie_interface_version']}" @@ -389,7 +392,6 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None "unit" : pcie_speed_unit} except amdsmi_exception.AmdSmiLibraryException as e: - bus_info = "N/A" logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['bus'] = bus_info From 108e6d4ae6754ef1805ebc1fd0119754e4f982f8 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Thu, 22 Feb 2024 08:38:54 -0600 Subject: [PATCH 12/15] Set and get DPM policy for GPU device Add new APIs to set and get dpm policy for the GPU device. Change-Id: I26fa49cd17d0ce66bda3446c38945a6cf35717ff --- amdsmi_cli/README.md | 51 +++++++- amdsmi_cli/amdsmi_commands.py | 58 ++++++--- amdsmi_cli/amdsmi_helpers.py | 1 - amdsmi_cli/amdsmi_parser.py | 4 + example/amd_smi_nodrm_example.cc | 12 ++ include/amd_smi/amdsmi.h | 74 ++++++++++- py-interface/amdsmi_interface.py | 44 ++++++- py-interface/amdsmi_wrapper.py | 68 ++++++++--- rocm_smi/include/rocm_smi/rocm_smi.h | 69 +++++++++++ rocm_smi/include/rocm_smi/rocm_smi_device.h | 1 + rocm_smi/include/rocm_smi/rocm_smi_utils.h | 1 + rocm_smi/src/rocm_smi.cc | 128 ++++++++++++++++++++ rocm_smi/src/rocm_smi_device.cc | 7 ++ rocm_smi/src/rocm_smi_utils.cc | 10 ++ src/amd_smi/amd_smi.cc | 17 +++ 15 files changed, 506 insertions(+), 39 deletions(-) diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index abded835..cf2b81df 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -474,7 +474,7 @@ Command Modifiers: ```bash usage: amd-smi set [-h] (-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]) [-f %] [-l LEVEL] [-P SETPROFILE] [-d SCLKMAX] [-C PARTITION] [-M PARTITION] - [-o WATTS] [--cpu-pwr-limit PWR_LIMIT] + [-o WATTS] [-p POLICY] [--cpu-pwr-limit PWR_LIMIT] [--cpu-xgmi-link-width MIN_WIDTH MAX_WIDTH] [--cpu-lclk-dpm-level NBIOID MIN_DPM MAX_DPM] [--cpu-pwr-eff-mode MODE] [--cpu-gmi3-link-width MIN_LW MAX_LW] [--cpu-pcie-link-rate LINK_RATE] @@ -512,6 +512,7 @@ Set Arguments: -M, --memory-partition PARTITION Set one of the following the memory partition modes: NPS1, NPS2, NPS4, NPS8 -o, --power-cap WATTS Set power capacity limit + -p, --dpm-policy POLICY_ID Set the GPU DPM policy using policy id CPU Arguments: --cpu-pwr-limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value. @@ -674,6 +675,18 @@ GPU: 0 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 0 AFFINITY: 0 @@ -770,6 +783,18 @@ GPU: 1 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 1 AFFINITY: 1 @@ -866,6 +891,18 @@ GPU: 2 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 2 AFFINITY: 2 @@ -962,6 +999,18 @@ GPU: 3 PARTITION: COMPUTE_PARTITION: SPX MEMORY_PARTITION: NPS1 + POLICY: + NUM_SUPPORTED: 4 + CURRENT_ID: 1 + POLICIES: + POLICY_ID: 0 + POLICY_DESCRIPTION: pstate_default + POLICY_ID: 1 + POLICY_DESCRIPTION: soc_pstate_0 + POLICY_ID: 2 + POLICY_DESCRIPTION: soc_pstate_1 + POLICY_ID: 3 + POLICY_DESCRIPTION: soc_pstate_2 NUMA: NODE: 3 AFFINITY: 3 diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index a919b472..697513f5 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -244,7 +244,7 @@ def static_cpu(self, args, multiple_devices=False, cpu=None, interface_ver=None) def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, - cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None): + cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, policy=None): """Get Static information for target gpu Args: @@ -267,7 +267,7 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None dfc_ucode (bool, optional): Value override for args.dfc_ucode. Defaults to None. fb_info (bool, optional): Value override for args.fb_info. Defaults to None. num_vf (bool, optional): Value override for args.num_vf. Defaults to None. - + policy (bool, optional): Value override for args.policy. Defaults to None. Returns: None: Print output via AMDSMILogger to destination """ @@ -300,8 +300,10 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None args.partition = partition if limit: args.limit = limit - current_platform_args += ["ras", "limit", "partition"] - current_platform_values += [args.ras, args.limit, args.partition] + if policy: + args.policy = policy + current_platform_args += ["ras", "limit", "partition", "policy"] + current_platform_values += [args.ras, args.limit, args.partition, args.policy] if self.helpers.is_linux() and not self.helpers.is_virtual_os(): if numa: @@ -486,6 +488,7 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None shutdown_temp_vram_limit = "N/A" logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + # Assign units power_unit = 'W' temp_unit_human_readable = '\N{DEGREE SIGN}C' @@ -626,6 +629,15 @@ def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None static_dict['partition'] = {"compute_partition": compute_partition, "memory_partition": memory_partition} + if 'policy' in current_platform_args: + if args.policy: + try: + policy_info = amdsmi_interface.amdsmi_get_dpm_policy(args.gpu) + except amdsmi_exception.AmdSmiLibraryException as e: + policy_info = "N/A" + logging.debug("Failed to get policy info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['dpm_policy'] = policy_info if 'numa' in current_platform_args: if args.numa: try: @@ -762,7 +774,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None, cpu=None, - interface_ver=None): + interface_ver=None, policy=None): """Get Static information for target gpu and cpu Args: @@ -785,7 +797,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, num_vf (bool, optional): Value override for args.num_vf. Defaults to None. cpu (cpu_handle, optional): cpu_handle for target device. Defaults to None. interface_ver (bool, optional): Value override for args.interface_ver. Defaults to None - + policy (bool, optional): Value override for args.policy. Defaults to None. Raises: IndexError: Index error if gpu list is empty @@ -811,7 +823,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, gpu_args_enabled = False gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras", "board", "numa", "vram", "cache", "partition", - "dfc_ucode", "fb_info", "num_vf"] + "dfc_ucode", "fb_info", "num_vf", "policy"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr): @@ -841,7 +853,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf) + dfc_ucode, fb_info, num_vf, policy) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None: args.cpu = self.cpu_handles @@ -855,7 +867,7 @@ def static(self, args, multiple_devices=False, gpu=None, asic=None, self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, - dfc_ucode, fb_info, num_vf) + dfc_ucode, fb_info, num_vf, policy) def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True): @@ -3096,7 +3108,7 @@ def set_cpu(self, args, multiple_devices=False, cpu=None, cpu_pwr_limit=None, def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, - memory_partition=None, power_cap=None): + memory_partition=None, power_cap=None, dpm_policy=None): """Issue reset commands to target gpu(s) Args: @@ -3110,6 +3122,7 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None. memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None. power_cap (int, optional): Value override for args.power_cap. Defaults to None. + dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -3135,7 +3148,8 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N args.memory_partition = memory_partition if power_cap: args.power_cap = power_cap - + if dpm_policy: + args.dpm_policy = dpm_policy # Handle No GPU passed if args.gpu == None: raise ValueError('No GPU provided, specific GPU target(s) are needed') @@ -3154,7 +3168,8 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N args.compute_partition, args.memory_partition, args.perf_determinism is not None, - args.power_cap]): + args.power_cap, + args.dpm_policy]): command = " ".join(sys.argv[1:]) raise AmdSmiRequiredCommandException(command, self.logger.format) @@ -3218,6 +3233,16 @@ def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=N raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}") + + if args.dpm_policy: + try: + amdsmi_interface.amdsmi_set_dpm_policy(args.gpu, args.dpm_policy) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set dpm policy to {args.dpm_policy} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'dpmpolicy', f"Successfully set dpm policy to id {args.dpm_policy}") + if isinstance(args.power_cap, int): try: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) @@ -3257,7 +3282,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, - soc_boost_limit=None, core=None, core_boost_limit=None): + soc_boost_limit=None, core=None, core_boost_limit=None, dpm_policy=None): """Issue reset commands to target gpu(s) Args: @@ -3286,6 +3311,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level core (device_handle, optional): device_handle for target core. Defaults to None. core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None + dpm_policy (int, optional): Value override for args.dpm_policy. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -3306,7 +3332,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", - "memory_partition", "power_cap"] + "memory_partition", "power_cap", "dpm_policy"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr) is not None: @@ -3367,7 +3393,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap) + memory_partition, power_cap, dpm_policy) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: raise ValueError('No CPU or CORE provided, specific target(s) are needed') @@ -3386,7 +3412,7 @@ def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level self.logger.clear_multiple_devices_ouput() self.set_gpu(args, multiple_devices, gpu, fan, perf_level, profile, perf_determinism, compute_partition, - memory_partition, power_cap) + memory_partition, power_cap, dpm_policy) def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index a685a7e8..080cc353 100644 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -632,7 +632,6 @@ def get_compute_partition_types(self): compute_partitions_str.remove('INVALID') return compute_partitions_str - def get_memory_partition_types(self): memory_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiMemoryPartitionType] if 'UNKNOWN' in memory_partitions_str: diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index ab283d9c..5341b274 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -543,6 +543,7 @@ def _add_static_parser(self, subparsers, func): vram_help = "All vram information" cache_help = "All cache information" board_help = "All board information" + dpm_policy_help = "The available DPM policy" # Options arguments help text for Hypervisors and Baremetal ras_help = "Displays RAS features information" @@ -582,6 +583,7 @@ def _add_static_parser(self, subparsers, func): static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) + static_parser.add_argument('-P', '--policy', action='store_true', required=False, help=dpm_policy_help) if self.helpers.is_linux() and not self.helpers.is_virtual_os(): static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) @@ -963,6 +965,7 @@ def _add_set_value_parser(self, subparsers, func): set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}" set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" set_power_cap_help = "Set power capacity limit" + set_dpm_policy_help = f"Set the GPU DPM policy using policy id\n" # Help text for CPU set options set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." @@ -998,6 +1001,7 @@ def _add_set_value_parser(self, subparsers, func): set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION') set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') + set_value_parser.add_argument('-p', '--dpm-policy', action='store', required=False, type=self._not_negative_int, help=set_dpm_policy_help, metavar='POLICY_ID') if self.helpers.is_amd_hsmp_initialized(): # Optional CPU Args diff --git a/example/amd_smi_nodrm_example.cc b/example/amd_smi_nodrm_example.cc index 0f829375..e6d37ced 100644 --- a/example/amd_smi_nodrm_example.cc +++ b/example/amd_smi_nodrm_example.cc @@ -331,6 +331,18 @@ int main() { printf(" Output of amdsmi_get_power_cap_info:\n"); std::cout << "\t\t Power Cap: " << cap_info.power_cap / 1000000 << "W\n\n"; + + amdsmi_dpm_policy_t policy; + ret = amdsmi_get_dpm_policy(processor_handles[j], &policy); + if (ret != AMDSMI_STATUS_NOT_SUPPORTED) { + CHK_AMDSMI_RET(ret) + std::cout << "\t amdsmi_get_dpm_policy total:" << policy.num_supported + <<" current:" << policy.current << "\n"; + for (int x=0; x < policy.num_supported; x++) { + std::cout << x <<": (" << policy.policies[x].policy_id + <<"," << policy.policies[x].policy_description << ")\n"; + } + } } } diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 7e283bfb..ef58a6ce 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -151,7 +151,7 @@ typedef enum { #define AMDSMI_LIB_VERSION_YEAR 24 //! Major version should be changed for every header change (adding/deleting APIs, changing names, fields of structures, etc.) -#define AMDSMI_LIB_VERSION_MAJOR 4 +#define AMDSMI_LIB_VERSION_MAJOR 5 //! Minor version should be updated for each API change, but without changing headers #define AMDSMI_LIB_VERSION_MINOR 0 @@ -1151,6 +1151,37 @@ typedef struct { uint64_t frequency[AMDSMI_MAX_NUM_FREQUENCIES]; } amdsmi_frequencies_t; +/** + * @brief The dpm policy. + */ +typedef struct { + uint32_t policy_id; + char policy_description[AMDSMI_MAX_NAME]; +} amdsmi_dpm_policy_entry_t; + +#define AMDSMI_MAX_NUM_PM_POLICIES 32 + +/** + * @brief This structure holds information about dpm policies. + */ +typedef struct { + /** + * The number of supported policies + */ + uint32_t num_supported; + + /** + * The current policy index + */ + uint32_t current; + + /** + * List of policies. + * Only the first num_supported policies are valid. + */ + amdsmi_dpm_policy_entry_t policies[AMDSMI_MAX_NUM_PM_POLICIES]; +} amdsmi_dpm_policy_t; + /** * @brief This structure holds information about the possible PCIe * bandwidths. Specifically, the possible transfer rates and their @@ -3333,6 +3364,47 @@ amdsmi_status_t amdsmi_set_gpu_overdrive_level(amdsmi_processor_handle processor amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle, amdsmi_clk_type_t clk_type, uint64_t freq_bitmask); +/** + * @brief Get the dpm policy for the processor + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle, this function will write + * current dpm policy settings to @p policy. All the processors at the same socket + * will have the same policy. + * + * @param[in] processor_handle a processor handle + * + * @param[in, out] policy the dpm policy for this processor. + * If this parameter is nullptr, this function will return + * ::AMDSMI_STATUS_INVAL + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle, + amdsmi_dpm_policy_t* policy); + +/** + * @brief Set the dpm policy for the processor + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle and a dpm policy @p policy_id, + * this function will set the dpm policy for this processor. All the processors at + * the same socket will be set to the same policy. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] policy_id the dpm policy id to set. The id is the id in + * amdsmi_dpm_policy_entry_t, which can be obtained by calling + * amdsmi_get_dpm_policy() + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle, + uint32_t policy_id); /** @} End PerfCont */ /*****************************************************************************/ diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 74a190da..7eb501bb 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -360,7 +360,6 @@ class AmdSmiProcessorType(IntEnum): NON_AMD_GPU = amdsmi_wrapper.NON_AMD_GPU NON_AMD_CPU = amdsmi_wrapper.NON_AMD_CPU - class AmdSmiEventReader: def __init__( self, processor_handle: amdsmi_wrapper.amdsmi_processor_handle, @@ -2690,6 +2689,19 @@ def amdsmi_set_clk_freq( ) ) +def amdsmi_set_dpm_policy( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + policy_id: int, +): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + _check_res( + amdsmi_wrapper.amdsmi_set_dpm_policy( + processor_handle, policy_id + ) + ) def amdsmi_set_gpu_overdrive_level( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, overdrive_value: int @@ -3249,6 +3261,36 @@ def amdsmi_get_clk_freq( "frequency": list(freq.frequency)[: freq.num_supported - 1], } +def amdsmi_get_dpm_policy( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, +) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + policy = amdsmi_wrapper.amdsmi_dpm_policy_t() + _check_res( + amdsmi_wrapper.amdsmi_get_dpm_policy( + processor_handle, ctypes.byref(policy) + ) + ) + + polices = [] + for i in range(0, policy.num_supported): + id = policy.policies[i].policy_id + desc = policy.policies[i].policy_description + polices.append({ + 'policy_id' : id, + 'policy_description': desc.decode() + }) + current_id = policy.policies[policy.current].policy_id + + return { + "num_supported": policy.num_supported, + "current_id": current_id, + "policies": polices, + } def amdsmi_get_gpu_od_volt_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 91bdc8bd..8fcdb375 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -746,6 +746,19 @@ class struct_fields_(Structure): class struct_amdsmi_pcie_info_t(Structure): pass +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('reserved', ctypes.c_uint64 * 10), +] + class struct_pcie_metric_(Structure): pass @@ -764,19 +777,6 @@ class struct_pcie_metric_(Structure): ('reserved', ctypes.c_uint64 * 13), ] -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('reserved', ctypes.c_uint64 * 10), -] - struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -1480,6 +1480,27 @@ class struct_amdsmi_frequencies_t(Structure): ] amdsmi_frequencies_t = struct_amdsmi_frequencies_t +class struct_amdsmi_dpm_policy_entry_t(Structure): + pass + +struct_amdsmi_dpm_policy_entry_t._pack_ = 1 # source:False +struct_amdsmi_dpm_policy_entry_t._fields_ = [ + ('policy_id', ctypes.c_uint32), + ('policy_description', ctypes.c_char * 32), +] + +amdsmi_dpm_policy_entry_t = struct_amdsmi_dpm_policy_entry_t +class struct_amdsmi_dpm_policy_t(Structure): + pass + +struct_amdsmi_dpm_policy_t._pack_ = 1 # source:False +struct_amdsmi_dpm_policy_t._fields_ = [ + ('num_supported', ctypes.c_uint32), + ('current', ctypes.c_uint32), + ('policies', struct_amdsmi_dpm_policy_entry_t * 32), +] + +amdsmi_dpm_policy_t = struct_amdsmi_dpm_policy_t class struct_amdsmi_pcie_bandwidth_t(Structure): pass @@ -2030,6 +2051,12 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): amdsmi_set_clk_freq = _libraries['libamd_smi.so'].amdsmi_set_clk_freq amdsmi_set_clk_freq.restype = amdsmi_status_t amdsmi_set_clk_freq.argtypes = [amdsmi_processor_handle, amdsmi_clk_type_t, uint64_t] +amdsmi_get_dpm_policy = _libraries['libamd_smi.so'].amdsmi_get_dpm_policy +amdsmi_get_dpm_policy.restype = amdsmi_status_t +amdsmi_get_dpm_policy.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_dpm_policy_t)] +amdsmi_set_dpm_policy = _libraries['libamd_smi.so'].amdsmi_set_dpm_policy +amdsmi_set_dpm_policy.restype = amdsmi_status_t +amdsmi_set_dpm_policy.argtypes = [amdsmi_processor_handle, uint32_t] amdsmi_get_lib_version = _libraries['libamd_smi.so'].amdsmi_get_lib_version amdsmi_get_lib_version.restype = amdsmi_status_t amdsmi_get_lib_version.argtypes = [ctypes.POINTER(struct_amdsmi_version_t)] @@ -2486,7 +2513,8 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'amdsmi_cpu_apb_enable', 'amdsmi_cpusocket_handle', 'amdsmi_ddr_bw_metrics_t', 'amdsmi_dev_perf_level_t', 'amdsmi_dimm_power_t', 'amdsmi_dimm_thermal_t', - 'amdsmi_dpm_level_t', 'amdsmi_driver_info_t', + 'amdsmi_dpm_level_t', 'amdsmi_dpm_policy_entry_t', + 'amdsmi_dpm_policy_t', 'amdsmi_driver_info_t', 'amdsmi_engine_usage_t', 'amdsmi_error_count_t', 'amdsmi_event_group_t', 'amdsmi_event_handle_t', 'amdsmi_event_type_t', 'amdsmi_evt_notification_data_t', @@ -2516,10 +2544,10 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'amdsmi_get_cpu_socket_power', 'amdsmi_get_cpu_socket_power_cap', 'amdsmi_get_cpu_socket_power_cap_max', 'amdsmi_get_cpu_socket_temperature', 'amdsmi_get_cpucore_handles', - 'amdsmi_get_cpusocket_handles', 'amdsmi_get_energy_count', - 'amdsmi_get_esmi_err_msg', 'amdsmi_get_fw_info', - 'amdsmi_get_gpu_activity', 'amdsmi_get_gpu_asic_info', - 'amdsmi_get_gpu_available_counters', + 'amdsmi_get_cpusocket_handles', 'amdsmi_get_dpm_policy', + 'amdsmi_get_energy_count', 'amdsmi_get_esmi_err_msg', + 'amdsmi_get_fw_info', 'amdsmi_get_gpu_activity', + 'amdsmi_get_gpu_asic_info', 'amdsmi_get_gpu_available_counters', 'amdsmi_get_gpu_bad_page_info', 'amdsmi_get_gpu_bdf_id', 'amdsmi_get_gpu_board_info', 'amdsmi_get_gpu_cache_info', 'amdsmi_get_gpu_compute_partition', @@ -2599,7 +2627,8 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'amdsmi_set_cpu_socket_boostlimit', 'amdsmi_set_cpu_socket_lclk_dpm_level', 'amdsmi_set_cpu_socket_power_cap', 'amdsmi_set_cpu_xgmi_width', - 'amdsmi_set_gpu_clk_range', 'amdsmi_set_gpu_compute_partition', + 'amdsmi_set_dpm_policy', 'amdsmi_set_gpu_clk_range', + 'amdsmi_set_gpu_compute_partition', 'amdsmi_set_gpu_event_notification_mask', 'amdsmi_set_gpu_fan_speed', 'amdsmi_set_gpu_memory_partition', 'amdsmi_set_gpu_od_clk_info', 'amdsmi_set_gpu_od_volt_info', @@ -2625,6 +2654,7 @@ class struct_amdsmi_hsmp_metrics_table_t(Structure): 'struct_amdsmi_clk_info_t', 'struct_amdsmi_counter_value_t', 'struct_amdsmi_ddr_bw_metrics_t', 'struct_amdsmi_dimm_power_t', 'struct_amdsmi_dimm_thermal_t', 'struct_amdsmi_dpm_level_t', + 'struct_amdsmi_dpm_policy_entry_t', 'struct_amdsmi_dpm_policy_t', 'struct_amdsmi_driver_info_t', 'struct_amdsmi_engine_usage_t', 'struct_amdsmi_error_count_t', 'struct_amdsmi_evt_notification_data_t', diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index d42ac466..12654213 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -192,6 +192,39 @@ typedef enum { RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100 //!< Unknown performance level } rsmi_dev_perf_level_t; + + +#define RSMI_MAX_NUM_PM_POLICIES 32 +#define RSMI_MAX_POLICY_NAME 32 +/** + * @brief The dpm policy. + */ +typedef struct { + uint32_t policy_id; + char policy_description[RSMI_MAX_POLICY_NAME]; +} rsmi_dpm_policy_entry_t; + +/** + * @brief This structure holds information about dpm policies. + */ +typedef struct { + /** + * The number of supported policies + */ + uint32_t num_supported; + + /** + * The current policy index + */ + uint32_t current; + + /** + * List of policies. + * Only the first num_supported policies are valid. + */ + rsmi_dpm_policy_entry_t policies[RSMI_MAX_NUM_PM_POLICIES]; +} rsmi_dpm_policy_t; + /// \cond Ignore in docs. typedef rsmi_dev_perf_level_t rsmi_dev_perf_level; /// \endcond @@ -3295,6 +3328,42 @@ rsmi_status_t rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od); rsmi_status_t rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, rsmi_clk_type_t clk_type, uint64_t freq_bitmask); +/** + * @brief Get the dpm policy for a device + * + * @details Given a device index @p dv_ind, this function will write + * current dpm policy settings to @p policy. All the devices at the same socket + * will have the same policy. + * + * @param[in] dv_ind a device index + * + * @param[in, out] policy the dpm policy for this device. + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVAL + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_dpm_policy_get(uint32_t dv_ind, + rsmi_dpm_policy_t* policy); + +/** + * @brief Set the dpm policy for a device + * + * @details Given a device index @p dv_ind and a dpm policy @p policy_id, + * this function will set the DPM policy for this device. All the devices at + * the same socket will be set to the same policy. + * + * @note This function requires root access + * + * @param[in] processor_handle a processor handle + * + * @param[in] policy_id the dpm policy will be modified + * + * @return ::RSMI_STATUS_SUCCESS is returned upon successful call, non-zero on fail + */ +rsmi_status_t rsmi_dev_dpm_policy_set(uint32_t dv_ind, + uint32_t policy_id); + /** @} */ // end of PerfCont /*****************************************************************************/ diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h index 5ca5193b..3df15f2e 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -173,6 +173,7 @@ enum DevInfoTypes { kDevNumaNode, kDevGpuMetrics, kDevPmMetrics, + kDevDPMPolicy, kDevRegMetrics, kDevGpuReset, kDevAvailableComputePartition, diff --git a/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/rocm_smi/include/rocm_smi/rocm_smi_utils.h index a6c3e80c..67d9d8b8 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -78,6 +78,7 @@ int isRegularFile(std::string fname, bool *is_reg); int ReadSysfsStr(std::string path, std::string *retStr); int WriteSysfsStr(std::string path, std::string val); bool IsInteger(const std::string & n_str); +bool stringToInteger(const std::string & n_str, int& value); std::pair executeCommand(std::string command, bool stdOut = true); rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index db5dbcc7..91c8ddbb 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -145,6 +145,7 @@ static uint64_t get_multiplier_from_str(char units_char) { return multiplier; } + /** * Parse a string of the form: * ": <|*>" @@ -2014,6 +2015,133 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, CATCH } + + +rsmi_status_t +rsmi_dev_dpm_policy_set(uint32_t dv_ind, + uint32_t policy_id) { + rsmi_status_t ret; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + REQUIRE_ROOT_ACCESS + DEVICE_MUTEX + GET_DEV_FROM_INDX + + std::string value("soc_pstate "); + value += std::to_string(policy_id); + int ret = dev->writeDevInfo(amd::smi::kDevDPMPolicy , value); + return amd::smi::ErrnoToRsmiStatus(ret); + + CATCH +} + +rsmi_status_t +rsmi_dev_dpm_policy_get(uint32_t dv_ind, + rsmi_dpm_policy_t* policy) { + rsmi_status_t ret; + std::vector val_vec; + + if (policy == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + *policy = {}; + + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + DEVICE_MUTEX + + ret = GetDevValueVec(amd::smi::kDevDPMPolicy, dv_ind, &val_vec); + if (ret == RSMI_STATUS_FILE_ERROR) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was RSMI_STATUS_FILE_ERROR " + << "-> reporting RSMI_STATUS_NOT_SUPPORTED"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS" + << " -> reporting " << amd::smi::getRSMIStatusString(ret); + LOG_ERROR(ss); + return ret; + } + /* + It will reply on the number but no string as it may vary from soc to soc. + The current pstate marked with * + soc pstate + 0 : soc_pstate_default + 1 : soc_pstate_0 + 2 : soc_pstate_1* + 3 : soc_pstate_2 + */ + bool see_soc_pstate = false; + bool see_current = false; + policy->num_supported = 0; + for (uint32_t i = 0; i < val_vec.size(); ++i) { + auto current_line = amd::smi::trim(val_vec[i]); + if (current_line == "soc pstate") { + see_soc_pstate = true; + continue; + } + if (see_soc_pstate == false) continue; + + // Get tokens: : + std::vector tokens; + std::istringstream f(current_line); + std::string s; + while (getline(f, s, ':')) { + tokens.push_back(s); + } + + int value = 0; + // At the end + if (tokens.size() < 2 || !amd::smi::stringToInteger(tokens[0], value)) { + break; + } + + if (value < 0 || policy->num_supported >= RSMI_MAX_NUM_PM_POLICIES) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", Unexpeced pstat data: the id is negative or too many policies."; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + + policy->policies[policy->num_supported].policy_id = value; + std::string description = amd::smi::trim(tokens[1]); + if (current_line.back() == '*') { // current policy + description.pop_back(); // remove last * + description = amd::smi::trim(description); + policy->current = policy->num_supported; + see_current = true; + } + strncpy(policy->policies[policy->num_supported].policy_description, + description.c_str(), + RSMI_MAX_POLICY_NAME-1); + policy->num_supported++; + } // end for + + if (!see_soc_pstate) { + return RSMI_STATUS_NOT_SUPPORTED; + } + + if (!see_current) { + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", Unexpeced pstat data: cannot find the current policy."; + LOG_ERROR(ss); + return RSMI_STATUS_UNEXPECTED_DATA; + } + // Cannot find it + return RSMI_STATUS_SUCCESS; + + CATCH +} + static std::vector pci_name_files = { "/usr/share/misc/pci.ids", "/usr/share/hwdata/pci.ids", diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 305364b6..3e63659c 100755 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -136,6 +136,7 @@ static const char *kDevAvailableComputePartitionFName = "available_compute_partition"; static const char *kDevComputePartitionFName = "current_compute_partition"; static const char *kDevMemoryPartitionFName = "current_memory_partition"; +static const char* kDevDPMPolicyFName = "pm_policy"; // The PM policy for pstat and XGMI // Firmware version files static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version"; @@ -315,6 +316,7 @@ static const std::map kDevAttribNameMap = { {kDevNumaNode, kDevNumaNodeFName}, {kDevGpuMetrics, kDevGpuMetricsFName}, {kDevPmMetrics, kDevPmMetricsFName}, + {kDevDPMPolicy, kDevDPMPolicyFName}, {kDevRegMetrics, kDevRegMetricsFName}, {kDevGpuReset, kDevGpuResetFName}, {kDevAvailableComputePartition, kDevAvailableComputePartitionFName}, @@ -472,6 +474,7 @@ Device::devInfoTypesStrings = { {kDevComputePartition, "kDevComputePartition"}, {kDevMemoryPartition, "kDevMemoryPartition"}, {kDevPCieVendorID, "kDevPCieVendorID"}, + {kDevDPMPolicy, "kDevDPMPolicy"}, }; static const std::map kDevFuncDependsMap = { @@ -533,6 +536,8 @@ static const std::map kDevFuncDependsMap = { {"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}}, {"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}}, {"rsmi_dev_pm_metrics_info_get", {{kDevPmMetricsFName}, {}}}, + {"rsmi_dev_dpm_policy_get", {{kDevDPMPolicyFName}, {}}}, + {"rsmi_dev_dpm_policy_set", {{kDevDPMPolicyFName}, {}}}, {"rsmi_dev_reg_table_info_get", {{kDevRegMetricsFName}, {}}}, {"rsmi_dev_gpu_reset", {{kDevGpuResetFName}, {}}}, {"rsmi_dev_compute_partition_get", {{kDevComputePartitionFName}, {}}}, @@ -938,6 +943,7 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) { case kDevPCIEClk: case kDevPowerODVoltage: case kDevSOCClk: + case kDevDPMPolicy: return writeDevInfoStr(type, val); case kDevComputePartition: case kDevMemoryPartition: @@ -1219,6 +1225,7 @@ int Device::readDevInfo(DevInfoTypes type, std::vector *val) { case kDevErrCntHDP: case kDevErrCntXGMIWAFL: case kDevMemPageBad: + case kDevDPMPolicy: return readDevInfoMultiLineStr(type, val); break; diff --git a/rocm_smi/src/rocm_smi_utils.cc b/rocm_smi/src/rocm_smi_utils.cc index ffa4ef70..61ec4243 100755 --- a/rocm_smi/src/rocm_smi_utils.cc +++ b/rocm_smi/src/rocm_smi_utils.cc @@ -257,6 +257,16 @@ bool IsInteger(const std::string & n_str) { return (*tmp == 0); } +bool stringToInteger(const std::string & n_str, int& value) { + try { + value = std::stoi(trim(n_str), nullptr); + return true; + } catch (...) { + return false; + } + return false; +} + rsmi_status_t handleException() { try { throw; diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index dc5f9509..392b6188 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1352,6 +1352,23 @@ amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle, return rsmi_wrapper(rsmi_dev_gpu_clk_freq_set, processor_handle, static_cast(clk_type), freq_bitmask); } + +amdsmi_status_t amdsmi_set_dpm_policy(amdsmi_processor_handle processor_handle, + uint32_t policy) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_dpm_policy_set, processor_handle, + policy); +} + +amdsmi_status_t amdsmi_get_dpm_policy(amdsmi_processor_handle processor_handle, + amdsmi_dpm_policy_t* policy) { + AMDSMI_CHECK_INIT(); + + return rsmi_wrapper(rsmi_dev_dpm_policy_get, processor_handle, + reinterpret_cast(policy)); +} + amdsmi_status_t amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle, uint32_t *num_pages, From b2690fdf1e70f0b7c813851db65ebb73a0ab0f64 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Mon, 11 Mar 2024 15:17:45 -0500 Subject: [PATCH 13/15] Unable to reset GPU from CLI The CLI helper compares the hex vendor id string with the number and never match it as AMD GPU. Change-Id: I1ababdce3a3694a5e26e5b0feef4d3d8cd40df7a --- amdsmi_cli/amdsmi_helpers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index 080cc353..2083c155 100644 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -617,8 +617,11 @@ def is_amd_device(self, device_handle): """ # Get card vendor id asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(device_handle) - return asic_info['vendor_id'] == AMD_VENDOR_ID - + try: + vendor_value = int(asic_info['vendor_id'], 16) + return vendor_value == AMD_VENDOR_ID + except: + return False def get_perf_levels(self): perf_levels_str = [clock.name for clock in amdsmi_interface.AmdSmiDevPerfLevel] From 25c8ff6c2af92e363723ab3a17923399c516d0fa Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Tue, 12 Mar 2024 15:35:31 -0500 Subject: [PATCH 14/15] SWDEV-449314 - Added pyyaml check before installing via pip Signed-off-by: Maisam Arif Change-Id: Ie6d0d664e74b47c1efce6e6fac19ee4a1bf0d5eb --- DEBIAN/postinst.in | 15 +++++++++++---- RPM/post.in | 15 +++++++++++---- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/DEBIAN/postinst.in b/DEBIAN/postinst.in index d991cf6b..f0d58d52 100755 --- a/DEBIAN/postinst.in +++ b/DEBIAN/postinst.in @@ -143,13 +143,20 @@ do_install_amdsmi_python_lib() { local pip_version pip_version=$(python3 -m pip --version | grep -Eo '^[^\ ]+ ([0-9]+)' | grep -Eo '[0-9]+$') if [[ "$pip_version" -lt 19 ]]; then - echo "Detected ancient pip version ($pip_version)... Upgrading..." - python3 -m pip install --upgrade pip --quiet --disable-pip-version-check + echo "Detected ancient pip version ($pip_version)... Upgrading..." + python3 -m pip install --upgrade pip --quiet --disable-pip-version-check fi unset pip_version - # install PyYAML dependency - python3 -m pip install 'PyYAML>=5.1' --quiet --disable-pip-version-check --ignore-installed + # Check PyYAML dependency + local pyyaml_version + pyyaml_version=$(pip show pyyaml | grep -Po '(?<=Version: )[0-9]') + if [[ "$pyyaml_version" -lt 5 ]]; then + echo "Detected ancient pyyaml version ($pyyaml_version)... Upgrading..." + python3 -m pip install 'PyYAML>=5.1' --quiet --disable-pip-version-check --ignore-installed + fi + unset pyyaml_version + # install python library at @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi local python_lib_path=@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@ python3 -m pip install "$python_lib_path" --quiet --disable-pip-version-check diff --git a/RPM/post.in b/RPM/post.in index a58a5f4e..5b5fc959 100755 --- a/RPM/post.in +++ b/RPM/post.in @@ -142,13 +142,20 @@ do_install_amdsmi_python_lib() { local pip_version pip_version=$(python3 -m pip --version | grep -Eo '^[^\ ]+ ([0-9]+)' | grep -Eo '[0-9]+$') if [[ "$pip_version" -lt 19 ]]; then - echo "Detected ancient pip version ($pip_version)... Upgrading..." - python3 -m pip install --upgrade pip --quiet --disable-pip-version-check + echo "Detected ancient pip version ($pip_version)... Upgrading..." + python3 -m pip install --upgrade pip --quiet --disable-pip-version-check fi unset pip_version - # install PyYAML dependency - python3 -m pip install 'PyYAML>=5.1' --quiet --disable-pip-version-check --ignore-installed + # Check PyYAML dependency + local pyyaml_version + pyyaml_version=$(pip show pyyaml | grep -Po '(?<=Version: )[0-9]') + if [[ "$pyyaml_version" -lt 5 ]]; then + echo "Detected ancient pyyaml version ($pyyaml_version)... Upgrading..." + python3 -m pip install 'PyYAML>=5.1' --quiet --disable-pip-version-check --ignore-installed + fi + unset pyyaml_version + # install python library at @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi local python_lib_path=@CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@ python3 -m pip install "$python_lib_path" --quiet --disable-pip-version-check From eef4169a0f7f0d1f65cb593612e89f8f67c65287 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 14 Mar 2024 03:43:52 -0500 Subject: [PATCH 15/15] SWDEV-449212 - Fix static build Disable Python interface and CLI tool for static builds (when -DBUILD_SHARED_LIBS=OFF is passed to cmake) Change-Id: I32bbd94d70628a50029a748f7493b55c91d45e02 Signed-off-by: Galantsev, Dmitrii --- CMakeLists.txt | 20 ++++++++++++-------- DEBIAN/postinst.in | 5 +++++ RPM/post.in | 5 +++++ 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e290230e..d59fe2f4 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,13 +69,15 @@ set(CMAKE_CXX_EXTENSIONS OFF) include(GNUInstallDirs) option(BUILD_TESTS "Build test suite" OFF) -# TODO: Enable once virtualenv is installed on CI machines -option(BUILD_WRAPPER "Rebuild AMDSMI-wrapper" OFF) -option(BUILD_CLI "Build AMDSMI-CLI and install" ON) -option(ENABLE_LDCONFIG "Set library links and caches using ldconfig." ON) option(ENABLE_ASAN_PACKAGING "" OFF) option(ENABLE_ESMI_LIB "" ON) +include(CMakeDependentOption) +# these options don't work without BUILD_SHARED_LIBS +cmake_dependent_option(BUILD_WRAPPER "Rebuild AMDSMI-wrapper" OFF "BUILD_SHARED_LIBS" OFF) +cmake_dependent_option(BUILD_CLI "Build AMDSMI-CLI and install" ON "BUILD_SHARED_LIBS" OFF) +cmake_dependent_option(ENABLE_LDCONFIG "Set library links and caches using ldconfig." ON "BUILD_SHARED_LIBS" OFF) + # Set share path here because project name != amd_smi set(SHARE_INSTALL_PREFIX "share/${AMD_SMI}" CACHE STRING "Tests and Example install directory") @@ -185,10 +187,12 @@ if(BUILD_TESTS) add_subdirectory("tests/amd_smi_test") endif() -add_subdirectory("py-interface") - -if(BUILD_CLI) - add_subdirectory("amdsmi_cli") +# python interface and CLI depend on shared libraries +if(BUILD_SHARED_LIBS) + add_subdirectory("py-interface") + if(BUILD_CLI) + add_subdirectory("amdsmi_cli") + endif() endif() include(CMakePackageConfigHelpers) diff --git a/DEBIAN/postinst.in b/DEBIAN/postinst.in index f0d58d52..3d0535d8 100755 --- a/DEBIAN/postinst.in +++ b/DEBIAN/postinst.in @@ -138,6 +138,11 @@ do_install_amdsmi_python_lib() { echo "Removed old AMD-SMI python library (amdsmi)..." fi + # static builds don't include python lib + if [ "@BUILD_SHARED_LIBS@" != "ON" ]; then + return + fi + # upgrade pip if it's an ancient version # otherwise the amdsmi install will fail local pip_version diff --git a/RPM/post.in b/RPM/post.in index 5b5fc959..653b365f 100755 --- a/RPM/post.in +++ b/RPM/post.in @@ -137,6 +137,11 @@ do_install_amdsmi_python_lib() { echo "Removed old AMD-SMI python library (amdsmi)..." fi + # static builds don't include python lib + if [ "@BUILD_SHARED_LIBS@" != "ON" ]; then + return + fi + # upgrade pip if it's an ancient version # otherwise the amdsmi install will fail local pip_version