From 21005edcb1899c274c074cd9682be16b62fb0c52 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 24 Sep 2024 15:43:17 -0400 Subject: [PATCH 01/17] Add a custom metrics CSV for the dcgm snap --- src/dcgm_metrics.csv | 156 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 src/dcgm_metrics.csv diff --git a/src/dcgm_metrics.csv b/src/dcgm_metrics.csv new file mode 100644 index 00000000..97cb12ca --- /dev/null +++ b/src/dcgm_metrics.csv @@ -0,0 +1,156 @@ +# Selected metrics for dcgm-exporter +# Default metric list https://github.com/NVIDIA/dcgm-exporter/blob/main/etc/default-counters.csv + +# Format +# If line starts with a '#' it is considered a comment +# Boolean values decode to - 1 = enabled 0 = disabled +# DCGM FIELD, Prometheus metric type, help message + + + + +# DEFAULT METRICS +# Clocks +DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). +DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). + +# Temperature +DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). +DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + +# Power +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). +DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). + +# PCIE +DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. +DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. +DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + +# Utilization (the sample period varies depending on the product) +DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). +DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). +DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). +DCGM_FI_DEV_DEC_UTIL, gauge, Decoder utilization (in %). + +# Errors and violations +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. + +# Memory usage +DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). +DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). + +# NVLink +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes + +# VGPU License status +DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + +# Remapped rows +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors +DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + +# Static configuration information and features +DCGM_FI_DRIVER_VERSION, label, Driver Version + + + + +# CUSTOM METRICS +# Clocks +DCGM_FI_DEV_VIDEO_CLOCK, gauge, Video encoder/decoder clock (in MHz). + +# Temperature +DCGM_FI_DEV_FAN_SPEED, gauge, Fan speed (in 0-100%) + +# Power +DCGM_FI_DEV_POWER_USAGE_INSTANT, gauge, Current instantaneous power usage (in W). + +# Errors and violations +DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, counter, Throttling reasons bitmask +DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). +DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). +DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). +DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). +DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). +DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). + +# Memory usage +DCGM_FI_DEV_FB_RESERVED, gauge, Frame buffer memory reserved (in MB). +DCGM_FI_DEV_FB_USED_PERCENT, gauge, Frame buffer percentage used (in 0-100%) - Used/(Total - Reserved) + +# ECC +DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. +DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. +DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. +DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + +# Retired pages +DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. +DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. +DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + +# NVLink +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. + +# VGPU +DCGM_FI_DEV_VGPU_INSTANCE_IDS, counter, Count of vGPU Instances +DCGM_FI_DEV_VGPU_UTILIZATIONS, gauge, vGPUs utilization + +# Bar +DCGM_FI_DEV_BAR1_USED, gauge, Used BAR1 (in MB) +DCGM_FI_DEV_BAR1_FREE, gauge, Free BAR1 (in MB) + +# DCP metrics +DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. +DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned. +DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM. +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active. +DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data. +DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active. +DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active. +DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active. +DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. + +# Static configuration information and features +DCGM_FI_NVML_VERSION, label, NVML Version +DCGM_FI_DEV_BRAND, label, Device Brand +DCGM_FI_DEV_SERIAL, label, Device Serial Number +DCGM_FI_DEV_NAME, label, Device Name +DCGM_FI_DEV_MINOR_NUMBER, label, Device node minor (/dev/nvidia#) +DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY, label, Cuda compute capability for the device (The major version is the upper 32 bits and the minor version is the lower 32 bits) +DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version +DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version +DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version +DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version +DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device + +DCGM_FI_DEV_COMPUTE_MODE, label, Compute mode +DCGM_FI_DEV_PERSISTENCE_MODE, label, Persistance mode (1 or 0) +DCGM_FI_DEV_CC_MODE, label, ConfidentialCompute/AmpereProtectedMemory status (1 or 0) +DCGM_FI_DEV_ECC_CURRENT, label, Current ECC mode +DCGM_FI_DEV_VIRTUAL_MODE, label, Virtualization mode +DCGM_FI_DEV_AUTOBOOST, label, Auto-boost enabled + +DCGM_FI_DEV_BAR1_TOTAL, label, Total BAR1 (in MB) + +DCGM_FI_DEV_MAX_SM_CLOCK, label, Maximum supported SM clock +DCGM_FI_DEV_MAX_MEM_CLOCK, label, Maximum supported Memory clock + +DCGM_FI_DEV_GPU_MAX_OP_TEMP, label, Maximum operating temperature +DCGM_FI_DEV_SLOWDOWN_TEMP, label, Slowdown temperature +DCGM_FI_DEV_SHUTDOWN_TEMP, label, Shutdown temperature + +DCGM_FI_DEV_POWER_MGMT_LIMIT, label, Current Power limit +DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN, label, Minimum Power limit +DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, label, Maximum Power limit +DCGM_FI_DEV_ENFORCED_POWER_LIMIT, label, Effective Power limit that the driver enforces after taking into account all limiters + +DCGM_FI_DEV_FB_TOTAL, label, Total Frame buffer (in MB) + +DCGM_FI_DEV_COUNT, label, Number of devices on the node From 49797e084b60ba17f9d7432ef9996ebc66690db7 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 24 Sep 2024 15:45:09 -0400 Subject: [PATCH 02/17] Move metrics to a separate folder --- src/{ => gpu_metrics}/dcgm_metrics.csv | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{ => gpu_metrics}/dcgm_metrics.csv (100%) diff --git a/src/dcgm_metrics.csv b/src/gpu_metrics/dcgm_metrics.csv similarity index 100% rename from src/dcgm_metrics.csv rename to src/gpu_metrics/dcgm_metrics.csv From 6555f3db67434438f0026260a8d0032c30493dde Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 24 Sep 2024 21:01:07 -0400 Subject: [PATCH 03/17] Configure dcgm exporter with custom metrics --- src/service.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/service.py b/src/service.py index 154ce865..69fffe3b 100644 --- a/src/service.py +++ b/src/service.py @@ -1,6 +1,7 @@ """Exporter service helper.""" import os +import shutil from abc import ABC, abstractmethod from logging import getLogger from pathlib import Path @@ -459,6 +460,27 @@ def __init__(self, config: ConfigData): def hw_tools() -> Set[HWTool]: """Return hardware tools to watch.""" return {HWTool.DCGM} + + def configure(self) -> bool: + """Configure and enable custom metrics.""" + gpu_metrics_file: Path = Path("./src/gpu_metrics/dcgm_metrics.csv") + dcgm_metrics_location = Path("/var/snap/dcgm/common/") + metric_config = "dcgm-exporter-metrics-file" + metric_config_value = gpu_metrics_file.name + + if not dcgm_metrics_location.exists(): + logger.error("DCGM snap common location does not exist.") + return False + + if not self.snap_client.get(metric_config) != metric_config_value: + try: + shutil.copy(gpu_metrics_file, dcgm_metrics_location) + self.snap_client.set({metric_config: metric_config_value}) + except Exception as err: # pylint: disable=broad-except + logger.error("Failed to copy dcgm metrics file: %s", err) + return False + + return True class HardwareExporter(RenderableExporter): From ab0b50e3340f4717ff8b6ea87e99943e093d7863 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 24 Sep 2024 21:08:23 -0400 Subject: [PATCH 04/17] Fix format --- src/service.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/service.py b/src/service.py index 69fffe3b..49db3fb7 100644 --- a/src/service.py +++ b/src/service.py @@ -460,18 +460,18 @@ def __init__(self, config: ConfigData): def hw_tools() -> Set[HWTool]: """Return hardware tools to watch.""" return {HWTool.DCGM} - + def configure(self) -> bool: """Configure and enable custom metrics.""" gpu_metrics_file: Path = Path("./src/gpu_metrics/dcgm_metrics.csv") dcgm_metrics_location = Path("/var/snap/dcgm/common/") metric_config = "dcgm-exporter-metrics-file" metric_config_value = gpu_metrics_file.name - + if not dcgm_metrics_location.exists(): logger.error("DCGM snap common location does not exist.") return False - + if not self.snap_client.get(metric_config) != metric_config_value: try: shutil.copy(gpu_metrics_file, dcgm_metrics_location) From 77acf7828dfa0b4ea097eaa59b53e88123d247c7 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 25 Sep 2024 13:29:05 -0400 Subject: [PATCH 05/17] Move the DCGM configuration to the install step --- src/charm.py | 2 +- src/service.py | 36 +++++++++++++++++++++--------------- tests/unit/test_service.py | 3 ++- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/charm.py b/src/charm.py index d720e594..5da8c7d1 100755 --- a/src/charm.py +++ b/src/charm.py @@ -84,7 +84,7 @@ def exporters(self) -> List[BaseExporter]: exporters.append(SmartCtlExporter(self.charm_dir, self.model.config)) if stored_tools & DCGMExporter.hw_tools(): - exporters.append(DCGMExporter(self.model.config)) + exporters.append(DCGMExporter(self.charm_dir, self.model.config)) return exporters diff --git a/src/service.py b/src/service.py index 49db3fb7..d51d25ba 100644 --- a/src/service.py +++ b/src/service.py @@ -451,37 +451,43 @@ class DCGMExporter(SnapExporter): exporter_name: str = "dcgm" port: int = 9400 - def __init__(self, config: ConfigData): + def __init__(self, charm_dir: Path, config: ConfigData): """Init.""" self.strategy = DCGMExporterStrategy(str(config["dcgm-snap-channel"])) + self.charm_dir = charm_dir super().__init__(config) - @staticmethod - def hw_tools() -> Set[HWTool]: - """Return hardware tools to watch.""" - return {HWTool.DCGM} - - def configure(self) -> bool: - """Configure and enable custom metrics.""" - gpu_metrics_file: Path = Path("./src/gpu_metrics/dcgm_metrics.csv") - dcgm_metrics_location = Path("/var/snap/dcgm/common/") - metric_config = "dcgm-exporter-metrics-file" - metric_config_value = gpu_metrics_file.name + def install(self) -> bool: + """Install the DCGM exporter and configure custom metrics.""" + gpu_metrics_file: Path = self.charm_dir / Path("src/gpu_metrics/dcgm_metrics.csv") + dcgm_metrics_location: Path = Path("/var/snap/dcgm/common/") + metric_config: str = "dcgm-exporter-metrics-file" + metric_config_value: str = gpu_metrics_file.name + + if not super().install(): + logger.error("Failed to install DCGM snap.") + return False if not dcgm_metrics_location.exists(): - logger.error("DCGM snap common location does not exist.") + logger.error("DCGM SNAP_COMMON location does not exist after install.") return False - if not self.snap_client.get(metric_config) != metric_config_value: + if self.snap_client.get(metric_config) != metric_config_value: try: shutil.copy(gpu_metrics_file, dcgm_metrics_location) self.snap_client.set({metric_config: metric_config_value}) + self.snap_client.restart(reload=True) except Exception as err: # pylint: disable=broad-except - logger.error("Failed to copy dcgm metrics file: %s", err) + logger.error("Failed to copy custom dcgm metrics file: %s", err) return False return True + @staticmethod + def hw_tools() -> Set[HWTool]: + """Return hardware tools to watch.""" + return {HWTool.DCGM} + class HardwareExporter(RenderableExporter): """A class representing the hardware exporter and the metric endpoints.""" diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index 6efd4577..c5071784 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -975,8 +975,9 @@ def test_dcgm_exporter(): mock_config = { "dcgm-snap-channel": "latest/stable", } + search_path = pathlib.Path(f"{__file__}/../../..").resolve() - exporter = service.DCGMExporter(mock_config) + exporter = service.DCGMExporter(search_path, mock_config) assert exporter.exporter_name == "dcgm" assert exporter.hw_tools() == {HWTool.DCGM} From 5b9bbb332c0b517d63f4f84eef94cb48b7e1b3f5 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 25 Sep 2024 15:30:12 -0400 Subject: [PATCH 06/17] Add unit tests for dcgm configuration --- src/service.py | 23 ++++------ tests/unit/test_service.py | 86 +++++++++++++++++++++++++++++++++----- 2 files changed, 83 insertions(+), 26 deletions(-) diff --git a/src/service.py b/src/service.py index d51d25ba..1ca3ad90 100644 --- a/src/service.py +++ b/src/service.py @@ -450,32 +450,25 @@ class DCGMExporter(SnapExporter): exporter_name: str = "dcgm" port: int = 9400 + metrics_location: Path = Path("/var/snap/dcgm/common/") + metric_config: str = "dcgm-exporter-metrics-file" def __init__(self, charm_dir: Path, config: ConfigData): """Init.""" self.strategy = DCGMExporterStrategy(str(config["dcgm-snap-channel"])) self.charm_dir = charm_dir + self.metrics_file = self.charm_dir / Path("src/gpu_metrics/dcgm_metrics.csv") + self.metric_config_value = self.metrics_file.name super().__init__(config) def install(self) -> bool: """Install the DCGM exporter and configure custom metrics.""" - gpu_metrics_file: Path = self.charm_dir / Path("src/gpu_metrics/dcgm_metrics.csv") - dcgm_metrics_location: Path = Path("/var/snap/dcgm/common/") - metric_config: str = "dcgm-exporter-metrics-file" - metric_config_value: str = gpu_metrics_file.name + super().install() - if not super().install(): - logger.error("Failed to install DCGM snap.") - return False - - if not dcgm_metrics_location.exists(): - logger.error("DCGM SNAP_COMMON location does not exist after install.") - return False - - if self.snap_client.get(metric_config) != metric_config_value: + if self.snap_client.get(self.metric_config) != self.metric_config_value: try: - shutil.copy(gpu_metrics_file, dcgm_metrics_location) - self.snap_client.set({metric_config: metric_config_value}) + shutil.copy(self.metrics_file, self.metrics_location) + self.snap_client.set({self.metric_config: self.metric_config_value}) self.snap_client.restart(reload=True) except Exception as err: # pylint: disable=broad-except logger.error("Failed to copy custom dcgm metrics file: %s", err) diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index c5071784..b6877e5a 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -813,6 +813,81 @@ def test_resource_remove(self): self.exporter.strategy.remove.accept_called() +class TestDCGMSnapExporter(unittest.TestCase): + """Test DCGM Snap exporter's methods.""" + + def setUp(self) -> None: + snap_lib_patcher = mock.patch.object(service, "snap") + self.mock_snap = snap_lib_patcher.start() + self.addCleanup(snap_lib_patcher.stop) + + search_path = pathlib.Path(f"{__file__}/../../..").resolve() + self.mock_config = { + "dcgm-snap-channel": "latest/stable", + } + + self.exporter = service.DCGMExporter(search_path, self.mock_config) + + def test_exporter_name(self): + self.assertEqual(self.exporter.exporter_name, "dcgm") + + def test_hw_tools(self): + self.assertEqual(self.exporter.hw_tools(), {HWTool.DCGM}) + + @mock.patch("service.shutil", return_value=mock.MagicMock()) + def test_install_success(self, mock_shutil): + self.exporter.strategy = mock.MagicMock() + self.exporter.snap_client = mock.MagicMock() + + self.exporter.snap_client.get = mock.MagicMock() + self.exporter.snap_client.get.return_value = "" + + exporter_install_ok = self.exporter.install() + + self.exporter.strategy.install.accept_called() + self.exporter.snap_client.get.accept_called_with(self.exporter.metric_config) + mock_shutil.copy.accept_called_with( + self.exporter.metrics_file, self.exporter.metrics_location + ) + self.exporter.snap_client.set.accept_called_with( + {self.exporter.metric_config: self.exporter.metric_config_value} + ) + self.exporter.snap_client.restart.accept_called_with(reload=True) + self.assertTrue(exporter_install_ok) + + @mock.patch("service.shutil", return_value=mock.MagicMock()) + def test_install_metrics_preset(self, mock_shutil): + self.exporter.strategy = mock.MagicMock() + self.exporter.snap_client = mock.MagicMock() + + self.exporter.snap_client.get = mock.MagicMock() + self.exporter.snap_client.get.return_value = self.exporter.metric_config_value + + exporter_install_ok = self.exporter.install() + + self.exporter.strategy.install.accept_called() + self.exporter.snap_client.get.accept_called_with(self.exporter.metric_config_value) + mock_shutil.copy.accept_not_called() + self.assertTrue(exporter_install_ok) + + @mock.patch("service.shutil", return_value=mock.MagicMock()) + def test_install_metrics_copy_fail(self, mock_shutil): + self.exporter.strategy = mock.MagicMock() + self.exporter.snap_client = mock.MagicMock() + + self.exporter.snap_client.get = mock.MagicMock() + self.exporter.snap_client.get.return_value = "" + + mock_shutil.copy.side_effect = FileNotFoundError + + exporter_install_ok = self.exporter.install() + + self.exporter.strategy.install.accept_called() + self.exporter.snap_client.get.accept_called_with(self.exporter.metric_config_value) + self.exporter.snap_client.restart.accept_not_called() + self.assertFalse(exporter_install_ok) + + class TestWriteToFile(unittest.TestCase): def setUp(self): self.temp_file = tempfile.NamedTemporaryFile(delete=False) @@ -971,16 +1046,5 @@ def test_snap_exporter_configure(mock_install, snap_exporter, install_result, ex mock_install.assert_called_once() -def test_dcgm_exporter(): - mock_config = { - "dcgm-snap-channel": "latest/stable", - } - search_path = pathlib.Path(f"{__file__}/../../..").resolve() - - exporter = service.DCGMExporter(search_path, mock_config) - assert exporter.exporter_name == "dcgm" - assert exporter.hw_tools() == {HWTool.DCGM} - - if __name__ == "__main__": unittest.main() From 76a1f4cfcf1bd69352b1c4d68d2a6247e98942e2 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 25 Sep 2024 16:05:04 -0400 Subject: [PATCH 07/17] Refinement --- src/service.py | 6 ++++-- tests/unit/test_service.py | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/service.py b/src/service.py index 1ca3ad90..4abb5c39 100644 --- a/src/service.py +++ b/src/service.py @@ -463,7 +463,8 @@ def __init__(self, charm_dir: Path, config: ConfigData): def install(self) -> bool: """Install the DCGM exporter and configure custom metrics.""" - super().install() + if not super().install(): + return False if self.snap_client.get(self.metric_config) != self.metric_config_value: try: @@ -471,7 +472,8 @@ def install(self) -> bool: self.snap_client.set({self.metric_config: self.metric_config_value}) self.snap_client.restart(reload=True) except Exception as err: # pylint: disable=broad-except - logger.error("Failed to copy custom dcgm metrics file: %s", err) + logger.error("Failed to configure custom DCGM metrics") + logger.error("Failed to copy the metrics file: %s", err) return False return True diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index b6877e5a..72e8518b 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -834,10 +834,23 @@ def test_exporter_name(self): def test_hw_tools(self): self.assertEqual(self.exporter.hw_tools(), {HWTool.DCGM}) + def test_install_failed(self): + self.exporter.strategy = mock.MagicMock() + self.exporter.snap_client = mock.MagicMock() + self.exporter.snap_client.present = mock.MagicMock() + self.exporter.snap_client.present = False + + exporter_install_ok = self.exporter.install() + + self.exporter.snap_client.get.assert_not_called() + self.assertFalse(exporter_install_ok) + @mock.patch("service.shutil", return_value=mock.MagicMock()) def test_install_success(self, mock_shutil): self.exporter.strategy = mock.MagicMock() self.exporter.snap_client = mock.MagicMock() + self.exporter.snap_client.present = mock.MagicMock() + self.exporter.snap_client.present = True self.exporter.snap_client.get = mock.MagicMock() self.exporter.snap_client.get.return_value = "" @@ -859,6 +872,8 @@ def test_install_success(self, mock_shutil): def test_install_metrics_preset(self, mock_shutil): self.exporter.strategy = mock.MagicMock() self.exporter.snap_client = mock.MagicMock() + self.exporter.snap_client.present = mock.MagicMock() + self.exporter.snap_client.present = True self.exporter.snap_client.get = mock.MagicMock() self.exporter.snap_client.get.return_value = self.exporter.metric_config_value @@ -874,6 +889,8 @@ def test_install_metrics_preset(self, mock_shutil): def test_install_metrics_copy_fail(self, mock_shutil): self.exporter.strategy = mock.MagicMock() self.exporter.snap_client = mock.MagicMock() + self.exporter.snap_client.present = mock.MagicMock() + self.exporter.snap_client.present = True self.exporter.snap_client.get = mock.MagicMock() self.exporter.snap_client.get.return_value = "" From b20eb2304f62acecef8b2c81b6cd3816c3442efb Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 25 Sep 2024 18:42:47 -0400 Subject: [PATCH 08/17] Add logs and move duplicate code to one place --- src/service.py | 8 +++++--- tests/unit/test_service.py | 27 +++++++-------------------- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/src/service.py b/src/service.py index 4abb5c39..75dcbee6 100644 --- a/src/service.py +++ b/src/service.py @@ -457,7 +457,7 @@ def __init__(self, charm_dir: Path, config: ConfigData): """Init.""" self.strategy = DCGMExporterStrategy(str(config["dcgm-snap-channel"])) self.charm_dir = charm_dir - self.metrics_file = self.charm_dir / Path("src/gpu_metrics/dcgm_metrics.csv") + self.metrics_file = self.charm_dir / "src/gpu_metrics/dcgm_metrics.csv" self.metric_config_value = self.metrics_file.name super().__init__(config) @@ -468,12 +468,14 @@ def install(self) -> bool: if self.snap_client.get(self.metric_config) != self.metric_config_value: try: + logger.info( + "Creating a custom metrics file and configuring the DCGM snap to use it." + ) shutil.copy(self.metrics_file, self.metrics_location) self.snap_client.set({self.metric_config: self.metric_config_value}) self.snap_client.restart(reload=True) except Exception as err: # pylint: disable=broad-except - logger.error("Failed to configure custom DCGM metrics") - logger.error("Failed to copy the metrics file: %s", err) + logger.error("Failed to configure custom DCGM metrics: %s", err) return False return True diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index 72e8518b..db3b4a66 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -817,6 +817,7 @@ class TestDCGMSnapExporter(unittest.TestCase): """Test DCGM Snap exporter's methods.""" def setUp(self) -> None: + """Set up harness for each test case.""" snap_lib_patcher = mock.patch.object(service, "snap") self.mock_snap = snap_lib_patcher.start() self.addCleanup(snap_lib_patcher.stop) @@ -828,6 +829,12 @@ def setUp(self) -> None: self.exporter = service.DCGMExporter(search_path, self.mock_config) + # Set up mocks + self.exporter.strategy = mock.MagicMock() + self.exporter.snap_client = mock.MagicMock() + self.exporter.snap_client.present = mock.MagicMock() + self.exporter.snap_client.get = mock.MagicMock() + def test_exporter_name(self): self.assertEqual(self.exporter.exporter_name, "dcgm") @@ -835,9 +842,6 @@ def test_hw_tools(self): self.assertEqual(self.exporter.hw_tools(), {HWTool.DCGM}) def test_install_failed(self): - self.exporter.strategy = mock.MagicMock() - self.exporter.snap_client = mock.MagicMock() - self.exporter.snap_client.present = mock.MagicMock() self.exporter.snap_client.present = False exporter_install_ok = self.exporter.install() @@ -847,12 +851,7 @@ def test_install_failed(self): @mock.patch("service.shutil", return_value=mock.MagicMock()) def test_install_success(self, mock_shutil): - self.exporter.strategy = mock.MagicMock() - self.exporter.snap_client = mock.MagicMock() - self.exporter.snap_client.present = mock.MagicMock() self.exporter.snap_client.present = True - - self.exporter.snap_client.get = mock.MagicMock() self.exporter.snap_client.get.return_value = "" exporter_install_ok = self.exporter.install() @@ -870,14 +869,8 @@ def test_install_success(self, mock_shutil): @mock.patch("service.shutil", return_value=mock.MagicMock()) def test_install_metrics_preset(self, mock_shutil): - self.exporter.strategy = mock.MagicMock() - self.exporter.snap_client = mock.MagicMock() - self.exporter.snap_client.present = mock.MagicMock() self.exporter.snap_client.present = True - - self.exporter.snap_client.get = mock.MagicMock() self.exporter.snap_client.get.return_value = self.exporter.metric_config_value - exporter_install_ok = self.exporter.install() self.exporter.strategy.install.accept_called() @@ -887,14 +880,8 @@ def test_install_metrics_preset(self, mock_shutil): @mock.patch("service.shutil", return_value=mock.MagicMock()) def test_install_metrics_copy_fail(self, mock_shutil): - self.exporter.strategy = mock.MagicMock() - self.exporter.snap_client = mock.MagicMock() - self.exporter.snap_client.present = mock.MagicMock() self.exporter.snap_client.present = True - - self.exporter.snap_client.get = mock.MagicMock() self.exporter.snap_client.get.return_value = "" - mock_shutil.copy.side_effect = FileNotFoundError exporter_install_ok = self.exporter.install() From 39067944c789a17997344f07431b64ff2ee73ad1 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 26 Sep 2024 13:13:50 -0400 Subject: [PATCH 09/17] Make config unconditional and refine unit tests --- src/service.py | 21 +++++++++------------ tests/unit/test_service.py | 38 ++++++++++---------------------------- 2 files changed, 19 insertions(+), 40 deletions(-) diff --git a/src/service.py b/src/service.py index 75dcbee6..81cb076b 100644 --- a/src/service.py +++ b/src/service.py @@ -450,7 +450,7 @@ class DCGMExporter(SnapExporter): exporter_name: str = "dcgm" port: int = 9400 - metrics_location: Path = Path("/var/snap/dcgm/common/") + snap_common: Path = Path("/var/snap/dcgm/common/") metric_config: str = "dcgm-exporter-metrics-file" def __init__(self, charm_dir: Path, config: ConfigData): @@ -466,17 +466,14 @@ def install(self) -> bool: if not super().install(): return False - if self.snap_client.get(self.metric_config) != self.metric_config_value: - try: - logger.info( - "Creating a custom metrics file and configuring the DCGM snap to use it." - ) - shutil.copy(self.metrics_file, self.metrics_location) - self.snap_client.set({self.metric_config: self.metric_config_value}) - self.snap_client.restart(reload=True) - except Exception as err: # pylint: disable=broad-except - logger.error("Failed to configure custom DCGM metrics: %s", err) - return False + try: + logger.info("Creating a custom metrics file and configuring the DCGM snap to use it.") + shutil.copy(self.metrics_file, self.snap_common) + self.snap_client.set({self.metric_config: self.metric_config_value}) + self.snap_client.restart(reload=True) + except Exception as err: # pylint: disable=broad-except + logger.error("Failed to configure custom DCGM metrics: %s", err) + return False return True diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index db3b4a66..70416f2a 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -819,8 +819,11 @@ class TestDCGMSnapExporter(unittest.TestCase): def setUp(self) -> None: """Set up harness for each test case.""" snap_lib_patcher = mock.patch.object(service, "snap") + shutil_lib_patcher = mock.patch.object(service, "shutil") self.mock_snap = snap_lib_patcher.start() + self.mock_shutil = shutil_lib_patcher.start() self.addCleanup(snap_lib_patcher.stop) + self.addCleanup(shutil_lib_patcher.stop) search_path = pathlib.Path(f"{__file__}/../../..").resolve() self.mock_config = { @@ -828,12 +831,7 @@ def setUp(self) -> None: } self.exporter = service.DCGMExporter(search_path, self.mock_config) - - # Set up mocks self.exporter.strategy = mock.MagicMock() - self.exporter.snap_client = mock.MagicMock() - self.exporter.snap_client.present = mock.MagicMock() - self.exporter.snap_client.get = mock.MagicMock() def test_exporter_name(self): self.assertEqual(self.exporter.exporter_name, "dcgm") @@ -846,20 +844,18 @@ def test_install_failed(self): exporter_install_ok = self.exporter.install() - self.exporter.snap_client.get.assert_not_called() + self.exporter.strategy.install.accept_called() + self.mock_shutil.copy.accept_not_called() self.assertFalse(exporter_install_ok) - @mock.patch("service.shutil", return_value=mock.MagicMock()) - def test_install_success(self, mock_shutil): + def test_install_success(self): self.exporter.snap_client.present = True - self.exporter.snap_client.get.return_value = "" exporter_install_ok = self.exporter.install() self.exporter.strategy.install.accept_called() - self.exporter.snap_client.get.accept_called_with(self.exporter.metric_config) - mock_shutil.copy.accept_called_with( - self.exporter.metrics_file, self.exporter.metrics_location + self.mock_shutil.copy.accept_called_with( + self.exporter.metrics_file, self.exporter.snap_common ) self.exporter.snap_client.set.accept_called_with( {self.exporter.metric_config: self.exporter.metric_config_value} @@ -867,27 +863,13 @@ def test_install_success(self, mock_shutil): self.exporter.snap_client.restart.accept_called_with(reload=True) self.assertTrue(exporter_install_ok) - @mock.patch("service.shutil", return_value=mock.MagicMock()) - def test_install_metrics_preset(self, mock_shutil): - self.exporter.snap_client.present = True - self.exporter.snap_client.get.return_value = self.exporter.metric_config_value - exporter_install_ok = self.exporter.install() - - self.exporter.strategy.install.accept_called() - self.exporter.snap_client.get.accept_called_with(self.exporter.metric_config_value) - mock_shutil.copy.accept_not_called() - self.assertTrue(exporter_install_ok) - - @mock.patch("service.shutil", return_value=mock.MagicMock()) - def test_install_metrics_copy_fail(self, mock_shutil): + def test_install_metrics_copy_fail(self): self.exporter.snap_client.present = True - self.exporter.snap_client.get.return_value = "" - mock_shutil.copy.side_effect = FileNotFoundError + self.mock_shutil.copy.side_effect = FileNotFoundError exporter_install_ok = self.exporter.install() self.exporter.strategy.install.accept_called() - self.exporter.snap_client.get.accept_called_with(self.exporter.metric_config_value) self.exporter.snap_client.restart.accept_not_called() self.assertFalse(exporter_install_ok) From 0ddafba77857c2219af9781b650d80a031c8ab01 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 26 Sep 2024 14:13:09 -0400 Subject: [PATCH 10/17] Fix format --- src/service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/service.py b/src/service.py index 81cb076b..666fba9a 100644 --- a/src/service.py +++ b/src/service.py @@ -467,7 +467,7 @@ def install(self) -> bool: return False try: - logger.info("Creating a custom metrics file and configuring the DCGM snap to use it.") + logger.info("Creating a custom metrics file and configuring the DCGM snap to use it") shutil.copy(self.metrics_file, self.snap_common) self.snap_client.set({self.metric_config: self.metric_config_value}) self.snap_client.restart(reload=True) From d618e60114c0218431a22f22b799b98e8cf205e4 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 26 Sep 2024 14:33:29 -0400 Subject: [PATCH 11/17] Remove # type: ignore to fix linter --- src/charm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index 5da8c7d1..948e29b8 100755 --- a/src/charm.py +++ b/src/charm.py @@ -291,4 +291,4 @@ def cos_agent_related(self) -> bool: if __name__ == "__main__": # pragma: nocover - ops.main(HardwareObserverCharm) # type: ignore + ops.main(HardwareObserverCharm) From c7c62f04974a617ae7ae0b9bbd70669e0ed10c6f Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 26 Sep 2024 14:50:54 -0400 Subject: [PATCH 12/17] Add warning header to the dcgm_metrics.csv --- src/gpu_metrics/dcgm_metrics.csv | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/gpu_metrics/dcgm_metrics.csv b/src/gpu_metrics/dcgm_metrics.csv index 97cb12ca..546f9a20 100644 --- a/src/gpu_metrics/dcgm_metrics.csv +++ b/src/gpu_metrics/dcgm_metrics.csv @@ -1,3 +1,8 @@ +############################################################################### +# [ WARNING ] +# Configuration file maintained by Juju. Local changes may be overwritten. +############################################################################### + # Selected metrics for dcgm-exporter # Default metric list https://github.com/NVIDIA/dcgm-exporter/blob/main/etc/default-counters.csv From 95216dd3ed72128cb6969857375477b7547c5e3f Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 26 Sep 2024 20:58:32 -0400 Subject: [PATCH 13/17] Fix unit test typos --- src/service.py | 2 +- tests/unit/test_service.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/service.py b/src/service.py index 666fba9a..f91bc4ab 100644 --- a/src/service.py +++ b/src/service.py @@ -466,8 +466,8 @@ def install(self) -> bool: if not super().install(): return False + logger.info("Creating a custom metrics file and configuring the DCGM snap to use it") try: - logger.info("Creating a custom metrics file and configuring the DCGM snap to use it") shutil.copy(self.metrics_file, self.snap_common) self.snap_client.set({self.metric_config: self.metric_config_value}) self.snap_client.restart(reload=True) diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index 70416f2a..95c320e6 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -844,8 +844,8 @@ def test_install_failed(self): exporter_install_ok = self.exporter.install() - self.exporter.strategy.install.accept_called() - self.mock_shutil.copy.accept_not_called() + self.exporter.strategy.install.assert_called() + self.mock_shutil.copy.assert_not_called() self.assertFalse(exporter_install_ok) def test_install_success(self): @@ -853,14 +853,14 @@ def test_install_success(self): exporter_install_ok = self.exporter.install() - self.exporter.strategy.install.accept_called() - self.mock_shutil.copy.accept_called_with( + self.exporter.strategy.install.assert_called() + self.mock_shutil.copy.assert_called_with( self.exporter.metrics_file, self.exporter.snap_common ) - self.exporter.snap_client.set.accept_called_with( + self.exporter.snap_client.set.assert_called_with( {self.exporter.metric_config: self.exporter.metric_config_value} ) - self.exporter.snap_client.restart.accept_called_with(reload=True) + self.exporter.snap_client.restart.assert_called_with(reload=True) self.assertTrue(exporter_install_ok) def test_install_metrics_copy_fail(self): @@ -869,8 +869,8 @@ def test_install_metrics_copy_fail(self): exporter_install_ok = self.exporter.install() - self.exporter.strategy.install.accept_called() - self.exporter.snap_client.restart.accept_not_called() + self.exporter.strategy.install.assert_called() + self.exporter.snap_client.restart.assert_not_called() self.assertFalse(exporter_install_ok) From 2637e8af66f3e5a5cf088dd40cf0a7a9afa4b583 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 27 Sep 2024 15:59:41 -0400 Subject: [PATCH 14/17] Comment out the faulty DCGM-exporter metric --- src/gpu_metrics/dcgm_metrics.csv | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gpu_metrics/dcgm_metrics.csv b/src/gpu_metrics/dcgm_metrics.csv index 546f9a20..29235451 100644 --- a/src/gpu_metrics/dcgm_metrics.csv +++ b/src/gpu_metrics/dcgm_metrics.csv @@ -103,7 +103,8 @@ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink r DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. # VGPU -DCGM_FI_DEV_VGPU_INSTANCE_IDS, counter, Count of vGPU Instances +# The metric is disabled because the DCGM-exporter produces an invalid value for it, ie "ERROR - FAILED TO CONVERT TO STRING" +# DCGM_FI_DEV_VGPU_INSTANCE_IDS, counter, Count of vGPU Instances DCGM_FI_DEV_VGPU_UTILIZATIONS, gauge, vGPUs utilization # Bar From 349e6408753dbade82154163591f2ee6e6aad34f Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 27 Sep 2024 16:21:51 -0400 Subject: [PATCH 15/17] Add issue link --- src/gpu_metrics/dcgm_metrics.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gpu_metrics/dcgm_metrics.csv b/src/gpu_metrics/dcgm_metrics.csv index 29235451..94750387 100644 --- a/src/gpu_metrics/dcgm_metrics.csv +++ b/src/gpu_metrics/dcgm_metrics.csv @@ -104,6 +104,7 @@ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink r # VGPU # The metric is disabled because the DCGM-exporter produces an invalid value for it, ie "ERROR - FAILED TO CONVERT TO STRING" +# see https://github.com/NVIDIA/dcgm-exporter/issues/394 # DCGM_FI_DEV_VGPU_INSTANCE_IDS, counter, Count of vGPU Instances DCGM_FI_DEV_VGPU_UTILIZATIONS, gauge, vGPUs utilization From ad6dd941a061e5340038d2ac04f3dd7c90cadc50 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Mon, 30 Sep 2024 10:15:42 -0400 Subject: [PATCH 16/17] Remove unsupported metrics --- src/gpu_metrics/dcgm_metrics.csv | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/gpu_metrics/dcgm_metrics.csv b/src/gpu_metrics/dcgm_metrics.csv index 94750387..5fb0a4d8 100644 --- a/src/gpu_metrics/dcgm_metrics.csv +++ b/src/gpu_metrics/dcgm_metrics.csv @@ -103,9 +103,6 @@ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink r DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. # VGPU -# The metric is disabled because the DCGM-exporter produces an invalid value for it, ie "ERROR - FAILED TO CONVERT TO STRING" -# see https://github.com/NVIDIA/dcgm-exporter/issues/394 -# DCGM_FI_DEV_VGPU_INSTANCE_IDS, counter, Count of vGPU Instances DCGM_FI_DEV_VGPU_UTILIZATIONS, gauge, vGPUs utilization # Bar From 41f5ea4418b2b11471de4346980186e17c213ced Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 2 Oct 2024 11:15:55 -0400 Subject: [PATCH 17/17] Remove redundant code --- tests/unit/test_service.py | 99 -------------------------------------- 1 file changed, 99 deletions(-) diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index 0d45045a..a0fb8835 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -715,105 +715,6 @@ def test_hw_tools(self): ) -class TestSmartMetricExporter(unittest.TestCase): - """Test SmartCtlExporter's methods.""" - - def setUp(self) -> None: - """Set up harness for each test case.""" - systemd_lib_patcher = mock.patch.object(service, "systemd") - self.mock_systemd = systemd_lib_patcher.start() - self.addCleanup(systemd_lib_patcher.stop) - - search_path = pathlib.Path(f"{__file__}/../../..").resolve() - self.mock_config = { - "smartctl-exporter-port": 10201, - "collect-timeout": 10, - "exporter-log-level": "INFO", - } - self.exporter = service.SmartCtlExporter(search_path, self.mock_config) - - def test_render_service(self): - """Test render service.""" - self.exporter._render_service = mock.MagicMock() - self.exporter._render_service.return_value = "some result" - - result = self.exporter.render_service() - self.assertEqual(result, "some result") - - self.exporter._render_service.assert_called_with( - { - "PORT": str(self.exporter.port), - "LEVEL": self.exporter.log_level, - } - ) - - @parameterized.expand( - [ - (True,), - (False,), - ] - ) - def test_set_config(self, service_render_success): - """Test render config.""" - self.exporter.render_service = mock.MagicMock() - self.exporter.render_service.return_value = service_render_success - - result = self.exporter.configure() - self.assertEqual(result, service_render_success) - - def test_hw_tools(self): - self.assertEqual(self.exporter.hw_tools(), {HWTool.SMARTCTL}) - - @mock.patch("service.systemd", return_value=mock.MagicMock()) - def test_install_resource_restart(self, mock_systemd): - self.exporter.strategy = mock.MagicMock() - self.exporter.check_active = mock.MagicMock() - self.exporter.check_active.return_value = True - - self.exporter.install_resources() - - self.exporter.strategy.install.assert_called() - self.exporter.check_active.assert_called() - mock_systemd.service_stop.assert_called_with(self.exporter.exporter_name) - mock_systemd.service_restart.assert_called_with(self.exporter.exporter_name) - - @mock.patch("service.systemd", return_value=mock.MagicMock()) - def test_install_resource_no_restart(self, mock_systemd): - self.exporter.strategy = mock.MagicMock() - self.exporter.check_active = mock.MagicMock() - self.exporter.check_active.return_value = False - - self.exporter.install_resources() - - self.exporter.strategy.install.assert_called() - self.exporter.check_active.assert_called() - mock_systemd.service_stop.assert_not_called() - mock_systemd.service_restart.assert_not_called() - - def test_resource_exists(self): - self.exporter.strategy = mock.MagicMock() - - self.exporter.resources_exist() - self.exporter.strategy.check.assert_called() - - def test_resources_exist(self): - self.exporter.strategy = mock.MagicMock() - self.exporter.strategy.check.return_value = "some result" - - result = self.exporter.resources_exist() - - self.assertEqual(result, "some result") - self.exporter.strategy.check.assert_called() - - def test_resource_remove(self): - self.exporter.strategy = mock.MagicMock() - - result = self.exporter.remove_resources() - self.assertEqual(result, True) - - self.exporter.strategy.remove.assert_called() - - class TestDCGMSnapExporter(unittest.TestCase): """Test DCGM Snap exporter's methods."""