From 4469950d9c5840abbbff8b9aa4979fbf910377a3 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Thu, 26 Sep 2024 18:22:31 -0300 Subject: [PATCH 1/9] Try to install NVIDIA driver if not present in the machine detect if NVIDIA driver is present by the /proc/driver/nvidia/version file and: * if present, doesn't try to install * if not present, try to install using the ubuntu-drivers pkg * even if installed, the machine might require reboot. To detect this, we check if nvidia-smi command is working. --- src/hw_tools.py | 25 +++++++++++++++++++++++++ src/service.py | 13 +++++++++++++ 2 files changed, 38 insertions(+) diff --git a/src/hw_tools.py b/src/hw_tools.py index 76e883a8..eb1bc4d0 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -233,11 +233,36 @@ class DCGMExporterStrategy(SnapStrategy): """DCGM strategy class.""" _name = HWTool.DCGM + pkg = "ubuntu-drivers-common" def __init__(self, channel: str) -> None: """Init.""" self.channel = channel + def install(self) -> None: + """Install the snap from a channel and the necessary nvidia driver.""" + super().install() + self._install_nvidia_drivers() + + def _install_nvidia_drivers(self) -> None: + """Install the NVIDIA driver if not present.""" + if Path("/proc/driver/nvidia/version").exists(): + logger.info("Driver already installed in the machine") + return + + logger.info("Installing NVIDIA driver") + apt.add_package(self.pkg, update_cache=True) + cmd = "ubuntu-drivers install --gpgpu" + try: + result = subprocess.check_output(cmd.split(), text=True) + if "No drivers found for installation" in result: + raise ResourceInstallationError(self._name) + logger.info("NVIDIA driver installed") + return + except subprocess.CalledProcessError as err: + logger.error("Failed to install the NVIDIA driver: %s", err) + raise err + class StorCLIStrategy(TPRStrategyABC): """Strategy to install storcli.""" diff --git a/src/service.py b/src/service.py index 154ce865..b8859714 100644 --- a/src/service.py +++ b/src/service.py @@ -1,6 +1,7 @@ """Exporter service helper.""" import os +import subprocess from abc import ABC, abstractmethod from logging import getLogger from pathlib import Path @@ -460,6 +461,18 @@ def hw_tools() -> Set[HWTool]: """Return hardware tools to watch.""" return {HWTool.DCGM} + def validate_exporter_configs(self) -> Tuple[bool, str]: + """Validate the if the DCGM exporter is able to run.""" + valid, msg = super().validate_exporter_configs() + if not valid: + return valid, msg + + try: + subprocess.check_call("nvidia-smi") + return valid, msg + except subprocess.CalledProcessError: + return False, "Failed to communicate with NVIDIA driver. Reboot might solve the issue." + class HardwareExporter(RenderableExporter): """A class representing the hardware exporter and the metric endpoints.""" From b8eee9e16daa2e5c0ccaa6f9464d77200a81f8f1 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Fri, 27 Sep 2024 19:52:10 -0300 Subject: [PATCH 2/9] - install nvidia_utils --- src/hw_tools.py | 23 ++++++++++++++++++++++- src/service.py | 8 ++++++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/hw_tools.py b/src/hw_tools.py index eb1bc4d0..461c112e 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -233,6 +233,7 @@ class DCGMExporterStrategy(SnapStrategy): """DCGM strategy class.""" _name = HWTool.DCGM + snap_common: Path = Path("/var/snap/dcgm/common/") pkg = "ubuntu-drivers-common" def __init__(self, channel: str) -> None: @@ -243,6 +244,7 @@ def install(self) -> None: """Install the snap from a channel and the necessary nvidia driver.""" super().install() self._install_nvidia_drivers() + self._install_nvidia_utils() def _install_nvidia_drivers(self) -> None: """Install the NVIDIA driver if not present.""" @@ -252,7 +254,10 @@ def _install_nvidia_drivers(self) -> None: logger.info("Installing NVIDIA driver") apt.add_package(self.pkg, update_cache=True) - cmd = "ubuntu-drivers install --gpgpu" + cmd = ( + "ubuntu-drivers install --gpgpu --package-list " + f"{self.snap_common}/nvidia-installed-pkgs.txt" + ) try: result = subprocess.check_output(cmd.split(), text=True) if "No drivers found for installation" in result: @@ -263,6 +268,22 @@ def _install_nvidia_drivers(self) -> None: logger.error("Failed to install the NVIDIA driver: %s", err) raise err + def _install_nvidia_utils(self) -> None: + """Install the nvidia utils to be able to use nvidia-smi.""" + nvidia_pkg = Path(self.snap_common / "nvidia-installed-pkgs.txt") + if nvidia_pkg.exists(): + installed_pkg = nvidia_pkg.read_text(encoding="utf-8").splitlines()[0] + logger.debug("installed driver from hardware-observer: %s", installed_pkg) + nvidia_version = installed_pkg.split("-")[-2] + + if nvidia_version.isdigit(): + pkg = f"nvidia-utils-{nvidia_version}" + apt.add_package(pkg, update_cache=True) + logger.info("installed %s", pkg) + return + + logger.debug("nvidia-utils not installed") + class StorCLIStrategy(TPRStrategyABC): """Strategy to install storcli.""" diff --git a/src/service.py b/src/service.py index b8859714..20ef8c8a 100644 --- a/src/service.py +++ b/src/service.py @@ -382,7 +382,11 @@ class SnapExporter(BaseExporter): def __init__(self, config: ConfigData): """Init.""" self.config = config - self.snap_client = snap.SnapCache()[self.strategy.snap] + + @property + def snap_client(self) -> snap.Snap: + """Return the snap client.""" + return snap.SnapCache()[self.strategy.snap] @staticmethod def hw_tools() -> Set[HWTool]: @@ -470,7 +474,7 @@ def validate_exporter_configs(self) -> Tuple[bool, str]: try: subprocess.check_call("nvidia-smi") return valid, msg - except subprocess.CalledProcessError: + except (FileNotFoundError, subprocess.CalledProcessError): return False, "Failed to communicate with NVIDIA driver. Reboot might solve the issue." From 0b6f1efa674e534b4853c3d0fc24025286b8737a Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Tue, 1 Oct 2024 19:16:43 -0300 Subject: [PATCH 3/9] Add unit tests --- src/hw_tools.py | 35 ++++++---- src/service.py | 7 +- tests/unit/test_hw_tools.py | 126 ++++++++++++++++++++++++++++++++++++ tests/unit/test_service.py | 42 ++++++++++-- 4 files changed, 192 insertions(+), 18 deletions(-) diff --git a/src/hw_tools.py b/src/hw_tools.py index 461c112e..9d15aa05 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -254,11 +254,15 @@ def _install_nvidia_drivers(self) -> None: logger.info("Installing NVIDIA driver") apt.add_package(self.pkg, update_cache=True) + + # output what driver was installed helps gets the version installed later cmd = ( "ubuntu-drivers install --gpgpu --package-list " f"{self.snap_common}/nvidia-installed-pkgs.txt" ) try: + # This can be changed to check_call and not rely in the output if this is fixed + # https://github.com/canonical/ubuntu-drivers-common/issues/106 result = subprocess.check_output(cmd.split(), text=True) if "No drivers found for installation" in result: raise ResourceInstallationError(self._name) @@ -271,18 +275,25 @@ def _install_nvidia_drivers(self) -> None: def _install_nvidia_utils(self) -> None: """Install the nvidia utils to be able to use nvidia-smi.""" nvidia_pkg = Path(self.snap_common / "nvidia-installed-pkgs.txt") - if nvidia_pkg.exists(): - installed_pkg = nvidia_pkg.read_text(encoding="utf-8").splitlines()[0] - logger.debug("installed driver from hardware-observer: %s", installed_pkg) - nvidia_version = installed_pkg.split("-")[-2] - - if nvidia_version.isdigit(): - pkg = f"nvidia-utils-{nvidia_version}" - apt.add_package(pkg, update_cache=True) - logger.info("installed %s", pkg) - return - - logger.debug("nvidia-utils not installed") + if not nvidia_pkg.exists(): + logger.debug("nvidia-utils not installed by the charm") + return + + installed_pkg = nvidia_pkg.read_text(encoding="utf-8").splitlines()[0] + logger.debug("installed driver from hardware-observer: %s", installed_pkg) + nvidia_version = installed_pkg.split("-")[-2] + + if not nvidia_version.isdigit(): + logger.warning( + "driver %s is an unexpected format and nvidia-utils was not installed", + installed_pkg, + ) + return + + pkg = f"nvidia-utils-{nvidia_version}" + apt.add_package(pkg, update_cache=True) + logger.info("installed %s", pkg) + return class StorCLIStrategy(TPRStrategyABC): diff --git a/src/service.py b/src/service.py index 20ef8c8a..852ed681 100644 --- a/src/service.py +++ b/src/service.py @@ -472,10 +472,13 @@ def validate_exporter_configs(self) -> Tuple[bool, str]: return valid, msg try: - subprocess.check_call("nvidia-smi") + subprocess.check_call("nvidia-smi", timeout=60) return valid, msg except (FileNotFoundError, subprocess.CalledProcessError): - return False, "Failed to communicate with NVIDIA driver. Reboot might solve the issue." + return ( + False, + "Failed to communicate with NVIDIA driver. Manual intervention is required.", + ) class HardwareExporter(RenderableExporter): diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index 4e51fba8..c31024ec 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -24,6 +24,7 @@ from config import SNAP_COMMON, TOOLS_DIR, TPR_RESOURCES, HWTool, StorageVendor, SystemVendor from hw_tools import ( APTStrategyABC, + DCGMExporterStrategy, HWToolHelper, IPMIDCMIStrategy, IPMISELStrategy, @@ -1265,3 +1266,128 @@ def test_snap_strategy_check(snap_exporter, mock_snap_lib, services, expected): mock_snap_lib.SnapCache.return_value = {"my-snap": mock_snap_client} assert snap_exporter.check() is expected + + +@pytest.fixture +def dcgm_exporter_strategy(mock_snap_lib): + strategy = DCGMExporterStrategy("latest/stable") + yield strategy + + +@mock.patch("hw_tools.DCGMExporterStrategy._install_nvidia_drivers") +@mock.patch("hw_tools.DCGMExporterStrategy._install_nvidia_utils") +def test_dcgm_exporter_strategy_install( + mock_install_nvidia_drivers, mock_install_nvidia_utils, dcgm_exporter_strategy +): + dcgm_exporter_strategy.install() + mock_install_nvidia_drivers.assert_called_once() + mock_install_nvidia_utils.assert_called_once() + + +@mock.patch("hw_tools.apt.add_package") +@mock.patch("hw_tools.subprocess.check_output") +@mock.patch("hw_tools.Path") +def test_dcgm_install_nvidia_drivers_success( + mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy +): + nvidia_version = mock.MagicMock() + nvidia_version.exists.return_value = False + mock_path.return_value = nvidia_version + + dcgm_exporter_strategy._install_nvidia_drivers() + + mock_add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) + mock_subprocess.assert_called_once() + + +@mock.patch("hw_tools.apt.add_package") +@mock.patch("hw_tools.subprocess.check_output") +@mock.patch("hw_tools.Path") +def test_dcgm_install_nvidia_drivers_already_installed( + mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy +): + nvidia_version = mock.MagicMock() + nvidia_version.exists.return_value = True + mock_path.return_value = nvidia_version + + dcgm_exporter_strategy._install_nvidia_drivers() + + mock_add_package.assert_not_called() + mock_subprocess.assert_not_called() + + +@mock.patch("hw_tools.apt.add_package") +@mock.patch("hw_tools.subprocess.check_output") +@mock.patch("hw_tools.Path") +def test_dcgm_install_nvidia_drivers_subprocess_exception( + mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy +): + nvidia_version = mock.MagicMock() + nvidia_version.exists.return_value = False + mock_path.return_value = nvidia_version + mock_subprocess.side_effect = subprocess.CalledProcessError(1, []) + + with pytest.raises(subprocess.CalledProcessError): + dcgm_exporter_strategy._install_nvidia_drivers() + + mock_add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) + + +@mock.patch("hw_tools.apt.add_package") +@mock.patch("hw_tools.subprocess.check_output") +@mock.patch("hw_tools.Path") +def test_dcgm_install_nvidia_drivers_no_drivers_found( + mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy +): + nvidia_version = mock.MagicMock() + nvidia_version.exists.return_value = False + mock_path.return_value = nvidia_version + mock_subprocess.return_value = "No drivers found for installation" + + with pytest.raises(ResourceInstallationError): + dcgm_exporter_strategy._install_nvidia_drivers() + + mock_add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) + + +@mock.patch("hw_tools.apt.add_package") +@mock.patch("hw_tools.Path") +def test_install_nvidia_utils_driver_installed_from_charm( + mock_path, mock_add_package, dcgm_exporter_strategy +): + driver_version = mock.MagicMock() + driver_version.exists.return_value = True + driver_version.read_text.return_value = ( + "nvidia-headless-no-dkms-535-server\nlibnvidia-cfg1-535-server" + ) + mock_path.return_value = driver_version + + dcgm_exporter_strategy._install_nvidia_utils() + mock_add_package.assert_called_with("nvidia-utils-535", update_cache=True) + + +@mock.patch("hw_tools.apt.add_package") +@mock.patch("hw_tools.Path") +def test_install_nvidia_utils_driver_not_installed_from_charm( + mock_path, mock_add_package, dcgm_exporter_strategy +): + driver_version = mock.MagicMock() + driver_version.exists.return_value = False + mock_path.return_value = driver_version + + dcgm_exporter_strategy._install_nvidia_utils() + mock_add_package.assert_not_called() + + +@mock.patch("hw_tools.apt.add_package") +@mock.patch("hw_tools.Path") +def test_install_nvidia_utils_driver_unexpected_format( + mock_path, mock_add_package, dcgm_exporter_strategy +): + driver_version = mock.MagicMock() + driver_version.exists.return_value = True + driver_version.read_text.return_value = "nvidia-my-version-server" + mock_path.return_value = driver_version + + dcgm_exporter_strategy._install_nvidia_utils() + mock_add_package.assert_not_called() diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index 6efd4577..5031eaf1 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -971,14 +971,48 @@ def test_snap_exporter_configure(mock_install, snap_exporter, install_result, ex mock_install.assert_called_once() -def test_dcgm_exporter(): +@pytest.fixture +def dcgm_exporter(): mock_config = { "dcgm-snap-channel": "latest/stable", } - exporter = service.DCGMExporter(mock_config) - assert exporter.exporter_name == "dcgm" - assert exporter.hw_tools() == {HWTool.DCGM} + strategy = mock.MagicMock() + exporter.strategy = strategy + yield exporter + strategy.reset_mock() + + +def test_dcgm_exporter(dcgm_exporter): + assert dcgm_exporter.exporter_name == "dcgm" + assert dcgm_exporter.hw_tools() == {HWTool.DCGM} + assert dcgm_exporter.port == 9400 + + +@mock.patch("service.subprocess.check_call") +def test_dcgm_exporter_validate_exporter_configs_success(_, dcgm_exporter): + valid, msg = dcgm_exporter.validate_exporter_configs() + assert valid is True + assert msg == "Exporter config is valid." + + +@pytest.mark.parametrize( + "exception", [FileNotFoundError, service.subprocess.CalledProcessError(1, [])] +) +@mock.patch("service.subprocess.check_call") +def test_dcgm_exporter_validate_exporter_configs_fails(mock_subprocess, dcgm_exporter, exception): + mock_subprocess.side_effect = exception + valid, msg = dcgm_exporter.validate_exporter_configs() + assert valid is False + assert msg == "Failed to communicate with NVIDIA driver. Manual intervention is required." + + +@mock.patch.object(service.BaseExporter, "validate_exporter_configs") +def test_dcgm_exporter_validate_exporter_configs_fails_parent(mock_parent_validate, dcgm_exporter): + mock_parent_validate.return_value = False, "Invalid config: exporter's port" + valid, msg = dcgm_exporter.validate_exporter_configs() + assert valid is False + assert msg == "Invalid config: exporter's port" if __name__ == "__main__": From d1c3141041145f1ff9e2c334f691fdbe2d8dbf00 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Wed, 2 Oct 2024 18:40:09 -0300 Subject: [PATCH 4/9] Add suggestions --- src/hw_tools.py | 14 ++++++++------ src/service.py | 5 +++++ tests/unit/test_hw_tools.py | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/hw_tools.py b/src/hw_tools.py index 9d15aa05..5f08e2f3 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -234,7 +234,6 @@ class DCGMExporterStrategy(SnapStrategy): _name = HWTool.DCGM snap_common: Path = Path("/var/snap/dcgm/common/") - pkg = "ubuntu-drivers-common" def __init__(self, channel: str) -> None: """Init.""" @@ -249,11 +248,11 @@ def install(self) -> None: def _install_nvidia_drivers(self) -> None: """Install the NVIDIA driver if not present.""" if Path("/proc/driver/nvidia/version").exists(): - logger.info("Driver already installed in the machine") + logger.info("NVIDIA driver already installed in the machine") return logger.info("Installing NVIDIA driver") - apt.add_package(self.pkg, update_cache=True) + apt.add_package("ubuntu-drivers-common", update_cache=True) # output what driver was installed helps gets the version installed later cmd = ( @@ -265,13 +264,16 @@ def _install_nvidia_drivers(self) -> None: # https://github.com/canonical/ubuntu-drivers-common/issues/106 result = subprocess.check_output(cmd.split(), text=True) if "No drivers found for installation" in result: + logger.warning( + "No drivers for the NVIDIA GPU were found. Manual installation is necessary" + ) raise ResourceInstallationError(self._name) - logger.info("NVIDIA driver installed") - return except subprocess.CalledProcessError as err: logger.error("Failed to install the NVIDIA driver: %s", err) raise err + logger.info("NVIDIA driver installed") + def _install_nvidia_utils(self) -> None: """Install the nvidia utils to be able to use nvidia-smi.""" nvidia_pkg = Path(self.snap_common / "nvidia-installed-pkgs.txt") @@ -290,7 +292,7 @@ def _install_nvidia_utils(self) -> None: ) return - pkg = f"nvidia-utils-{nvidia_version}" + pkg = f"nvidia-utils-{nvidia_version}-server" apt.add_package(pkg, update_cache=True) logger.info("installed %s", pkg) return diff --git a/src/service.py b/src/service.py index 852ed681..04c8a9fd 100644 --- a/src/service.py +++ b/src/service.py @@ -475,6 +475,11 @@ def validate_exporter_configs(self) -> Tuple[bool, str]: subprocess.check_call("nvidia-smi", timeout=60) return valid, msg except (FileNotFoundError, subprocess.CalledProcessError): + logger.warning( + "nvidia-smi is not working. It's necessary to manually remove and install " + "a different NVIDIA driver until nvidia-smi is working. See the docs for more " + "details: https://ubuntu.com/server/docs/nvidia-drivers-installation" + ) return ( False, "Failed to communicate with NVIDIA driver. Manual intervention is required.", diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index c31024ec..bd23439d 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -1363,7 +1363,7 @@ def test_install_nvidia_utils_driver_installed_from_charm( mock_path.return_value = driver_version dcgm_exporter_strategy._install_nvidia_utils() - mock_add_package.assert_called_with("nvidia-utils-535", update_cache=True) + mock_add_package.assert_called_with("nvidia-utils-535-server", update_cache=True) @mock.patch("hw_tools.apt.add_package") From ba9ed08de646892f5274012034ca7d7a061d4cc9 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Thu, 3 Oct 2024 18:55:53 -0300 Subject: [PATCH 5/9] Add NVIDIA strategy --- src/config.py | 1 + src/hw_tools.py | 37 ++++++++--- src/service.py | 40 +++++++----- tests/unit/test_hw_tools.py | 126 ++++++++++++++++++++---------------- tests/unit/test_service.py | 69 ++++++++++++++------ 5 files changed, 174 insertions(+), 99 deletions(-) diff --git a/src/config.py b/src/config.py index a5fe935b..8af9b950 100644 --- a/src/config.py +++ b/src/config.py @@ -65,6 +65,7 @@ class HWTool(str, Enum): REDFISH = "redfish" SMARTCTL_EXPORTER = "smartctl-exporter" DCGM = "dcgm" + NVIDIA_DRIVER = "nvidia-driver" TPR_RESOURCES: t.Dict[HWTool, str] = { diff --git a/src/hw_tools.py b/src/hw_tools.py index bdfc7049..bdca6cc2 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -231,15 +231,20 @@ class DCGMExporterStrategy(SnapStrategy): """DCGM strategy class.""" _name = HWTool.DCGM - snap_common: Path = Path("/var/snap/dcgm/common/") def __init__(self, channel: str) -> None: """Init.""" self.channel = channel + +class NVIDIADriverStrategy(StrategyABC): + """NVIDIA driver strategy class.""" + + _name = HWTool.NVIDIA_DRIVER + snap_common: Path = Path("/var/snap/dcgm/common/") + def install(self) -> None: - """Install the snap from a channel and the necessary nvidia driver.""" - super().install() + """Install the driver and NVIDIA utils.""" self._install_nvidia_drivers() self._install_nvidia_utils() @@ -261,15 +266,17 @@ def _install_nvidia_drivers(self) -> None: # This can be changed to check_call and not rely in the output if this is fixed # https://github.com/canonical/ubuntu-drivers-common/issues/106 result = subprocess.check_output(cmd.split(), text=True) - if "No drivers found for installation" in result: - logger.warning( - "No drivers for the NVIDIA GPU were found. Manual installation is necessary" - ) - raise ResourceInstallationError(self._name) + except subprocess.CalledProcessError as err: logger.error("Failed to install the NVIDIA driver: %s", err) raise err + if "No drivers found for installation" in result: + logger.warning( + "No drivers for the NVIDIA GPU were found. Manual installation is necessary" + ) + raise ResourceInstallationError(self._name) + logger.info("NVIDIA driver installed") def _install_nvidia_utils(self) -> None: @@ -295,6 +302,20 @@ def _install_nvidia_utils(self) -> None: logger.info("installed %s", pkg) return + def check(self) -> bool: + """Check if nvidia-smi is working.""" + try: + subprocess.check_call("nvidia-smi", timeout=60) + return True + except (FileNotFoundError, subprocess.CalledProcessError) as e: + logger.error(e) + logger.warning( + "nvidia-smi is not working. It's necessary to manually remove and install " + "a different NVIDIA driver until nvidia-smi is working. See the docs for more " + "details: https://ubuntu.com/server/docs/nvidia-drivers-installation" + ) + return False + class SmartCtlExporterStrategy(SnapStrategy): """SmartCtl strategy class.""" diff --git a/src/service.py b/src/service.py index 7b5a0899..7abcacbe 100644 --- a/src/service.py +++ b/src/service.py @@ -2,11 +2,9 @@ import os import shutil -import subprocess from abc import ABC, abstractmethod from logging import getLogger from pathlib import Path -from subprocess import CalledProcessError from time import sleep from typing import Any, Dict, Optional, Set, Tuple @@ -24,7 +22,12 @@ HWTool, ) from hardware import get_bmc_address -from hw_tools import DCGMExporterStrategy, SmartCtlExporterStrategy, SnapStrategy +from hw_tools import ( + DCGMExporterStrategy, + NVIDIADriverStrategy, + SmartCtlExporterStrategy, + SnapStrategy, +) logger = getLogger(__name__) @@ -411,6 +414,7 @@ class DCGMExporter(SnapExporter): def __init__(self, charm_dir: Path, config: ConfigData): """Init.""" self.strategy = DCGMExporterStrategy(str(config["dcgm-snap-channel"])) + self.nvidia_driver_strategy = NVIDIADriverStrategy() self.charm_dir = charm_dir self.metrics_file = self.charm_dir / "src/gpu_metrics/dcgm_metrics.csv" self.metric_config_value = self.metrics_file.name @@ -418,9 +422,20 @@ def __init__(self, charm_dir: Path, config: ConfigData): def install(self) -> bool: """Install the DCGM exporter and configure custom metrics.""" - if not super().install(): + return all( + (super().install(), self._install_nvidia_drivers(), self._create_custom_metrics()) + ) + + def _install_nvidia_drivers(self) -> bool: + try: + self.nvidia_driver_strategy.install() + return True + except Exception as err: # pylint: disable=broad-except + logger.error("Failed to install the NVIDIA drivers %s", err) return False + def _create_custom_metrics(self) -> bool: + """Create the custom metric files for the DCGM exporter.""" logger.info("Creating a custom metrics file and configuring the DCGM snap to use it") try: shutil.copy(self.metrics_file, self.snap_common) @@ -441,22 +456,13 @@ def validate_exporter_configs(self) -> Tuple[bool, str]: """Validate the if the DCGM exporter is able to run.""" valid, msg = super().validate_exporter_configs() if not valid: - return valid, msg - - try: - subprocess.check_call("nvidia-smi", timeout=60) - return valid, msg - except (FileNotFoundError, CalledProcessError) as e: - logger.error(e) - logger.warning( - "nvidia-smi is not working. It's necessary to manually remove and install " - "a different NVIDIA driver until nvidia-smi is working. See the docs for more " - "details: https://ubuntu.com/server/docs/nvidia-drivers-installation" - ) + return False, msg + if not self.nvidia_driver_strategy.check(): return ( False, - "Failed to communicate with NVIDIA driver. Manual intervention is required.", + "Failed to communicate with NVIDIA driver. See more details in the logs", ) + return valid, msg class SmartCtlExporter(SnapExporter): diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index 93999be5..7fe08d3d 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -22,11 +22,11 @@ from config import SNAP_COMMON, TOOLS_DIR, TPR_RESOURCES, HWTool, StorageVendor, SystemVendor from hw_tools import ( APTStrategyABC, - DCGMExporterStrategy, HWToolHelper, IPMIDCMIStrategy, IPMISELStrategy, IPMISENSORStrategy, + NVIDIADriverStrategy, PercCLIStrategy, ResourceChecksumError, ResourceFileSizeZeroError, @@ -1146,91 +1146,101 @@ def test_snap_strategy_check(snap_exporter, mock_snap_lib, services, expected): @pytest.fixture -def dcgm_exporter_strategy(mock_snap_lib): - strategy = DCGMExporterStrategy("latest/stable") +def mock_check_output(): + with mock.patch("hw_tools.subprocess.check_output") as mocked_check_output: + yield mocked_check_output + + +@pytest.fixture +def mock_check_call(): + with mock.patch("hw_tools.subprocess.check_call") as mocked_check_call: + yield mocked_check_call + + +@pytest.fixture +def mock_apt_lib(): + with mock.patch("hw_tools.apt") as mocked_apt_lib: + yield mocked_apt_lib + + +@pytest.fixture +def mock_path(): + with mock.patch("hw_tools.Path") as mocked_path: + yield mocked_path + + +@pytest.fixture +def nvidia_driver_strategy(mock_check_output, mock_apt_lib, mock_path, mock_check_call): + strategy = NVIDIADriverStrategy() yield strategy -@mock.patch("hw_tools.DCGMExporterStrategy._install_nvidia_drivers") -@mock.patch("hw_tools.DCGMExporterStrategy._install_nvidia_utils") -def test_dcgm_exporter_strategy_install( - mock_install_nvidia_drivers, mock_install_nvidia_utils, dcgm_exporter_strategy +@mock.patch("hw_tools.NVIDIADriverStrategy._install_nvidia_drivers") +@mock.patch("hw_tools.NVIDIADriverStrategy._install_nvidia_utils") +def test_nvidia_driver_strategy_install( + mock_install_nvidia_drivers, mock_install_nvidia_utils, nvidia_driver_strategy ): - dcgm_exporter_strategy.install() + nvidia_driver_strategy.install() mock_install_nvidia_drivers.assert_called_once() mock_install_nvidia_utils.assert_called_once() -@mock.patch("hw_tools.apt.add_package") -@mock.patch("hw_tools.subprocess.check_output") -@mock.patch("hw_tools.Path") -def test_dcgm_install_nvidia_drivers_success( - mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy +def test_install_nvidia_drivers_success( + mock_path, mock_check_output, mock_apt_lib, nvidia_driver_strategy ): nvidia_version = mock.MagicMock() nvidia_version.exists.return_value = False mock_path.return_value = nvidia_version - dcgm_exporter_strategy._install_nvidia_drivers() + nvidia_driver_strategy._install_nvidia_drivers() - mock_add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) - mock_subprocess.assert_called_once() + mock_apt_lib.add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) + mock_check_output.assert_called_once() -@mock.patch("hw_tools.apt.add_package") -@mock.patch("hw_tools.subprocess.check_output") -@mock.patch("hw_tools.Path") -def test_dcgm_install_nvidia_drivers_already_installed( - mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy +def test_install_nvidia_drivers_already_installed( + mock_path, mock_apt_lib, nvidia_driver_strategy, mock_check_output ): nvidia_version = mock.MagicMock() nvidia_version.exists.return_value = True mock_path.return_value = nvidia_version - dcgm_exporter_strategy._install_nvidia_drivers() + nvidia_driver_strategy._install_nvidia_drivers() - mock_add_package.assert_not_called() - mock_subprocess.assert_not_called() + mock_apt_lib.add_package.assert_not_called() + mock_check_output.assert_not_called() -@mock.patch("hw_tools.apt.add_package") -@mock.patch("hw_tools.subprocess.check_output") -@mock.patch("hw_tools.Path") -def test_dcgm_install_nvidia_drivers_subprocess_exception( - mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy +def test_install_nvidia_drivers_subprocess_exception( + mock_path, mock_check_output, mock_apt_lib, nvidia_driver_strategy ): nvidia_version = mock.MagicMock() nvidia_version.exists.return_value = False mock_path.return_value = nvidia_version - mock_subprocess.side_effect = subprocess.CalledProcessError(1, []) + mock_check_output.side_effect = subprocess.CalledProcessError(1, []) with pytest.raises(subprocess.CalledProcessError): - dcgm_exporter_strategy._install_nvidia_drivers() + nvidia_driver_strategy._install_nvidia_drivers() - mock_add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) + mock_apt_lib.add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) -@mock.patch("hw_tools.apt.add_package") -@mock.patch("hw_tools.subprocess.check_output") -@mock.patch("hw_tools.Path") -def test_dcgm_install_nvidia_drivers_no_drivers_found( - mock_path, mock_subprocess, mock_add_package, dcgm_exporter_strategy +def test_install_nvidia_drivers_no_drivers_found( + mock_path, mock_check_output, mock_apt_lib, nvidia_driver_strategy ): nvidia_version = mock.MagicMock() nvidia_version.exists.return_value = False mock_path.return_value = nvidia_version - mock_subprocess.return_value = "No drivers found for installation" + mock_check_output.return_value = "No drivers found for installation" with pytest.raises(ResourceInstallationError): - dcgm_exporter_strategy._install_nvidia_drivers() + nvidia_driver_strategy._install_nvidia_drivers() - mock_add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) + mock_apt_lib.add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) -@mock.patch("hw_tools.apt.add_package") -@mock.patch("hw_tools.Path") def test_install_nvidia_utils_driver_installed_from_charm( - mock_path, mock_add_package, dcgm_exporter_strategy + mock_path, mock_apt_lib, nvidia_driver_strategy ): driver_version = mock.MagicMock() driver_version.exists.return_value = True @@ -1239,35 +1249,41 @@ def test_install_nvidia_utils_driver_installed_from_charm( ) mock_path.return_value = driver_version - dcgm_exporter_strategy._install_nvidia_utils() - mock_add_package.assert_called_with("nvidia-utils-535-server", update_cache=True) + nvidia_driver_strategy._install_nvidia_utils() + mock_apt_lib.add_package.assert_called_with("nvidia-utils-535-server", update_cache=True) -@mock.patch("hw_tools.apt.add_package") -@mock.patch("hw_tools.Path") def test_install_nvidia_utils_driver_not_installed_from_charm( - mock_path, mock_add_package, dcgm_exporter_strategy + mock_path, mock_apt_lib, nvidia_driver_strategy ): driver_version = mock.MagicMock() driver_version.exists.return_value = False mock_path.return_value = driver_version - dcgm_exporter_strategy._install_nvidia_utils() - mock_add_package.assert_not_called() + nvidia_driver_strategy._install_nvidia_utils() + mock_apt_lib.add_package.assert_not_called() -@mock.patch("hw_tools.apt.add_package") -@mock.patch("hw_tools.Path") def test_install_nvidia_utils_driver_unexpected_format( - mock_path, mock_add_package, dcgm_exporter_strategy + mock_path, mock_apt_lib, nvidia_driver_strategy ): driver_version = mock.MagicMock() driver_version.exists.return_value = True driver_version.read_text.return_value = "nvidia-my-version-server" mock_path.return_value = driver_version - dcgm_exporter_strategy._install_nvidia_utils() - mock_add_package.assert_not_called() + nvidia_driver_strategy._install_nvidia_utils() + mock_apt_lib.add_package.assert_not_called() + + +def test_nvidia_strategy_check(nvidia_driver_strategy): + assert nvidia_driver_strategy.check() is True + + +@pytest.mark.parametrize("exception", [FileNotFoundError, subprocess.CalledProcessError(1, [])]) +def test_nvidia_strategy_check_fails(nvidia_driver_strategy, mock_check_call, exception): + mock_check_call.side_effect = exception + assert nvidia_driver_strategy.check() is False @mock.patch("hw_tools.Path.unlink") diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index c4169228..7965233d 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -4,6 +4,7 @@ import pathlib import tempfile import unittest +from subprocess import CalledProcessError from unittest import mock import pytest @@ -722,14 +723,11 @@ def setUp(self) -> None: """Set up harness for each test case.""" snap_lib_patcher = mock.patch.object(service, "snap") shutil_lib_patcher = mock.patch.object(service, "shutil") - subprocess = mock.patch.object(service, "subprocess") self.mock_snap = snap_lib_patcher.start() self.mock_shutil = shutil_lib_patcher.start() - self.mock_subprocess = subprocess.start() self.addCleanup(snap_lib_patcher.stop) self.addCleanup(shutil_lib_patcher.stop) - self.addCleanup(subprocess.stop) search_path = pathlib.Path(f"{__file__}/../../..").resolve() self.mock_config = { @@ -738,6 +736,7 @@ def setUp(self) -> None: self.exporter = service.DCGMExporter(search_path, self.mock_config) self.exporter.strategy = mock.MagicMock() + self.exporter.nvidia_driver_strategy = mock.MagicMock() def test_exporter_name(self): self.assertEqual(self.exporter.exporter_name, "dcgm") @@ -745,21 +744,41 @@ def test_exporter_name(self): def test_hw_tools(self): self.assertEqual(self.exporter.hw_tools(), {HWTool.DCGM}) - def test_install_failed(self): - self.exporter.snap_client.present = False + @mock.patch("service.DCGMExporter._create_custom_metrics") + @mock.patch("service.DCGMExporter._install_nvidia_drivers") + @mock.patch("service.SnapExporter.install") + def test_install_dcgm( + self, + mock_super_install, + mock_nvidia, + mock_custom_metrics, + ): + test_cases = [ + (True, True, True, True), + (False, True, True, False), + (True, False, True, False), + (True, True, False, False), + (False, False, False, False), + ] + for super_install, nvidia_drivers, custom_metrics, expected in test_cases: + with self.subTest( + super_install=super_install, + nvidia_drivers=nvidia_drivers, + custom_metrics=custom_metrics, + expected=expected, + ): + mock_super_install.return_value = super_install + mock_nvidia.return_value = nvidia_drivers + mock_custom_metrics.return_value = custom_metrics - exporter_install_ok = self.exporter.install() + result = self.exporter.install() - self.exporter.strategy.install.assert_called() - self.mock_shutil.copy.assert_not_called() - self.assertFalse(exporter_install_ok) + self.assertEqual(result, expected) - def test_install_success(self): - self.exporter.snap_client.present = True + def test_create_custom_metrics(self): - exporter_install_ok = self.exporter.install() + result = self.exporter._create_custom_metrics() - self.exporter.strategy.install.assert_called() self.mock_shutil.copy.assert_called_with( self.exporter.metrics_file, self.exporter.snap_common ) @@ -767,33 +786,45 @@ def test_install_success(self): {self.exporter.metric_config: self.exporter.metric_config_value} ) self.exporter.snap_client.restart.assert_called_with(reload=True) - self.assertTrue(exporter_install_ok) + self.assertTrue(result) def test_install_metrics_copy_fail(self): self.exporter.snap_client.present = True self.mock_shutil.copy.side_effect = FileNotFoundError - exporter_install_ok = self.exporter.install() + result = self.exporter.install() - self.exporter.strategy.install.assert_called() + self.exporter.snap_client.set.assert_not_called() self.exporter.snap_client.restart.assert_not_called() - self.assertFalse(exporter_install_ok) + self.assertFalse(result) + + def test_install_nvidia_drivers_success(self): + result = self.exporter._install_nvidia_drivers() + self.assertTrue(result) + self.exporter.nvidia_driver_strategy.install.assert_called_once() + + def test_install_nvidia_drivers_fails(self): + self.exporter.nvidia_driver_strategy.install.side_effect = CalledProcessError(1, []) + result = self.exporter._install_nvidia_drivers() + self.assertFalse(result) def test_validate_exporter_configs_success(self): + self.exporter.nvidia_driver_strategy.check.return_value = True valid, msg = self.exporter.validate_exporter_configs() self.assertTrue(valid) self.assertEqual(msg, "Exporter config is valid.") def test_validate_exporter_configs_fails(self): - self.mock_subprocess.check_call.side_effect = FileNotFoundError + self.exporter.nvidia_driver_strategy.check.return_value = False valid, msg = self.exporter.validate_exporter_configs() self.assertFalse(valid) self.assertEqual( - msg, "Failed to communicate with NVIDIA driver. Manual intervention is required." + msg, "Failed to communicate with NVIDIA driver. See more details in the logs" ) @mock.patch.object(service.BaseExporter, "validate_exporter_configs") def test_validate_exporter_configs_fails_parent(self, mock_parent_validate): + self.exporter.nvidia_driver_strategy.check.return_value = True mock_parent_validate.return_value = False, "Invalid config: exporter's port" valid, msg = self.exporter.validate_exporter_configs() self.assertFalse(valid) From 3fd769fcf599b5a78235d2e91d078304a3ac8931 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Fri, 4 Oct 2024 18:51:39 -0300 Subject: [PATCH 6/9] Apply changes --- src/charm.py | 2 +- src/hw_tools.py | 72 ++++++++++++++++++++++------------ src/service.py | 78 ++++++++++++++----------------------- tests/unit/test_hw_tools.py | 23 ++++++----- 4 files changed, 90 insertions(+), 85 deletions(-) diff --git a/src/charm.py b/src/charm.py index 11900a41..ec8a8534 100755 --- a/src/charm.py +++ b/src/charm.py @@ -84,7 +84,7 @@ def exporters(self) -> List[BaseExporter]: exporters.append(SmartCtlExporter(self.model.config)) if stored_tools & DCGMExporter.hw_tools(): - exporters.append(DCGMExporter(self.charm_dir, self.model.config)) + exporters.append(DCGMExporter(self.model.config)) return exporters diff --git a/src/hw_tools.py b/src/hw_tools.py index bdca6cc2..5e74729f 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -5,6 +5,7 @@ import logging import os +import re import shutil import stat import subprocess @@ -196,6 +197,11 @@ def snap(self) -> str: """Snap name.""" return self._name.value + @property + def snap_client(self) -> snap.Snap: + """Return the snap client.""" + return snap.SnapCache()[self.snap] + def install(self) -> None: """Install the snap from a channel.""" try: @@ -228,20 +234,38 @@ def check(self) -> bool: class DCGMExporterStrategy(SnapStrategy): - """DCGM strategy class.""" + """DCGM exporter strategy class.""" _name = HWTool.DCGM + metric_file = Path().parent / "/gpu_metrics/dcgm_metrics.csv" + snap_common: Path = Path("/var/snap/dcgm/common/") def __init__(self, channel: str) -> None: """Init.""" self.channel = channel + def install(self) -> None: + """Install the snap and the custom metrics.""" + super().install() + self._create_custom_metrics() + + def _create_custom_metrics(self) -> None: + logger.info("Creating a custom metrics file and configuring the DCGM snap to use it") + try: + shutil.copy(self.metric_file, self.snap_common) + self.snap_client.set({"dcgm-exporter-metrics-file": self.metric_file.name}) + self.snap_client.restart(reload=True) + except Exception as err: # pylint: disable=broad-except + logger.error("Failed to configure custom DCGM metrics: %s", err) + raise err + class NVIDIADriverStrategy(StrategyABC): """NVIDIA driver strategy class.""" _name = HWTool.NVIDIA_DRIVER - snap_common: Path = Path("/var/snap/dcgm/common/") + installed_pkgs = Path("/tmp/nvidia-installed-pkgs.txt") + pkg_pattern = r"nvidia(?:-[a-zA-Z-]*)?-(\d+)(?:-[a-zA-Z]*)?" def install(self) -> None: """Install the driver and NVIDIA utils.""" @@ -258,10 +282,7 @@ def _install_nvidia_drivers(self) -> None: apt.add_package("ubuntu-drivers-common", update_cache=True) # output what driver was installed helps gets the version installed later - cmd = ( - "ubuntu-drivers install --gpgpu --package-list " - f"{self.snap_common}/nvidia-installed-pkgs.txt" - ) + cmd = f"ubuntu-drivers install --gpgpu --package-list {self.installed_pkgs}" try: # This can be changed to check_call and not rely in the output if this is fixed # https://github.com/canonical/ubuntu-drivers-common/issues/106 @@ -281,26 +302,29 @@ def _install_nvidia_drivers(self) -> None: def _install_nvidia_utils(self) -> None: """Install the nvidia utils to be able to use nvidia-smi.""" - nvidia_pkg = Path(self.snap_common / "nvidia-installed-pkgs.txt") - if not nvidia_pkg.exists(): - logger.debug("nvidia-utils not installed by the charm") + if not self.installed_pkgs.exists(): + logger.debug("Drivers not installed by the charm. Skipping nvidia-utils") return - installed_pkg = nvidia_pkg.read_text(encoding="utf-8").splitlines()[0] - logger.debug("installed driver from hardware-observer: %s", installed_pkg) - nvidia_version = installed_pkg.split("-")[-2] - - if not nvidia_version.isdigit(): + installed_pkgs = self.installed_pkgs.read_text(encoding="utf-8").splitlines() + for line in installed_pkgs: + if match := re.search(self.pkg_pattern, line): + nvidia_version = match.group(1) + logger.debug("installed driver from hardware-observer: %s", line) + pkg = f"nvidia-utils-{nvidia_version}-server" + apt.add_package(pkg, update_cache=True) + logger.info("installed %s", pkg) + break + else: logger.warning( - "driver %s is an unexpected format and nvidia-utils was not installed", - installed_pkg, + "packages installed at %s are in an unexpected format. " + "nvidia-utils was not installed", + self.installed_pkgs, ) - return - pkg = f"nvidia-utils-{nvidia_version}-server" - apt.add_package(pkg, update_cache=True) - logger.info("installed %s", pkg) - return + def remove(self) -> None: + """Drivers shouldn't be removed by the strategy.""" + return None def check(self) -> bool: """Check if nvidia-smi is working.""" @@ -310,9 +334,9 @@ def check(self) -> bool: except (FileNotFoundError, subprocess.CalledProcessError) as e: logger.error(e) logger.warning( - "nvidia-smi is not working. It's necessary to manually remove and install " - "a different NVIDIA driver until nvidia-smi is working. See the docs for more " - "details: https://ubuntu.com/server/docs/nvidia-drivers-installation" + "nvidia-smi is not working. Ensure the correct driver is installed. " + "See the docs for more details: " + "https://ubuntu.com/server/docs/nvidia-drivers-installation" ) return False diff --git a/src/service.py b/src/service.py index 7abcacbe..4744b795 100644 --- a/src/service.py +++ b/src/service.py @@ -1,12 +1,11 @@ """Exporter service helper.""" import os -import shutil from abc import ABC, abstractmethod from logging import getLogger from pathlib import Path from time import sleep -from typing import Any, Dict, Optional, Set, Tuple +from typing import Any, Dict, Optional, Set, Tuple, List from charms.operator_libs_linux.v1 import systemd from charms.operator_libs_linux.v2 import snap @@ -27,6 +26,7 @@ NVIDIADriverStrategy, SmartCtlExporterStrategy, SnapStrategy, + StrategyABC, ) logger = getLogger(__name__) @@ -320,16 +320,17 @@ class SnapExporter(BaseExporter): exporter_name: str channel: str port: int - strategy: SnapStrategy + strategies: List[StrategyABC] def __init__(self, config: ConfigData): """Init.""" self.config = config + self.strategies = [] @property def snap_client(self) -> snap.Snap: """Return the snap client.""" - return snap.SnapCache()[self.strategy.snap] + return snap.SnapCache()[self.exporter_name] @staticmethod def hw_tools() -> Set[HWTool]: @@ -342,10 +343,12 @@ def install(self) -> bool: Returns true if the install is successful, false otherwise. """ try: - self.strategy.install() + for strategy in self.strategies: + strategy.install() self.enable_and_start() return self.snap_client.present is True - except Exception: # pylint: disable=broad-except + except Exception as err: # pylint: disable=broad-except + logger.error("Failed to install %s: %s", strategy.name, err) return False def uninstall(self) -> bool: @@ -354,12 +357,13 @@ def uninstall(self) -> bool: Returns true if the uninstall is successful, false otherwise. """ try: - self.strategy.remove() + for strategy in self.strategies: + strategy.remove() # using the snap.SnapError will result into: # TypeError: catching classes that do not inherit from BaseException is not allowed except Exception as err: # pylint: disable=broad-except - logger.error("Failed to remove %s: %s", self.strategy.snap, err) + logger.error("Failed to remove %s: %s", strategy.name, err) return False return self.snap_client.present is False @@ -384,7 +388,7 @@ def set(self, snap_config: dict) -> bool: try: self.snap_client.set(snap_config, typed=True) except snap.SnapError as err: - logger.error("Failed to update snap configs %s: %s", self.strategy.snap, err) + logger.error("Failed to update snap configs %s: %s", self.exporter_name, err) return False return True @@ -393,14 +397,21 @@ def check_health(self) -> bool: Returns true if the service is healthy, false otherwise. """ - return self.strategy.check() + return all(strategy.check() for strategy in self.strategies) def configure(self) -> bool: """Set the necessary exporter configurations or change snap channel. Returns true if the configure is successful, false otherwise. """ - return self.install() + for strategy in self.strategies: + if isinstance(strategy, SnapStrategy): + try: + return self.install() + except Exception as err: # pylint: disable=broad-except + logger.error("Failed to configure %s: %s", self.exporter_name, err) + return False + return True class DCGMExporter(SnapExporter): @@ -408,45 +419,15 @@ class DCGMExporter(SnapExporter): exporter_name: str = "dcgm" port: int = 9400 - snap_common: Path = Path("/var/snap/dcgm/common/") - metric_config: str = "dcgm-exporter-metrics-file" - def __init__(self, charm_dir: Path, config: ConfigData): + def __init__(self, config: ConfigData): """Init.""" - self.strategy = DCGMExporterStrategy(str(config["dcgm-snap-channel"])) - self.nvidia_driver_strategy = NVIDIADriverStrategy() - self.charm_dir = charm_dir - self.metrics_file = self.charm_dir / "src/gpu_metrics/dcgm_metrics.csv" - self.metric_config_value = self.metrics_file.name + self.strategies = [ + DCGMExporterStrategy(str(config["dcgm-snap-channel"])), + NVIDIADriverStrategy(), + ] super().__init__(config) - def install(self) -> bool: - """Install the DCGM exporter and configure custom metrics.""" - return all( - (super().install(), self._install_nvidia_drivers(), self._create_custom_metrics()) - ) - - def _install_nvidia_drivers(self) -> bool: - try: - self.nvidia_driver_strategy.install() - return True - except Exception as err: # pylint: disable=broad-except - logger.error("Failed to install the NVIDIA drivers %s", err) - return False - - def _create_custom_metrics(self) -> bool: - """Create the custom metric files for the DCGM exporter.""" - logger.info("Creating a custom metrics file and configuring the DCGM snap to use it") - try: - shutil.copy(self.metrics_file, self.snap_common) - self.snap_client.set({self.metric_config: self.metric_config_value}) - self.snap_client.restart(reload=True) - except Exception as err: # pylint: disable=broad-except - logger.error("Failed to configure custom DCGM metrics: %s", err) - return False - - return True - @staticmethod def hw_tools() -> Set[HWTool]: """Return hardware tools to watch.""" @@ -457,7 +438,8 @@ def validate_exporter_configs(self) -> Tuple[bool, str]: valid, msg = super().validate_exporter_configs() if not valid: return False, msg - if not self.nvidia_driver_strategy.check(): + + if not NVIDIADriverStrategy().check(): return ( False, "Failed to communicate with NVIDIA driver. See more details in the logs", @@ -474,7 +456,7 @@ def __init__(self, config: ConfigData) -> None: """Initialize the SmartctlExporter class.""" self.port = int(config["smartctl-exporter-port"]) self.log_level = str(config["exporter-log-level"]) - self.strategy = SmartCtlExporterStrategy(str(config["smartctl-exporter-snap-channel"])) + self.strategies = [SmartCtlExporterStrategy(str(config["smartctl-exporter-snap-channel"]))] super().__init__(config) @staticmethod diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index 7fe08d3d..f544e247 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -1172,6 +1172,7 @@ def mock_path(): @pytest.fixture def nvidia_driver_strategy(mock_check_output, mock_apt_lib, mock_path, mock_check_call): strategy = NVIDIADriverStrategy() + strategy.installed_pkgs = mock_path yield strategy @@ -1239,43 +1240,41 @@ def test_install_nvidia_drivers_no_drivers_found( mock_apt_lib.add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) -def test_install_nvidia_utils_driver_installed_from_charm( - mock_path, mock_apt_lib, nvidia_driver_strategy -): +def test_install_nvidia_utils_driver_installed_from_charm(mock_apt_lib, nvidia_driver_strategy): driver_version = mock.MagicMock() driver_version.exists.return_value = True driver_version.read_text.return_value = ( "nvidia-headless-no-dkms-535-server\nlibnvidia-cfg1-535-server" ) - mock_path.return_value = driver_version + nvidia_driver_strategy.installed_pkgs = driver_version nvidia_driver_strategy._install_nvidia_utils() mock_apt_lib.add_package.assert_called_with("nvidia-utils-535-server", update_cache=True) def test_install_nvidia_utils_driver_not_installed_from_charm( - mock_path, mock_apt_lib, nvidia_driver_strategy + mock_apt_lib, nvidia_driver_strategy ): - driver_version = mock.MagicMock() - driver_version.exists.return_value = False - mock_path.return_value = driver_version + nvidia_driver_strategy.installed_pkgs.exists.return_value = False nvidia_driver_strategy._install_nvidia_utils() mock_apt_lib.add_package.assert_not_called() -def test_install_nvidia_utils_driver_unexpected_format( - mock_path, mock_apt_lib, nvidia_driver_strategy -): +def test_install_nvidia_utils_driver_unexpected_format(mock_apt_lib, nvidia_driver_strategy): driver_version = mock.MagicMock() driver_version.exists.return_value = True driver_version.read_text.return_value = "nvidia-my-version-server" - mock_path.return_value = driver_version + nvidia_driver_strategy.installed_pkgs = driver_version nvidia_driver_strategy._install_nvidia_utils() mock_apt_lib.add_package.assert_not_called() +def test_nvidia_strategy_remove(nvidia_driver_strategy): + assert nvidia_driver_strategy.remove() is None + + def test_nvidia_strategy_check(nvidia_driver_strategy): assert nvidia_driver_strategy.check() is True From 1272fbf40a283dc66c00b5a0aa6e9784f62d5a45 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Fri, 4 Oct 2024 19:32:04 -0300 Subject: [PATCH 7/9] fix lint --- src/hw_tools.py | 20 +++++++++++--------- src/service.py | 6 +++--- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/hw_tools.py b/src/hw_tools.py index 5e74729f..b9db254c 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -193,43 +193,45 @@ class SnapStrategy(StrategyABC): channel: str @property - def snap(self) -> str: + def snap_name(self) -> str: """Snap name.""" return self._name.value @property def snap_client(self) -> snap.Snap: """Return the snap client.""" - return snap.SnapCache()[self.snap] + return snap.SnapCache()[self.snap_name] def install(self) -> None: """Install the snap from a channel.""" try: - snap.add(self.snap, channel=self.channel) - logger.info("Installed %s from channel: %s", self.snap, self.channel) + snap.add(self.snap_name, channel=self.channel) + logger.info("Installed %s from channel: %s", self.snap_name, self.channel) # using the snap.SnapError will result into: # TypeError: catching classes that do not inherit from BaseException is not allowed except Exception as err: # pylint: disable=broad-except - logger.error("Failed to install %s from channel: %s: %s", self.snap, self.channel, err) + logger.error( + "Failed to install %s from channel: %s: %s", self.snap_name, self.channel, err + ) raise err def remove(self) -> None: """Remove the snap.""" try: - snap.remove([self.snap]) + snap.remove([self.snap_name]) # using the snap.SnapError will result into: # TypeError: catching classes that do not inherit from BaseException is not allowed except Exception as err: # pylint: disable=broad-except - logger.error("Failed to remove %s: %s", self.snap, err) + logger.error("Failed to remove %s: %s", self.snap_name, err) raise err def check(self) -> bool: """Check if all services are active.""" return all( service.get("active", False) - for service in snap.SnapCache()[self.snap].services.values() + for service in snap.SnapCache()[self.snap_name].services.values() ) @@ -260,7 +262,7 @@ def _create_custom_metrics(self) -> None: raise err -class NVIDIADriverStrategy(StrategyABC): +class NVIDIADriverStrategy(APTStrategyABC): """NVIDIA driver strategy class.""" _name = HWTool.NVIDIA_DRIVER diff --git a/src/service.py b/src/service.py index 4744b795..e56d71d7 100644 --- a/src/service.py +++ b/src/service.py @@ -5,7 +5,7 @@ from logging import getLogger from pathlib import Path from time import sleep -from typing import Any, Dict, Optional, Set, Tuple, List +from typing import Any, Dict, List, Optional, Set, Tuple, Union from charms.operator_libs_linux.v1 import systemd from charms.operator_libs_linux.v2 import snap @@ -22,11 +22,11 @@ ) from hardware import get_bmc_address from hw_tools import ( + APTStrategyABC, DCGMExporterStrategy, NVIDIADriverStrategy, SmartCtlExporterStrategy, SnapStrategy, - StrategyABC, ) logger = getLogger(__name__) @@ -320,7 +320,7 @@ class SnapExporter(BaseExporter): exporter_name: str channel: str port: int - strategies: List[StrategyABC] + strategies: List[Union[SnapStrategy, APTStrategyABC]] def __init__(self, config: ConfigData): """Init.""" From c65725d3275faae5836758733e6e3b3c71618def Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Mon, 7 Oct 2024 16:57:56 -0300 Subject: [PATCH 8/9] add unit tests --- src/hw_tools.py | 8 +- src/service.py | 4 +- tests/unit/test_hw_tools.py | 48 +++++++++++- tests/unit/test_service.py | 148 +++++++++++------------------------- 4 files changed, 97 insertions(+), 111 deletions(-) diff --git a/src/hw_tools.py b/src/hw_tools.py index b9db254c..b83cd565 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -197,6 +197,11 @@ def snap_name(self) -> str: """Snap name.""" return self._name.value + @property + def snap_common(self) -> Path: + """Snap common directory.""" + return Path(f"/var/snap/{self.snap_name}/common/") + @property def snap_client(self) -> snap.Snap: """Return the snap client.""" @@ -239,8 +244,7 @@ class DCGMExporterStrategy(SnapStrategy): """DCGM exporter strategy class.""" _name = HWTool.DCGM - metric_file = Path().parent / "/gpu_metrics/dcgm_metrics.csv" - snap_common: Path = Path("/var/snap/dcgm/common/") + metric_file = Path.cwd() / "src/gpu_metrics/dcgm_metrics.csv" def __init__(self, channel: str) -> None: """Init.""" diff --git a/src/service.py b/src/service.py index e56d71d7..38b30a0a 100644 --- a/src/service.py +++ b/src/service.py @@ -325,7 +325,6 @@ class SnapExporter(BaseExporter): def __init__(self, config: ConfigData): """Init.""" self.config = config - self.strategies = [] @property def snap_client(self) -> snap.Snap: @@ -407,7 +406,8 @@ def configure(self) -> bool: for strategy in self.strategies: if isinstance(strategy, SnapStrategy): try: - return self.install() + # refresh the snap for a new channel if necessary + strategy.install() except Exception as err: # pylint: disable=broad-except logger.error("Failed to configure %s: %s", self.exporter_name, err) return False diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index f544e247..37eac304 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -22,6 +22,7 @@ from config import SNAP_COMMON, TOOLS_DIR, TPR_RESOURCES, HWTool, StorageVendor, SystemVendor from hw_tools import ( APTStrategyABC, + DCGMExporterStrategy, HWToolHelper, IPMIDCMIStrategy, IPMISELStrategy, @@ -1039,7 +1040,7 @@ def mock_snap_lib(): def test_snap_strategy_name(snap_exporter): - assert snap_exporter.snap == "my-snap" + assert snap_exporter.snap_name == "my-snap" def test_snap_strategy_channel(snap_exporter): @@ -1048,7 +1049,9 @@ def test_snap_strategy_channel(snap_exporter): def test_snap_strategy_install_success(snap_exporter, mock_snap_lib): snap_exporter.install() - mock_snap_lib.add.assert_called_once_with(snap_exporter.snap, channel=snap_exporter.channel) + mock_snap_lib.add.assert_called_once_with( + snap_exporter.snap_name, channel=snap_exporter.channel + ) def test_snap_strategy_install_fail(snap_exporter, mock_snap_lib): @@ -1060,7 +1063,7 @@ def test_snap_strategy_install_fail(snap_exporter, mock_snap_lib): def test_snap_strategy_remove_success(snap_exporter, mock_snap_lib): snap_exporter.remove() - mock_snap_lib.remove.assert_called_once_with([snap_exporter.snap]) + mock_snap_lib.remove.assert_called_once_with([snap_exporter.snap_name]) def test_snap_strategy_remove_fail(snap_exporter, mock_snap_lib): @@ -1169,6 +1172,12 @@ def mock_path(): yield mocked_path +@pytest.fixture +def mock_shutil_copy(): + with mock.patch("hw_tools.shutil.copy") as mocked_copy: + yield mocked_copy + + @pytest.fixture def nvidia_driver_strategy(mock_check_output, mock_apt_lib, mock_path, mock_check_call): strategy = NVIDIADriverStrategy() @@ -1176,6 +1185,39 @@ def nvidia_driver_strategy(mock_check_output, mock_apt_lib, mock_path, mock_chec yield strategy +@pytest.fixture +def dcgm_exporter_strategy(mock_snap_lib, mock_shutil_copy): + yield DCGMExporterStrategy("latest/stable") + + +@mock.patch("hw_tools.DCGMExporterStrategy._create_custom_metrics") +def test_dcgm_exporter_install(mock_custom_metrics, dcgm_exporter_strategy): + assert dcgm_exporter_strategy.install() is None + mock_custom_metrics.assert_called_once() + + +def test_dcgm_create_custom_metrics(dcgm_exporter_strategy, mock_shutil_copy, mock_snap_lib): + assert dcgm_exporter_strategy._create_custom_metrics() is None + mock_shutil_copy.assert_called_once_with( + Path.cwd() / "src/gpu_metrics/dcgm_metrics.csv", Path("/var/snap/dcgm/common") + ) + dcgm_exporter_strategy.snap_client.set.assert_called_once_with( + {"dcgm-exporter-metrics-file": "dcgm_metrics.csv"} + ) + dcgm_exporter_strategy.snap_client.restart.assert_called_once_with(reload=True) + + +def test_dcgm_create_custom_metrics_copy_fail( + dcgm_exporter_strategy, mock_shutil_copy, mock_snap_lib +): + mock_shutil_copy.side_effect = FileNotFoundError + with pytest.raises(FileNotFoundError): + dcgm_exporter_strategy._create_custom_metrics() + + dcgm_exporter_strategy.snap_client.set.assert_not_called() + dcgm_exporter_strategy.snap_client.restart.assert_not_called() + + @mock.patch("hw_tools.NVIDIADriverStrategy._install_nvidia_drivers") @mock.patch("hw_tools.NVIDIADriverStrategy._install_nvidia_utils") def test_nvidia_driver_strategy_install( diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index 7965233d..3c75577b 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -4,7 +4,6 @@ import pathlib import tempfile import unittest -from subprocess import CalledProcessError from unittest import mock import pytest @@ -722,21 +721,18 @@ class TestDCGMSnapExporter(unittest.TestCase): def setUp(self) -> None: """Set up harness for each test case.""" snap_lib_patcher = mock.patch.object(service, "snap") - shutil_lib_patcher = mock.patch.object(service, "shutil") self.mock_snap = snap_lib_patcher.start() - self.mock_shutil = shutil_lib_patcher.start() self.addCleanup(snap_lib_patcher.stop) - self.addCleanup(shutil_lib_patcher.stop) - search_path = pathlib.Path(f"{__file__}/../../..").resolve() - self.mock_config = { - "dcgm-snap-channel": "latest/stable", - } - - self.exporter = service.DCGMExporter(search_path, self.mock_config) - self.exporter.strategy = mock.MagicMock() - self.exporter.nvidia_driver_strategy = mock.MagicMock() + self.exporter = service.DCGMExporter( + { + "dcgm-snap-channel": "latest/stable", + } + ) + self.snap_strategy = mock.MagicMock(spec=service.DCGMExporterStrategy) + self.nvidia_strategy = mock.MagicMock(spec=service.NVIDIADriverStrategy) + self.exporter.strategies = [self.snap_strategy, self.nvidia_strategy] def test_exporter_name(self): self.assertEqual(self.exporter.exporter_name, "dcgm") @@ -744,78 +740,14 @@ def test_exporter_name(self): def test_hw_tools(self): self.assertEqual(self.exporter.hw_tools(), {HWTool.DCGM}) - @mock.patch("service.DCGMExporter._create_custom_metrics") - @mock.patch("service.DCGMExporter._install_nvidia_drivers") - @mock.patch("service.SnapExporter.install") - def test_install_dcgm( - self, - mock_super_install, - mock_nvidia, - mock_custom_metrics, - ): - test_cases = [ - (True, True, True, True), - (False, True, True, False), - (True, False, True, False), - (True, True, False, False), - (False, False, False, False), - ] - for super_install, nvidia_drivers, custom_metrics, expected in test_cases: - with self.subTest( - super_install=super_install, - nvidia_drivers=nvidia_drivers, - custom_metrics=custom_metrics, - expected=expected, - ): - mock_super_install.return_value = super_install - mock_nvidia.return_value = nvidia_drivers - mock_custom_metrics.return_value = custom_metrics - - result = self.exporter.install() - - self.assertEqual(result, expected) - - def test_create_custom_metrics(self): - - result = self.exporter._create_custom_metrics() - - self.mock_shutil.copy.assert_called_with( - self.exporter.metrics_file, self.exporter.snap_common - ) - self.exporter.snap_client.set.assert_called_with( - {self.exporter.metric_config: self.exporter.metric_config_value} - ) - self.exporter.snap_client.restart.assert_called_with(reload=True) - self.assertTrue(result) - - def test_install_metrics_copy_fail(self): - self.exporter.snap_client.present = True - self.mock_shutil.copy.side_effect = FileNotFoundError - - result = self.exporter.install() - - self.exporter.snap_client.set.assert_not_called() - self.exporter.snap_client.restart.assert_not_called() - self.assertFalse(result) - - def test_install_nvidia_drivers_success(self): - result = self.exporter._install_nvidia_drivers() - self.assertTrue(result) - self.exporter.nvidia_driver_strategy.install.assert_called_once() - - def test_install_nvidia_drivers_fails(self): - self.exporter.nvidia_driver_strategy.install.side_effect = CalledProcessError(1, []) - result = self.exporter._install_nvidia_drivers() - self.assertFalse(result) - - def test_validate_exporter_configs_success(self): - self.exporter.nvidia_driver_strategy.check.return_value = True + @mock.patch("service.NVIDIADriverStrategy.check", return_value=True) + def test_validate_exporter_configs_success(self, _): valid, msg = self.exporter.validate_exporter_configs() self.assertTrue(valid) self.assertEqual(msg, "Exporter config is valid.") - def test_validate_exporter_configs_fails(self): - self.exporter.nvidia_driver_strategy.check.return_value = False + @mock.patch("service.NVIDIADriverStrategy.check", return_value=False) + def test_validate_exporter_configs_fails(self, _): valid, msg = self.exporter.validate_exporter_configs() self.assertFalse(valid) self.assertEqual( @@ -824,7 +756,6 @@ def test_validate_exporter_configs_fails(self): @mock.patch.object(service.BaseExporter, "validate_exporter_configs") def test_validate_exporter_configs_fails_parent(self, mock_parent_validate): - self.exporter.nvidia_driver_strategy.check.return_value = True mock_parent_validate.return_value = False, "Invalid config: exporter's port" valid, msg = self.exporter.validate_exporter_configs() self.assertFalse(valid) @@ -900,25 +831,27 @@ def test_write_to_file_not_a_directory_error(self, mock_open): @pytest.fixture def snap_exporter(): - my_strategy = mock.MagicMock(spec=service.SnapStrategy) + my_snap_strategy = mock.MagicMock(spec=service.SnapStrategy) + my_apt_strategy = mock.MagicMock(spec=service.APTStrategyABC) class MySnapExporter(service.SnapExporter): exporter_name = "my-exporter" channel = "my-channel" - strategy = my_strategy - - mock_config = { - "dcgm-snap-channel": "latest/stable", - } + strategies = [my_snap_strategy, my_apt_strategy] with mock.patch("service.snap.SnapCache"): - exporter = MySnapExporter(mock_config) + exporter = MySnapExporter( + { + "dcgm-snap-channel": "latest/stable", + } + ) exporter.snap_client.services = {"service1": {}, "service2": {}} yield exporter - my_strategy.reset_mock() + my_snap_strategy.reset_mock() + my_apt_strategy.reset_mock() def test_snap_exporter_hw_tools(snap_exporter): @@ -926,16 +859,16 @@ def test_snap_exporter_hw_tools(snap_exporter): assert snap_exporter.hw_tools() == set() -def test_snap_exporter_install(snap_exporter): - snap_exporter.strategy.install.return_value = True +def test_snap_exporter_install_success(snap_exporter): snap_exporter.snap_client.present = True assert snap_exporter.install() is True - snap_exporter.strategy.install.assert_called_once() + for strategy in snap_exporter.strategies: + strategy.install.assert_called_once() def test_snap_exporter_install_fail(snap_exporter): - snap_exporter.strategy.install.side_effect = ValueError + snap_exporter.strategies[0].install.side_effect = ValueError assert snap_exporter.install() is False @@ -944,11 +877,12 @@ def test_snap_exporter_uninstall(snap_exporter): snap_exporter.snap_client.present = False assert snap_exporter.uninstall() is True - snap_exporter.strategy.remove.assert_called_once() + for strategy in snap_exporter.strategies: + strategy.remove.assert_called_once() def test_snap_exporter_uninstall_fail(snap_exporter): - snap_exporter.strategy.remove.side_effect = ValueError + snap_exporter.strategies[0].remove.side_effect = ValueError assert snap_exporter.uninstall() is False @@ -957,7 +891,8 @@ def test_snap_exporter_uninstall_present(snap_exporter): snap_exporter.snap_client.present = True assert snap_exporter.uninstall() is False - snap_exporter.strategy.remove.assert_called_once() + for strategy in snap_exporter.strategies: + strategy.remove.assert_called_once() def test_snap_exporter_enable_and_start(snap_exporter): @@ -990,20 +925,25 @@ def test_snap_exporter_set_failed(snap_exporter): def test_snap_exporter_check_health(snap_exporter): snap_exporter.check_health() - snap_exporter.strategy.check.assert_called_once() + for strategy in snap_exporter.strategies: + strategy.check.assert_called_once() + +@mock.patch("service.isinstance", return_value=True) +def test_snap_exporter_configure(_, snap_exporter): + assert snap_exporter.configure() is True + for strategy in snap_exporter.strategies: + strategy.install.assert_called_once() -@pytest.mark.parametrize("install_result, expected_result", [(True, True), (False, False)]) -@mock.patch("service.SnapExporter.install") -def test_snap_exporter_configure(mock_install, snap_exporter, install_result, expected_result): - mock_install.return_value = install_result - assert snap_exporter.configure() is expected_result - mock_install.assert_called_once() +@mock.patch("service.isinstance", return_value=True) +def test_snap_exporter_configure_exception(_, snap_exporter): + snap_exporter.strategies[0].install.side_effect = snap.SnapError + assert snap_exporter.configure() is False @pytest.mark.parametrize("result, expected_result", [(True, True), (False, False)]) -@mock.patch("service.SnapExporter.install") +@mock.patch("service.SmartCtlExporterStrategy.install") @mock.patch("service.SnapExporter.set") def test_smartctl_exporter_configure(mock_set, mock_install, result, expected_result): mock_config = { From 3fee50a1c4a244f9b5c6f2ba851cc5235df80e00 Mon Sep 17 00:00:00 2001 From: jneo8 Date: Tue, 8 Oct 2024 16:16:05 +0800 Subject: [PATCH 9/9] refactor: Remove HwToolHelper - Restructure StrategyABC to have consistence interface - Remove HWToolHelper and move the logic into service layer - Change the `install` function return on BaseExporter for blocked message required - New ResourceMixin embedded into BaseExporter partial fix: #272 --- src/charm.py | 36 +++------- src/hw_tools.py | 184 +++++++----------------------------------------- src/service.py | 184 ++++++++++++++++++++++++++++++------------------ 3 files changed, 149 insertions(+), 255 deletions(-) diff --git a/src/charm.py b/src/charm.py index ec8a8534..3ca62622 100755 --- a/src/charm.py +++ b/src/charm.py @@ -12,7 +12,7 @@ from ops.framework import EventBase, StoredState from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus -from hw_tools import HWTool, HWToolHelper, detect_available_tools, remove_legacy_smartctl_exporter +from hw_tools import HWTool, detect_available_tools, remove_legacy_smartctl_exporter from service import BaseExporter, DCGMExporter, ExporterError, HardwareExporter, SmartCtlExporter logger = logging.getLogger(__name__) @@ -26,7 +26,6 @@ class HardwareObserverCharm(ops.CharmBase): def __init__(self, *args: Any) -> None: """Init.""" super().__init__(*args) - self.hw_tool_helper = HWToolHelper() # Add refresh_events to COSAgentProvider to update relation data when # config changed (default behavior) and upgrade charm. This is useful @@ -75,6 +74,7 @@ def exporters(self) -> List[BaseExporter]: exporters.append( HardwareExporter( self.charm_dir, + self.model.resources, self.model.config, stored_tools, ) @@ -134,47 +134,30 @@ def _on_install_or_upgrade(self, event: EventBase) -> None: remove_legacy_smartctl_exporter() - stored_tools = self.get_stored_tools() - msg: str resource_installed: bool - # Install hw tools - resource_installed, msg = self.hw_tool_helper.install(self.model.resources, stored_tools) - - self._stored.resource_installed = resource_installed - if not resource_installed: - logger.warning(msg) - self.model.unit.status = BlockedStatus(msg) - return - # Install exporter services and resources for exporter in self.exporters: - exporter_install_ok = exporter.install() + exporter_install_ok, msg = exporter.install() if not exporter_install_ok: resource_installed = False self._stored.resource_installed = resource_installed - msg = f"Exporter {exporter.exporter_name} install failed" logger.warning(msg) self.model.unit.status = BlockedStatus(msg) return + self._stored.resource_installed = True self._on_update_status(event) def _on_remove(self, _: EventBase) -> None: """Remove everything when charm is being removed.""" logger.info("Start to remove.") - # Remove binary tool - self.hw_tool_helper.remove( - self.model.resources, - self.get_stored_tools(), - ) - self._stored.resource_installed = False - # Remove exporters for exporter in self.exporters: self.model.unit.status = MaintenanceStatus(f"Removing {exporter.exporter_name}...") exporter.uninstall() + self._stored.resource_installed = False def _on_update_status(self, _: EventBase) -> None: # noqa: C901 """Update the charm's status.""" @@ -187,16 +170,15 @@ def _on_update_status(self, _: EventBase) -> None: # noqa: C901 return for exporter in self.exporters: + resource_ready, resource_ready_message = exporter.check_resources() + if not resource_ready: + self.model.unit.status = BlockedStatus(resource_ready_message) + return config_valid, config_valid_message = exporter.validate_exporter_configs() if not config_valid: self.model.unit.status = BlockedStatus(config_valid_message) return - hw_tool_ok, error_msg = self.hw_tool_helper.check_installed(self.get_stored_tools()) - if not hw_tool_ok: - self.model.unit.status = BlockedStatus(error_msg) - return - # Check health of all exporters exporters_health = [self._check_exporter_health(exporter) for exporter in self.exporters] diff --git a/src/hw_tools.py b/src/hw_tools.py index b83cd565..38545fbd 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -11,14 +11,14 @@ import subprocess from abc import ABCMeta, abstractmethod from pathlib import Path -from typing import Dict, List, Set, Tuple +from typing import List, Set import requests import urllib3 from charms.operator_libs_linux.v0 import apt from charms.operator_libs_linux.v1 import systemd from charms.operator_libs_linux.v2 import snap -from ops.model import ModelError, Resources +from ops.model import Resources import apt_helpers from checksum import ( @@ -159,32 +159,28 @@ def name(self) -> HWTool: def check(self) -> bool: """Check installation status of the tool.""" - -class APTStrategyABC(StrategyABC, metaclass=ABCMeta): - """Strategy for apt install tool.""" - - @abstractmethod def install(self) -> None: """Installation details.""" - @abstractmethod def remove(self) -> None: """Remove details.""" - # Note: The repo and keys should be remove when removing - # hook is triggered. But currently the apt lib don't have - # the remove option. class TPRStrategyABC(StrategyABC, metaclass=ABCMeta): """Third party resource strategy class.""" - @abstractmethod - def install(self, path: Path) -> None: - """Installation details.""" + resources: Resources - @abstractmethod - def remove(self) -> None: - """Remove details.""" + def __init__(self, resources: Resources) -> None: + """Inject the Resource object for fetching resource.""" + self.resources = resources + + def _fetch_tool(self) -> Path: + path = self.resources.fetch(TPR_RESOURCES[self._name]) + if path is None or file_is_empty(path): + logger.info("Skipping %s resource install since empty file was detected.", self.name) + raise ResourceFileSizeZeroError(tool=self._name, path=path) + return path class SnapStrategy(StrategyABC): @@ -266,7 +262,7 @@ def _create_custom_metrics(self) -> None: raise err -class NVIDIADriverStrategy(APTStrategyABC): +class NVIDIADriverStrategy(StrategyABC): """NVIDIA driver strategy class.""" _name = HWTool.NVIDIA_DRIVER @@ -364,11 +360,10 @@ class StorCLIStrategy(TPRStrategyABC): origin_path = Path("/opt/MegaRAID/storcli/storcli64") symlink_bin = TOOLS_DIR / HWTool.STORCLI.value - def install(self, path: Path) -> None: + def install(self) -> None: """Install storcli.""" - if file_is_empty(path): - logger.info("Skipping StorCLI resource install since empty file was detected.") - raise ResourceFileSizeZeroError(tool=self._name, path=path) + path = self._fetch_tool() + if not validate_checksum(STORCLI_VERSION_INFOS, path): raise ResourceChecksumError install_deb(self.name, path) @@ -392,11 +387,9 @@ class PercCLIStrategy(TPRStrategyABC): origin_path = Path("/opt/MegaRAID/perccli/perccli64") symlink_bin = TOOLS_DIR / HWTool.PERCCLI.value - def install(self, path: Path) -> None: + def install(self) -> None: """Install perccli.""" - if file_is_empty(path): - logger.info("Skipping PERCCLI resource install since empty file was detected.") - raise ResourceFileSizeZeroError(tool=self._name, path=path) + path = self._fetch_tool() if not validate_checksum(PERCCLI_VERSION_INFOS, path): raise ResourceChecksumError install_deb(self.name, path) @@ -419,11 +412,9 @@ class SAS2IRCUStrategy(TPRStrategyABC): _name = HWTool.SAS2IRCU symlink_bin = TOOLS_DIR / HWTool.SAS2IRCU.value - def install(self, path: Path) -> None: + def install(self) -> None: """Install sas2ircu.""" - if file_is_empty(path): - logger.info("Skipping SAS2IRCU resource install since empty file was detected.") - raise ResourceFileSizeZeroError(tool=self._name, path=path) + path = self._fetch_tool() if not validate_checksum(SAS2IRCU_VERSION_INFOS, path): raise ResourceChecksumError make_executable(path) @@ -445,18 +436,16 @@ class SAS3IRCUStrategy(SAS2IRCUStrategy): _name = HWTool.SAS3IRCU symlink_bin = TOOLS_DIR / HWTool.SAS3IRCU.value - def install(self, path: Path) -> None: + def install(self) -> None: """Install sas3ircu.""" - if file_is_empty(path): - logger.info("Skipping SAS3IRCU resource install since empty file was detected.") - raise ResourceFileSizeZeroError(tool=self._name, path=path) + path = self._fetch_tool() if not validate_checksum(SAS3IRCU_VERSION_INFOS, path): raise ResourceChecksumError make_executable(path) symlink(src=path, dst=self.symlink_bin) -class SSACLIStrategy(APTStrategyABC): +class SSACLIStrategy(StrategyABC): """Strategy for install ssacli.""" _name = HWTool.SSACLI @@ -489,7 +478,7 @@ def check(self) -> bool: return check_deb_pkg_installed(self.pkg) -class IPMIStrategy(APTStrategyABC): +class IPMIStrategy(StrategyABC): """Strategy for installing ipmi.""" freeipmi_pkg = "freeipmi-tools" @@ -730,126 +719,3 @@ def remove_legacy_smartctl_exporter() -> None: smartctl_exporter_config_path.unlink() if smartctl_exporter.exists(): shutil.rmtree("/opt/SmartCtlExporter/") - - -class HWToolHelper: - """Helper to install vendor's or hardware related tools.""" - - @property - def strategies(self) -> List[StrategyABC]: - """Define strategies for every tools.""" - return [ - StorCLIStrategy(), - PercCLIStrategy(), - SAS2IRCUStrategy(), - SAS3IRCUStrategy(), - SSACLIStrategy(), - IPMISELStrategy(), - IPMIDCMIStrategy(), - IPMISENSORStrategy(), - RedFishStrategy(), - ] - - def fetch_tools( # pylint: disable=W0102 - self, - resources: Resources, - hw_available: Set[HWTool] = set(), - ) -> Dict[HWTool, Path]: - """Fetch resource from juju if it's VENDOR_TOOLS.""" - fetch_tools: Dict[HWTool, Path] = {} - # Fetch all tools from juju resources - for tool, resource in TPR_RESOURCES.items(): - if tool not in hw_available: - logger.info("Skip fetch tool: %s", tool) - continue - try: - path = resources.fetch(resource) - fetch_tools[tool] = path - except ModelError: - logger.warning("Fail to fetch tool: %s", resource) - - return fetch_tools - - def check_missing_resources( - self, hw_available: Set[HWTool], fetch_tools: Dict[HWTool, Path] - ) -> Tuple[bool, str]: - """Check if required resources are not been uploaded.""" - missing_resources = [] - for tool in hw_available: - if tool in TPR_RESOURCES: - # Resource hasn't been uploaded - if tool not in fetch_tools: - missing_resources.append(TPR_RESOURCES[tool]) - # Uploaded but file size is zero - path = fetch_tools.get(tool) - if path and file_is_empty(path): - logger.warning( - "Empty resource file detected for tool %s at path %s", tool, path - ) - missing_resources.append(TPR_RESOURCES[tool]) - if missing_resources: - return False, f"Missing resources: {missing_resources}" - return True, "" - - def install(self, resources: Resources, hw_available: Set[HWTool]) -> Tuple[bool, str]: - """Install tools.""" - logger.info("hw_available: %s", hw_available) - - fetch_tools: Dict[HWTool, Path] = self.fetch_tools(resources, hw_available) - - ok, msg = self.check_missing_resources(hw_available, fetch_tools) - if not ok: - return ok, msg - - fail_strategies = [] - - # Iterate over each strategy and execute. - for strategy in self.strategies: - if strategy.name not in hw_available: - continue - try: - if isinstance(strategy, TPRStrategyABC): - path = fetch_tools.get(strategy.name) # pylint: disable=W0212 - if path: - strategy.install(path) - - elif isinstance(strategy, (APTStrategyABC, SnapStrategy)): - strategy.install() # pylint: disable=E1120 - logger.info("Strategy %s install success", strategy) - except ( - ResourceFileSizeZeroError, - OSError, - apt.PackageError, - ResourceChecksumError, - ) as e: - logger.warning("Strategy %s install fail: %s", strategy, e) - fail_strategies.append(strategy.name) - - if fail_strategies: - return False, f"Fail strategies: {fail_strategies}" - return True, "" - - # pylint: disable=W0613 - def remove(self, resources: Resources, hw_available: Set[HWTool]) -> None: - """Execute all remove strategies.""" - for strategy in self.strategies: - if strategy.name not in hw_available: - continue - if isinstance(strategy, (TPRStrategyABC, APTStrategyABC, SnapStrategy)): - strategy.remove() - logger.info("Strategy %s remove success", strategy) - - def check_installed(self, hw_available: Set[HWTool]) -> Tuple[bool, str]: - """Check tool status.""" - failed_checks: Set[HWTool] = set() - - for strategy in self.strategies: - if strategy.name not in hw_available: - continue - ok = strategy.check() - if not ok: - failed_checks.add(strategy.name) - - if failed_checks: - return False, f"Fail strategy checks: {failed_checks}" - return True, "" diff --git a/src/service.py b/src/service.py index 38b30a0a..19ad2968 100644 --- a/src/service.py +++ b/src/service.py @@ -5,28 +5,42 @@ from logging import getLogger from pathlib import Path from time import sleep -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple +from charms.operator_libs_linux.v0 import apt from charms.operator_libs_linux.v1 import systemd from charms.operator_libs_linux.v2 import snap from jinja2 import Environment, FileSystemLoader -from ops.model import ConfigData +from ops.model import ConfigData, Resources from redfish import redfish_client from redfish.rest.v1 import InvalidCredentialsError +from checksum import ( + ResourceChecksumError, +) from config import ( HARDWARE_EXPORTER_COLLECTOR_MAPPING, HARDWARE_EXPORTER_SETTINGS, ExporterSettings, HWTool, + TPR_RESOURCES, ) from hardware import get_bmc_address from hw_tools import ( - APTStrategyABC, DCGMExporterStrategy, NVIDIADriverStrategy, SmartCtlExporterStrategy, SnapStrategy, + StrategyABC, + StorCLIStrategy, + PercCLIStrategy, + SAS2IRCUStrategy, + SAS3IRCUStrategy, + IPMISELStrategy, + IPMIDCMIStrategy, + IPMISENSORStrategy, + RedFishStrategy, + ResourceFileSizeZeroError, ) logger = getLogger(__name__) @@ -39,21 +53,72 @@ class ExporterError(Exception): """ -class BaseExporter(ABC): - """A class representing the exporter and the metric endpoints.""" +class ResourceMixin(): + """Mixin to handle multiple resource strategy install/remove/check.""" - config: ConfigData - exporter_name: str - port: int - log_level: str + strategies: List[StrategyABC] + available_tools: Set[HWTool] @staticmethod @abstractmethod def hw_tools() -> Set[HWTool]: """Return hardware tools to watch.""" + def install_resources(self) -> Tuple[bool, str]: + """Run install for all strategies.""" + failed_strategies = [] + missing_resources = [] + for strategy in self.strategies: + if strategy.name in self.available_tools: + try: + strategy.install() + except ResourceFileSizeZeroError as e: + missing_resources.append(TPR_RESOURCES[strategy.name]) + except ( + OSError, + apt.PackageError, + ResourceChecksumError, + ) as e: + logger.warning("Strategy %s install fail: %s", strategy, e) + failed_strategies.append(strategy.name) + if missing_resources: + return False, f"Missing resources: {missing_resources}" + if failed_strategies: + return False, f"Fail strategies: {failed_strategies}" + return True, "" + + def remove_resources(self) -> bool: + """Run remove for strategies.""" + for strategy in self.strategies: + if strategy.name in self.available_tools: + strategy.remove() + return True + + def check_resources(self) -> Tuple[bool, str]: + """Run check for strategies.""" + failed_checks: Set[HWTool] = set() + for strategy in self.strategies: + if strategy.name in self.available_tools: + continue + strategy.check() + ok = strategy.check() + if not ok: + failed_checks.add(strategy.name) + if failed_checks: + return False, f"Fail strategy checks: {failed_checks}" + return True, "" + + +class BaseExporter(ABC, ResourceMixin): + """A class representing the exporter and the metric endpoints.""" + + config: ConfigData + exporter_name: str + port: int + log_level: str + @abstractmethod - def install(self) -> bool: + def install(self) -> Tuple[bool, str]: """Install the exporter.""" @abstractmethod @@ -99,7 +164,6 @@ class RenderableExporter(BaseExporter): def __init__(self, charm_dir: Path, config: ConfigData, settings: ExporterSettings) -> None: """Initialize the Exporter class.""" self.charm_dir = charm_dir - self.port: int self.settings = settings @@ -110,28 +174,6 @@ def __init__(self, charm_dir: Path, config: ConfigData, settings: ExporterSettin self.log_level = str(config["exporter-log-level"]) - def resources_exist(self) -> bool: - """Return true if required resources exist. - - Overwrite this method if there are resources need to be installed. - """ - return True - - def install_resources(self) -> bool: - """Install the necessary resources for the exporter service. - - Overwrite this method if there are resources need to be installed. - """ - logger.debug("No required resources for %s", self.__class__.__name__) - return True - - def remove_resources(self) -> bool: - """Remove exporter resources. - - Overwrite this method if there are resources need to be removed. - """ - return True - def remove_config(self) -> bool: """Remove exporter configuration file.""" if self.exporter_config_path is not None and self.exporter_config_path.exists(): @@ -194,40 +236,38 @@ def verify_render_files_exist(self) -> bool: service_file_exists = self.exporter_service_path.exists() return service_file_exists and config_file_exists - def install(self) -> bool: + def install(self) -> Tuple[bool, str]: """Install the exporter.""" logger.info("Installing %s.", self.exporter_name) # Install resources - install_resource_success = self.install_resources() + install_resource_success, msg = self.install_resources() if not install_resource_success: logger.error("Failed to install %s resources.", self.exporter_name) - return False - if not self.resources_exist(): - logger.error("%s resources are not installed properly.", self.exporter_name) - # pylint: disable=too-many-instance-attributes - return False + return False, msg # Render config configure_success = self.configure() if not configure_success: - logger.error("Failed to render config files for %s.", self.exporter_name) - return False + msg = f"Failed to render config files for {self.exporter_name}." + logger.error(msg) + return False, msg # Install service render_service_success = self.render_service() if not render_service_success: - logger.error("Failed to install %s.", self.exporter_name) - return False + msg = f"Failed to install {self.exporter_name}." + logger.error(msg) + return False, msg if not self.verify_render_files_exist(): - logger.error("%s is not installed properly.", self.exporter_name) - return False + msg = f"{self.exporter_name} is not installed properly." + return False, msg systemd.daemon_reload() logger.info("%s installed.", self.exporter_name) - return True + return True, "" def uninstall(self) -> bool: """Uninstall the exporter.""" @@ -320,7 +360,6 @@ class SnapExporter(BaseExporter): exporter_name: str channel: str port: int - strategies: List[Union[SnapStrategy, APTStrategyABC]] def __init__(self, config: ConfigData): """Init.""" @@ -336,35 +375,25 @@ def hw_tools() -> Set[HWTool]: """Return hardware tools to watch.""" return set() - def install(self) -> bool: + def install(self) -> Tuple[bool, str]: """Install the snap from a channel. Returns true if the install is successful, false otherwise. """ - try: - for strategy in self.strategies: - strategy.install() - self.enable_and_start() - return self.snap_client.present is True - except Exception as err: # pylint: disable=broad-except - logger.error("Failed to install %s: %s", strategy.name, err) - return False + ok, msg = self.install_resources() + if not ok: + return False, msg + self.enable_and_start() + if self.snap_client.present is False: + return False, "{self.exporter_name} service is not present" + return True, "" def uninstall(self) -> bool: """Uninstall the snap. Returns true if the uninstall is successful, false otherwise. """ - try: - for strategy in self.strategies: - strategy.remove() - - # using the snap.SnapError will result into: - # TypeError: catching classes that do not inherit from BaseException is not allowed - except Exception as err: # pylint: disable=broad-except - logger.error("Failed to remove %s: %s", strategy.name, err) - return False - + self.remove_resources() return self.snap_client.present is False def enable_and_start(self) -> None: @@ -477,9 +506,15 @@ def configure(self) -> bool: class HardwareExporter(RenderableExporter): """A class representing the hardware exporter and the metric endpoints.""" - required_config: bool = True + resources: Resources - def __init__(self, charm_dir: Path, config: ConfigData, available_tools: Set[HWTool]) -> None: + def __init__( + self, + charm_dir: Path, + resources: Resources, + config: ConfigData, + available_tools: Set[HWTool], + ) -> None: """Initialize the Hardware Exporter class.""" super().__init__(charm_dir, config, HARDWARE_EXPORTER_SETTINGS) @@ -490,6 +525,17 @@ def __init__(self, charm_dir: Path, config: ConfigData, available_tools: Set[HWT self.available_tools = available_tools self.collect_timeout = int(config["collect-timeout"]) self.bmc_address = get_bmc_address() + self.resources = resources + self.strategies = [ + StorCLIStrategy(resources=self.resources), + PercCLIStrategy(resources=self.resources), + SAS2IRCUStrategy(resources=self.resources), + SAS3IRCUStrategy(resources=self.resources), + IPMISELStrategy(), + IPMIDCMIStrategy(), + IPMISENSORStrategy(), + RedFishStrategy(), + ] def _render_config_content(self) -> str: """Render and install exporter config file."""