diff --git a/src/charm.py b/src/charm.py index 11900a41..3ca62622 100755 --- a/src/charm.py +++ b/src/charm.py @@ -12,7 +12,7 @@ from ops.framework import EventBase, StoredState from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus -from hw_tools import HWTool, HWToolHelper, detect_available_tools, remove_legacy_smartctl_exporter +from hw_tools import HWTool, detect_available_tools, remove_legacy_smartctl_exporter from service import BaseExporter, DCGMExporter, ExporterError, HardwareExporter, SmartCtlExporter logger = logging.getLogger(__name__) @@ -26,7 +26,6 @@ class HardwareObserverCharm(ops.CharmBase): def __init__(self, *args: Any) -> None: """Init.""" super().__init__(*args) - self.hw_tool_helper = HWToolHelper() # Add refresh_events to COSAgentProvider to update relation data when # config changed (default behavior) and upgrade charm. This is useful @@ -75,6 +74,7 @@ def exporters(self) -> List[BaseExporter]: exporters.append( HardwareExporter( self.charm_dir, + self.model.resources, self.model.config, stored_tools, ) @@ -84,7 +84,7 @@ def exporters(self) -> List[BaseExporter]: exporters.append(SmartCtlExporter(self.model.config)) if stored_tools & DCGMExporter.hw_tools(): - exporters.append(DCGMExporter(self.charm_dir, self.model.config)) + exporters.append(DCGMExporter(self.model.config)) return exporters @@ -134,47 +134,30 @@ def _on_install_or_upgrade(self, event: EventBase) -> None: remove_legacy_smartctl_exporter() - stored_tools = self.get_stored_tools() - msg: str resource_installed: bool - # Install hw tools - resource_installed, msg = self.hw_tool_helper.install(self.model.resources, stored_tools) - - self._stored.resource_installed = resource_installed - if not resource_installed: - logger.warning(msg) - self.model.unit.status = BlockedStatus(msg) - return - # Install exporter services and resources for exporter in self.exporters: - exporter_install_ok = exporter.install() + exporter_install_ok, msg = exporter.install() if not exporter_install_ok: resource_installed = False self._stored.resource_installed = resource_installed - msg = f"Exporter {exporter.exporter_name} install failed" logger.warning(msg) self.model.unit.status = BlockedStatus(msg) return + self._stored.resource_installed = True self._on_update_status(event) def _on_remove(self, _: EventBase) -> None: """Remove everything when charm is being removed.""" logger.info("Start to remove.") - # Remove binary tool - self.hw_tool_helper.remove( - self.model.resources, - self.get_stored_tools(), - ) - self._stored.resource_installed = False - # Remove exporters for exporter in self.exporters: self.model.unit.status = MaintenanceStatus(f"Removing {exporter.exporter_name}...") exporter.uninstall() + self._stored.resource_installed = False def _on_update_status(self, _: EventBase) -> None: # noqa: C901 """Update the charm's status.""" @@ -187,16 +170,15 @@ def _on_update_status(self, _: EventBase) -> None: # noqa: C901 return for exporter in self.exporters: + resource_ready, resource_ready_message = exporter.check_resources() + if not resource_ready: + self.model.unit.status = BlockedStatus(resource_ready_message) + return config_valid, config_valid_message = exporter.validate_exporter_configs() if not config_valid: self.model.unit.status = BlockedStatus(config_valid_message) return - hw_tool_ok, error_msg = self.hw_tool_helper.check_installed(self.get_stored_tools()) - if not hw_tool_ok: - self.model.unit.status = BlockedStatus(error_msg) - return - # Check health of all exporters exporters_health = [self._check_exporter_health(exporter) for exporter in self.exporters] diff --git a/src/config.py b/src/config.py index a5fe935b..8af9b950 100644 --- a/src/config.py +++ b/src/config.py @@ -65,6 +65,7 @@ class HWTool(str, Enum): REDFISH = "redfish" SMARTCTL_EXPORTER = "smartctl-exporter" DCGM = "dcgm" + NVIDIA_DRIVER = "nvidia-driver" TPR_RESOURCES: t.Dict[HWTool, str] = { diff --git a/src/hw_tools.py b/src/hw_tools.py index a3ec7bee..38545fbd 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -5,19 +5,20 @@ import logging import os +import re import shutil import stat import subprocess from abc import ABCMeta, abstractmethod from pathlib import Path -from typing import Dict, List, Set, Tuple +from typing import List, Set import requests import urllib3 from charms.operator_libs_linux.v0 import apt from charms.operator_libs_linux.v1 import systemd from charms.operator_libs_linux.v2 import snap -from ops.model import ModelError, Resources +from ops.model import Resources import apt_helpers from checksum import ( @@ -61,6 +62,14 @@ def __init__(self, tool: HWTool, path: Path): self.message = f"Tool: {tool} path: {path} size is zero" +class ResourceInstallationError(Exception): + """Exception raised when a hardware tool installation fails.""" + + def __init__(self, tool: HWTool): + """Init.""" + super().__init__(f"Installation failed for tool: {tool}") + + def copy_to_snap_common_bin(source: Path, filename: str) -> None: """Copy file to $SNAP_COMMON/bin folder.""" Path(f"{SNAP_COMMON}/bin").mkdir(parents=False, exist_ok=True) @@ -150,32 +159,28 @@ def name(self) -> HWTool: def check(self) -> bool: """Check installation status of the tool.""" - -class APTStrategyABC(StrategyABC, metaclass=ABCMeta): - """Strategy for apt install tool.""" - - @abstractmethod def install(self) -> None: """Installation details.""" - @abstractmethod def remove(self) -> None: """Remove details.""" - # Note: The repo and keys should be remove when removing - # hook is triggered. But currently the apt lib don't have - # the remove option. class TPRStrategyABC(StrategyABC, metaclass=ABCMeta): """Third party resource strategy class.""" - @abstractmethod - def install(self, path: Path) -> None: - """Installation details.""" + resources: Resources - @abstractmethod - def remove(self) -> None: - """Remove details.""" + def __init__(self, resources: Resources) -> None: + """Inject the Resource object for fetching resource.""" + self.resources = resources + + def _fetch_tool(self) -> Path: + path = self.resources.fetch(TPR_RESOURCES[self._name]) + if path is None or file_is_empty(path): + logger.info("Skipping %s resource install since empty file was detected.", self.name) + raise ResourceFileSizeZeroError(tool=self._name, path=path) + return path class SnapStrategy(StrategyABC): @@ -184,50 +189,159 @@ class SnapStrategy(StrategyABC): channel: str @property - def snap(self) -> str: + def snap_name(self) -> str: """Snap name.""" return self._name.value + @property + def snap_common(self) -> Path: + """Snap common directory.""" + return Path(f"/var/snap/{self.snap_name}/common/") + + @property + def snap_client(self) -> snap.Snap: + """Return the snap client.""" + return snap.SnapCache()[self.snap_name] + def install(self) -> None: """Install the snap from a channel.""" try: - snap.add(self.snap, channel=self.channel) - logger.info("Installed %s from channel: %s", self.snap, self.channel) + snap.add(self.snap_name, channel=self.channel) + logger.info("Installed %s from channel: %s", self.snap_name, self.channel) # using the snap.SnapError will result into: # TypeError: catching classes that do not inherit from BaseException is not allowed except Exception as err: # pylint: disable=broad-except - logger.error("Failed to install %s from channel: %s: %s", self.snap, self.channel, err) + logger.error( + "Failed to install %s from channel: %s: %s", self.snap_name, self.channel, err + ) raise err def remove(self) -> None: """Remove the snap.""" try: - snap.remove([self.snap]) + snap.remove([self.snap_name]) # using the snap.SnapError will result into: # TypeError: catching classes that do not inherit from BaseException is not allowed except Exception as err: # pylint: disable=broad-except - logger.error("Failed to remove %s: %s", self.snap, err) + logger.error("Failed to remove %s: %s", self.snap_name, err) raise err def check(self) -> bool: """Check if all services are active.""" return all( service.get("active", False) - for service in snap.SnapCache()[self.snap].services.values() + for service in snap.SnapCache()[self.snap_name].services.values() ) class DCGMExporterStrategy(SnapStrategy): - """DCGM strategy class.""" + """DCGM exporter strategy class.""" _name = HWTool.DCGM + metric_file = Path.cwd() / "src/gpu_metrics/dcgm_metrics.csv" def __init__(self, channel: str) -> None: """Init.""" self.channel = channel + def install(self) -> None: + """Install the snap and the custom metrics.""" + super().install() + self._create_custom_metrics() + + def _create_custom_metrics(self) -> None: + logger.info("Creating a custom metrics file and configuring the DCGM snap to use it") + try: + shutil.copy(self.metric_file, self.snap_common) + self.snap_client.set({"dcgm-exporter-metrics-file": self.metric_file.name}) + self.snap_client.restart(reload=True) + except Exception as err: # pylint: disable=broad-except + logger.error("Failed to configure custom DCGM metrics: %s", err) + raise err + + +class NVIDIADriverStrategy(StrategyABC): + """NVIDIA driver strategy class.""" + + _name = HWTool.NVIDIA_DRIVER + installed_pkgs = Path("/tmp/nvidia-installed-pkgs.txt") + pkg_pattern = r"nvidia(?:-[a-zA-Z-]*)?-(\d+)(?:-[a-zA-Z]*)?" + + def install(self) -> None: + """Install the driver and NVIDIA utils.""" + self._install_nvidia_drivers() + self._install_nvidia_utils() + + def _install_nvidia_drivers(self) -> None: + """Install the NVIDIA driver if not present.""" + if Path("/proc/driver/nvidia/version").exists(): + logger.info("NVIDIA driver already installed in the machine") + return + + logger.info("Installing NVIDIA driver") + apt.add_package("ubuntu-drivers-common", update_cache=True) + + # output what driver was installed helps gets the version installed later + cmd = f"ubuntu-drivers install --gpgpu --package-list {self.installed_pkgs}" + try: + # This can be changed to check_call and not rely in the output if this is fixed + # https://github.com/canonical/ubuntu-drivers-common/issues/106 + result = subprocess.check_output(cmd.split(), text=True) + + except subprocess.CalledProcessError as err: + logger.error("Failed to install the NVIDIA driver: %s", err) + raise err + + if "No drivers found for installation" in result: + logger.warning( + "No drivers for the NVIDIA GPU were found. Manual installation is necessary" + ) + raise ResourceInstallationError(self._name) + + logger.info("NVIDIA driver installed") + + def _install_nvidia_utils(self) -> None: + """Install the nvidia utils to be able to use nvidia-smi.""" + if not self.installed_pkgs.exists(): + logger.debug("Drivers not installed by the charm. Skipping nvidia-utils") + return + + installed_pkgs = self.installed_pkgs.read_text(encoding="utf-8").splitlines() + for line in installed_pkgs: + if match := re.search(self.pkg_pattern, line): + nvidia_version = match.group(1) + logger.debug("installed driver from hardware-observer: %s", line) + pkg = f"nvidia-utils-{nvidia_version}-server" + apt.add_package(pkg, update_cache=True) + logger.info("installed %s", pkg) + break + else: + logger.warning( + "packages installed at %s are in an unexpected format. " + "nvidia-utils was not installed", + self.installed_pkgs, + ) + + def remove(self) -> None: + """Drivers shouldn't be removed by the strategy.""" + return None + + def check(self) -> bool: + """Check if nvidia-smi is working.""" + try: + subprocess.check_call("nvidia-smi", timeout=60) + return True + except (FileNotFoundError, subprocess.CalledProcessError) as e: + logger.error(e) + logger.warning( + "nvidia-smi is not working. Ensure the correct driver is installed. " + "See the docs for more details: " + "https://ubuntu.com/server/docs/nvidia-drivers-installation" + ) + return False + class SmartCtlExporterStrategy(SnapStrategy): """SmartCtl strategy class.""" @@ -246,11 +360,10 @@ class StorCLIStrategy(TPRStrategyABC): origin_path = Path("/opt/MegaRAID/storcli/storcli64") symlink_bin = TOOLS_DIR / HWTool.STORCLI.value - def install(self, path: Path) -> None: + def install(self) -> None: """Install storcli.""" - if file_is_empty(path): - logger.info("Skipping StorCLI resource install since empty file was detected.") - raise ResourceFileSizeZeroError(tool=self._name, path=path) + path = self._fetch_tool() + if not validate_checksum(STORCLI_VERSION_INFOS, path): raise ResourceChecksumError install_deb(self.name, path) @@ -274,11 +387,9 @@ class PercCLIStrategy(TPRStrategyABC): origin_path = Path("/opt/MegaRAID/perccli/perccli64") symlink_bin = TOOLS_DIR / HWTool.PERCCLI.value - def install(self, path: Path) -> None: + def install(self) -> None: """Install perccli.""" - if file_is_empty(path): - logger.info("Skipping PERCCLI resource install since empty file was detected.") - raise ResourceFileSizeZeroError(tool=self._name, path=path) + path = self._fetch_tool() if not validate_checksum(PERCCLI_VERSION_INFOS, path): raise ResourceChecksumError install_deb(self.name, path) @@ -301,11 +412,9 @@ class SAS2IRCUStrategy(TPRStrategyABC): _name = HWTool.SAS2IRCU symlink_bin = TOOLS_DIR / HWTool.SAS2IRCU.value - def install(self, path: Path) -> None: + def install(self) -> None: """Install sas2ircu.""" - if file_is_empty(path): - logger.info("Skipping SAS2IRCU resource install since empty file was detected.") - raise ResourceFileSizeZeroError(tool=self._name, path=path) + path = self._fetch_tool() if not validate_checksum(SAS2IRCU_VERSION_INFOS, path): raise ResourceChecksumError make_executable(path) @@ -327,18 +436,16 @@ class SAS3IRCUStrategy(SAS2IRCUStrategy): _name = HWTool.SAS3IRCU symlink_bin = TOOLS_DIR / HWTool.SAS3IRCU.value - def install(self, path: Path) -> None: + def install(self) -> None: """Install sas3ircu.""" - if file_is_empty(path): - logger.info("Skipping SAS3IRCU resource install since empty file was detected.") - raise ResourceFileSizeZeroError(tool=self._name, path=path) + path = self._fetch_tool() if not validate_checksum(SAS3IRCU_VERSION_INFOS, path): raise ResourceChecksumError make_executable(path) symlink(src=path, dst=self.symlink_bin) -class SSACLIStrategy(APTStrategyABC): +class SSACLIStrategy(StrategyABC): """Strategy for install ssacli.""" _name = HWTool.SSACLI @@ -371,7 +478,7 @@ def check(self) -> bool: return check_deb_pkg_installed(self.pkg) -class IPMIStrategy(APTStrategyABC): +class IPMIStrategy(StrategyABC): """Strategy for installing ipmi.""" freeipmi_pkg = "freeipmi-tools" @@ -612,126 +719,3 @@ def remove_legacy_smartctl_exporter() -> None: smartctl_exporter_config_path.unlink() if smartctl_exporter.exists(): shutil.rmtree("/opt/SmartCtlExporter/") - - -class HWToolHelper: - """Helper to install vendor's or hardware related tools.""" - - @property - def strategies(self) -> List[StrategyABC]: - """Define strategies for every tools.""" - return [ - StorCLIStrategy(), - PercCLIStrategy(), - SAS2IRCUStrategy(), - SAS3IRCUStrategy(), - SSACLIStrategy(), - IPMISELStrategy(), - IPMIDCMIStrategy(), - IPMISENSORStrategy(), - RedFishStrategy(), - ] - - def fetch_tools( # pylint: disable=W0102 - self, - resources: Resources, - hw_available: Set[HWTool] = set(), - ) -> Dict[HWTool, Path]: - """Fetch resource from juju if it's VENDOR_TOOLS.""" - fetch_tools: Dict[HWTool, Path] = {} - # Fetch all tools from juju resources - for tool, resource in TPR_RESOURCES.items(): - if tool not in hw_available: - logger.info("Skip fetch tool: %s", tool) - continue - try: - path = resources.fetch(resource) - fetch_tools[tool] = path - except ModelError: - logger.warning("Fail to fetch tool: %s", resource) - - return fetch_tools - - def check_missing_resources( - self, hw_available: Set[HWTool], fetch_tools: Dict[HWTool, Path] - ) -> Tuple[bool, str]: - """Check if required resources are not been uploaded.""" - missing_resources = [] - for tool in hw_available: - if tool in TPR_RESOURCES: - # Resource hasn't been uploaded - if tool not in fetch_tools: - missing_resources.append(TPR_RESOURCES[tool]) - # Uploaded but file size is zero - path = fetch_tools.get(tool) - if path and file_is_empty(path): - logger.warning( - "Empty resource file detected for tool %s at path %s", tool, path - ) - missing_resources.append(TPR_RESOURCES[tool]) - if missing_resources: - return False, f"Missing resources: {missing_resources}" - return True, "" - - def install(self, resources: Resources, hw_available: Set[HWTool]) -> Tuple[bool, str]: - """Install tools.""" - logger.info("hw_available: %s", hw_available) - - fetch_tools: Dict[HWTool, Path] = self.fetch_tools(resources, hw_available) - - ok, msg = self.check_missing_resources(hw_available, fetch_tools) - if not ok: - return ok, msg - - fail_strategies = [] - - # Iterate over each strategy and execute. - for strategy in self.strategies: - if strategy.name not in hw_available: - continue - try: - if isinstance(strategy, TPRStrategyABC): - path = fetch_tools.get(strategy.name) # pylint: disable=W0212 - if path: - strategy.install(path) - - elif isinstance(strategy, (APTStrategyABC, SnapStrategy)): - strategy.install() # pylint: disable=E1120 - logger.info("Strategy %s install success", strategy) - except ( - ResourceFileSizeZeroError, - OSError, - apt.PackageError, - ResourceChecksumError, - ) as e: - logger.warning("Strategy %s install fail: %s", strategy, e) - fail_strategies.append(strategy.name) - - if fail_strategies: - return False, f"Fail strategies: {fail_strategies}" - return True, "" - - # pylint: disable=W0613 - def remove(self, resources: Resources, hw_available: Set[HWTool]) -> None: - """Execute all remove strategies.""" - for strategy in self.strategies: - if strategy.name not in hw_available: - continue - if isinstance(strategy, (TPRStrategyABC, APTStrategyABC, SnapStrategy)): - strategy.remove() - logger.info("Strategy %s remove success", strategy) - - def check_installed(self, hw_available: Set[HWTool]) -> Tuple[bool, str]: - """Check tool status.""" - failed_checks: Set[HWTool] = set() - - for strategy in self.strategies: - if strategy.name not in hw_available: - continue - ok = strategy.check() - if not ok: - failed_checks.add(strategy.name) - - if failed_checks: - return False, f"Fail strategy checks: {failed_checks}" - return True, "" diff --git a/src/service.py b/src/service.py index d71f4e9e..19ad2968 100644 --- a/src/service.py +++ b/src/service.py @@ -1,28 +1,47 @@ """Exporter service helper.""" import os -import shutil from abc import ABC, abstractmethod from logging import getLogger from pathlib import Path from time import sleep -from typing import Any, Dict, Optional, Set, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple +from charms.operator_libs_linux.v0 import apt from charms.operator_libs_linux.v1 import systemd from charms.operator_libs_linux.v2 import snap from jinja2 import Environment, FileSystemLoader -from ops.model import ConfigData +from ops.model import ConfigData, Resources from redfish import redfish_client from redfish.rest.v1 import InvalidCredentialsError +from checksum import ( + ResourceChecksumError, +) from config import ( HARDWARE_EXPORTER_COLLECTOR_MAPPING, HARDWARE_EXPORTER_SETTINGS, ExporterSettings, HWTool, + TPR_RESOURCES, ) from hardware import get_bmc_address -from hw_tools import DCGMExporterStrategy, SmartCtlExporterStrategy, SnapStrategy +from hw_tools import ( + DCGMExporterStrategy, + NVIDIADriverStrategy, + SmartCtlExporterStrategy, + SnapStrategy, + StrategyABC, + StorCLIStrategy, + PercCLIStrategy, + SAS2IRCUStrategy, + SAS3IRCUStrategy, + IPMISELStrategy, + IPMIDCMIStrategy, + IPMISENSORStrategy, + RedFishStrategy, + ResourceFileSizeZeroError, +) logger = getLogger(__name__) @@ -34,21 +53,72 @@ class ExporterError(Exception): """ -class BaseExporter(ABC): - """A class representing the exporter and the metric endpoints.""" +class ResourceMixin(): + """Mixin to handle multiple resource strategy install/remove/check.""" - config: ConfigData - exporter_name: str - port: int - log_level: str + strategies: List[StrategyABC] + available_tools: Set[HWTool] @staticmethod @abstractmethod def hw_tools() -> Set[HWTool]: """Return hardware tools to watch.""" + def install_resources(self) -> Tuple[bool, str]: + """Run install for all strategies.""" + failed_strategies = [] + missing_resources = [] + for strategy in self.strategies: + if strategy.name in self.available_tools: + try: + strategy.install() + except ResourceFileSizeZeroError as e: + missing_resources.append(TPR_RESOURCES[strategy.name]) + except ( + OSError, + apt.PackageError, + ResourceChecksumError, + ) as e: + logger.warning("Strategy %s install fail: %s", strategy, e) + failed_strategies.append(strategy.name) + if missing_resources: + return False, f"Missing resources: {missing_resources}" + if failed_strategies: + return False, f"Fail strategies: {failed_strategies}" + return True, "" + + def remove_resources(self) -> bool: + """Run remove for strategies.""" + for strategy in self.strategies: + if strategy.name in self.available_tools: + strategy.remove() + return True + + def check_resources(self) -> Tuple[bool, str]: + """Run check for strategies.""" + failed_checks: Set[HWTool] = set() + for strategy in self.strategies: + if strategy.name in self.available_tools: + continue + strategy.check() + ok = strategy.check() + if not ok: + failed_checks.add(strategy.name) + if failed_checks: + return False, f"Fail strategy checks: {failed_checks}" + return True, "" + + +class BaseExporter(ABC, ResourceMixin): + """A class representing the exporter and the metric endpoints.""" + + config: ConfigData + exporter_name: str + port: int + log_level: str + @abstractmethod - def install(self) -> bool: + def install(self) -> Tuple[bool, str]: """Install the exporter.""" @abstractmethod @@ -94,7 +164,6 @@ class RenderableExporter(BaseExporter): def __init__(self, charm_dir: Path, config: ConfigData, settings: ExporterSettings) -> None: """Initialize the Exporter class.""" self.charm_dir = charm_dir - self.port: int self.settings = settings @@ -105,28 +174,6 @@ def __init__(self, charm_dir: Path, config: ConfigData, settings: ExporterSettin self.log_level = str(config["exporter-log-level"]) - def resources_exist(self) -> bool: - """Return true if required resources exist. - - Overwrite this method if there are resources need to be installed. - """ - return True - - def install_resources(self) -> bool: - """Install the necessary resources for the exporter service. - - Overwrite this method if there are resources need to be installed. - """ - logger.debug("No required resources for %s", self.__class__.__name__) - return True - - def remove_resources(self) -> bool: - """Remove exporter resources. - - Overwrite this method if there are resources need to be removed. - """ - return True - def remove_config(self) -> bool: """Remove exporter configuration file.""" if self.exporter_config_path is not None and self.exporter_config_path.exists(): @@ -189,40 +236,38 @@ def verify_render_files_exist(self) -> bool: service_file_exists = self.exporter_service_path.exists() return service_file_exists and config_file_exists - def install(self) -> bool: + def install(self) -> Tuple[bool, str]: """Install the exporter.""" logger.info("Installing %s.", self.exporter_name) # Install resources - install_resource_success = self.install_resources() + install_resource_success, msg = self.install_resources() if not install_resource_success: logger.error("Failed to install %s resources.", self.exporter_name) - return False - if not self.resources_exist(): - logger.error("%s resources are not installed properly.", self.exporter_name) - # pylint: disable=too-many-instance-attributes - return False + return False, msg # Render config configure_success = self.configure() if not configure_success: - logger.error("Failed to render config files for %s.", self.exporter_name) - return False + msg = f"Failed to render config files for {self.exporter_name}." + logger.error(msg) + return False, msg # Install service render_service_success = self.render_service() if not render_service_success: - logger.error("Failed to install %s.", self.exporter_name) - return False + msg = f"Failed to install {self.exporter_name}." + logger.error(msg) + return False, msg if not self.verify_render_files_exist(): - logger.error("%s is not installed properly.", self.exporter_name) - return False + msg = f"{self.exporter_name} is not installed properly." + return False, msg systemd.daemon_reload() logger.info("%s installed.", self.exporter_name) - return True + return True, "" def uninstall(self) -> bool: """Uninstall the exporter.""" @@ -315,7 +360,6 @@ class SnapExporter(BaseExporter): exporter_name: str channel: str port: int - strategy: SnapStrategy def __init__(self, config: ConfigData): """Init.""" @@ -324,39 +368,32 @@ def __init__(self, config: ConfigData): @property def snap_client(self) -> snap.Snap: """Return the snap client.""" - return snap.SnapCache()[self.strategy.snap] + return snap.SnapCache()[self.exporter_name] @staticmethod def hw_tools() -> Set[HWTool]: """Return hardware tools to watch.""" return set() - def install(self) -> bool: + def install(self) -> Tuple[bool, str]: """Install the snap from a channel. Returns true if the install is successful, false otherwise. """ - try: - self.strategy.install() - self.enable_and_start() - return self.snap_client.present is True - except Exception: # pylint: disable=broad-except - return False + ok, msg = self.install_resources() + if not ok: + return False, msg + self.enable_and_start() + if self.snap_client.present is False: + return False, "{self.exporter_name} service is not present" + return True, "" def uninstall(self) -> bool: """Uninstall the snap. Returns true if the uninstall is successful, false otherwise. """ - try: - self.strategy.remove() - - # using the snap.SnapError will result into: - # TypeError: catching classes that do not inherit from BaseException is not allowed - except Exception as err: # pylint: disable=broad-except - logger.error("Failed to remove %s: %s", self.strategy.snap, err) - return False - + self.remove_resources() return self.snap_client.present is False def enable_and_start(self) -> None: @@ -379,7 +416,7 @@ def set(self, snap_config: dict) -> bool: try: self.snap_client.set(snap_config, typed=True) except snap.SnapError as err: - logger.error("Failed to update snap configs %s: %s", self.strategy.snap, err) + logger.error("Failed to update snap configs %s: %s", self.exporter_name, err) return False return True @@ -388,14 +425,22 @@ def check_health(self) -> bool: Returns true if the service is healthy, false otherwise. """ - return self.strategy.check() + return all(strategy.check() for strategy in self.strategies) def configure(self) -> bool: """Set the necessary exporter configurations or change snap channel. Returns true if the configure is successful, false otherwise. """ - return self.install() + for strategy in self.strategies: + if isinstance(strategy, SnapStrategy): + try: + # refresh the snap for a new channel if necessary + strategy.install() + except Exception as err: # pylint: disable=broad-except + logger.error("Failed to configure %s: %s", self.exporter_name, err) + return False + return True class DCGMExporter(SnapExporter): @@ -403,38 +448,33 @@ class DCGMExporter(SnapExporter): exporter_name: str = "dcgm" port: int = 9400 - snap_common: Path = Path("/var/snap/dcgm/common/") - metric_config: str = "dcgm-exporter-metrics-file" - def __init__(self, charm_dir: Path, config: ConfigData): + def __init__(self, config: ConfigData): """Init.""" - self.strategy = DCGMExporterStrategy(str(config["dcgm-snap-channel"])) - self.charm_dir = charm_dir - self.metrics_file = self.charm_dir / "src/gpu_metrics/dcgm_metrics.csv" - self.metric_config_value = self.metrics_file.name + self.strategies = [ + DCGMExporterStrategy(str(config["dcgm-snap-channel"])), + NVIDIADriverStrategy(), + ] super().__init__(config) - def install(self) -> bool: - """Install the DCGM exporter and configure custom metrics.""" - if not super().install(): - return False - - logger.info("Creating a custom metrics file and configuring the DCGM snap to use it") - try: - shutil.copy(self.metrics_file, self.snap_common) - self.snap_client.set({self.metric_config: self.metric_config_value}) - self.snap_client.restart(reload=True) - except Exception as err: # pylint: disable=broad-except - logger.error("Failed to configure custom DCGM metrics: %s", err) - return False - - return True - @staticmethod def hw_tools() -> Set[HWTool]: """Return hardware tools to watch.""" return {HWTool.DCGM} + def validate_exporter_configs(self) -> Tuple[bool, str]: + """Validate the if the DCGM exporter is able to run.""" + valid, msg = super().validate_exporter_configs() + if not valid: + return False, msg + + if not NVIDIADriverStrategy().check(): + return ( + False, + "Failed to communicate with NVIDIA driver. See more details in the logs", + ) + return valid, msg + class SmartCtlExporter(SnapExporter): """A class representing the smartctl exporter and the metric endpoints.""" @@ -445,7 +485,7 @@ def __init__(self, config: ConfigData) -> None: """Initialize the SmartctlExporter class.""" self.port = int(config["smartctl-exporter-port"]) self.log_level = str(config["exporter-log-level"]) - self.strategy = SmartCtlExporterStrategy(str(config["smartctl-exporter-snap-channel"])) + self.strategies = [SmartCtlExporterStrategy(str(config["smartctl-exporter-snap-channel"]))] super().__init__(config) @staticmethod @@ -466,9 +506,15 @@ def configure(self) -> bool: class HardwareExporter(RenderableExporter): """A class representing the hardware exporter and the metric endpoints.""" - required_config: bool = True + resources: Resources - def __init__(self, charm_dir: Path, config: ConfigData, available_tools: Set[HWTool]) -> None: + def __init__( + self, + charm_dir: Path, + resources: Resources, + config: ConfigData, + available_tools: Set[HWTool], + ) -> None: """Initialize the Hardware Exporter class.""" super().__init__(charm_dir, config, HARDWARE_EXPORTER_SETTINGS) @@ -479,6 +525,17 @@ def __init__(self, charm_dir: Path, config: ConfigData, available_tools: Set[HWT self.available_tools = available_tools self.collect_timeout = int(config["collect-timeout"]) self.bmc_address = get_bmc_address() + self.resources = resources + self.strategies = [ + StorCLIStrategy(resources=self.resources), + PercCLIStrategy(resources=self.resources), + SAS2IRCUStrategy(resources=self.resources), + SAS3IRCUStrategy(resources=self.resources), + IPMISELStrategy(), + IPMIDCMIStrategy(), + IPMISENSORStrategy(), + RedFishStrategy(), + ] def _render_config_content(self) -> str: """Render and install exporter config file.""" diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index 0def3264..37eac304 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -22,13 +22,16 @@ from config import SNAP_COMMON, TOOLS_DIR, TPR_RESOURCES, HWTool, StorageVendor, SystemVendor from hw_tools import ( APTStrategyABC, + DCGMExporterStrategy, HWToolHelper, IPMIDCMIStrategy, IPMISELStrategy, IPMISENSORStrategy, + NVIDIADriverStrategy, PercCLIStrategy, ResourceChecksumError, ResourceFileSizeZeroError, + ResourceInstallationError, SAS2IRCUStrategy, SAS3IRCUStrategy, SnapStrategy, @@ -1037,7 +1040,7 @@ def mock_snap_lib(): def test_snap_strategy_name(snap_exporter): - assert snap_exporter.snap == "my-snap" + assert snap_exporter.snap_name == "my-snap" def test_snap_strategy_channel(snap_exporter): @@ -1046,7 +1049,9 @@ def test_snap_strategy_channel(snap_exporter): def test_snap_strategy_install_success(snap_exporter, mock_snap_lib): snap_exporter.install() - mock_snap_lib.add.assert_called_once_with(snap_exporter.snap, channel=snap_exporter.channel) + mock_snap_lib.add.assert_called_once_with( + snap_exporter.snap_name, channel=snap_exporter.channel + ) def test_snap_strategy_install_fail(snap_exporter, mock_snap_lib): @@ -1058,7 +1063,7 @@ def test_snap_strategy_install_fail(snap_exporter, mock_snap_lib): def test_snap_strategy_remove_success(snap_exporter, mock_snap_lib): snap_exporter.remove() - mock_snap_lib.remove.assert_called_once_with([snap_exporter.snap]) + mock_snap_lib.remove.assert_called_once_with([snap_exporter.snap_name]) def test_snap_strategy_remove_fail(snap_exporter, mock_snap_lib): @@ -1143,6 +1148,185 @@ def test_snap_strategy_check(snap_exporter, mock_snap_lib, services, expected): assert snap_exporter.check() is expected +@pytest.fixture +def mock_check_output(): + with mock.patch("hw_tools.subprocess.check_output") as mocked_check_output: + yield mocked_check_output + + +@pytest.fixture +def mock_check_call(): + with mock.patch("hw_tools.subprocess.check_call") as mocked_check_call: + yield mocked_check_call + + +@pytest.fixture +def mock_apt_lib(): + with mock.patch("hw_tools.apt") as mocked_apt_lib: + yield mocked_apt_lib + + +@pytest.fixture +def mock_path(): + with mock.patch("hw_tools.Path") as mocked_path: + yield mocked_path + + +@pytest.fixture +def mock_shutil_copy(): + with mock.patch("hw_tools.shutil.copy") as mocked_copy: + yield mocked_copy + + +@pytest.fixture +def nvidia_driver_strategy(mock_check_output, mock_apt_lib, mock_path, mock_check_call): + strategy = NVIDIADriverStrategy() + strategy.installed_pkgs = mock_path + yield strategy + + +@pytest.fixture +def dcgm_exporter_strategy(mock_snap_lib, mock_shutil_copy): + yield DCGMExporterStrategy("latest/stable") + + +@mock.patch("hw_tools.DCGMExporterStrategy._create_custom_metrics") +def test_dcgm_exporter_install(mock_custom_metrics, dcgm_exporter_strategy): + assert dcgm_exporter_strategy.install() is None + mock_custom_metrics.assert_called_once() + + +def test_dcgm_create_custom_metrics(dcgm_exporter_strategy, mock_shutil_copy, mock_snap_lib): + assert dcgm_exporter_strategy._create_custom_metrics() is None + mock_shutil_copy.assert_called_once_with( + Path.cwd() / "src/gpu_metrics/dcgm_metrics.csv", Path("/var/snap/dcgm/common") + ) + dcgm_exporter_strategy.snap_client.set.assert_called_once_with( + {"dcgm-exporter-metrics-file": "dcgm_metrics.csv"} + ) + dcgm_exporter_strategy.snap_client.restart.assert_called_once_with(reload=True) + + +def test_dcgm_create_custom_metrics_copy_fail( + dcgm_exporter_strategy, mock_shutil_copy, mock_snap_lib +): + mock_shutil_copy.side_effect = FileNotFoundError + with pytest.raises(FileNotFoundError): + dcgm_exporter_strategy._create_custom_metrics() + + dcgm_exporter_strategy.snap_client.set.assert_not_called() + dcgm_exporter_strategy.snap_client.restart.assert_not_called() + + +@mock.patch("hw_tools.NVIDIADriverStrategy._install_nvidia_drivers") +@mock.patch("hw_tools.NVIDIADriverStrategy._install_nvidia_utils") +def test_nvidia_driver_strategy_install( + mock_install_nvidia_drivers, mock_install_nvidia_utils, nvidia_driver_strategy +): + nvidia_driver_strategy.install() + mock_install_nvidia_drivers.assert_called_once() + mock_install_nvidia_utils.assert_called_once() + + +def test_install_nvidia_drivers_success( + mock_path, mock_check_output, mock_apt_lib, nvidia_driver_strategy +): + nvidia_version = mock.MagicMock() + nvidia_version.exists.return_value = False + mock_path.return_value = nvidia_version + + nvidia_driver_strategy._install_nvidia_drivers() + + mock_apt_lib.add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) + mock_check_output.assert_called_once() + + +def test_install_nvidia_drivers_already_installed( + mock_path, mock_apt_lib, nvidia_driver_strategy, mock_check_output +): + nvidia_version = mock.MagicMock() + nvidia_version.exists.return_value = True + mock_path.return_value = nvidia_version + + nvidia_driver_strategy._install_nvidia_drivers() + + mock_apt_lib.add_package.assert_not_called() + mock_check_output.assert_not_called() + + +def test_install_nvidia_drivers_subprocess_exception( + mock_path, mock_check_output, mock_apt_lib, nvidia_driver_strategy +): + nvidia_version = mock.MagicMock() + nvidia_version.exists.return_value = False + mock_path.return_value = nvidia_version + mock_check_output.side_effect = subprocess.CalledProcessError(1, []) + + with pytest.raises(subprocess.CalledProcessError): + nvidia_driver_strategy._install_nvidia_drivers() + + mock_apt_lib.add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) + + +def test_install_nvidia_drivers_no_drivers_found( + mock_path, mock_check_output, mock_apt_lib, nvidia_driver_strategy +): + nvidia_version = mock.MagicMock() + nvidia_version.exists.return_value = False + mock_path.return_value = nvidia_version + mock_check_output.return_value = "No drivers found for installation" + + with pytest.raises(ResourceInstallationError): + nvidia_driver_strategy._install_nvidia_drivers() + + mock_apt_lib.add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) + + +def test_install_nvidia_utils_driver_installed_from_charm(mock_apt_lib, nvidia_driver_strategy): + driver_version = mock.MagicMock() + driver_version.exists.return_value = True + driver_version.read_text.return_value = ( + "nvidia-headless-no-dkms-535-server\nlibnvidia-cfg1-535-server" + ) + nvidia_driver_strategy.installed_pkgs = driver_version + + nvidia_driver_strategy._install_nvidia_utils() + mock_apt_lib.add_package.assert_called_with("nvidia-utils-535-server", update_cache=True) + + +def test_install_nvidia_utils_driver_not_installed_from_charm( + mock_apt_lib, nvidia_driver_strategy +): + nvidia_driver_strategy.installed_pkgs.exists.return_value = False + + nvidia_driver_strategy._install_nvidia_utils() + mock_apt_lib.add_package.assert_not_called() + + +def test_install_nvidia_utils_driver_unexpected_format(mock_apt_lib, nvidia_driver_strategy): + driver_version = mock.MagicMock() + driver_version.exists.return_value = True + driver_version.read_text.return_value = "nvidia-my-version-server" + nvidia_driver_strategy.installed_pkgs = driver_version + + nvidia_driver_strategy._install_nvidia_utils() + mock_apt_lib.add_package.assert_not_called() + + +def test_nvidia_strategy_remove(nvidia_driver_strategy): + assert nvidia_driver_strategy.remove() is None + + +def test_nvidia_strategy_check(nvidia_driver_strategy): + assert nvidia_driver_strategy.check() is True + + +@pytest.mark.parametrize("exception", [FileNotFoundError, subprocess.CalledProcessError(1, [])]) +def test_nvidia_strategy_check_fails(nvidia_driver_strategy, mock_check_call, exception): + mock_check_call.side_effect = exception + assert nvidia_driver_strategy.check() is False + + @mock.patch("hw_tools.Path.unlink") @mock.patch("hw_tools.Path.exists") @mock.patch("hw_tools.shutil") diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index a0fb8835..3c75577b 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -721,19 +721,18 @@ class TestDCGMSnapExporter(unittest.TestCase): def setUp(self) -> None: """Set up harness for each test case.""" snap_lib_patcher = mock.patch.object(service, "snap") - shutil_lib_patcher = mock.patch.object(service, "shutil") + self.mock_snap = snap_lib_patcher.start() - self.mock_shutil = shutil_lib_patcher.start() self.addCleanup(snap_lib_patcher.stop) - self.addCleanup(shutil_lib_patcher.stop) - - search_path = pathlib.Path(f"{__file__}/../../..").resolve() - self.mock_config = { - "dcgm-snap-channel": "latest/stable", - } - self.exporter = service.DCGMExporter(search_path, self.mock_config) - self.exporter.strategy = mock.MagicMock() + self.exporter = service.DCGMExporter( + { + "dcgm-snap-channel": "latest/stable", + } + ) + self.snap_strategy = mock.MagicMock(spec=service.DCGMExporterStrategy) + self.nvidia_strategy = mock.MagicMock(spec=service.NVIDIADriverStrategy) + self.exporter.strategies = [self.snap_strategy, self.nvidia_strategy] def test_exporter_name(self): self.assertEqual(self.exporter.exporter_name, "dcgm") @@ -741,39 +740,26 @@ def test_exporter_name(self): def test_hw_tools(self): self.assertEqual(self.exporter.hw_tools(), {HWTool.DCGM}) - def test_install_failed(self): - self.exporter.snap_client.present = False - - exporter_install_ok = self.exporter.install() - - self.exporter.strategy.install.assert_called() - self.mock_shutil.copy.assert_not_called() - self.assertFalse(exporter_install_ok) - - def test_install_success(self): - self.exporter.snap_client.present = True - - exporter_install_ok = self.exporter.install() + @mock.patch("service.NVIDIADriverStrategy.check", return_value=True) + def test_validate_exporter_configs_success(self, _): + valid, msg = self.exporter.validate_exporter_configs() + self.assertTrue(valid) + self.assertEqual(msg, "Exporter config is valid.") - self.exporter.strategy.install.assert_called() - self.mock_shutil.copy.assert_called_with( - self.exporter.metrics_file, self.exporter.snap_common - ) - self.exporter.snap_client.set.assert_called_with( - {self.exporter.metric_config: self.exporter.metric_config_value} + @mock.patch("service.NVIDIADriverStrategy.check", return_value=False) + def test_validate_exporter_configs_fails(self, _): + valid, msg = self.exporter.validate_exporter_configs() + self.assertFalse(valid) + self.assertEqual( + msg, "Failed to communicate with NVIDIA driver. See more details in the logs" ) - self.exporter.snap_client.restart.assert_called_with(reload=True) - self.assertTrue(exporter_install_ok) - - def test_install_metrics_copy_fail(self): - self.exporter.snap_client.present = True - self.mock_shutil.copy.side_effect = FileNotFoundError - exporter_install_ok = self.exporter.install() - - self.exporter.strategy.install.assert_called() - self.exporter.snap_client.restart.assert_not_called() - self.assertFalse(exporter_install_ok) + @mock.patch.object(service.BaseExporter, "validate_exporter_configs") + def test_validate_exporter_configs_fails_parent(self, mock_parent_validate): + mock_parent_validate.return_value = False, "Invalid config: exporter's port" + valid, msg = self.exporter.validate_exporter_configs() + self.assertFalse(valid) + self.assertEqual(msg, "Invalid config: exporter's port") class TestWriteToFile(unittest.TestCase): @@ -845,25 +831,27 @@ def test_write_to_file_not_a_directory_error(self, mock_open): @pytest.fixture def snap_exporter(): - my_strategy = mock.MagicMock(spec=service.SnapStrategy) + my_snap_strategy = mock.MagicMock(spec=service.SnapStrategy) + my_apt_strategy = mock.MagicMock(spec=service.APTStrategyABC) class MySnapExporter(service.SnapExporter): exporter_name = "my-exporter" channel = "my-channel" - strategy = my_strategy - - mock_config = { - "dcgm-snap-channel": "latest/stable", - } + strategies = [my_snap_strategy, my_apt_strategy] with mock.patch("service.snap.SnapCache"): - exporter = MySnapExporter(mock_config) + exporter = MySnapExporter( + { + "dcgm-snap-channel": "latest/stable", + } + ) exporter.snap_client.services = {"service1": {}, "service2": {}} yield exporter - my_strategy.reset_mock() + my_snap_strategy.reset_mock() + my_apt_strategy.reset_mock() def test_snap_exporter_hw_tools(snap_exporter): @@ -871,16 +859,16 @@ def test_snap_exporter_hw_tools(snap_exporter): assert snap_exporter.hw_tools() == set() -def test_snap_exporter_install(snap_exporter): - snap_exporter.strategy.install.return_value = True +def test_snap_exporter_install_success(snap_exporter): snap_exporter.snap_client.present = True assert snap_exporter.install() is True - snap_exporter.strategy.install.assert_called_once() + for strategy in snap_exporter.strategies: + strategy.install.assert_called_once() def test_snap_exporter_install_fail(snap_exporter): - snap_exporter.strategy.install.side_effect = ValueError + snap_exporter.strategies[0].install.side_effect = ValueError assert snap_exporter.install() is False @@ -889,11 +877,12 @@ def test_snap_exporter_uninstall(snap_exporter): snap_exporter.snap_client.present = False assert snap_exporter.uninstall() is True - snap_exporter.strategy.remove.assert_called_once() + for strategy in snap_exporter.strategies: + strategy.remove.assert_called_once() def test_snap_exporter_uninstall_fail(snap_exporter): - snap_exporter.strategy.remove.side_effect = ValueError + snap_exporter.strategies[0].remove.side_effect = ValueError assert snap_exporter.uninstall() is False @@ -902,7 +891,8 @@ def test_snap_exporter_uninstall_present(snap_exporter): snap_exporter.snap_client.present = True assert snap_exporter.uninstall() is False - snap_exporter.strategy.remove.assert_called_once() + for strategy in snap_exporter.strategies: + strategy.remove.assert_called_once() def test_snap_exporter_enable_and_start(snap_exporter): @@ -935,20 +925,25 @@ def test_snap_exporter_set_failed(snap_exporter): def test_snap_exporter_check_health(snap_exporter): snap_exporter.check_health() - snap_exporter.strategy.check.assert_called_once() + for strategy in snap_exporter.strategies: + strategy.check.assert_called_once() + +@mock.patch("service.isinstance", return_value=True) +def test_snap_exporter_configure(_, snap_exporter): + assert snap_exporter.configure() is True + for strategy in snap_exporter.strategies: + strategy.install.assert_called_once() -@pytest.mark.parametrize("install_result, expected_result", [(True, True), (False, False)]) -@mock.patch("service.SnapExporter.install") -def test_snap_exporter_configure(mock_install, snap_exporter, install_result, expected_result): - mock_install.return_value = install_result - assert snap_exporter.configure() is expected_result - mock_install.assert_called_once() +@mock.patch("service.isinstance", return_value=True) +def test_snap_exporter_configure_exception(_, snap_exporter): + snap_exporter.strategies[0].install.side_effect = snap.SnapError + assert snap_exporter.configure() is False @pytest.mark.parametrize("result, expected_result", [(True, True), (False, False)]) -@mock.patch("service.SnapExporter.install") +@mock.patch("service.SmartCtlExporterStrategy.install") @mock.patch("service.SnapExporter.set") def test_smartctl_exporter_configure(mock_set, mock_install, result, expected_result): mock_config = { @@ -956,7 +951,6 @@ def test_smartctl_exporter_configure(mock_set, mock_install, result, expected_re "exporter-log-level": "info", "smartctl-exporter-snap-channel": "latest/stable", } - mock_set.return_value = result mock_install.return_value = result exporter = service.SmartCtlExporter(mock_config)