diff --git a/config.yaml b/config.yaml index 33e9d88c..7f505bca 100644 --- a/config.yaml +++ b/config.yaml @@ -8,7 +8,7 @@ options: default: 10000 description: | Start the prometheus exporter at "exporter-port". By default, it will - start at port 10000. + start at port 10000. Allowed values are between 1 and 65535 inclusively. exporter-log-level: type: string default: "INFO" diff --git a/src/charm.py b/src/charm.py index 70a95eb6..75e43090 100755 --- a/src/charm.py +++ b/src/charm.py @@ -8,14 +8,14 @@ from typing import Any, Dict, Optional import ops -from charms.grafana_agent.v0.cos_agent import COSAgentProvider from ops.framework import EventBase, StoredState -from ops.model import ActiveStatus, BlockedStatus +from ops.model import ActiveStatus, BlockedStatus, ErrorStatus, MaintenanceStatus -from config import HWTool +import cos_agent +import service +from config import EXPORTER_RELATION_NAME, HWTool from hardware import get_bmc_address -from hw_tools import HWToolHelper, bmc_hw_verifier -from service import Exporter +from hw_tools import HWToolHelper logger = logging.getLogger(__name__) @@ -29,80 +29,123 @@ def __init__(self, *args: Any) -> None: """Init.""" super().__init__(*args) self.hw_tool_helper = HWToolHelper() - - self.cos_agent_provider = COSAgentProvider( + self.exporter = service.Exporter(self.charm_dir) + self.cos_agent_relation_handler = cos_agent.Handler( self, + exporter=self.exporter, + relation_name=EXPORTER_RELATION_NAME, metrics_endpoints=[ - {"path": "/metrics", "port": int(self.model.config["exporter-port"])} + { + "path": "/metrics", + "port": int( + self.model.config["exporter-port"], + ), + } ], ) - self.exporter = Exporter(self.charm_dir) self.framework.observe(self.on.config_changed, self._on_config_changed) self.framework.observe(self.on.install, self._on_install_or_upgrade) self.framework.observe(self.on.remove, self._on_remove) self.framework.observe(self.on.update_status, self._on_update_status) self.framework.observe(self.on.upgrade_charm, self._on_install_or_upgrade) - self.framework.observe( - self.on.cos_agent_relation_joined, self._on_cos_agent_relation_joined - ) - self.framework.observe( - self.on.cos_agent_relation_departed, self._on_cos_agent_relation_departed - ) - - self._stored.set_default(installed=False, config={}, blocked_msg="") - - def _on_install_or_upgrade(self, event: ops.InstallEvent) -> None: - """Install and upgrade.""" - port = self.model.config.get("exporter-port", "10000") - level = self.model.config.get("exporter-log-level", "INFO") - redfish_creds = self._get_redfish_creds() - - self.exporter.install(port, level, redfish_creds) - installed, msg = self.hw_tool_helper.install(self.model.resources) - self._stored.installed = installed + self._stored.set_default( + config={}, + exporter_installed=False, + resource_install_result={}, + ) - if not installed: - logger.info(msg) - self._stored.blocked_msg = msg - self._on_update_status(event) + def _on_install_or_upgrade(self, _: ops.InstallEvent) -> None: + """Install or upgrade charm.""" + self.model.unit.status = MaintenanceStatus("Installing resources...") + resource_white_list = self.hw_tool_helper.get_resource_white_list(self.model.resources) + resource_install_result = self.hw_tool_helper.install( + resource_white_list, + resource_black_list=self._stored.resource_install_result, # type: ignore[arg-type] + ) + self._stored.resource_install_result = resource_install_result + if not all(resource_install_result.values()): + failed_resources = [r for r, s in resource_install_result.items() if not s] + msg = f"Failed to install resources: {', '.join(failed_resources)}" + logger.error(msg) + self.model.unit.status = ErrorStatus(msg) return - self._stored.installed = True - self._stored.blocked_msg = "" - self.model.unit.status = ActiveStatus("Install complete") - logger.info("Install complete") + if self._stored.exporter_installed is not True: + self.model.unit.status = MaintenanceStatus("Installing exporter...") + success = self.cos_agent_relation_handler.install_exporter() + self._stored.exporter_installed = success + if not success: + msg = "Failed to install exporter, please refer to `juju debug-log`" + logger.error(msg) + self.model.unit.status = ErrorStatus(msg) + return + + self.update_status() def _on_remove(self, _: EventBase) -> None: """Remove everything when charm is being removed.""" logger.info("Start to remove.") - # Remove binary tool self.hw_tool_helper.remove(self.model.resources) - self.exporter.uninstall() + self._stored.resource_install_result = {} + success = self.cos_agent_relation_handler.uninstall_exporter() + if not success: + msg = "Failed to uninstall exporter, please refer to `juju debug-log`" + logger.warning(msg) + self._stored.exporter_installed = not success logger.info("Remove complete") def _on_update_status(self, _: EventBase) -> None: """Update the charm's status.""" - if self._stored.installed is not True and self._stored.blocked_msg != "": - self.model.unit.status = BlockedStatus(self._stored.blocked_msg) # type: ignore + self.update_status() + + def _on_config_changed(self, _: EventBase) -> None: + """Reconfigure charm.""" + change_set = self.update_config_store() + + if self.cos_agent_relation_handler.exporter_enabled: + options = self.get_exporter_configs() + success, message = self.cos_agent_relation_handler.validate_exporter_configs(options) + if not success: + self.model.unit.status = BlockedStatus(message) + return + + success = self.cos_agent_relation_handler.configure_exporter(options, change_set) + if not success: + message = "Failed to configure exporter, please check if the server is healthy." + self.model.unit.status = BlockedStatus(message) + return + + self.update_status() + + def update_status(self) -> None: + """Update the charm's status.""" + if not self.cos_agent_relation_handler.exporter_enabled: + self.model.unit.status = BlockedStatus(f"Missing relation: [{EXPORTER_RELATION_NAME}]") return - if not self.model.get_relation("cos-agent"): - self.model.unit.status = BlockedStatus("Missing relation: [cos-agent]") + + if self.cos_agent_relation_handler.too_many_relations: + self.model.unit.status = BlockedStatus("Cannot relate to more than one grafana-agent") return - if not self.exporter.check_health(): - self.model.unit.status = BlockedStatus("Exporter is unhealthy") + + if ( + self.cos_agent_relation_handler.exporter_enabled + and not self.cos_agent_relation_handler.exporter_online + ): + error_msg = "Exporter crashed unexpectedly, please refer to systemd logs..." + self.model.unit.status = ErrorStatus(error_msg) return - if not self.exporter.check_active(): - self.model.unit.status = BlockedStatus("Exporter is not running") + + hw_tool_ok, error_msg = self.hw_tool_helper.check_status() + if not hw_tool_ok: + self.model.unit.status = ErrorStatus(error_msg) return + self.model.unit.status = ActiveStatus("Unit is ready") - def _on_config_changed(self, event: EventBase) -> None: - """Reconfigure charm.""" - # Keep track of what model config options + some extra config related - # information are changed. This can be helpful when we want to respond - # to the change of a specific config option. + def update_config_store(self) -> set: + """Update the config store, and return a set of config options that are changed.""" change_set = set() model_config: Dict[str, Optional[str]] = dict(self.model.config.items()) for key, value in model_config.items(): @@ -110,63 +153,38 @@ def _on_config_changed(self, event: EventBase) -> None: logger.info("Setting %s to: %s", key, value) self._stored.config[key] = value # type: ignore change_set.add(key) + return change_set + + def get_redfish_options(self) -> Dict[str, Any]: + """Get redfish config options.""" + redfish_options = { + "enable": False, + "host": "", + "username": self.model.config.get("redfish-username", ""), + "password": self.model.config.get("redfish-password", ""), + } - if not self._stored.installed: # type: ignore - logging.info( # type: ignore - "Config changed called before install complete, deferring event: %s", - event.handle, - ) - event.defer() - return + bmc_address = get_bmc_address() + if bmc_address: + redfish_options["enable"] = True + redfish_options["host"] = f"https://{bmc_address}" - exporter_configs = { - "exporter-port", - "exporter-log-level", - "redfish-host", - "redfish-username", - "redfish-password", + return redfish_options + + def get_exporter_configs(self) -> Dict[str, Any]: + """Get the exporter related config options.""" + port = self.model.config.get("exporter-port", "10000") + level = self.model.config.get("exporter-log-level", "INFO") + collectors = self.hw_tool_helper.hw_collector_white_list + redfish_options = {"enable": False} + if HWTool.REDFISH in collectors: + redfish_options = self.get_redfish_options() + return { + "port": port, + "level": level, + "collectors": collectors, + "redfish_options": redfish_options, } - if exporter_configs.intersection(change_set): - logger.info("Detected changes in exporter config.") - port = self.model.config.get("exporter-port", "10000") - level = self.model.config.get("exporter-log-level", "INFO") - - redfish_creds = self._get_redfish_creds() - success = self.exporter.template.render_config( - port=port, level=level, redfish_creds=redfish_creds - ) - # First condition prevent the exporter from starting at when the - # charm just installed; the second condition tries to recover the - # exporter from failed status. - if success and self.exporter.check_active() or not self.exporter.check_health(): - self.exporter.restart() - - self._on_update_status(event) - - def _on_cos_agent_relation_joined(self, event: EventBase) -> None: - """Start the exporter when relation joined.""" - self.exporter.start() - self._on_update_status(event) - - def _on_cos_agent_relation_departed(self, event: EventBase) -> None: - """Remove the exporter when relation departed.""" - self.exporter.stop() - self._on_update_status(event) - - def _get_redfish_creds(self) -> Dict[str, str]: - """Provide redfish config if redfish is available, else empty dict.""" - bmc_tools = bmc_hw_verifier() - if HWTool.REDFISH in bmc_tools: - bmc_address = get_bmc_address() - redfish_creds = { - # Force to use https as default protocol - "host": f"https://{bmc_address}", - "username": self.model.config.get("redfish-username", ""), - "password": self.model.config.get("redfish-password", ""), - } - else: - redfish_creds = {} - return redfish_creds if __name__ == "__main__": # pragma: nocover diff --git a/src/checksum.py b/src/checksum.py index 6459d349..583a4052 100644 --- a/src/checksum.py +++ b/src/checksum.py @@ -14,21 +14,15 @@ # # For further info, check https://github.com/canonical/charmcraft """Checksum definition, check functions and related utils.""" -import hashlib import logging import typing as t from dataclasses import dataclass, field -from pathlib import Path -from os_platform import Architecture, UbuntuSeries, get_os_platform +from os_platform import Architecture, UbuntuSeries logger = logging.getLogger(__name__) -class ResourceChecksumError(Exception): - """Raise if checksum does not match.""" - - @dataclass class ToolVersionInfo: """Tool version information for checksum comparison.""" @@ -209,27 +203,3 @@ class ToolVersionInfo: sha256_checksum="458d51b030468901fc8a207088070e6ce82db34b181d9190c8f849605f1b9b6d", ), ] - - -def validate_checksum(support_version_infos: t.List[ToolVersionInfo], path: Path) -> bool: - """Validate checksum of resource file by checking with supported versions. - - Returns True if resource is supported by the charm, architecture, and - checksum validation is successful. - """ - os_platform = get_os_platform() - - supported_checksums = [] - for info in support_version_infos: - if os_platform.machine in info.supported_architectures and ( - info.support_all_series or os_platform.series in info.supported_series - ): - supported_checksums.append(info.sha256_checksum) - - with open(path, "rb") as f: - sha256_hash = hashlib.sha256(f.read()).hexdigest() - - if sha256_hash in supported_checksums: - return True - logger.warning("Checksum validation fail, path: %s hash: %s", path, sha256_hash) - return False diff --git a/src/config.py b/src/config.py index a6ab420a..9fabaccd 100644 --- a/src/config.py +++ b/src/config.py @@ -9,6 +9,7 @@ EXPORTER_SERVICE_PATH = Path(f"/etc/systemd/system/{EXPORTER_NAME}.service") EXPORTER_CONFIG_TEMPLATE = f"{EXPORTER_NAME}-config.yaml.j2" EXPORTER_SERVICE_TEMPLATE = f"{EXPORTER_NAME}.service.j2" +EXPORTER_RELATION_NAME = "cos-agent" # Redfish REDFISH_TIMEOUT = 3 diff --git a/src/cos_agent.py b/src/cos_agent.py new file mode 100644 index 00000000..16ac9cdc --- /dev/null +++ b/src/cos_agent.py @@ -0,0 +1,119 @@ +"""COS relation handler.""" +from logging import getLogger +from typing import Any, Dict, List, Tuple + +import ops +from charms.grafana_agent.v0.cos_agent import COSAgentProvider + +from config import EXPORTER_RELATION_NAME +from hardware import validate_redfish_credential +from service import Exporter + +logger = getLogger(__name__) + + +class Handler(ops.Object): + """A class representing the cos-agent relation handler.""" + + def __init__( + self, + charm: ops.CharmBase, + exporter: Exporter, + metrics_endpoints: List[Dict[str, object]], + relation_name: str = EXPORTER_RELATION_NAME, + ) -> None: + """Initialize the class.""" + super().__init__(charm, relation_name) + self.charm = charm + self.exporter = exporter + self.num_relations = self.get_num_relations(relation_name) + self.cos_exporter_provider = COSAgentProvider( + self.charm, + relation_name=relation_name, + metrics_endpoints=metrics_endpoints, + ) + + self.charm.framework.observe( + self.charm.on[relation_name].relation_joined, + self._on_exporter_relation_joined, + ) + self.charm.framework.observe( + self.charm.on[relation_name].relation_departed, + self._on_exporter_relation_departed, + ) + + def _on_exporter_relation_joined(self, _: ops.EventBase) -> None: + """Start the exporter when relation joined.""" + self.exporter.start() + + def _on_exporter_relation_departed(self, _: ops.EventBase) -> None: + """Remove the exporter when relation departed.""" + self.exporter.stop() + + @property + def exporter_enabled(self) -> bool: + """Return True if cos-agent relation is present.""" + return self.num_relations != 0 + + @property + def exporter_online(self) -> bool: + """Return True if the exporter is online.""" + return self.exporter.check_health() + + @property + def too_many_relations(self) -> bool: + """Return True if there're more than one cos-agent relation.""" + return self.num_relations > 1 + + def get_num_relations(self, relation_name: str) -> int: + """Get the number of relation given a relation_name.""" + relations = self.charm.model.relations.get(relation_name, []) + return len(relations) + + def install_exporter(self) -> bool: + """Install the exporter.""" + return self.exporter.install() + + def uninstall_exporter(self) -> bool: + """Uninstall the exporter.""" + return self.exporter.uninstall() + + def configure_exporter(self, options: Dict[str, Any], change_set: set) -> bool: + """Configure the exporter.""" + if not self.exporter.config_options.intersection(change_set): + logger.info("No changes in exporter config.") + return True + + logger.info("Detected changes in exporter config.") + return self.exporter.configure(**options) + + def validate_exporter_configs(self, options: Dict[str, Any]) -> Tuple[bool, str]: + """Validate the static and runtime config options for the exporter.""" + port = int(options.get("port", 0)) + if not 1 <= port <= 65535: + logger.error("Invalid exporter-port: port must be in [1, 65535].") + return False, "Invalid config: 'exporter-port'" + + level = options.get("level", "") + allowed_choices = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"} + if level.upper() not in allowed_choices: + logger.error( + "Invalid exporter-log-level: level must be in %s (case-insensitive).", + allowed_choices, + ) + return False, "Invalid config: 'exporter-log-level'" + + valid = True + redfish_options = options.get("redfish_options") + if redfish_options and redfish_options.get("enable"): + valid = validate_redfish_credential( + redfish_options.get("host", ""), + username=redfish_options.get("username", ""), + password=redfish_options.get("password", ""), + ) + if not valid: + logger.error("Invalid redfish-username or redfish-password") + logger.error("Please also check if redfish is available on the server.") + return False, "Invalid redfish credential or redfish is not available." + + return True, "Exporter config is valid." diff --git a/src/hardware.py b/src/hardware.py index 586b23f5..ee240697 100644 --- a/src/hardware.py +++ b/src/hardware.py @@ -5,8 +5,10 @@ import typing as t from charms.operator_libs_linux.v0 import apt +from redfish import redfish_client +from redfish.rest.v1 import InvalidCredentialsError -from config import HWTool +from config import REDFISH_MAX_RETRY, REDFISH_TIMEOUT, HWTool, StorageVendor, SystemVendor logger = logging.getLogger(__name__) @@ -52,9 +54,19 @@ def lshw(class_filter: t.Optional[str] = None) -> t.Any: raise err +def install_apt_package(pkg_name: str) -> None: + """Install APT package if it's not installed.""" + try: + apt.DebianPackage.from_installed_package(pkg_name) + except apt.PackageNotFoundError: + logger.info("installing %s", pkg_name) + apt.add_package(pkg_name, update_cache=False) + else: + logger.info("%s already installed", pkg_name) + + def get_bmc_address() -> t.Optional[str]: """Get BMC IP address by ipmitool.""" - apt.add_package("ipmitool", update_cache=False) cmd = "ipmitool lan print" try: output = subprocess.check_output(cmd.split(), text=True) @@ -65,3 +77,106 @@ def get_bmc_address() -> t.Optional[str]: except subprocess.CalledProcessError: logger.debug("IPMI is not available") return None + + +def validate_redfish_credential(ip_address: str, username: str = "", password: str = "") -> bool: + """Validate redfish credentials by attempting to login with them.""" + try: + result = True + redfish_obj = redfish_client( + base_url=f"https://{ip_address}", + username=username, + password=password, + timeout=REDFISH_TIMEOUT, + max_retry=REDFISH_MAX_RETRY, + ) + redfish_obj.login() + except InvalidCredentialsError: + result = False + logger.error("invalid redfish credential") + except Exception as err: # pylint: disable=W0718 + result = False + logger.error("redfish not available") + logger.error("cannot connect to redfish: %s", str(err)) + else: + redfish_obj.logout() + + return result + + +def raid_hw_verifier() -> t.List[HWTool]: + """Verify if the HWTool support RAID card exists on machine.""" + hw_info = lshw() + system_vendor = hw_info.get("vendor") + storage_info = lshw(class_filter="storage") + + tools = set() + + for info in storage_info: + _id = info.get("id") + product = info.get("product") + vendor = info.get("vendor") + driver = info.get("configuration", {}).get("driver") + if _id == "sas": + # sas3ircu + if ( + any( + _product + for _product in SUPPORTED_STORAGES[HWTool.SAS3IRCU] + if _product in product + ) + and vendor == StorageVendor.BROADCOM + ): + tools.add(HWTool.SAS3IRCU) + # sas2ircu + if ( + any( + _product + for _product in SUPPORTED_STORAGES[HWTool.SAS2IRCU] + if _product in product + ) + and vendor == StorageVendor.BROADCOM + ): + tools.add(HWTool.SAS2IRCU) + + if _id == "raid": + # ssacli + if system_vendor == SystemVendor.HP and any( + _product for _product in SUPPORTED_STORAGES[HWTool.SSACLI] if _product in product + ): + tools.add(HWTool.SSACLI) + # perccli + elif system_vendor == SystemVendor.DELL: + tools.add(HWTool.PERCCLI) + # storcli + elif driver == "megaraid_sas" and vendor == StorageVendor.BROADCOM: + tools.add(HWTool.STORCLI) + return list(tools) + + +def bmc_hw_verifier() -> t.List[HWTool]: + """Verify if the ipmi is available on the machine. + + Using ipmitool to verify, the package will be removed in removing stage. + """ + bmc_address = get_bmc_address() + if not bmc_address: + logger.info("BMC is not available.") + return [] + + tools = [] + if bmc_address is not None: + tools.append(HWTool.IPMI) + if validate_redfish_credential(bmc_address): + tools.append(HWTool.REDFISH) + + return tools + + +def get_hw_tool_white_list() -> t.List[HWTool]: + """Return HWTool white list.""" + # bmc_hw_verifier requires `ipmitool` + install_apt_package("ipmitool") + bmc_white_list = bmc_hw_verifier() + raid_white_list = raid_hw_verifier() + return raid_white_list + bmc_white_list diff --git a/src/hw_resources.py b/src/hw_resources.py new file mode 100644 index 00000000..dc0338b9 --- /dev/null +++ b/src/hw_resources.py @@ -0,0 +1,101 @@ +"""Wrapper for `model.Resources`.""" + +import hashlib +import logging +import os +import typing as t +from pathlib import Path + +from checksum import ToolVersionInfo +from config import HWTool +from os_platform import get_os_platform + +logger = logging.getLogger(__name__) + + +class ResourceFileSizeZeroError(Exception): + """Empty resource error.""" + + def __init__(self, tool: HWTool, path: Path): + """Init.""" + self.message = f"{tool}: {path} has zero size" + + +class ResourceChecksumError(Exception): + """Raise if checksum does not match.""" + + def __init__(self, tool: HWTool, path: Path): + """Init.""" + self.message = f"{tool}: {path} has incorrect checksum" + + +class ResourceNotFoundError(Exception): + """Raise if resource not found.""" + + def __init__(self, tool: HWTool, path: Path): + """Init.""" + self.message = f"{tool}: {path} does not exist" + + +class ResourceNotExecutableError(Exception): + """Raise if resource is not an executable.""" + + def __init__(self, tool: HWTool, path: Path): + """Init.""" + self.message = f"{tool}: {path} is not an executable" + + +class ResourceIsDirectoryError(Exception): + """Raise if the resource is a directory.""" + + +def check_file_exists(src: Path) -> bool: + """Check if file exists or not.""" + if src.is_dir(): + raise ResourceIsDirectoryError(f"{src} is not a file.") + return src.exists() + + +def check_file_executable(src: Path) -> bool: + """Check if file is executable or not.""" + if src.is_dir(): + raise ResourceIsDirectoryError(f"{src} is not a file.") + return os.access(src, os.X_OK) + + +def validate_size(path: Path) -> bool: + """Verify if the file size > 0. + + Because charm focus us to publish the resources on charmhub, + but most of the hardware related tools have the un-republish + policy. Currently our solution is publish a empty file which + size is 0. + """ + if path.stat().st_size == 0: + logger.info("%s size is 0, skip install", path) + return False + return True + + +def validate_checksum(support_version_infos: t.List[ToolVersionInfo], path: Path) -> bool: + """Validate checksum of resource file by checking with supported versions. + + Returns True if resource is supported by the charm, architecture, and + checksum validation is successful. + """ + os_platform = get_os_platform() + + supported_checksums = [] + for info in support_version_infos: + if os_platform.machine in info.supported_architectures and ( + info.support_all_series or os_platform.series in info.supported_series + ): + supported_checksums.append(info.sha256_checksum) + + with open(path, "rb") as f: + sha256_hash = hashlib.sha256(f.read()).hexdigest() + + if sha256_hash in supported_checksums: + return True + logger.warning("Checksum validation fail, path: %s hash: %s", path, sha256_hash) + return False diff --git a/src/hw_tools.py b/src/hw_tools.py index 2c7967f9..5cf47de3 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -9,50 +9,26 @@ import subprocess import typing as t from abc import ABCMeta, abstractmethod +from functools import cached_property from pathlib import Path from charms.operator_libs_linux.v0 import apt from ops.model import ModelError, Resources -from redfish import redfish_client -from redfish.rest.v1 import ( - InvalidCredentialsError, - RetriesExhaustedError, - ServerDownOrUnreachableError, - SessionCreationError, -) +import hw_resources from checksum import ( PERCCLI_VERSION_INFOS, SAS2IRCU_VERSION_INFOS, SAS3IRCU_VERSION_INFOS, STORCLI_VERSION_INFOS, - ResourceChecksumError, - validate_checksum, -) -from config import ( - REDFISH_MAX_RETRY, - REDFISH_TIMEOUT, - SNAP_COMMON, - TOOLS_DIR, - TPR_RESOURCES, - HWTool, - StorageVendor, - SystemVendor, ) -from hardware import SUPPORTED_STORAGES, get_bmc_address, lshw +from config import EXPORTER_COLLECTOR_MAPPING, SNAP_COMMON, TOOLS_DIR, TPR_RESOURCES, HWTool +from hardware import get_hw_tool_white_list from keys import HP_KEYS logger = logging.getLogger(__name__) -class ResourceFileSizeZeroError(Exception): - """Empty resource error.""" - - def __init__(self, tool: HWTool, path: Path): - """Init.""" - self.message = f"Tool: {tool} path: {path} size is zero" - - def copy_to_snap_common_bin(source: Path, filename: str) -> None: """Copy file to $SNAP_COMMON/bin folder.""" Path(f"{SNAP_COMMON}/bin").mkdir(parents=False, exist_ok=True) @@ -69,20 +45,6 @@ def symlink(src: Path, dst: Path) -> None: raise -def check_file_size(path: Path) -> bool: - """Verify if the file size > 0. - - Because charm focus us to publish the resources on charmhub, - but most of the hardware related tools have the un-republish - policy. Currently our solution is publish a empty file which - size is 0. - """ - if path.stat().st_size == 0: - logger.info("% size is 0, skip install", path) - return False - return True - - def install_deb(name: str, path: Path) -> None: """Install local deb package.""" _cmd: t.List[str] = ["dpkg", "-i", str(path)] @@ -117,6 +79,16 @@ def make_executable(src: Path) -> None: raise err +def check_deb_pkg_installed(pkg: str) -> bool: + """Check if debian package is installed.""" + try: + apt.DebianPackage.from_installed_package(pkg) + return True + except apt.PackageNotFoundError: + logger.warning("package %s not found in installed package", pkg) + return False + + class StrategyABC(metaclass=ABCMeta): # pylint: disable=R0903 """Basic strategy.""" @@ -127,6 +99,10 @@ def name(self) -> HWTool: """Name.""" return self._name + @abstractmethod + def check_status(self) -> t.Dict[str, bool]: + """Check installation status of the tool.""" + class APTStrategyABC(StrategyABC, metaclass=ABCMeta): """Strategy for apt install tool.""" @@ -142,6 +118,10 @@ def remove(self) -> None: # hook is triggered. But currently the apt lib don't have # the remove option. + @abstractmethod + def validate_tool(self) -> bool: + """Check if the tool is valid or not.""" + class TPRStrategyABC(StrategyABC, metaclass=ABCMeta): """Third party resource strategy class.""" @@ -154,6 +134,10 @@ def install(self, path: Path) -> None: def remove(self) -> None: """Remove details.""" + @abstractmethod + def validate_tool(self, path: Path) -> bool: + """Check if the tool is valid or not.""" + class StorCLIStrategy(TPRStrategyABC): """Strategy to install storcli.""" @@ -164,10 +148,10 @@ class StorCLIStrategy(TPRStrategyABC): def install(self, path: Path) -> None: """Install storcli.""" - if not check_file_size(path): - raise ResourceFileSizeZeroError(tool=self._name, path=path) - if not validate_checksum(STORCLI_VERSION_INFOS, path): - raise ResourceChecksumError + if not self.validate_tool(path): + logger.error("%s is not valid.", self.name) + return + install_deb(self.name, path) symlink(src=self.origin_path, dst=self.symlink_bin) @@ -177,6 +161,29 @@ def remove(self) -> None: logger.debug("Remove file %s", self.symlink_bin) remove_deb(pkg=self.name) + def check_status(self) -> t.Dict[str, bool]: + """Check installation status of third party tool.""" + try: + path = self.symlink_bin + exists = hw_resources.check_file_exists(path) + executable = hw_resources.check_file_executable(path) + except hw_resources.ResourceIsDirectoryError as err: + raise err + + if not exists: + raise hw_resources.ResourceNotFoundError(tool=self.name, path=path) + if not executable: + raise hw_resources.ResourceNotExecutableError(tool=self.name, path=path) + return {str(self.name): True} + + def validate_tool(self, path: Path) -> bool: + """Check if third party tool is valid.""" + if not hw_resources.validate_size(path): + raise hw_resources.ResourceFileSizeZeroError(tool=self.name, path=path) + if not hw_resources.validate_checksum(STORCLI_VERSION_INFOS, path): + raise hw_resources.ResourceChecksumError(tool=self.name, path=path) + return True + class PercCLIStrategy(TPRStrategyABC): """Strategy to install storcli.""" @@ -187,10 +194,10 @@ class PercCLIStrategy(TPRStrategyABC): def install(self, path: Path) -> None: """Install perccli.""" - if not check_file_size(path): - raise ResourceFileSizeZeroError(tool=self._name, path=path) - if not validate_checksum(PERCCLI_VERSION_INFOS, path): - raise ResourceChecksumError + if not self.validate_tool(path): + logger.error("%s is not valid.", self.name) + return + install_deb(self.name, path) symlink(src=self.origin_path, dst=self.symlink_bin) @@ -200,6 +207,29 @@ def remove(self) -> None: logger.debug("Remove file %s", self.symlink_bin) remove_deb(pkg=self.name) + def check_status(self) -> t.Dict[str, bool]: + """Check installation status of third party tool.""" + try: + path = self.symlink_bin + exists = hw_resources.check_file_exists(path) + executable = hw_resources.check_file_executable(path) + except hw_resources.ResourceIsDirectoryError as err: + raise err + + if not exists: + raise hw_resources.ResourceNotFoundError(tool=self.name, path=path) + if not executable: + raise hw_resources.ResourceNotExecutableError(tool=self.name, path=path) + return {str(self.name): True} + + def validate_tool(self, path: Path) -> bool: + """Check if third party tool is valid.""" + if not hw_resources.validate_size(path): + raise hw_resources.ResourceFileSizeZeroError(tool=self.name, path=path) + if not hw_resources.validate_checksum(PERCCLI_VERSION_INFOS, path): + raise hw_resources.ResourceChecksumError(tool=self.name, path=path) + return True + class SAS2IRCUStrategy(TPRStrategyABC): """Strategy to install storcli.""" @@ -209,10 +239,10 @@ class SAS2IRCUStrategy(TPRStrategyABC): def install(self, path: Path) -> None: """Install sas2ircu.""" - if not check_file_size(path): - raise ResourceFileSizeZeroError(tool=self._name, path=path) - if not validate_checksum(SAS2IRCU_VERSION_INFOS, path): - raise ResourceChecksumError + if not self.validate_tool(path): + logger.error("%s is not valid.", self.name) + return + make_executable(path) symlink(src=path, dst=self.symlink_bin) @@ -221,6 +251,29 @@ def remove(self) -> None: self.symlink_bin.unlink(missing_ok=True) logger.debug("Remove file %s", self.symlink_bin) + def check_status(self) -> t.Dict[str, bool]: + """Check installation status of third party tool.""" + try: + path = self.symlink_bin + exists = hw_resources.check_file_exists(path) + executable = hw_resources.check_file_executable(path) + except hw_resources.ResourceIsDirectoryError as err: + raise err + + if not exists: + raise hw_resources.ResourceNotFoundError(tool=self.name, path=path) + if not executable: + raise hw_resources.ResourceNotExecutableError(tool=self.name, path=path) + return {str(self.name): True} + + def validate_tool(self, path: Path) -> bool: + """Check if third party tool is valid.""" + if not hw_resources.validate_size(path): + raise hw_resources.ResourceFileSizeZeroError(tool=self.name, path=path) + if not hw_resources.validate_checksum(SAS2IRCU_VERSION_INFOS, path): + raise hw_resources.ResourceChecksumError(tool=self.name, path=path) + return True + class SAS3IRCUStrategy(SAS2IRCUStrategy): """Strategy to install storcli.""" @@ -230,13 +283,21 @@ class SAS3IRCUStrategy(SAS2IRCUStrategy): def install(self, path: Path) -> None: """Install sas3ircu.""" - if not check_file_size(path): - raise ResourceFileSizeZeroError(tool=self._name, path=path) - if not validate_checksum(SAS3IRCU_VERSION_INFOS, path): - raise ResourceChecksumError + if not self.validate_tool(path): + logger.error("%s is not valid.", self.name) + return + make_executable(path) symlink(src=path, dst=self.symlink_bin) + def validate_tool(self, path: Path) -> bool: + """Check if third party tool is valid.""" + if not hw_resources.validate_size(path): + raise hw_resources.ResourceFileSizeZeroError(tool=self.name, path=path) + if not hw_resources.validate_checksum(SAS3IRCU_VERSION_INFOS, path): + raise hw_resources.ResourceChecksumError(tool=self.name, path=path) + return True + class SSACLIStrategy(APTStrategyABC): """Strategy for install ssacli.""" @@ -261,6 +322,10 @@ def disable_repo(self) -> None: repositories.disable(self.repo) def install(self) -> None: + if not self.validate_tool(): + logger.error("%s is not valid.", self.name) + return + for key in HP_KEYS: apt.import_key(key) self.add_repo() @@ -270,6 +335,15 @@ def remove(self) -> None: apt.remove_package(self.pkg) self.disable_repo() + def check_status(self) -> t.Dict[str, bool]: + """Check package status.""" + return {self.pkg: check_deb_pkg_installed(self.pkg)} + + def validate_tool(self) -> bool: + """Check package status.""" + # Needs implementation + return True + class IPMIStrategy(APTStrategyABC): """Strategy for install ipmi.""" @@ -278,6 +352,10 @@ class IPMIStrategy(APTStrategyABC): pkgs = ["freeipmi-tools"] def install(self) -> None: + if not self.validate_tool(): + logger.error("%s is not valid.", self.name) + return + for pkg in self.pkgs: apt.add_package(pkg) @@ -285,6 +363,18 @@ def remove(self) -> None: for pkg in self.pkgs: apt.remove_package(pkg) + def check_status(self) -> t.Dict[str, bool]: + """Check package status.""" + result = {} + for pkg in self.pkgs: + result[pkg] = check_deb_pkg_installed(pkg) + return result + + def validate_tool(self) -> bool: + """Check package status.""" + # Needs implementation + return True + class RedFishStrategy(StrategyABC): # pylint: disable=R0903 """Install strategy for redfish. @@ -294,115 +384,13 @@ class RedFishStrategy(StrategyABC): # pylint: disable=R0903 _name = HWTool.REDFISH + def check_status(self) -> t.Dict[str, bool]: + """Check package status.""" + return {"redfish": True} -def raid_hw_verifier() -> t.List[HWTool]: - """Verify if the HWTool support RAID card exists on machine.""" - hw_info = lshw() - system_vendor = hw_info.get("vendor") - storage_info = lshw(class_filter="storage") - - tools = set() - - for info in storage_info: - _id = info.get("id") - product = info.get("product") - vendor = info.get("vendor") - driver = info.get("configuration", {}).get("driver") - if _id == "sas": - # sas3ircu - if ( - any( - _product - for _product in SUPPORTED_STORAGES[HWTool.SAS3IRCU] - if _product in product - ) - and vendor == StorageVendor.BROADCOM - ): - tools.add(HWTool.SAS3IRCU) - # sas2ircu - if ( - any( - _product - for _product in SUPPORTED_STORAGES[HWTool.SAS2IRCU] - if _product in product - ) - and vendor == StorageVendor.BROADCOM - ): - tools.add(HWTool.SAS2IRCU) - - if _id == "raid": - # ssacli - if system_vendor == SystemVendor.HP and any( - _product for _product in SUPPORTED_STORAGES[HWTool.SSACLI] if _product in product - ): - tools.add(HWTool.SSACLI) - # perccli - elif system_vendor == SystemVendor.DELL: - tools.add(HWTool.PERCCLI) - # storcli - elif driver == "megaraid_sas" and vendor == StorageVendor.BROADCOM: - tools.add(HWTool.STORCLI) - return list(tools) - - -def redfish_available() -> bool: - """Check if redfish service is available.""" - bmc_address = get_bmc_address() - host = f"https://{bmc_address}" - try: - # credentials can be empty because we're only checking if redfish service is accessible - redfish_obj = redfish_client( - base_url=host, - username="", - password="", - timeout=REDFISH_TIMEOUT, - max_retry=REDFISH_MAX_RETRY, - ) - redfish_obj.login(auth="session") - except (RetriesExhaustedError, ServerDownOrUnreachableError): - # redfish not available - result = False - except (SessionCreationError, InvalidCredentialsError): - # redfish available, wrong credentials or not able to create a session - result = True - except Exception as e: # pylint: disable=W0718 - # mark redfish unavailable for any generic exception - result = False - logger.error("cannot connect to redfish: %s", str(e)) - else: # login succeeded with empty credentials - result = True - redfish_obj.logout() - - return result - - -def bmc_hw_verifier() -> t.List[HWTool]: - """Verify if the ipmi is available on the machine. - - Using ipmitool to verify, the package will be removed in removing stage. - """ - tools = [] - # Check IPMI available - apt.add_package("ipmitool", update_cache=False) - try: - subprocess.check_output("ipmitool lan print".split()) - tools.append(HWTool.IPMI) - except subprocess.CalledProcessError: - logger.info("IPMI is not available") - - # Check RedFish available - if redfish_available(): - tools.append(HWTool.REDFISH) - else: - logger.info("Redfish is not available") - return tools - - -def get_hw_tool_white_list() -> t.List[HWTool]: - """Return HWTool white list.""" - raid_white_list = raid_hw_verifier() - bmc_white_list = bmc_hw_verifier() - return raid_white_list + bmc_white_list + def validate_tool(self) -> bool: + """Validate if redfish tool is valid or not.""" + return True class HWToolHelper: @@ -421,93 +409,121 @@ def strategies(self) -> t.List[StrategyABC]: RedFishStrategy(), ] - def fetch_tools( # pylint: disable=W0102 - self, - resources: Resources, - hw_white_list: t.List[HWTool] = [], - ) -> t.Dict[HWTool, Path]: - """Fetch resource from juju if it's VENDOR_TOOLS.""" - fetch_tools: t.Dict[HWTool, Path] = {} - # Fetch all tools from juju resources + @cached_property + def hw_tool_white_list(self) -> t.List[HWTool]: + """Define hardware tool white list.""" + return get_hw_tool_white_list() + + @property + def hw_collector_white_list(self) -> t.List[str]: + """Define hardware colletor white list.""" + collectors = [] + for tool in self.hw_tool_white_list: + collector = EXPORTER_COLLECTOR_MAPPING.get(tool) + if collector is not None: + collectors += collector + return collectors + + @property + def strategy_white_list(self) -> t.List[StrategyABC]: + """Define strategy white list.""" + return [s for s in self.strategies if s.name in self.hw_tool_white_list] + + def get_resource_white_list(self, resources: Resources) -> t.Dict[HWTool, t.Optional[Path]]: + """Fetch white listed tool and path pair from juju if it's VENDOR_TOOLSa.""" + resource_white_list: t.Dict[HWTool, t.Optional[Path]] = {} + # Note: we need to loop over TRP_RESOURCES rather than hw tool + # whitelist because some hw tools don't need to install any resources. for tool, resource in TPR_RESOURCES.items(): - if tool not in hw_white_list: - logger.info("Skip fetch tool: %s", tool) + if tool not in self.hw_collector_white_list: + logger.warning("Skip fetching resource for tool: %s (not in white list)", tool) continue + try: path = resources.fetch(resource) - fetch_tools[tool] = path + logger.info("Fetched resource for tool: %s", tool) except ModelError: - logger.warning("Fail to fetch tool: %s", resource) - - return fetch_tools - - def check_missing_resources( - self, hw_white_list: t.List[HWTool], fetch_tools: t.Dict[HWTool, Path] - ) -> t.Tuple[bool, str]: - """Check if required resources are not been uploaded.""" - missing_resources = [] - for tool in hw_white_list: - if tool in TPR_RESOURCES: - # Resource hasn't been uploaded - if tool not in fetch_tools: - missing_resources.append(TPR_RESOURCES[tool]) - # Uploaded but file size is zero - path = fetch_tools.get(tool) - if path and not check_file_size(path): - logger.warning("Tool: %s path: %s size is zero", tool, path) - missing_resources.append(TPR_RESOURCES[tool]) - if len(missing_resources) > 0: - return False, f"Missing resources: {missing_resources}" - return True, "" + # If path is None, this means the resource cannot be installed or it's missing. + path = None + logger.warning("Failed to fetch resource for tool: %s", tool) + else: + resource_white_list[tool] = path - def install(self, resources: Resources) -> t.Tuple[bool, str]: - """Install tools.""" - hw_white_list = get_hw_tool_white_list() - logger.info("hw_tool_white_list: %s", hw_white_list) - - fetch_tools = self.fetch_tools(resources, hw_white_list) - - ok, msg = self.check_missing_resources(hw_white_list, fetch_tools) - if not ok: - return ok, msg - - fail_strategies = [] - strategy_errors = [] + logger.info("resource_white_list: %s", resource_white_list) + return resource_white_list - # Iterate over each strategy and execute. - for strategy in self.strategies: - if strategy.name not in hw_white_list: + def install( + self, + resource_white_list: t.Dict[HWTool, t.Optional[Path]], + resource_black_list: t.Dict[HWTool, bool], + ) -> t.Dict[HWTool, bool]: + """Install tools.""" + logger.info("hw_tool_white_list: %s", self.hw_tool_white_list) + logger.info("hw_collector_white_list: %s", self.hw_collector_white_list) + + resource_install_status = {} + # Iterate over each white listed strategy and execute. + for strategy in self.strategy_white_list: + if resource_black_list.get(strategy.name, False): + logger.info("Strategy %s already installed, skipping", strategy) continue - # TPRStrategy + try: + # TPRStrategy if isinstance(strategy, TPRStrategyABC): - path = fetch_tools.get(strategy.name) # pylint: disable=W0212 + path = resource_white_list.get(strategy.name) if path: strategy.install(path) # APTStrategy elif isinstance(strategy, APTStrategyABC): - strategy.install() # pylint: disable=E1120 + strategy.install() logger.info("Strategy %s install success", strategy) except ( - ResourceFileSizeZeroError, OSError, apt.PackageError, - ResourceChecksumError, + hw_resources.ResourceChecksumError, + hw_resources.ResourceFileSizeZeroError, ) as e: + resource_install_status[strategy.name] = False logger.warning("Strategy %s install fail: %s", strategy, e) - fail_strategies.append(strategy.name) - strategy_errors.append(e) + else: + resource_install_status[strategy.name] = True - if len(strategy_errors) > 0: - return False, f"Fail strategies: {fail_strategies}" - return True, "" + return resource_install_status def remove(self, resources: Resources) -> None: # pylint: disable=W0613 """Execute all remove strategies.""" - hw_white_list = get_hw_tool_white_list() - for strategy in self.strategies: - if strategy.name not in hw_white_list: - continue + for strategy in self.strategy_white_list: if isinstance(strategy, (TPRStrategyABC, APTStrategyABC)): strategy.remove() logger.info("Strategy %s remove success", strategy) + + def check_status(self) -> t.Tuple[bool, str]: + """Check tool status.""" + failed_checks = [] + + for strategy in self.strategy_white_list: + try: + result = strategy.check_status() + except ( + hw_resources.ResourceNotFoundError, + hw_resources.ResourceIsDirectoryError, + hw_resources.ResourceNotExecutableError, + ) as e: + failed_checks.append(strategy.name) + logger.error("Strategy %s check status failed: %s", strategy.name, e) + else: + for name, status in result.items(): + if not status: + logger.error( + "Strategy %s check status failed: %s not installed", strategy, name + ) + if not all(result.values()): + failed_checks.append(strategy.name) + + if len(failed_checks) > 0: + return ( + False, + f"Fail strategy checks: {failed_checks}, please refer to juju debug-log.", + ) + return True, "" diff --git a/src/service.py b/src/service.py index 42833c67..b7db4b29 100644 --- a/src/service.py +++ b/src/service.py @@ -2,25 +2,27 @@ from functools import wraps from logging import getLogger from pathlib import Path -from typing import Any, Callable, Dict, Tuple +from time import sleep +from typing import Any, Callable, Dict, List, Tuple from charms.operator_libs_linux.v1 import systemd from jinja2 import Environment, FileSystemLoader from config import ( - EXPORTER_COLLECTOR_MAPPING, EXPORTER_CONFIG_PATH, EXPORTER_CONFIG_TEMPLATE, EXPORTER_NAME, EXPORTER_SERVICE_PATH, EXPORTER_SERVICE_TEMPLATE, - HWTool, ) -from hw_tools import get_hw_tool_white_list logger = getLogger(__name__) +EXPORTER_HEALTH_RETRY_COUNT = 3 +EXPORTER_HEALTH_RETRY_TIMEOUT = 3 + + def check_installed(func: Callable) -> Callable: """Ensure exporter service and exporter config is installed before running operations.""" @@ -78,22 +80,15 @@ def _uninstall(self, path: Path) -> bool: logger.info("Removing file '%s' - Done.", path) return success - def render_config(self, port: str, level: str, redfish_creds: dict) -> bool: + def render_config( + self, port: str, level: str, collectors: List[str], redfish_options: dict + ) -> bool: """Render and install exporter config file.""" - hw_tools = get_hw_tool_white_list() - collectors = [] - for tool in hw_tools: - collector = EXPORTER_COLLECTOR_MAPPING.get(tool) - if collector is not None: - collectors += collector content = self.config_template.render( PORT=port, LEVEL=level, COLLECTORS=collectors, - REDFISH_ENABLE=(HWTool.REDFISH in hw_tools), - REDFISH_HOST=redfish_creds.get("host", ""), - REDFISH_USERNAME=redfish_creds.get("username", ""), - REDFISH_PASSWORD=redfish_creds.get("password", ""), + REDFISH_OPTIONS=redfish_options, ) return self._install(EXPORTER_CONFIG_PATH, content) @@ -114,15 +109,22 @@ def remove_service(self) -> bool: class Exporter: """A class representing the exporter and the metric endpoints.""" + config_options = { + "exporter-port", + "exporter-log-level", + "redfish-host", + "redfish-username", + "redfish-password", + } + def __init__(self, charm_dir: Path) -> None: """Initialize the class.""" self.charm_dir = charm_dir self.template = ExporterTemplate(charm_dir) - def install(self, port: str, level: str, redfish_creds: dict) -> bool: + def install(self) -> bool: """Install the exporter.""" logger.info("Installing %s.", EXPORTER_NAME) - success = self.template.render_config(port=port, level=level, redfish_creds=redfish_creds) success = self.template.render_service(str(self.charm_dir), str(EXPORTER_CONFIG_PATH)) if not success: logger.error("Failed to install %s.", EXPORTER_NAME) @@ -143,6 +145,27 @@ def uninstall(self) -> bool: logger.info("%s uninstalled.", EXPORTER_NAME) return success + @check_installed + def configure( + self, port: str, level: str, collectors: List[str], redfish_options: dict + ) -> bool: + """Configure the exporter.""" + logger.info("Configuring %s.", EXPORTER_NAME) + success = self.template.render_config( + port=port, + level=level, + collectors=collectors, + redfish_options=redfish_options, + ) + if not success: + logger.error("Failed to configure %s.", EXPORTER_NAME) + return success + systemd.daemon_reload() + logger.info("%s configured.", EXPORTER_NAME) + + self.restart() + return success + @check_installed def stop(self) -> None: """Stop the exporter daemon.""" @@ -165,5 +188,26 @@ def check_active(self) -> bool: @check_installed def check_health(self) -> bool: - """Check if the exporter daemon is healthy or not.""" - return not systemd.service_failed(EXPORTER_NAME) + """Check the health of the exporter daemon and try to recover it if needed. + + This function perform health check on exporter daemon if the exporter + is already installed. If it is somehow stopped, we should try to + restart it, if not possible we will set the charm to BlockedStatus to + alert the users. + """ + try: + if self.check_active(): + logger.info("Exporter health check - healthy.") + return True + logger.warning("Exporter health check - unhealthy.") + for i in range(1, EXPORTER_HEALTH_RETRY_COUNT + 1): + logger.warning("Restarting exporter - %d retry", i) + self.restart() + sleep(EXPORTER_HEALTH_RETRY_TIMEOUT) + if self.check_active(): + logger.info("Exporter restarted.") + return True + logger.error("Failed to restart the exporter.") + except Exception as e: # pylint: disable=W0718 + logger.error("Unknown error when trying to check exporter health: %s", str(e)) + return False diff --git a/templates/hardware-exporter-config.yaml.j2 b/templates/hardware-exporter-config.yaml.j2 index d448f5ed..b1a04b97 100644 --- a/templates/hardware-exporter-config.yaml.j2 +++ b/templates/hardware-exporter-config.yaml.j2 @@ -1,14 +1,14 @@ port: {{ PORT }} level: {{ LEVEL }} -{% if COLLECTORS | length > 0 %} +{% if COLLECTORS | length > 0 -%} enable_collectors: - {% for collector in COLLECTORS %} + {% for collector in COLLECTORS -%} - {{collector}} - {% endfor %} -{% endif %} + {% endfor -%} +{% endif -%} -{% if REDFISH_ENABLE %} -redfish_host: "{{ REDFISH_HOST }}" -redfish_username: "{{ REDFISH_USERNAME }}" -redfish_password: "{{ REDFISH_PASSWORD }}" +{% if REDFISH_OPTIONS.get('enable', False) %} +redfish_host: "{{ REDFISH_OPTIONS.get('host', '') }}" +redfish_username: "{{ REDFISH_OPTIONS.get('username', '') }}" +redfish_password: "{{ REDFISH_OPTIONS.get('password', '') }}" {% endif %}