From 5f734cf348fc129c8d52097d554637758edf63a5 Mon Sep 17 00:00:00 2001 From: Ray Chan <72372217+chanchiwai-ray@users.noreply.github.com> Date: Mon, 4 Dec 2023 11:28:21 +0800 Subject: [PATCH 1/9] Enhance charm's life cycle. (#119) * Enhance charm's life cycle. * Remove unused property. * Cherry-pick from 7513f18549ded7a23e91a9a04aaeee6c50433e56, and fix linting. Co-authored-by: jneo8 Continue to remove refactoring code from the previous branch, and fix status handling. * Move "exporter health check failed and restart" logic into charm.py * add resource check in update_status event, and change Error -> Blocked * Fix typo * Add unittests. * Update unit test docstring and name. * - Used `parameterized` - Fixed some warning message in unittest - Patched exporter retry constants --- requirements-dev.txt | 1 + src/charm.py | 181 +++++++++++++++++++++++++++--------- src/config.py | 3 + src/hw_tools.py | 56 ++++++++++- tests/unit/test_charm.py | 73 ++++++++++++++- tests/unit/test_exporter.py | 180 +++++++++++++++++++++++++++-------- tests/unit/test_hw_tools.py | 57 ++++++++++++ 7 files changed, 465 insertions(+), 86 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index f43aa290..da06b13b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,3 +2,4 @@ coverage flake8 < 5 pre-commit +parameterized diff --git a/src/charm.py b/src/charm.py index 70a95eb6..fe56115c 100755 --- a/src/charm.py +++ b/src/charm.py @@ -5,14 +5,15 @@ """Charm the application.""" import logging -from typing import Any, Dict, Optional +from time import sleep +from typing import Any, Dict, Optional, Tuple import ops from charms.grafana_agent.v0.cos_agent import COSAgentProvider from ops.framework import EventBase, StoredState -from ops.model import ActiveStatus, BlockedStatus +from ops.model import ActiveStatus, BlockedStatus, ErrorStatus, MaintenanceStatus -from config import HWTool +from config import EXPORTER_HEALTH_RETRY_COUNT, EXPORTER_HEALTH_RETRY_TIMEOUT, HWTool from hardware import get_bmc_address from hw_tools import HWToolHelper, bmc_hw_verifier from service import Exporter @@ -50,52 +51,100 @@ def __init__(self, *args: Any) -> None: self.on.cos_agent_relation_departed, self._on_cos_agent_relation_departed ) - self._stored.set_default(installed=False, config={}, blocked_msg="") + self._stored.set_default(config={}, exporter_installed=False, installed=False) + self.num_cos_agent_relations = self.get_num_cos_agent_relations("cos-agent") def _on_install_or_upgrade(self, event: ops.InstallEvent) -> None: - """Install and upgrade.""" - port = self.model.config.get("exporter-port", "10000") - level = self.model.config.get("exporter-log-level", "INFO") + """Install or upgrade charm.""" + self.model.unit.status = MaintenanceStatus("Installing resources...") - redfish_creds = self._get_redfish_creds() - - self.exporter.install(port, level, redfish_creds) installed, msg = self.hw_tool_helper.install(self.model.resources) self._stored.installed = installed if not installed: - logger.info(msg) - self._stored.blocked_msg = msg - self._on_update_status(event) + logger.error(msg) + self.model.unit.status = BlockedStatus(msg) + event.defer() return - self._stored.installed = True - self._stored.blocked_msg = "" - self.model.unit.status = ActiveStatus("Install complete") - logger.info("Install complete") + if self._stored.exporter_installed is not True: + self.model.unit.status = MaintenanceStatus("Installing exporter...") + success, err_msg = self.validate_exporter_configs() + if not success: + self.model.unit.status = BlockedStatus(err_msg) + event.defer() + return + + port = self.model.config.get("exporter-port", "10000") + level = self.model.config.get("exporter-log-level", "INFO") + redfish_creds = self._get_redfish_creds() + success = self.exporter.install(port, level, redfish_creds) + self._stored.exporter_installed = success + if not success: + msg = "Failed to install exporter, please refer to `juju debug-log`" + logger.error(msg) + self.model.unit.status = BlockedStatus(msg) + return + + self._on_update_status(event) def _on_remove(self, _: EventBase) -> None: """Remove everything when charm is being removed.""" logger.info("Start to remove.") # Remove binary tool self.hw_tool_helper.remove(self.model.resources) - self.exporter.uninstall() + self._stored.installed = False + success = self.exporter.uninstall() + if not success: + msg = "Failed to uninstall exporter, please refer to `juju debug-log`" + # we probably don't need to set any status here because the charm + # will go away soon, so only logging is enough + logger.warning(msg) + self._stored.exporter_installed = not success logger.info("Remove complete") - def _on_update_status(self, _: EventBase) -> None: + def _on_update_status(self, _: EventBase) -> None: # noqa: C901 """Update the charm's status.""" - if self._stored.installed is not True and self._stored.blocked_msg != "": - self.model.unit.status = BlockedStatus(self._stored.blocked_msg) # type: ignore + if not self._stored.installed: # type: ignore + self.model.unit.status = BlockedStatus("Resoures are not installed") # type: ignore return - if not self.model.get_relation("cos-agent"): + + if not self.exporter_enabled: self.model.unit.status = BlockedStatus("Missing relation: [cos-agent]") return - if not self.exporter.check_health(): - self.model.unit.status = BlockedStatus("Exporter is unhealthy") + + if self.too_many_cos_agent_relation: + self.model.unit.status = BlockedStatus("Cannot relate to more than one grafana-agent") return - if not self.exporter.check_active(): - self.model.unit.status = BlockedStatus("Exporter is not running") + + hw_tool_ok, error_msg = self.hw_tool_helper.check_installed() + if not hw_tool_ok: + self.model.unit.status = BlockedStatus(error_msg) return + + if not self.exporter.check_health(): + logger.warning("Exporter health check - failed.") + try: + for i in range(1, EXPORTER_HEALTH_RETRY_COUNT + 1): + logger.warning("Restarting exporter - %d retry", i) + self.exporter.restart() + sleep(EXPORTER_HEALTH_RETRY_TIMEOUT) + if self.exporter.check_active(): + logger.info("Exporter restarted.") + break + if not self.exporter.check_active(): + logger.error("Failed to restart the exporter.") + self.model.unit.status = ErrorStatus( + "Exporter crashed unexpectedly, please refer to systemd logs..." + ) + return + except Exception as err: # pylint: disable=W0718 + logger.error("Exporter crashed unexpectedly: %s", err) + self.model.unit.status = ErrorStatus( + "Exporter crashed unexpectedly, please refer to systemd logs..." + ) + return + self.model.unit.status = ActiveStatus("Unit is ready") def _on_config_changed(self, event: EventBase) -> None: @@ -117,28 +166,35 @@ def _on_config_changed(self, event: EventBase) -> None: event.handle, ) event.defer() - return - exporter_configs = { - "exporter-port", - "exporter-log-level", - "redfish-host", - "redfish-username", - "redfish-password", - } - if exporter_configs.intersection(change_set): - logger.info("Detected changes in exporter config.") - port = self.model.config.get("exporter-port", "10000") - level = self.model.config.get("exporter-log-level", "INFO") + if self.exporter_enabled: + success, message = self.validate_exporter_configs() + if not success: + self.model.unit.status = BlockedStatus(message) + return - redfish_creds = self._get_redfish_creds() - success = self.exporter.template.render_config( - port=port, level=level, redfish_creds=redfish_creds - ) - # First condition prevent the exporter from starting at when the - # charm just installed; the second condition tries to recover the - # exporter from failed status. - if success and self.exporter.check_active() or not self.exporter.check_health(): + exporter_configs = { + "exporter-port", + "exporter-log-level", + "redfish-host", + "redfish-username", + "redfish-password", + } + if exporter_configs.intersection(change_set): + logger.info("Detected changes in exporter config.") + port = self.model.config.get("exporter-port", "10000") + level = self.model.config.get("exporter-log-level", "INFO") + + redfish_creds = self._get_redfish_creds() + success = self.exporter.template.render_config( + port=port, level=level, redfish_creds=redfish_creds + ) + if not success: + message = ( + "Failed to configure exporter, please check if the server is healthy." + ) + self.model.unit.status = BlockedStatus(message) + return self.exporter.restart() self._on_update_status(event) @@ -168,6 +224,39 @@ def _get_redfish_creds(self) -> Dict[str, str]: redfish_creds = {} return redfish_creds + def validate_exporter_configs(self) -> Tuple[bool, str]: + """Validate the static and runtime config options for the exporter.""" + port = int(self.model.config.get("exporter-port", "10000")) + if not 1 <= port <= 65535: + logger.error("Invalid exporter-port: port must be in [1, 65535].") + return False, "Invalid config: 'exporter-port'" + + level = self.model.config.get("exporter-log-level", "") + allowed_choices = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"} + if level.upper() not in allowed_choices: + logger.error( + "Invalid exporter-log-level: level must be in %s (case-insensitive).", + allowed_choices, + ) + return False, "Invalid config: 'exporter-log-level'" + + return True, "Exporter config is valid." + + def get_num_cos_agent_relations(self, relation_name: str) -> int: + """Get the number of relation given a relation_name.""" + relations = self.model.relations.get(relation_name, []) + return len(relations) + + @property + def exporter_enabled(self) -> bool: + """Return True if cos-agent relation is present.""" + return self.num_cos_agent_relations != 0 + + @property + def too_many_cos_agent_relation(self) -> bool: + """Return True if there're more than one cos-agent relation.""" + return self.num_cos_agent_relations > 1 + if __name__ == "__main__": # pragma: nocover ops.main(HardwareObserverCharm) # type: ignore diff --git a/src/config.py b/src/config.py index a6ab420a..7e697c09 100644 --- a/src/config.py +++ b/src/config.py @@ -9,6 +9,9 @@ EXPORTER_SERVICE_PATH = Path(f"/etc/systemd/system/{EXPORTER_NAME}.service") EXPORTER_CONFIG_TEMPLATE = f"{EXPORTER_NAME}-config.yaml.j2" EXPORTER_SERVICE_TEMPLATE = f"{EXPORTER_NAME}.service.j2" +EXPORTER_HEALTH_RETRY_COUNT = 3 +EXPORTER_HEALTH_RETRY_TIMEOUT = 3 + # Redfish REDFISH_TIMEOUT = 3 diff --git a/src/hw_tools.py b/src/hw_tools.py index 2c7967f9..20a42de4 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -78,7 +78,7 @@ def check_file_size(path: Path) -> bool: size is 0. """ if path.stat().st_size == 0: - logger.info("% size is 0, skip install", path) + logger.info("%s size is 0, skip install", path) return False return True @@ -117,6 +117,16 @@ def make_executable(src: Path) -> None: raise err +def check_deb_pkg_installed(pkg: str) -> bool: + """Check if debian package is installed.""" + try: + apt.DebianPackage.from_installed_package(pkg) + return True + except apt.PackageNotFoundError: + logger.warning("package %s not found in installed package", pkg) + return False + + class StrategyABC(metaclass=ABCMeta): # pylint: disable=R0903 """Basic strategy.""" @@ -127,6 +137,10 @@ def name(self) -> HWTool: """Name.""" return self._name + @abstractmethod + def check(self) -> bool: + """Check installation status of the tool.""" + class APTStrategyABC(StrategyABC, metaclass=ABCMeta): """Strategy for apt install tool.""" @@ -177,6 +191,10 @@ def remove(self) -> None: logger.debug("Remove file %s", self.symlink_bin) remove_deb(pkg=self.name) + def check(self) -> bool: + """Check resource status.""" + return self.symlink_bin.exists() and os.access(self.symlink_bin, os.X_OK) + class PercCLIStrategy(TPRStrategyABC): """Strategy to install storcli.""" @@ -200,6 +218,10 @@ def remove(self) -> None: logger.debug("Remove file %s", self.symlink_bin) remove_deb(pkg=self.name) + def check(self) -> bool: + """Check resource status.""" + return self.symlink_bin.exists() and os.access(self.symlink_bin, os.X_OK) + class SAS2IRCUStrategy(TPRStrategyABC): """Strategy to install storcli.""" @@ -221,6 +243,10 @@ def remove(self) -> None: self.symlink_bin.unlink(missing_ok=True) logger.debug("Remove file %s", self.symlink_bin) + def check(self) -> bool: + """Check resource status.""" + return self.symlink_bin.exists() and os.access(self.symlink_bin, os.X_OK) + class SAS3IRCUStrategy(SAS2IRCUStrategy): """Strategy to install storcli.""" @@ -270,6 +296,10 @@ def remove(self) -> None: apt.remove_package(self.pkg) self.disable_repo() + def check(self) -> bool: + """Check package status.""" + return check_deb_pkg_installed(self.pkg) + class IPMIStrategy(APTStrategyABC): """Strategy for install ipmi.""" @@ -285,6 +315,10 @@ def remove(self) -> None: for pkg in self.pkgs: apt.remove_package(pkg) + def check(self) -> bool: + """Check package status.""" + return check_deb_pkg_installed(self.pkgs[0]) + class RedFishStrategy(StrategyABC): # pylint: disable=R0903 """Install strategy for redfish. @@ -294,6 +328,10 @@ class RedFishStrategy(StrategyABC): # pylint: disable=R0903 _name = HWTool.REDFISH + def check(self) -> bool: + """Check package status.""" + return True + def raid_hw_verifier() -> t.List[HWTool]: """Verify if the HWTool support RAID card exists on machine.""" @@ -511,3 +549,19 @@ def remove(self, resources: Resources) -> None: # pylint: disable=W0613 if isinstance(strategy, (TPRStrategyABC, APTStrategyABC)): strategy.remove() logger.info("Strategy %s remove success", strategy) + + def check_installed(self) -> t.Tuple[bool, str]: + """Check tool status.""" + hw_white_list = get_hw_tool_white_list() + failed_checks = [] + + for strategy in self.strategies: + if strategy.name not in hw_white_list: + continue + ok = strategy.check() + if not ok: + failed_checks.append(strategy.name) + + if len(failed_checks) > 0: + return False, f"Fail strategy checks: {failed_checks}" + return True, "" diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index d4d86b56..1f81b3e8 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -8,7 +8,7 @@ import ops import ops.testing -from ops.model import ActiveStatus, BlockedStatus +from ops.model import ActiveStatus, BlockedStatus, ErrorStatus import charm from charm import HardwareObserverCharm @@ -54,6 +54,7 @@ def test_01_harness(self) -> None: def test_02_install(self, mock_hw_tool_helper, mock_exporter) -> None: """Test event install.""" mock_hw_tool_helper.return_value.install.return_value = (True, "") + mock_exporter.return_value.install.return_value = True self.harness.begin() self.harness.charm.on.install.emit() @@ -69,6 +70,7 @@ def test_02_install(self, mock_hw_tool_helper, mock_exporter) -> None: def test_03_upgrade_charm(self, mock_hw_tool_helper, mock_exporter) -> None: """Test event upgrade_charm.""" mock_hw_tool_helper.return_value.install.return_value = (True, "") + mock_exporter.return_value.install.return_value = True self.harness.begin() self.harness.charm.on.install.emit() @@ -102,6 +104,7 @@ def test_05_install_redfish_unavailable(self, mock_hw_tool_helper, mock_exporter """Test event install.""" self.mock_bmc_hw_verifier.return_value = [HWTool.IPMI] mock_hw_tool_helper.return_value.install.return_value = (True, "") + mock_exporter.return_value.install.return_value = True self.harness.begin() self.harness.charm.on.install.emit() @@ -109,6 +112,74 @@ def test_05_install_redfish_unavailable(self, mock_hw_tool_helper, mock_exporter self.harness.charm.exporter.install.assert_called_with(10000, "INFO", {}) + @mock.patch("charm.Exporter", return_value=mock.MagicMock()) + @mock.patch("charm.HWToolHelper", return_value=mock.MagicMock()) + def test_06_install_failed(self, mock_hw_tool_helper, mock_exporter) -> None: + """Test event install.""" + mock_hw_tool_helper.return_value.install.return_value = (True, "") + mock_exporter.return_value.install.return_value = False + self.harness.begin() + self.harness.charm.validate_exporter_configs = mock.Mock() + self.harness.charm.validate_exporter_configs.return_value = (False, "error") + self.harness.charm.on.install.emit() + + self.assertTrue(self.harness.charm._stored.installed) + + self.assertEqual(self.harness.charm.unit.status, BlockedStatus("error")) + + @mock.patch("charm.Exporter", return_value=mock.MagicMock()) + @mock.patch("charm.HWToolHelper", return_value=mock.MagicMock()) + def test_07_update_status_all_green(self, mock_hw_tool_helper, mock_exporter): + """Test update_status when everything is okay.""" + self.mock_bmc_hw_verifier.return_value = [HWTool.IPMI] + mock_hw_tool_helper.return_value.install.return_value = (True, "") + mock_hw_tool_helper.return_value.check_installed.return_value = (True, "") + mock_exporter.return_value.install.return_value = True + rid = self.harness.add_relation("cos-agent", "grafana-agent") + self.harness.begin() + self.harness.charm.on.install.emit() + self.harness.add_relation_unit(rid, "grafana-agent/0") + + self.harness.charm.on.update_status.emit() + self.assertEqual(self.harness.charm.unit.status, ActiveStatus("Unit is ready")) + + @mock.patch("charm.Exporter", return_value=mock.MagicMock()) + @mock.patch("charm.HWToolHelper", return_value=mock.MagicMock()) + def test_08_update_status_check_installed_false(self, mock_hw_tool_helper, mock_exporter): + """Test update_status when hw tool checks failed.""" + self.mock_bmc_hw_verifier.return_value = [HWTool.IPMI] + mock_hw_tool_helper.return_value.install.return_value = (True, "") + mock_hw_tool_helper.return_value.check_installed.return_value = (False, "error") + mock_exporter.return_value.install.return_value = True + rid = self.harness.add_relation("cos-agent", "grafana-agent") + self.harness.begin() + self.harness.charm.on.install.emit() + self.harness.add_relation_unit(rid, "grafana-agent/0") + + self.harness.charm.on.update_status.emit() + self.assertEqual(self.harness.charm.unit.status, BlockedStatus("error")) + + @mock.patch("charm.Exporter", return_value=mock.MagicMock()) + @mock.patch("charm.HWToolHelper", return_value=mock.MagicMock()) + def test_09_update_status_exporter_crashed(self, mock_hw_tool_helper, mock_exporter): + """Test update_status.""" + self.mock_bmc_hw_verifier.return_value = [HWTool.IPMI] + mock_hw_tool_helper.return_value.install.return_value = (True, "") + mock_hw_tool_helper.return_value.check_installed.return_value = (True, "") + mock_exporter.return_value.install.return_value = True + mock_exporter.return_value.check_health.return_value = False + mock_exporter.return_value.restart.side_effect = Exception() + rid = self.harness.add_relation("cos-agent", "grafana-agent") + self.harness.begin() + self.harness.charm.on.install.emit() + self.harness.add_relation_unit(rid, "grafana-agent/0") + + self.harness.charm.on.update_status.emit() + self.assertEqual( + self.harness.charm.unit.status, + ErrorStatus("Exporter crashed unexpectedly, please refer to systemd logs..."), + ) + @mock.patch("charm.Exporter", return_value=mock.MagicMock()) def test_10_config_changed(self, mock_exporter): """Test config change event updates the charm's internal store.""" diff --git a/tests/unit/test_exporter.py b/tests/unit/test_exporter.py index d90f940c..f3440080 100644 --- a/tests/unit/test_exporter.py +++ b/tests/unit/test_exporter.py @@ -6,8 +6,9 @@ from unittest import mock import ops -from ops.model import ActiveStatus, BlockedStatus +from ops.model import ActiveStatus, BlockedStatus, ErrorStatus from ops.testing import Harness +from parameterized import parameterized import charm import service @@ -34,12 +35,25 @@ def setUp(self): hw_tool_lib_patcher = mock.patch.object(charm, "HWToolHelper") mock_hw_tool_helper = hw_tool_lib_patcher.start() mock_hw_tool_helper.return_value.install.return_value = [True, ""] + mock_hw_tool_helper.return_value.check_installed.return_value = [True, ""] self.addCleanup(hw_tool_lib_patcher.stop) get_hw_tool_white_list_patcher = mock.patch.object(service, "get_hw_tool_white_list") get_hw_tool_white_list_patcher.start() self.addCleanup(get_hw_tool_white_list_patcher.stop) + @classmethod + def setUpClass(cls): + exporter_health_retry_count_patcher = mock.patch("charm.EXPORTER_HEALTH_RETRY_COUNT", 1) + exporter_health_retry_count_patcher.start() + cls.addClassCleanup(exporter_health_retry_count_patcher.stop) + + exporter_health_retry_timeout_patcher = mock.patch( + "charm.EXPORTER_HEALTH_RETRY_TIMEOUT", 0 + ) + exporter_health_retry_timeout_patcher.start() + cls.addClassCleanup(exporter_health_retry_timeout_patcher.stop) + @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) def test_00_install_okay(self, mock_bmc_hw_verifier, mock_get_bmc_address): @@ -93,7 +107,7 @@ def test_11_uninstall_failed(self, mock_service_exists): @mock.patch.object(pathlib.Path, "exists", return_value=True) def test_20_start_okay(self, mock_service_installed): """Test exporter service started when relation is joined.""" - rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "cos-agent") + rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() self.harness.add_relation_unit(rid, "grafana-agent/0") self.mock_systemd.service_start.assert_called_once() @@ -101,7 +115,7 @@ def test_20_start_okay(self, mock_service_installed): @mock.patch.object(pathlib.Path, "exists", return_value=False) def test_21_start_failed(self, mock_service_not_installed): """Test exporter service failed to started when relation is joined.""" - rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "cos-agent") + rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() self.harness.add_relation_unit(rid, "grafana-agent/0") self.mock_systemd.service_start.assert_not_called() @@ -109,7 +123,7 @@ def test_21_start_failed(self, mock_service_not_installed): @mock.patch.object(pathlib.Path, "exists", return_value=True) def test_30_stop_okay(self, mock_service_installed): """Test exporter service is stopped when service is installed and relation is departed.""" - rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "cos-agent") + rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() self.harness.add_relation_unit(rid, "grafana-agent/0") self.harness.remove_relation_unit(rid, "grafana-agent/0") @@ -118,67 +132,101 @@ def test_30_stop_okay(self, mock_service_installed): @mock.patch.object(pathlib.Path, "exists", return_value=False) def test_31_stop_failed(self, mock_service_not_installed): """Test exporter service failed to stop when service is not installed.""" - rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "cos-agent") + rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() self.harness.add_relation_unit(rid, "grafana-agent/0") self.harness.remove_relation_unit(rid, "grafana-agent/0") self.mock_systemd.service_stop.assert_not_called() - @mock.patch.object(pathlib.Path, "exists", return_value=True) - def test_40_check_health(self, mock_service_installed): - """Test check_health function when service is installed.""" - rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "cos-agent") - self.harness.begin() - self.harness.add_relation_unit(rid, "grafana-agent/0") - - test_cases = [ - (False, ActiveStatus("Unit is ready")), - (True, BlockedStatus("Exporter is unhealthy")), + @parameterized.expand( + [ + (False, ActiveStatus("Unit is ready"), True), + (True, ActiveStatus("Unit is ready"), True), + (False, ActiveStatus("Unit is ready"), False), + ( + True, + ErrorStatus("Exporter crashed unexpectedly, please refer to systemd logs..."), + False, + ), ] - for failed, expected_status in test_cases: - with self.subTest(service_failed=failed): - self.mock_systemd.service_failed.return_value = failed - self.harness.charm.on.update_status.emit() - self.assertEqual(self.harness.charm.unit.status, expected_status) - + ) + @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") + @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) @mock.patch.object(pathlib.Path, "exists", return_value=True) - def test_41_check_active(self, mock_service_installed): + def test_40_check_health( + self, + failed, + expected_status, + restart_okay, + mock_service_installed, + mock_bmc_hw_verifier, + mock_get_bmc_address, + ): """Test check_health function when service is installed.""" - rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "cos-agent") + rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() - self.harness.add_relation_unit(rid, "grafana-agent/0") + with mock.patch("builtins.open", new_callable=mock.mock_open) as _: + self.harness.charm.on.install.emit() + self.harness.add_relation_unit(rid, "grafana-agent/0") - test_cases = [ - (True, ActiveStatus("Unit is ready")), - (False, BlockedStatus("Exporter is not running")), - ] - self.mock_systemd.service_failed.return_value = False - for running, expected_status in test_cases: - with self.subTest(service_running=running): - self.mock_systemd.service_running.return_value = running - self.harness.charm.on.update_status.emit() - self.assertEqual(self.harness.charm.unit.status, expected_status) + self.mock_systemd.service_running.return_value = restart_okay + self.mock_systemd.service_failed.return_value = failed + self.harness.charm.on.update_status.emit() + self.assertEqual(self.harness.charm.unit.status, expected_status) + @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") + @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) @mock.patch.object(pathlib.Path, "exists", return_value=True) - def test_50_check_relation_exists(self, mock_service_installed): + def test_50_check_relation_exists( + self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address + ): """Test check_relation function when relation exists.""" - rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "cos-agent") + rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() - self.harness.add_relation_unit(rid, "grafana-agent/0") + with mock.patch("builtins.open", new_callable=mock.mock_open) as _: + self.harness.charm.on.install.emit() + self.harness.add_relation_unit(rid, "grafana-agent/0") self.mock_systemd.service_failed.return_value = False self.harness.charm.on.update_status.emit() self.assertEqual(self.harness.charm.unit.status, ActiveStatus("Unit is ready")) + @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") + @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) @mock.patch.object(pathlib.Path, "exists", return_value=True) - def test_51_check_relation_not_exists(self, mock_service_installed): + def test_51_check_relation_not_exists( + self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address + ): """Test check_relation function when relation does not exists.""" self.harness.begin() + with mock.patch("builtins.open", new_callable=mock.mock_open) as _: + self.harness.charm.on.install.emit() self.mock_systemd.service_failed.return_value = False self.harness.charm.on.update_status.emit() self.assertEqual( self.harness.charm.unit.status, BlockedStatus("Missing relation: [cos-agent]") ) + @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") + @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) + @mock.patch.object(pathlib.Path, "exists", return_value=True) + def test_52_too_many_relations( + self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address + ): + """Test there too many relations.""" + rid_1 = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") + rid_2 = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") + self.harness.begin() + with mock.patch("builtins.open", new_callable=mock.mock_open) as _: + self.harness.charm.on.install.emit() + self.harness.add_relation_unit(rid_1, "grafana-agent/0") + self.harness.add_relation_unit(rid_2, "grafana-agent/1") + self.mock_systemd.service_failed.return_value = False + self.harness.charm.on.update_status.emit() + self.assertEqual( + self.harness.charm.unit.status, + BlockedStatus("Cannot relate to more than one grafana-agent"), + ) + @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) @mock.patch.object(pathlib.Path, "exists", return_value=True) @@ -186,16 +234,72 @@ def test_60_config_changed_log_level_okay( self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address ): """Test on_config_change function when exporter-log-level is changed.""" + rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() with mock.patch("builtins.open", new_callable=mock.mock_open): self.mock_systemd.service_failed.return_value = False self.harness.charm.on.install.emit() + self.harness.add_relation_unit(rid, "grafana-agent/0") self.harness.update_config({"exporter-log-level": "DEBUG"}) self.harness.charm.on.config_changed.emit() self.assertEqual(self.harness.charm._stored.config.get("exporter-log-level"), "DEBUG") self.mock_systemd.service_restart.assert_called_once() + @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") + @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) + @mock.patch.object(pathlib.Path, "exists", return_value=True) + def test_61_config_changed_not_okay( + self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address + ): + """Test on_config_change function when exporter-log-level is changed.""" + rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") + self.harness.begin() + + with mock.patch("builtins.open", new_callable=mock.mock_open): + self.mock_systemd.service_failed.return_value = False + self.harness.charm.on.install.emit() + self.harness.add_relation_unit(rid, "grafana-agent/0") + # self.harness.charm.validate_exporter_configs = mock.Mock() + # self.harness.charm.validate_exporter_configs.return_value = (False, "error") + self.harness.update_config({"exporter-port": 100000, "exporter-log-level": "DEBUG"}) + self.harness.charm.on.config_changed.emit() + self.assertEqual( + self.harness.charm.unit.status, BlockedStatus("Invalid config: 'exporter-port'") + ) + self.harness.update_config({"exporter-port": 8080, "exporter-log-level": "xxx"}) + self.harness.charm.on.config_changed.emit() + self.assertEqual( + self.harness.charm.unit.status, + BlockedStatus("Invalid config: 'exporter-log-level'"), + ) + + @mock.patch("charm.Exporter", return_value=mock.MagicMock()) + @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") + @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) + @mock.patch.object(pathlib.Path, "exists", return_value=True) + def test_62_config_changed_not_okay( + self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address, mock_exporter + ): + """Test on_config_change function when exporter-log-level is changed.""" + rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") + self.harness.begin() + + with mock.patch("builtins.open", new_callable=mock.mock_open): + self.mock_systemd.service_failed.return_value = False + mock_exporter.return_value.install.return_value = True + self.harness.charm.on.install.emit() + mock_exporter.return_value.template.render_config.return_value = False + self.harness.add_relation_unit(rid, "grafana-agent/0") + self.harness.charm.on.config_changed.emit() + self.mock_systemd.service_restart.assert_not_called() + self.assertEqual( + self.harness.charm.unit.status, + BlockedStatus( + "Failed to configure exporter, please check if the server is healthy." + ), + ) + class TestExporterTemplate(unittest.TestCase): def setUp(self): diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index b3319d84..6a2e3eec 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -35,6 +35,7 @@ StrategyABC, TPRStrategyABC, bmc_hw_verifier, + check_deb_pkg_installed, check_file_size, copy_to_snap_common_bin, get_hw_tool_white_list, @@ -69,6 +70,13 @@ def test_copy_to_snap_common_bin(mock_path, mock_shutil): mock_path_obj.mkdir.assert_called() +@mock.patch("hw_tools.apt") +def test_check_deb_pkg_installed_okay(mock_apt): + mock_pkg = "ipmitool" + result = check_deb_pkg_installed(mock_pkg) + assert result is True + + class TestSymlink(unittest.TestCase): def test_symlink(self): mock_src = mock.Mock() @@ -323,6 +331,55 @@ def test_11_check_missing_resources_zero_size_resources(self, check_file_size): self.assertFalse(ok) self.assertEqual("Missing resources: ['storcli-deb']", msg) + @mock.patch("hw_tools.get_hw_tool_white_list", return_value=[HWTool.STORCLI]) + @mock.patch( + "hw_tools.HWToolHelper.strategies", + return_value=[ + mock.PropertyMock(spec=TPRStrategyABC), + ], + new_callable=mock.PropertyMock, + ) + def test_12_check_installed_okay(self, mock_strategies, _): + self.harness.begin() + mock_strategies.return_value[0].name = HWTool.STORCLI + self.hw_tool_helper.check_installed() + for strategy in mock_strategies.return_value: + strategy.check.assert_called() + + @mock.patch("hw_tools.get_hw_tool_white_list", return_value=[HWTool.STORCLI]) + @mock.patch( + "hw_tools.HWToolHelper.strategies", + return_value=[ + mock.PropertyMock(spec=TPRStrategyABC), + ], + new_callable=mock.PropertyMock, + ) + def test_13_check_installed_okay(self, mock_strategies, _): + self.harness.begin() + mock_strategies.return_value[0].name = HWTool.SSACLI + success, msg = self.hw_tool_helper.check_installed() + self.assertTrue(success) + self.assertEqual(msg, "") + + @mock.patch("hw_tools.os") + @mock.patch("hw_tools.Path") + @mock.patch( + "hw_tools.get_hw_tool_white_list", + return_value=[ + HWTool.STORCLI, + HWTool.PERCCLI, + HWTool.SAS2IRCU, + HWTool.SAS3IRCU, + HWTool.SSACLI, + HWTool.IPMI, + HWTool.REDFISH, + ], + ) + def test_14_check_installed_not_okay(self, _, mock_os, mock_path): + self.harness.begin() + success, msg = self.hw_tool_helper.check_installed() + self.assertFalse(success) + class TestStorCLIStrategy(unittest.TestCase): @mock.patch("hw_tools.validate_checksum", return_value=True) From 4dfc192c604e3d3d557e3d780e3114d456a525d0 Mon Sep 17 00:00:00 2001 From: Sudeep Bhandari Date: Wed, 6 Dec 2023 13:50:51 +0545 Subject: [PATCH 2/9] Separate enabling logic of individual ipmi services monitoring (#123) * Separate enabling logic of individual ipmi services monitoring * Add and refactor unit tests --- src/config.py | 8 +++-- src/hw_tools.py | 39 +++++++++++++++++------- tests/unit/test_charm.py | 31 +++++++++++++++---- tests/unit/test_exporter.py | 59 ++++++++++++------------------------- tests/unit/test_hw_tools.py | 46 ++++++++++++++++++++++------- 5 files changed, 116 insertions(+), 67 deletions(-) diff --git a/src/config.py b/src/config.py index 7e697c09..d8f97813 100644 --- a/src/config.py +++ b/src/config.py @@ -41,7 +41,9 @@ class HWTool(str, Enum): SAS2IRCU = "sas2ircu" SAS3IRCU = "sas3ircu" PERCCLI = "perccli" - IPMI = "ipmi" + IPMI_DCMI = "ipmi_dcmi" + IPMI_SEL = "ipmi_sel" + IPMI_SENSOR = "ipmi_sensor" REDFISH = "redfish" @@ -58,7 +60,9 @@ class HWTool(str, Enum): HWTool.SAS2IRCU: ["collector.lsi_sas_2"], HWTool.SAS3IRCU: ["collector.lsi_sas_3"], HWTool.SSACLI: ["collector.hpe_ssa"], - HWTool.IPMI: ["collector.ipmi_dcmi", "collector.ipmi_sel", "collector.ipmi_sensor"], + HWTool.IPMI_DCMI: ["collector.ipmi_dcmi"], + HWTool.IPMI_SEL: ["collector.ipmi_sel"], + HWTool.IPMI_SENSOR: ["collector.ipmi_sensor"], HWTool.REDFISH: ["collector.redfish"], } diff --git a/src/hw_tools.py b/src/hw_tools.py index 20a42de4..d0693782 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -302,9 +302,14 @@ def check(self) -> bool: class IPMIStrategy(APTStrategyABC): - """Strategy for install ipmi.""" - - _name = HWTool.IPMI + """Strategy for installing ipmi.""" + + # Because IPMISTrategy now encompasses all of + # HWTool.IPMI_SENSOR, HWTool.IPMI_SEL and HWTool.IPMI_DCMI, + # we will need some refactoring here to avoid misleading log + # messages. The installation should be good since all of these + # tools require the same `freeipmi-tools` to be installed. + _name = HWTool.IPMI_SENSOR pkgs = ["freeipmi-tools"] def install(self) -> None: @@ -417,18 +422,32 @@ def redfish_available() -> bool: def bmc_hw_verifier() -> t.List[HWTool]: """Verify if the ipmi is available on the machine. - Using ipmitool to verify, the package will be removed in removing stage. + Using freeipmi-tools to verify, the package will be removed in removing stage. """ tools = [] - # Check IPMI available - apt.add_package("ipmitool", update_cache=False) + + # Check if ipmi services are available + apt.add_package("freeipmi-tools", update_cache=False) + + try: + subprocess.check_output("ipmimonitoring".split()) + tools.append(HWTool.IPMI_SENSOR) + except subprocess.CalledProcessError: + logger.info("IPMI sensors monitoring is not available") + + try: + subprocess.check_output("ipmi-sel".split()) + tools.append(HWTool.IPMI_SEL) + except subprocess.CalledProcessError: + logger.info("IPMI SEL monitoring is not available") + try: - subprocess.check_output("ipmitool lan print".split()) - tools.append(HWTool.IPMI) + subprocess.check_output("ipmi-dcmi --get-system-power-statistics".split()) + tools.append(HWTool.IPMI_DCMI) except subprocess.CalledProcessError: - logger.info("IPMI is not available") + logger.info("IPMI DCMI monitoring is not available") - # Check RedFish available + # Check if RedFish is available if redfish_available(): tools.append(HWTool.REDFISH) else: diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 1f81b3e8..42251019 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -27,7 +27,12 @@ def setUp(self): bmc_hw_verifier_patcher = mock.patch.object(charm, "bmc_hw_verifier") self.mock_bmc_hw_verifier = bmc_hw_verifier_patcher.start() - self.mock_bmc_hw_verifier.return_value = [HWTool.IPMI, HWTool.REDFISH] + self.mock_bmc_hw_verifier.return_value = [ + HWTool.IPMI_SENSOR, + HWTool.IPMI_SEL, + HWTool.IPMI_DCMI, + HWTool.REDFISH, + ] self.addCleanup(bmc_hw_verifier_patcher.stop) @classmethod @@ -102,7 +107,11 @@ def test_04_install_missing_resources(self, mock_hw_tool_helper, mock_exporter) @mock.patch("charm.HWToolHelper", return_value=mock.MagicMock()) def test_05_install_redfish_unavailable(self, mock_hw_tool_helper, mock_exporter) -> None: """Test event install.""" - self.mock_bmc_hw_verifier.return_value = [HWTool.IPMI] + self.mock_bmc_hw_verifier.return_value = [ + HWTool.IPMI_SENSOR, + HWTool.IPMI_SEL, + HWTool.IPMI_DCMI, + ] mock_hw_tool_helper.return_value.install.return_value = (True, "") mock_exporter.return_value.install.return_value = True self.harness.begin() @@ -131,7 +140,11 @@ def test_06_install_failed(self, mock_hw_tool_helper, mock_exporter) -> None: @mock.patch("charm.HWToolHelper", return_value=mock.MagicMock()) def test_07_update_status_all_green(self, mock_hw_tool_helper, mock_exporter): """Test update_status when everything is okay.""" - self.mock_bmc_hw_verifier.return_value = [HWTool.IPMI] + self.mock_bmc_hw_verifier.return_value = [ + HWTool.IPMI_SENSOR, + HWTool.IPMI_SEL, + HWTool.IPMI_DCMI, + ] mock_hw_tool_helper.return_value.install.return_value = (True, "") mock_hw_tool_helper.return_value.check_installed.return_value = (True, "") mock_exporter.return_value.install.return_value = True @@ -147,7 +160,11 @@ def test_07_update_status_all_green(self, mock_hw_tool_helper, mock_exporter): @mock.patch("charm.HWToolHelper", return_value=mock.MagicMock()) def test_08_update_status_check_installed_false(self, mock_hw_tool_helper, mock_exporter): """Test update_status when hw tool checks failed.""" - self.mock_bmc_hw_verifier.return_value = [HWTool.IPMI] + self.mock_bmc_hw_verifier.return_value = [ + HWTool.IPMI_SENSOR, + HWTool.IPMI_SEL, + HWTool.IPMI_DCMI, + ] mock_hw_tool_helper.return_value.install.return_value = (True, "") mock_hw_tool_helper.return_value.check_installed.return_value = (False, "error") mock_exporter.return_value.install.return_value = True @@ -163,7 +180,11 @@ def test_08_update_status_check_installed_false(self, mock_hw_tool_helper, mock_ @mock.patch("charm.HWToolHelper", return_value=mock.MagicMock()) def test_09_update_status_exporter_crashed(self, mock_hw_tool_helper, mock_exporter): """Test update_status.""" - self.mock_bmc_hw_verifier.return_value = [HWTool.IPMI] + self.mock_bmc_hw_verifier.return_value = [ + HWTool.IPMI_SENSOR, + HWTool.IPMI_SEL, + HWTool.IPMI_DCMI, + ] mock_hw_tool_helper.return_value.install.return_value = (True, "") mock_hw_tool_helper.return_value.check_installed.return_value = (True, "") mock_exporter.return_value.install.return_value = True diff --git a/tests/unit/test_exporter.py b/tests/unit/test_exporter.py index f3440080..34705794 100644 --- a/tests/unit/test_exporter.py +++ b/tests/unit/test_exporter.py @@ -42,6 +42,17 @@ def setUp(self): get_hw_tool_white_list_patcher.start() self.addCleanup(get_hw_tool_white_list_patcher.stop) + get_bmc_address_patcher = mock.patch("charm.get_bmc_address", return_value="127.0.0.1") + get_bmc_address_patcher.start() + self.addCleanup(get_bmc_address_patcher.stop) + + bmc_hw_verifier_patcher = mock.patch( + "charm.bmc_hw_verifier", + return_value=[HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.IPMI_DCMI, HWTool.REDFISH], + ) + bmc_hw_verifier_patcher.start() + self.addCleanup(bmc_hw_verifier_patcher.stop) + @classmethod def setUpClass(cls): exporter_health_retry_count_patcher = mock.patch("charm.EXPORTER_HEALTH_RETRY_COUNT", 1) @@ -54,9 +65,7 @@ def setUpClass(cls): exporter_health_retry_timeout_patcher.start() cls.addClassCleanup(exporter_health_retry_timeout_patcher.stop) - @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") - @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) - def test_00_install_okay(self, mock_bmc_hw_verifier, mock_get_bmc_address): + def test_00_install_okay(self): """Test exporter service is installed when charm is installed - okay.""" self.harness.begin() @@ -65,9 +74,7 @@ def test_00_install_okay(self, mock_bmc_hw_verifier, mock_get_bmc_address): mock_open.assert_called() self.mock_systemd.daemon_reload.assert_called_once() - @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") - @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) - def test_01_install_failed_rendering(self, mock_bmc_hw_verifier, mock_get_bmc_address): + def test_01_install_failed_rendering(self): """Test exporter service is failed to installed - failed to render.""" self.harness.begin() @@ -150,8 +157,6 @@ def test_31_stop_failed(self, mock_service_not_installed): ), ] ) - @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") - @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) @mock.patch.object(pathlib.Path, "exists", return_value=True) def test_40_check_health( self, @@ -159,8 +164,6 @@ def test_40_check_health( expected_status, restart_okay, mock_service_installed, - mock_bmc_hw_verifier, - mock_get_bmc_address, ): """Test check_health function when service is installed.""" rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") @@ -174,12 +177,8 @@ def test_40_check_health( self.harness.charm.on.update_status.emit() self.assertEqual(self.harness.charm.unit.status, expected_status) - @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") - @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) @mock.patch.object(pathlib.Path, "exists", return_value=True) - def test_50_check_relation_exists( - self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address - ): + def test_50_check_relation_exists(self, mock_service_installed): """Test check_relation function when relation exists.""" rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() @@ -190,12 +189,8 @@ def test_50_check_relation_exists( self.harness.charm.on.update_status.emit() self.assertEqual(self.harness.charm.unit.status, ActiveStatus("Unit is ready")) - @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") - @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) @mock.patch.object(pathlib.Path, "exists", return_value=True) - def test_51_check_relation_not_exists( - self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address - ): + def test_51_check_relation_not_exists(self, mock_service_installed): """Test check_relation function when relation does not exists.""" self.harness.begin() with mock.patch("builtins.open", new_callable=mock.mock_open) as _: @@ -206,12 +201,8 @@ def test_51_check_relation_not_exists( self.harness.charm.unit.status, BlockedStatus("Missing relation: [cos-agent]") ) - @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") - @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) @mock.patch.object(pathlib.Path, "exists", return_value=True) - def test_52_too_many_relations( - self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address - ): + def test_52_too_many_relations(self, mock_service_installed): """Test there too many relations.""" rid_1 = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") rid_2 = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") @@ -227,12 +218,8 @@ def test_52_too_many_relations( BlockedStatus("Cannot relate to more than one grafana-agent"), ) - @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") - @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) @mock.patch.object(pathlib.Path, "exists", return_value=True) - def test_60_config_changed_log_level_okay( - self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address - ): + def test_60_config_changed_log_level_okay(self, mock_service_installed): """Test on_config_change function when exporter-log-level is changed.""" rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() @@ -246,12 +233,8 @@ def test_60_config_changed_log_level_okay( self.assertEqual(self.harness.charm._stored.config.get("exporter-log-level"), "DEBUG") self.mock_systemd.service_restart.assert_called_once() - @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") - @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) @mock.patch.object(pathlib.Path, "exists", return_value=True) - def test_61_config_changed_not_okay( - self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address - ): + def test_61_config_changed_not_okay(self, mock_service_installed): """Test on_config_change function when exporter-log-level is changed.""" rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() @@ -275,12 +258,8 @@ def test_61_config_changed_not_okay( ) @mock.patch("charm.Exporter", return_value=mock.MagicMock()) - @mock.patch("charm.get_bmc_address", return_value="127.0.0.1") - @mock.patch("charm.bmc_hw_verifier", return_value=[HWTool.IPMI, HWTool.REDFISH]) @mock.patch.object(pathlib.Path, "exists", return_value=True) - def test_62_config_changed_not_okay( - self, mock_service_installed, mock_bmc_hw_verifier, mock_get_bmc_address, mock_exporter - ): + def test_62_config_changed_not_okay(self, mock_service_installed, mock_exporter): """Test on_config_change function when exporter-log-level is changed.""" rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index 6a2e3eec..8e40694d 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -288,7 +288,11 @@ def test_09_install_required_resource_not_uploaded(self, _, mock_hw_white_list): @mock.patch( "hw_tools.get_hw_tool_white_list", - return_value=[HWTool.STORCLI, HWTool.IPMI, HWTool.REDFISH], + return_value=[ + HWTool.STORCLI, + HWTool.IPMI_SENSOR, + HWTool.REDFISH, + ], ) @mock.patch( "hw_tools.HWToolHelper.strategies", @@ -305,7 +309,7 @@ def test_10_install_strategy_errors(self, mock_strategies, mock_hw_white_list): self.harness.begin() mock_resources = self.harness.charm.model.resources mock_strategies.return_value[0].name = HWTool.STORCLI - mock_strategies.return_value[1].name = HWTool.IPMI + mock_strategies.return_value[1].name = HWTool.IPMI_SENSOR mock_strategies.return_value[2].name = HWTool.REDFISH mock_strategies.return_value[0].install.side_effect = ResourceFileSizeZeroError( @@ -319,7 +323,10 @@ def test_10_install_strategy_errors(self, mock_strategies, mock_hw_white_list): ok, msg = self.hw_tool_helper.install(mock_resources) self.assertFalse(ok) - self.assertEqual(f"Fail strategies: {[HWTool.STORCLI, HWTool.IPMI, HWTool.REDFISH]}", msg) + self.assertEqual( + f"Fail strategies: {[HWTool.STORCLI, HWTool.IPMI_SENSOR, HWTool.REDFISH]}", + msg, + ) @mock.patch("hw_tools.check_file_size", return_value=False) def test_11_check_missing_resources_zero_size_resources(self, check_file_size): @@ -371,7 +378,9 @@ def test_13_check_installed_okay(self, mock_strategies, _): HWTool.SAS2IRCU, HWTool.SAS3IRCU, HWTool.SSACLI, - HWTool.IPMI, + HWTool.IPMI_SENSOR, + HWTool.IPMI_SEL, + HWTool.IPMI_DCMI, HWTool.REDFISH, ], ) @@ -822,10 +831,10 @@ def test_redfish_available_and_login_success(self, mock_bmc_address, mock_redfis @mock.patch("hw_tools.apt") def test_bmc_hw_verifier(self, mock_apt, mock_subprocess, mock_redfish_available): output = bmc_hw_verifier() - mock_apt.add_package.assert_called_with("ipmitool", update_cache=False) - mock_subprocess.check_output.assert_called_with("ipmitool lan print".split()) - self.assertCountEqual(output, [HWTool.IPMI, HWTool.REDFISH]) - mock_redfish_available.assert_called() + mock_apt.add_package.assert_called_with("freeipmi-tools", update_cache=False) + self.assertCountEqual( + output, [HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.IPMI_DCMI, HWTool.REDFISH] + ) @mock.patch("hw_tools.redfish_available", return_value=False) @mock.patch( @@ -837,6 +846,23 @@ def test_bmc_hw_verifier_error_handling( self, mock_apt, mock_check_output, mock_redfish_available ): output = bmc_hw_verifier() - mock_apt.add_package.assert_called_with("ipmitool", update_cache=False) - mock_check_output.assert_called_with("ipmitool lan print".split()) + mock_apt.add_package.assert_called_with("freeipmi-tools", update_cache=False) self.assertEqual(output, []) + + @mock.patch("hw_tools.redfish_available", return_value=False) + @mock.patch("hw_tools.apt") + def test_bmc_hw_verifier_mixed(self, mock_apt, mock_redfish_available): + """Test a mixture of failures and successes for ipmi.""" + + def mock_get_response_ipmi(ipmi_call): + if ipmi_call == "ipmimonitoring".split(): + pass + elif ipmi_call == "ipmi-sel".split(): + pass + elif ipmi_call == "ipmi-dcmi --get-system-power-statistics".split(): + raise subprocess.CalledProcessError(-1, "cmd") + + with mock.patch("hw_tools.subprocess.check_output", side_effect=mock_get_response_ipmi): + output = bmc_hw_verifier() + mock_apt.add_package.assert_called_with("freeipmi-tools", update_cache=False) + self.assertCountEqual(output, [HWTool.IPMI_SENSOR, HWTool.IPMI_SEL]) From 9f081d90841b22502a72decb7fa4258ff92445e1 Mon Sep 17 00:00:00 2001 From: james_lin Date: Thu, 14 Dec 2023 13:24:28 +0800 Subject: [PATCH 3/9] Fix/resoure install failed (#135) * fix: lifecycle bug fix - Remove install retry - Add cache to hw_white_list to avoid repeat calls - Fix missing resource failed msg in update status hook * fix: Remove resource_failed_msg * fix: Fix type checking comments and defer relation_join hook if checking not pass * fix: Missing return after grafana-agent relation join event defer --- src/charm.py | 48 ++++++++++++++++++++++++------------- src/hw_tools.py | 13 ++++++++++ tests/unit/test_charm.py | 14 +++++------ tests/unit/test_exporter.py | 26 ++++++++++++++++++++ tests/unit/test_hw_tools.py | 9 ++++++- 5 files changed, 86 insertions(+), 24 deletions(-) diff --git a/src/charm.py b/src/charm.py index fe56115c..962369f8 100755 --- a/src/charm.py +++ b/src/charm.py @@ -51,20 +51,23 @@ def __init__(self, *args: Any) -> None: self.on.cos_agent_relation_departed, self._on_cos_agent_relation_departed ) - self._stored.set_default(config={}, exporter_installed=False, installed=False) + self._stored.set_default( + config={}, + exporter_installed=False, + resource_installed=False, + ) self.num_cos_agent_relations = self.get_num_cos_agent_relations("cos-agent") def _on_install_or_upgrade(self, event: ops.InstallEvent) -> None: """Install or upgrade charm.""" self.model.unit.status = MaintenanceStatus("Installing resources...") - installed, msg = self.hw_tool_helper.install(self.model.resources) - self._stored.installed = installed + resource_installed, msg = self.hw_tool_helper.install(self.model.resources) + self._stored.resource_installed = resource_installed - if not installed: - logger.error(msg) + if not resource_installed: + logger.warning(msg) self.model.unit.status = BlockedStatus(msg) - event.defer() return if self._stored.exporter_installed is not True: @@ -72,7 +75,6 @@ def _on_install_or_upgrade(self, event: ops.InstallEvent) -> None: success, err_msg = self.validate_exporter_configs() if not success: self.model.unit.status = BlockedStatus(err_msg) - event.defer() return port = self.model.config.get("exporter-port", "10000") @@ -85,7 +87,6 @@ def _on_install_or_upgrade(self, event: ops.InstallEvent) -> None: logger.error(msg) self.model.unit.status = BlockedStatus(msg) return - self._on_update_status(event) def _on_remove(self, _: EventBase) -> None: @@ -93,7 +94,7 @@ def _on_remove(self, _: EventBase) -> None: logger.info("Start to remove.") # Remove binary tool self.hw_tool_helper.remove(self.model.resources) - self._stored.installed = False + self._stored.resource_installed = False success = self.exporter.uninstall() if not success: msg = "Failed to uninstall exporter, please refer to `juju debug-log`" @@ -105,9 +106,9 @@ def _on_remove(self, _: EventBase) -> None: def _on_update_status(self, _: EventBase) -> None: # noqa: C901 """Update the charm's status.""" - if not self._stored.installed: # type: ignore - self.model.unit.status = BlockedStatus("Resoures are not installed") # type: ignore - return + if not self._stored.resource_installed: # type: ignore[truthy-function] + # The charm should be in BlockedStatus with install failed msg + return # type: ignore[unreachable] if not self.exporter_enabled: self.model.unit.status = BlockedStatus("Missing relation: [cos-agent]") @@ -155,13 +156,16 @@ def _on_config_changed(self, event: EventBase) -> None: change_set = set() model_config: Dict[str, Optional[str]] = dict(self.model.config.items()) for key, value in model_config.items(): - if key not in self._stored.config or self._stored.config[key] != value: # type: ignore + if ( + key not in self._stored.config # type: ignore[operator] + or self._stored.config[key] != value # type: ignore[index] + ): logger.info("Setting %s to: %s", key, value) self._stored.config[key] = value # type: ignore change_set.add(key) - if not self._stored.installed: # type: ignore - logging.info( # type: ignore + if not self._stored.resource_installed: # type: ignore[truthy-function] + logging.info( # type: ignore[unreachable] "Config changed called before install complete, deferring event: %s", event.handle, ) @@ -201,12 +205,24 @@ def _on_config_changed(self, event: EventBase) -> None: def _on_cos_agent_relation_joined(self, event: EventBase) -> None: """Start the exporter when relation joined.""" + if ( + not self._stored.resource_installed # type: ignore[truthy-function] + or not self._stored.exporter_installed # type: ignore[truthy-function] + ): + logger.info( # type: ignore[unreachable] + "Defer cos-agent relation join because exporter or resources is not ready yet." + ) + event.defer() + return self.exporter.start() + logger.info("Start exporter service") self._on_update_status(event) def _on_cos_agent_relation_departed(self, event: EventBase) -> None: """Remove the exporter when relation departed.""" - self.exporter.stop() + if self._stored.exporter_installed: # type: ignore[truthy-function] + self.exporter.stop() + logger.info("Stop exporter service") self._on_update_status(event) def _get_redfish_creds(self) -> Dict[str, str]: diff --git a/src/hw_tools.py b/src/hw_tools.py index d0693782..338f9540 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -9,6 +9,7 @@ import subprocess import typing as t from abc import ABCMeta, abstractmethod +from functools import lru_cache from pathlib import Path from charms.operator_libs_linux.v0 import apt @@ -338,6 +339,9 @@ def check(self) -> bool: return True +# Using cache here to avoid repeat call. +# The lru_cache should be clean everytime the hook been triggered. +@lru_cache def raid_hw_verifier() -> t.List[HWTool]: """Verify if the HWTool support RAID card exists on machine.""" hw_info = lshw() @@ -388,6 +392,9 @@ def raid_hw_verifier() -> t.List[HWTool]: return list(tools) +# Using cache here to avoid repeat call. +# The lru_cache should be clean everytime the hook been triggered. +@lru_cache def redfish_available() -> bool: """Check if redfish service is available.""" bmc_address = get_bmc_address() @@ -419,6 +426,9 @@ def redfish_available() -> bool: return result +# Using cache here to avoid repeat call. +# The lru_cache should be clean everytime the hook been triggered. +@lru_cache def bmc_hw_verifier() -> t.List[HWTool]: """Verify if the ipmi is available on the machine. @@ -455,6 +465,9 @@ def bmc_hw_verifier() -> t.List[HWTool]: return tools +# Using cache here to avoid repeat call. +# The lru_cache should be clean everytime the hook been triggered. +@lru_cache def get_hw_tool_white_list() -> t.List[HWTool]: """Return HWTool white list.""" raid_white_list = raid_hw_verifier() diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 42251019..fcb379c7 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -51,7 +51,7 @@ def _get_notice_count(self, hook): def test_01_harness(self) -> None: """Test charm initialise.""" self.harness.begin() - self.assertFalse(self.harness.charm._stored.installed) + self.assertFalse(self.harness.charm._stored.resource_installed) self.assertTrue(isinstance(self.harness.charm._stored.config, ops.framework.StoredDict)) @mock.patch("charm.Exporter", return_value=mock.MagicMock()) @@ -63,7 +63,7 @@ def test_02_install(self, mock_hw_tool_helper, mock_exporter) -> None: self.harness.begin() self.harness.charm.on.install.emit() - self.assertTrue(self.harness.charm._stored.installed) + self.assertTrue(self.harness.charm._stored.resource_installed) self.harness.charm.exporter.install.assert_called_once() self.harness.charm.hw_tool_helper.install.assert_called_with( @@ -79,7 +79,7 @@ def test_03_upgrade_charm(self, mock_hw_tool_helper, mock_exporter) -> None: self.harness.begin() self.harness.charm.on.install.emit() - self.assertTrue(self.harness.charm._stored.installed) + self.assertTrue(self.harness.charm._stored.resource_installed) self.harness.charm.exporter.install.assert_called_once() self.harness.charm.hw_tool_helper.install.assert_called_with( @@ -117,7 +117,7 @@ def test_05_install_redfish_unavailable(self, mock_hw_tool_helper, mock_exporter self.harness.begin() self.harness.charm.on.install.emit() - self.assertTrue(self.harness.charm._stored.installed) + self.assertTrue(self.harness.charm._stored.resource_installed) self.harness.charm.exporter.install.assert_called_with(10000, "INFO", {}) @@ -132,7 +132,7 @@ def test_06_install_failed(self, mock_hw_tool_helper, mock_exporter) -> None: self.harness.charm.validate_exporter_configs.return_value = (False, "error") self.harness.charm.on.install.emit() - self.assertTrue(self.harness.charm._stored.installed) + self.assertTrue(self.harness.charm._stored.resource_installed) self.assertEqual(self.harness.charm.unit.status, BlockedStatus("error")) @@ -205,7 +205,7 @@ def test_09_update_status_exporter_crashed(self, mock_hw_tool_helper, mock_expor def test_10_config_changed(self, mock_exporter): """Test config change event updates the charm's internal store.""" self.harness.begin() - self.harness.charm._stored.installed = True + self.harness.charm._stored.resource_installed = True new_config = {"exporter-port": 80, "exporter-log-level": "DEBUG"} self.harness.update_config(new_config) @@ -222,7 +222,7 @@ def test_10_config_changed(self, mock_exporter): def test_11_config_changed_before_install_complete(self, mock_exporter): """Test: config change event is deferred if charm not installed.""" self.harness.begin() - self.harness.charm._stored.installed = False + self.harness.charm._stored.resource_installed = False self.harness.charm.on.config_changed.emit() self.assertEqual(self._get_notice_count("config_changed"), 1) diff --git a/tests/unit/test_exporter.py b/tests/unit/test_exporter.py index 34705794..857e70fe 100644 --- a/tests/unit/test_exporter.py +++ b/tests/unit/test_exporter.py @@ -116,6 +116,8 @@ def test_20_start_okay(self, mock_service_installed): """Test exporter service started when relation is joined.""" rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() + self.harness.charm._stored.resource_installed = True + self.harness.charm._stored.exporter_installed = True self.harness.add_relation_unit(rid, "grafana-agent/0") self.mock_systemd.service_start.assert_called_once() @@ -127,11 +129,33 @@ def test_21_start_failed(self, mock_service_not_installed): self.harness.add_relation_unit(rid, "grafana-agent/0") self.mock_systemd.service_start.assert_not_called() + @mock.patch.object(pathlib.Path, "exists", return_value=True) + def test_22_start_defer_resource_not_ready(self, mock_service_installed): + """Test exporter service started when relation is joined.""" + rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") + self.harness.begin() + self.harness.charm._stored.resource_installed = False + self.harness.charm._stored.exporter_installed = True + self.harness.add_relation_unit(rid, "grafana-agent/0") + self.mock_systemd.service_start.assert_not_called() + + @mock.patch.object(pathlib.Path, "exists", return_value=True) + def test_23_start_defer_exporter_not_ready(self, mock_service_installed): + """Test exporter service started when relation is joined.""" + rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") + self.harness.begin() + self.harness.charm._stored.resource_installed = True + self.harness.charm._stored.exporter_installed = False + self.harness.add_relation_unit(rid, "grafana-agent/0") + self.mock_systemd.service_start.assert_not_called() + @mock.patch.object(pathlib.Path, "exists", return_value=True) def test_30_stop_okay(self, mock_service_installed): """Test exporter service is stopped when service is installed and relation is departed.""" rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() + self.harness.charm._stored.resource_installed = True + self.harness.charm._stored.exporter_installed = True self.harness.add_relation_unit(rid, "grafana-agent/0") self.harness.remove_relation_unit(rid, "grafana-agent/0") self.mock_systemd.service_stop.assert_called_once() @@ -141,6 +165,8 @@ def test_31_stop_failed(self, mock_service_not_installed): """Test exporter service failed to stop when service is not installed.""" rid = self.harness.add_relation(EXPORTER_RELATION_NAME, "grafana-agent") self.harness.begin() + self.harness.charm._stored.resource_installed = True + self.harness.charm._stored.exporter_installed = True self.harness.add_relation_unit(rid, "grafana-agent/0") self.harness.remove_relation_unit(rid, "grafana-agent/0") self.mock_systemd.service_stop.assert_not_called() diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index 8e40694d..8254f5fc 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -284,7 +284,7 @@ def test_09_install_required_resource_not_uploaded(self, _, mock_hw_white_list): ok, msg = self.hw_tool_helper.install(mock_resources) self.assertFalse(ok) self.assertEqual(msg, "Missing resources: ['storcli-deb', 'perccli-deb']") - self.assertFalse(self.harness.charm._stored.installed) + self.assertFalse(self.harness.charm._stored.resource_installed) @mock.patch( "hw_tools.get_hw_tool_white_list", @@ -762,6 +762,7 @@ def test_get_hw_tool_white_list(mock_raid_verifier, mock_bmc_hw_verifier): @mock.patch("hw_tools.lshw") def test_raid_hw_verifier(mock_lshw, lshw_output, lshw_storage_output, expect): mock_lshw.side_effect = [lshw_output, lshw_storage_output] + raid_hw_verifier.cache_clear() output = raid_hw_verifier() case = unittest.TestCase() case.assertCountEqual(output, expect) @@ -775,6 +776,7 @@ def test_redfish_not_available(self, mock_bmc_address, mock_redfish_client): mock_redfish_client.return_value = mock_redfish_obj mock_redfish_obj.login.side_effect = RetriesExhaustedError() + redfish_available.cache_clear() result = redfish_available() self.assertEqual(result, False) @@ -789,6 +791,7 @@ def test_redfish_not_available_generic(self, mock_bmc_address, mock_redfish_clie mock_redfish_client.return_value = mock_redfish_obj mock_redfish_obj.login.side_effect = Exception() + redfish_available.cache_clear() result = redfish_available() self.assertEqual(result, False) @@ -817,6 +820,7 @@ def test_redfish_available_and_login_success(self, mock_bmc_address, mock_redfis mock_redfish_obj = mock.Mock() mock_redfish_client.return_value = mock_redfish_obj + redfish_available.cache_clear() result = redfish_available() self.assertEqual(result, True) @@ -830,6 +834,7 @@ def test_redfish_available_and_login_success(self, mock_bmc_address, mock_redfis @mock.patch("hw_tools.subprocess") @mock.patch("hw_tools.apt") def test_bmc_hw_verifier(self, mock_apt, mock_subprocess, mock_redfish_available): + bmc_hw_verifier.cache_clear() output = bmc_hw_verifier() mock_apt.add_package.assert_called_with("freeipmi-tools", update_cache=False) self.assertCountEqual( @@ -845,6 +850,7 @@ def test_bmc_hw_verifier(self, mock_apt, mock_subprocess, mock_redfish_available def test_bmc_hw_verifier_error_handling( self, mock_apt, mock_check_output, mock_redfish_available ): + bmc_hw_verifier.cache_clear() output = bmc_hw_verifier() mock_apt.add_package.assert_called_with("freeipmi-tools", update_cache=False) self.assertEqual(output, []) @@ -862,6 +868,7 @@ def mock_get_response_ipmi(ipmi_call): elif ipmi_call == "ipmi-dcmi --get-system-power-statistics".split(): raise subprocess.CalledProcessError(-1, "cmd") + bmc_hw_verifier.cache_clear() with mock.patch("hw_tools.subprocess.check_output", side_effect=mock_get_response_ipmi): output = bmc_hw_verifier() mock_apt.add_package.assert_called_with("freeipmi-tools", update_cache=False) From 314c28757f3ffdac0b010a1a82e6de613ca585b8 Mon Sep 17 00:00:00 2001 From: james_lin Date: Thu, 14 Dec 2023 17:04:38 +0800 Subject: [PATCH 4/9] fix: Force re-config exporter when upgrade (#140) --- src/charm.py | 32 ++++++++++++++++---------------- tests/unit/test_charm.py | 19 +++++++++++++++++++ 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/charm.py b/src/charm.py index 962369f8..981813b9 100755 --- a/src/charm.py +++ b/src/charm.py @@ -70,23 +70,23 @@ def _on_install_or_upgrade(self, event: ops.InstallEvent) -> None: self.model.unit.status = BlockedStatus(msg) return - if self._stored.exporter_installed is not True: - self.model.unit.status = MaintenanceStatus("Installing exporter...") - success, err_msg = self.validate_exporter_configs() - if not success: - self.model.unit.status = BlockedStatus(err_msg) - return + # Install exporter + self.model.unit.status = MaintenanceStatus("Installing exporter...") + success, err_msg = self.validate_exporter_configs() + if not success: + self.model.unit.status = BlockedStatus(err_msg) + return - port = self.model.config.get("exporter-port", "10000") - level = self.model.config.get("exporter-log-level", "INFO") - redfish_creds = self._get_redfish_creds() - success = self.exporter.install(port, level, redfish_creds) - self._stored.exporter_installed = success - if not success: - msg = "Failed to install exporter, please refer to `juju debug-log`" - logger.error(msg) - self.model.unit.status = BlockedStatus(msg) - return + port = self.model.config.get("exporter-port", "10000") + level = self.model.config.get("exporter-log-level", "INFO") + redfish_creds = self._get_redfish_creds() + success = self.exporter.install(port, level, redfish_creds) + self._stored.exporter_installed = success + if not success: + msg = "Failed to install exporter, please refer to `juju debug-log`" + logger.error(msg) + self.model.unit.status = BlockedStatus(msg) + return self._on_update_status(event) def _on_remove(self, _: EventBase) -> None: diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index fcb379c7..72c5cc56 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -226,3 +226,22 @@ def test_11_config_changed_before_install_complete(self, mock_exporter): self.harness.charm.on.config_changed.emit() self.assertEqual(self._get_notice_count("config_changed"), 1) + + @mock.patch("charm.Exporter", return_value=mock.MagicMock()) + @mock.patch("charm.HWToolHelper", return_value=mock.MagicMock()) + def test_12_upgrade_force_reconfig_exporter(self, mock_hw_tool_helper, mock_exporter) -> None: + """Test event install.""" + mock_hw_tool_helper.return_value.install.return_value = (True, "") + mock_exporter.return_value.install.return_value = True + self.harness.begin() + self.harness.charm._stored.exporter_installed = True + print(dir(self.harness.charm.on)) + self.harness.charm.on.upgrade_charm.emit() + + self.assertTrue(self.harness.charm._stored.resource_installed) + self.assertTrue(self.harness.charm._stored.exporter_installed) + + self.harness.charm.exporter.install.assert_called_once() + self.harness.charm.hw_tool_helper.install.assert_called_with( + self.harness.charm.model.resources + ) From ca92b287d4494f1d4b2067335d3cfdf5876862f2 Mon Sep 17 00:00:00 2001 From: james_lin Date: Thu, 14 Dec 2023 17:07:43 +0800 Subject: [PATCH 5/9] refactor: Move restart exporter logic as single function (#139) --- src/charm.py | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/src/charm.py b/src/charm.py index 981813b9..72992ad9 100755 --- a/src/charm.py +++ b/src/charm.py @@ -11,7 +11,7 @@ import ops from charms.grafana_agent.v0.cos_agent import COSAgentProvider from ops.framework import EventBase, StoredState -from ops.model import ActiveStatus, BlockedStatus, ErrorStatus, MaintenanceStatus +from ops.model import ActiveStatus, BlockedStatus, ErrorStatus, MaintenanceStatus, StatusBase from config import EXPORTER_HEALTH_RETRY_COUNT, EXPORTER_HEALTH_RETRY_TIMEOUT, HWTool from hardware import get_bmc_address @@ -125,29 +125,35 @@ def _on_update_status(self, _: EventBase) -> None: # noqa: C901 if not self.exporter.check_health(): logger.warning("Exporter health check - failed.") - try: - for i in range(1, EXPORTER_HEALTH_RETRY_COUNT + 1): - logger.warning("Restarting exporter - %d retry", i) - self.exporter.restart() - sleep(EXPORTER_HEALTH_RETRY_TIMEOUT) - if self.exporter.check_active(): - logger.info("Exporter restarted.") - break - if not self.exporter.check_active(): - logger.error("Failed to restart the exporter.") - self.model.unit.status = ErrorStatus( - "Exporter crashed unexpectedly, please refer to systemd logs..." - ) - return - except Exception as err: # pylint: disable=W0718 - logger.error("Exporter crashed unexpectedly: %s", err) - self.model.unit.status = ErrorStatus( - "Exporter crashed unexpectedly, please refer to systemd logs..." - ) + restart_ok, restart_status = self.restart_exporter() + if not restart_ok and restart_status is not None: + self.model.unit.status = restart_status return self.model.unit.status = ActiveStatus("Unit is ready") + def restart_exporter(self) -> Tuple[bool, Optional[StatusBase]]: + """Restart exporter service with retry.""" + try: + for i in range(1, EXPORTER_HEALTH_RETRY_COUNT + 1): + logger.warning("Restarting exporter - %d retry", i) + self.exporter.restart() + sleep(EXPORTER_HEALTH_RETRY_TIMEOUT) + if self.exporter.check_active(): + logger.info("Exporter restarted.") + break + if not self.exporter.check_active(): + logger.error("Failed to restart the exporter.") + return False, ErrorStatus( + "Exporter crashed unexpectedly, please refer to systemd logs..." + ) + except Exception as err: # pylint: disable=W0718 + logger.error("Exporter crashed unexpectedly: %s", err) + return False, ErrorStatus( + "Exporter crashed unexpectedly, please refer to systemd logs..." + ) + return True, None + def _on_config_changed(self, event: EventBase) -> None: """Reconfigure charm.""" # Keep track of what model config options + some extra config related From 44fdb853b9dcbbe149059940d40b261533372865 Mon Sep 17 00:00:00 2001 From: james_lin Date: Thu, 14 Dec 2023 17:24:21 +0800 Subject: [PATCH 6/9] Fix/skip remove freeipmi tools (#138) * fix: Skip removing pkg on IPMI strategy Skip removing package because we afraid this cause dependency error for any other services on the same machine. * fix: Skip remove ssacli pkg and repo --- src/hw_tools.py | 15 ++++++--------- tests/unit/test_hw_tools.py | 6 +++--- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/hw_tools.py b/src/hw_tools.py index 338f9540..27df31e1 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -282,11 +282,6 @@ def add_repo(self) -> None: repositories = apt.RepositoryMapping() repositories.add(self.repo) - def disable_repo(self) -> None: - """Disable the repository.""" - repositories = apt.RepositoryMapping() - repositories.disable(self.repo) - def install(self) -> None: for key in HP_KEYS: apt.import_key(key) @@ -294,8 +289,9 @@ def install(self) -> None: apt.add_package(self.pkg, update_cache=True) def remove(self) -> None: - apt.remove_package(self.pkg) - self.disable_repo() + # Skip removing because we afriad this cause dependency error + # for other services on the same machine. + logger.info("SSACLIStrategy skip removing %s", self.pkg) def check(self) -> bool: """Check package status.""" @@ -318,8 +314,9 @@ def install(self) -> None: apt.add_package(pkg) def remove(self) -> None: - for pkg in self.pkgs: - apt.remove_package(pkg) + # Skip removing because we afriad this cause dependency error + # for other services on the same machine. + logger.info("IPMIStrategy skip removing %s", self.pkgs) def check(self) -> bool: """Check package status.""" diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index 8254f5fc..9a08d3c1 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -669,8 +669,8 @@ def test_remove(self, mock_apt): mock_apt.RepositoryMapping.return_value = mock_repos strategy.remove() - mock_apt.remove_package.assert_called_with(strategy.pkg) - mock_repos.disable.assert_called_with(strategy.repo) + mock_apt.remove_package.assert_not_called() + mock_repos.disable.assert_not_called() class TestIPMIStrategy(unittest.TestCase): @@ -686,7 +686,7 @@ def test_remove(self, mock_apt): strategy = IPMIStrategy() strategy.remove() - mock_apt.remove_package.assert_called_with("freeipmi-tools") + mock_apt.remove_package.assert_not_called() @mock.patch("hw_tools.bmc_hw_verifier", return_value=[1, 2, 3]) From 5fc0f7f9e1cd7eaca9e7691efcd0569e20954122 Mon Sep 17 00:00:00 2001 From: james_lin Date: Thu, 14 Dec 2023 19:07:24 +0800 Subject: [PATCH 7/9] test: Fix failed functional test (#141) Fix failure functional test caused because of life-cycle changing. --- src/charm.py | 5 +++++ tests/functional/test_charm.py | 15 +++------------ tests/unit/test_charm.py | 24 ++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/charm.py b/src/charm.py index 72992ad9..7a9ed508 100755 --- a/src/charm.py +++ b/src/charm.py @@ -118,6 +118,11 @@ def _on_update_status(self, _: EventBase) -> None: # noqa: C901 self.model.unit.status = BlockedStatus("Cannot relate to more than one grafana-agent") return + config_valied, confg_valid_message = self.validate_exporter_configs() + if not config_valied: + self.model.unit.status = BlockedStatus(confg_valid_message) + return + hw_tool_ok, error_msg = self.hw_tool_helper.check_installed() if not hw_tool_ok: self.model.unit.status = BlockedStatus(error_msg) diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index fc7cfb02..daafd38a 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -35,9 +35,9 @@ class AppStatus(str, Enum): INSTALL = "Install complete" READY = "Unit is ready" MISSING_RELATION = "Missing relation: [cos-agent]" - UNHEALTHY = "Exporter is unhealthy" NOT_RUNNING = "Exporter is not running" MISSING_RESOURCES = "Missing resources:" + INVALID_CONFIG_EXPORTER_LOG_LEVEL = "Invalid config: 'exporter-log-level'" @pytest.mark.abort_on_fail @@ -162,20 +162,11 @@ async def test_01_config_changed_log_level(self, app, unit, sync_helper, ops_tes async def test_10_start_and_stop_exporter(self, app, unit, sync_helper, ops_test): """Test starting and stopping the exporter results in correct charm status.""" - # Stop the exporter + # Stop the exporter, and the exporter should auto-restart after update status fire. stop_cmd = "systemctl stop hardware-exporter" async with ops_test.fast_forward(): await asyncio.gather( unit.run(stop_cmd), - ops_test.model.wait_for_idle(apps=[APP_NAME], status="blocked", timeout=TIMEOUT), - ) - assert unit.workload_status_message == AppStatus.NOT_RUNNING - - # Start the exporter - start_cmd = "systemctl start hardware-exporter" - async with ops_test.fast_forward(): - await asyncio.gather( - unit.run(start_cmd), ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=TIMEOUT), ) assert unit.workload_status_message == AppStatus.READY @@ -188,7 +179,7 @@ async def test_11_exporter_failed(self, app, unit, sync_helper, ops_test): app.set_config({"exporter-log-level": "RANDOM_LEVEL"}), ops_test.model.wait_for_idle(apps=[APP_NAME], status="blocked", timeout=TIMEOUT), ) - assert unit.workload_status_message == AppStatus.UNHEALTHY + assert unit.workload_status_message == AppStatus.INVALID_CONFIG_EXPORTER_LOG_LEVEL async with ops_test.fast_forward(): await asyncio.gather( diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 72c5cc56..af43217f 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -245,3 +245,27 @@ def test_12_upgrade_force_reconfig_exporter(self, mock_hw_tool_helper, mock_expo self.harness.charm.hw_tool_helper.install.assert_called_with( self.harness.charm.model.resources ) + + @mock.patch("charm.Exporter", return_value=mock.MagicMock()) + @mock.patch("charm.HWToolHelper", return_value=mock.MagicMock()) + def test_13_update_status_config_invalid(self, mock_hw_tool_helper, mock_exporter): + """Test update_status when everything is okay.""" + self.mock_bmc_hw_verifier.return_value = [ + HWTool.IPMI_SENSOR, + HWTool.IPMI_SEL, + HWTool.IPMI_DCMI, + ] + mock_hw_tool_helper.return_value.install.return_value = (True, "") + mock_hw_tool_helper.return_value.check_installed.return_value = (True, "") + mock_exporter.return_value.install.return_value = True + rid = self.harness.add_relation("cos-agent", "grafana-agent") + self.harness.begin() + self.harness.charm.on.install.emit() + self.harness.add_relation_unit(rid, "grafana-agent/0") + + self.harness.charm.validate_exporter_configs = mock.MagicMock() + self.harness.charm.validate_exporter_configs.return_value = (False, "config fail message") + + self.harness.charm.on.update_status.emit() + self.assertEqual(self.harness.charm.unit.status, BlockedStatus("config fail message")) + From 02f89e0536ebc91059fe8abaee44e7787c790f51 Mon Sep 17 00:00:00 2001 From: james_lin Date: Thu, 14 Dec 2023 19:12:41 +0800 Subject: [PATCH 8/9] style: Linter fix (#142) --- tests/unit/test_charm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index af43217f..f7c70576 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -268,4 +268,3 @@ def test_13_update_status_config_invalid(self, mock_hw_tool_helper, mock_exporte self.harness.charm.on.update_status.emit() self.assertEqual(self.harness.charm.unit.status, BlockedStatus("config fail message")) - From 50a93bf89983a5853e51cfc456d4b203393fa2da Mon Sep 17 00:00:00 2001 From: james_lin Date: Fri, 15 Dec 2023 14:58:31 +0800 Subject: [PATCH 9/9] fix: apt install freeipmi-tools failed on focal (#143) * fix: apt install freeipmi-tools failed on focal Fix: #128 --- src/apt_helpers.py | 36 ++++++++++++++++++++++++++++ src/hw_tools.py | 12 +++++----- tests/unit/test_apt_helpers.py | 43 ++++++++++++++++++++++++++++++++++ tests/unit/test_hw_tools.py | 28 ++++++++++++---------- 4 files changed, 101 insertions(+), 18 deletions(-) create mode 100644 src/apt_helpers.py create mode 100644 tests/unit/test_apt_helpers.py diff --git a/src/apt_helpers.py b/src/apt_helpers.py new file mode 100644 index 00000000..c89ca81b --- /dev/null +++ b/src/apt_helpers.py @@ -0,0 +1,36 @@ +"""Apt helper module for missing features in operator_libs_linux.""" +import re +from subprocess import PIPE, CalledProcessError, check_output +from typing import Optional + +from charms.operator_libs_linux.v0 import apt + + +def get_candidate_version(package: str) -> Optional[str]: + """Get candiate version of package from apt-cache. + + Related issue: https://github.com/canonical/operator-libs-linux/issues/113 + """ + try: + output = check_output( + ["apt-cache", "policy", package], stderr=PIPE, universal_newlines=True + ) + except CalledProcessError as e: + raise apt.PackageError(f"Could not list packages in apt-cache: {e.output}") from None + + lines = [line.strip() for line in output.strip().split("\n")] + for line in lines: + candidate_matcher = re.compile(r"^Candidate:\s(?P(.*))") + matches = candidate_matcher.search(line) + if matches: + return matches.groupdict().get("version") + raise apt.PackageError(f"Could not find candidate version package in apt-cache: {output}") + + +def add_pkg_with_candidate_version(pkg: str) -> None: + """Install package with apt-cache candidate version. + + Related issue: https://github.com/canonical/operator-libs-linux/issues/113 + """ + version = get_candidate_version(pkg) + apt.add_package(pkg, version=version, update_cache=False) diff --git a/src/hw_tools.py b/src/hw_tools.py index 27df31e1..b4c557cf 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -22,6 +22,7 @@ SessionCreationError, ) +import apt_helpers from checksum import ( PERCCLI_VERSION_INFOS, SAS2IRCU_VERSION_INFOS, @@ -307,20 +308,19 @@ class IPMIStrategy(APTStrategyABC): # messages. The installation should be good since all of these # tools require the same `freeipmi-tools` to be installed. _name = HWTool.IPMI_SENSOR - pkgs = ["freeipmi-tools"] + pkg = "freeipmi-tools" def install(self) -> None: - for pkg in self.pkgs: - apt.add_package(pkg) + apt_helpers.add_pkg_with_candidate_version(self.pkg) def remove(self) -> None: # Skip removing because we afriad this cause dependency error # for other services on the same machine. - logger.info("IPMIStrategy skip removing %s", self.pkgs) + logger.info("IPMIStrategy skip removing %s", self.pkg) def check(self) -> bool: """Check package status.""" - return check_deb_pkg_installed(self.pkgs[0]) + return check_deb_pkg_installed(self.pkg) class RedFishStrategy(StrategyABC): # pylint: disable=R0903 @@ -434,7 +434,7 @@ def bmc_hw_verifier() -> t.List[HWTool]: tools = [] # Check if ipmi services are available - apt.add_package("freeipmi-tools", update_cache=False) + apt_helpers.add_pkg_with_candidate_version("freeipmi-tools") try: subprocess.check_output("ipmimonitoring".split()) diff --git a/tests/unit/test_apt_helpers.py b/tests/unit/test_apt_helpers.py new file mode 100644 index 00000000..7e708861 --- /dev/null +++ b/tests/unit/test_apt_helpers.py @@ -0,0 +1,43 @@ +import unittest +from subprocess import CalledProcessError +from unittest import mock + +from charms.operator_libs_linux.v0 import apt + +import apt_helpers + +APT_CACHE_POLICY_FREEIPMI_TOOLS_OUTPUT = """freeipmi-tools: + Installed: (none) + Candidate: 1.6.4-3ubuntu1.1 + Version table: + 1.6.9-2~bpo20.04.1 100 + 100 http://tw.archive.ubuntu.com/ubuntu focal-backports/main amd64 Packages + 1.6.4-3ubuntu1.1 500 + 500 http://tw.archive.ubuntu.com/ubuntu focal-updates/main amd64 Packages + 100 /var/lib/dpkg/status + 1.6.4-3ubuntu1 500 + 500 http://tw.archive.ubuntu.com/ubuntu focal/main amd64 Packages +""" + + +class TestGetCandidateVersion(unittest.TestCase): + @mock.patch("apt_helpers.check_output") + def test_install_freeipmi_tools_on_focal(self, mock_check_output): + mock_check_output.return_value = APT_CACHE_POLICY_FREEIPMI_TOOLS_OUTPUT + version = apt_helpers.get_candidate_version("freeipmi-tools") + self.assertEqual(version, "1.6.4-3ubuntu1.1") + + @mock.patch("apt_helpers.check_output") + def test_checkoutput_failed(self, mock_check_output): + mock_check_output.side_effect = CalledProcessError(-1, "cmd") + + with self.assertRaises(apt.PackageError): + apt_helpers.get_candidate_version("freeipmi-tools") + + @mock.patch("apt_helpers.check_output") + def test_checkoutput_version_not_found_error(self, mock_check_output): + fake_output = APT_CACHE_POLICY_FREEIPMI_TOOLS_OUTPUT.replace("Candidate", "NotCandidate") + mock_check_output.return_value = fake_output + + with self.assertRaises(apt.PackageError): + apt_helpers.get_candidate_version("freeipmi-tools") diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index 9a08d3c1..71b19bf8 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -674,12 +674,16 @@ def test_remove(self, mock_apt): class TestIPMIStrategy(unittest.TestCase): - @mock.patch("hw_tools.apt") - def test_install(self, mock_apt): + @mock.patch("apt_helpers.get_candidate_version") + @mock.patch("apt_helpers.apt") + def test_install(self, mock_apt, mock_candidate_version): strategy = IPMIStrategy() + mock_candidate_version.return_value = "some-candidate-version" strategy.install() - mock_apt.add_package.assert_called_with("freeipmi-tools") + mock_apt.add_package.assert_called_with( + "freeipmi-tools", version="some-candidate-version", update_cache=False + ) @mock.patch("hw_tools.apt") def test_remove(self, mock_apt): @@ -832,11 +836,11 @@ def test_redfish_available_and_login_success(self, mock_bmc_address, mock_redfis @mock.patch("hw_tools.redfish_available", return_value=True) @mock.patch("hw_tools.subprocess") - @mock.patch("hw_tools.apt") - def test_bmc_hw_verifier(self, mock_apt, mock_subprocess, mock_redfish_available): + @mock.patch("hw_tools.apt_helpers") + def test_bmc_hw_verifier(self, mock_apt_helpers, mock_subprocess, mock_redfish_available): bmc_hw_verifier.cache_clear() output = bmc_hw_verifier() - mock_apt.add_package.assert_called_with("freeipmi-tools", update_cache=False) + mock_apt_helpers.add_pkg_with_candidate_version.assert_called_with("freeipmi-tools") self.assertCountEqual( output, [HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.IPMI_DCMI, HWTool.REDFISH] ) @@ -846,18 +850,18 @@ def test_bmc_hw_verifier(self, mock_apt, mock_subprocess, mock_redfish_available "hw_tools.subprocess.check_output", side_effect=subprocess.CalledProcessError(-1, "cmd"), ) - @mock.patch("hw_tools.apt") + @mock.patch("hw_tools.apt_helpers") def test_bmc_hw_verifier_error_handling( - self, mock_apt, mock_check_output, mock_redfish_available + self, mock_apt_helpers, mock_check_output, mock_redfish_available ): bmc_hw_verifier.cache_clear() output = bmc_hw_verifier() - mock_apt.add_package.assert_called_with("freeipmi-tools", update_cache=False) + mock_apt_helpers.add_pkg_with_candidate_version.assert_called_with("freeipmi-tools") self.assertEqual(output, []) @mock.patch("hw_tools.redfish_available", return_value=False) - @mock.patch("hw_tools.apt") - def test_bmc_hw_verifier_mixed(self, mock_apt, mock_redfish_available): + @mock.patch("hw_tools.apt_helpers") + def test_bmc_hw_verifier_mixed(self, mock_apt_helpers, mock_redfish_available): """Test a mixture of failures and successes for ipmi.""" def mock_get_response_ipmi(ipmi_call): @@ -871,5 +875,5 @@ def mock_get_response_ipmi(ipmi_call): bmc_hw_verifier.cache_clear() with mock.patch("hw_tools.subprocess.check_output", side_effect=mock_get_response_ipmi): output = bmc_hw_verifier() - mock_apt.add_package.assert_called_with("freeipmi-tools", update_cache=False) + mock_apt_helpers.add_pkg_with_candidate_version.assert_called_with("freeipmi-tools") self.assertCountEqual(output, [HWTool.IPMI_SENSOR, HWTool.IPMI_SEL])