Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/life cycle #137

Merged
merged 9 commits into from
Dec 25, 2023
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
coverage
flake8 < 5
pre-commit
parameterized
36 changes: 36 additions & 0 deletions src/apt_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Apt helper module for missing features in operator_libs_linux."""
import re
from subprocess import PIPE, CalledProcessError, check_output
from typing import Optional

from charms.operator_libs_linux.v0 import apt


def get_candidate_version(package: str) -> Optional[str]:
"""Get candiate version of package from apt-cache.

Related issue: https://github.com/canonical/operator-libs-linux/issues/113
"""
try:
output = check_output(
["apt-cache", "policy", package], stderr=PIPE, universal_newlines=True
)
except CalledProcessError as e:
raise apt.PackageError(f"Could not list packages in apt-cache: {e.output}") from None

lines = [line.strip() for line in output.strip().split("\n")]
for line in lines:
candidate_matcher = re.compile(r"^Candidate:\s(?P<version>(.*))")
matches = candidate_matcher.search(line)
if matches:
return matches.groupdict().get("version")
raise apt.PackageError(f"Could not find candidate version package in apt-cache: {output}")


def add_pkg_with_candidate_version(pkg: str) -> None:
"""Install package with apt-cache candidate version.

Related issue: https://github.com/canonical/operator-libs-linux/issues/113
"""
version = get_candidate_version(pkg)
apt.add_package(pkg, version=version, update_cache=False)
224 changes: 170 additions & 54 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
"""Charm the application."""

import logging
from typing import Any, Dict, Optional
from time import sleep
from typing import Any, Dict, Optional, Tuple

import ops
from charms.grafana_agent.v0.cos_agent import COSAgentProvider
from ops.framework import EventBase, StoredState
from ops.model import ActiveStatus, BlockedStatus
from ops.model import ActiveStatus, BlockedStatus, ErrorStatus, MaintenanceStatus, StatusBase

from config import HWTool
from config import EXPORTER_HEALTH_RETRY_COUNT, EXPORTER_HEALTH_RETRY_TIMEOUT, HWTool
from hardware import get_bmc_address
from hw_tools import HWToolHelper, bmc_hw_verifier
from service import Exporter
Expand Down Expand Up @@ -50,54 +51,114 @@ def __init__(self, *args: Any) -> None:
self.on.cos_agent_relation_departed, self._on_cos_agent_relation_departed
)

self._stored.set_default(installed=False, config={}, blocked_msg="")
self._stored.set_default(
config={},
exporter_installed=False,
resource_installed=False,
)
self.num_cos_agent_relations = self.get_num_cos_agent_relations("cos-agent")

def _on_install_or_upgrade(self, event: ops.InstallEvent) -> None:
"""Install and upgrade."""
port = self.model.config.get("exporter-port", "10000")
level = self.model.config.get("exporter-log-level", "INFO")
"""Install or upgrade charm."""
self.model.unit.status = MaintenanceStatus("Installing resources...")

redfish_creds = self._get_redfish_creds()
resource_installed, msg = self.hw_tool_helper.install(self.model.resources)
self._stored.resource_installed = resource_installed

self.exporter.install(port, level, redfish_creds)
installed, msg = self.hw_tool_helper.install(self.model.resources)
self._stored.installed = installed
if not resource_installed:
logger.warning(msg)
self.model.unit.status = BlockedStatus(msg)
return

if not installed:
logger.info(msg)
self._stored.blocked_msg = msg
self._on_update_status(event)
# Install exporter
self.model.unit.status = MaintenanceStatus("Installing exporter...")
success, err_msg = self.validate_exporter_configs()
if not success:
self.model.unit.status = BlockedStatus(err_msg)
return

self._stored.installed = True
self._stored.blocked_msg = ""
self.model.unit.status = ActiveStatus("Install complete")
logger.info("Install complete")
port = self.model.config.get("exporter-port", "10000")
level = self.model.config.get("exporter-log-level", "INFO")
redfish_creds = self._get_redfish_creds()
success = self.exporter.install(port, level, redfish_creds)
self._stored.exporter_installed = success
if not success:
msg = "Failed to install exporter, please refer to `juju debug-log`"
logger.error(msg)
self.model.unit.status = BlockedStatus(msg)
return
self._on_update_status(event)

def _on_remove(self, _: EventBase) -> None:
"""Remove everything when charm is being removed."""
logger.info("Start to remove.")
# Remove binary tool
self.hw_tool_helper.remove(self.model.resources)
self.exporter.uninstall()
self._stored.resource_installed = False
success = self.exporter.uninstall()
if not success:
msg = "Failed to uninstall exporter, please refer to `juju debug-log`"
# we probably don't need to set any status here because the charm
# will go away soon, so only logging is enough
logger.warning(msg)
self._stored.exporter_installed = not success
logger.info("Remove complete")

def _on_update_status(self, _: EventBase) -> None:
def _on_update_status(self, _: EventBase) -> None: # noqa: C901
"""Update the charm's status."""
if self._stored.installed is not True and self._stored.blocked_msg != "":
self.model.unit.status = BlockedStatus(self._stored.blocked_msg) # type: ignore
return
if not self.model.get_relation("cos-agent"):
if not self._stored.resource_installed: # type: ignore[truthy-function]
# The charm should be in BlockedStatus with install failed msg
return # type: ignore[unreachable]

if not self.exporter_enabled:
self.model.unit.status = BlockedStatus("Missing relation: [cos-agent]")
return
if not self.exporter.check_health():
self.model.unit.status = BlockedStatus("Exporter is unhealthy")

if self.too_many_cos_agent_relation:
self.model.unit.status = BlockedStatus("Cannot relate to more than one grafana-agent")
return
if not self.exporter.check_active():
self.model.unit.status = BlockedStatus("Exporter is not running")

config_valied, confg_valid_message = self.validate_exporter_configs()
if not config_valied:
self.model.unit.status = BlockedStatus(confg_valid_message)
return

hw_tool_ok, error_msg = self.hw_tool_helper.check_installed()
if not hw_tool_ok:
self.model.unit.status = BlockedStatus(error_msg)
return

if not self.exporter.check_health():
logger.warning("Exporter health check - failed.")
restart_ok, restart_status = self.restart_exporter()
if not restart_ok and restart_status is not None:
self.model.unit.status = restart_status
return

self.model.unit.status = ActiveStatus("Unit is ready")

def restart_exporter(self) -> Tuple[bool, Optional[StatusBase]]:
"""Restart exporter service with retry."""
try:
for i in range(1, EXPORTER_HEALTH_RETRY_COUNT + 1):
logger.warning("Restarting exporter - %d retry", i)
self.exporter.restart()
sleep(EXPORTER_HEALTH_RETRY_TIMEOUT)
if self.exporter.check_active():
logger.info("Exporter restarted.")
break
if not self.exporter.check_active():
logger.error("Failed to restart the exporter.")
return False, ErrorStatus(
"Exporter crashed unexpectedly, please refer to systemd logs..."
)
except Exception as err: # pylint: disable=W0718
logger.error("Exporter crashed unexpectedly: %s", err)
return False, ErrorStatus(
"Exporter crashed unexpectedly, please refer to systemd logs..."
)
return True, None

def _on_config_changed(self, event: EventBase) -> None:
"""Reconfigure charm."""
# Keep track of what model config options + some extra config related
Expand All @@ -106,51 +167,73 @@ def _on_config_changed(self, event: EventBase) -> None:
change_set = set()
model_config: Dict[str, Optional[str]] = dict(self.model.config.items())
for key, value in model_config.items():
if key not in self._stored.config or self._stored.config[key] != value: # type: ignore
if (
key not in self._stored.config # type: ignore[operator]
or self._stored.config[key] != value # type: ignore[index]
):
logger.info("Setting %s to: %s", key, value)
self._stored.config[key] = value # type: ignore
change_set.add(key)

if not self._stored.installed: # type: ignore
logging.info( # type: ignore
if not self._stored.resource_installed: # type: ignore[truthy-function]
logging.info( # type: ignore[unreachable]
"Config changed called before install complete, deferring event: %s",
event.handle,
)
event.defer()
return

exporter_configs = {
"exporter-port",
"exporter-log-level",
"redfish-host",
"redfish-username",
"redfish-password",
}
if exporter_configs.intersection(change_set):
logger.info("Detected changes in exporter config.")
port = self.model.config.get("exporter-port", "10000")
level = self.model.config.get("exporter-log-level", "INFO")

redfish_creds = self._get_redfish_creds()
success = self.exporter.template.render_config(
port=port, level=level, redfish_creds=redfish_creds
)
# First condition prevent the exporter from starting at when the
# charm just installed; the second condition tries to recover the
# exporter from failed status.
if success and self.exporter.check_active() or not self.exporter.check_health():
if self.exporter_enabled:
success, message = self.validate_exporter_configs()
if not success:
self.model.unit.status = BlockedStatus(message)
return

exporter_configs = {
"exporter-port",
"exporter-log-level",
"redfish-host",
"redfish-username",
"redfish-password",
}
if exporter_configs.intersection(change_set):
logger.info("Detected changes in exporter config.")
port = self.model.config.get("exporter-port", "10000")
level = self.model.config.get("exporter-log-level", "INFO")

redfish_creds = self._get_redfish_creds()
success = self.exporter.template.render_config(
port=port, level=level, redfish_creds=redfish_creds
)
if not success:
message = (
"Failed to configure exporter, please check if the server is healthy."
)
self.model.unit.status = BlockedStatus(message)
return
self.exporter.restart()

self._on_update_status(event)

def _on_cos_agent_relation_joined(self, event: EventBase) -> None:
"""Start the exporter when relation joined."""
if (
not self._stored.resource_installed # type: ignore[truthy-function]
or not self._stored.exporter_installed # type: ignore[truthy-function]
):
logger.info( # type: ignore[unreachable]
"Defer cos-agent relation join because exporter or resources is not ready yet."
)
event.defer()
return
self.exporter.start()
logger.info("Start exporter service")
self._on_update_status(event)

def _on_cos_agent_relation_departed(self, event: EventBase) -> None:
"""Remove the exporter when relation departed."""
self.exporter.stop()
if self._stored.exporter_installed: # type: ignore[truthy-function]
self.exporter.stop()
logger.info("Stop exporter service")
self._on_update_status(event)

def _get_redfish_creds(self) -> Dict[str, str]:
Expand All @@ -168,6 +251,39 @@ def _get_redfish_creds(self) -> Dict[str, str]:
redfish_creds = {}
return redfish_creds

def validate_exporter_configs(self) -> Tuple[bool, str]:
"""Validate the static and runtime config options for the exporter."""
port = int(self.model.config.get("exporter-port", "10000"))
if not 1 <= port <= 65535:
logger.error("Invalid exporter-port: port must be in [1, 65535].")
return False, "Invalid config: 'exporter-port'"

level = self.model.config.get("exporter-log-level", "")
allowed_choices = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
if level.upper() not in allowed_choices:
logger.error(
"Invalid exporter-log-level: level must be in %s (case-insensitive).",
allowed_choices,
)
return False, "Invalid config: 'exporter-log-level'"

return True, "Exporter config is valid."

def get_num_cos_agent_relations(self, relation_name: str) -> int:
"""Get the number of relation given a relation_name."""
relations = self.model.relations.get(relation_name, [])
return len(relations)

@property
def exporter_enabled(self) -> bool:
"""Return True if cos-agent relation is present."""
return self.num_cos_agent_relations != 0

@property
def too_many_cos_agent_relation(self) -> bool:
"""Return True if there're more than one cos-agent relation."""
return self.num_cos_agent_relations > 1


if __name__ == "__main__": # pragma: nocover
ops.main(HardwareObserverCharm) # type: ignore
11 changes: 9 additions & 2 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
EXPORTER_SERVICE_PATH = Path(f"/etc/systemd/system/{EXPORTER_NAME}.service")
EXPORTER_CONFIG_TEMPLATE = f"{EXPORTER_NAME}-config.yaml.j2"
EXPORTER_SERVICE_TEMPLATE = f"{EXPORTER_NAME}.service.j2"
EXPORTER_HEALTH_RETRY_COUNT = 3
EXPORTER_HEALTH_RETRY_TIMEOUT = 3


# Redfish
REDFISH_TIMEOUT = 3
Expand Down Expand Up @@ -38,7 +41,9 @@ class HWTool(str, Enum):
SAS2IRCU = "sas2ircu"
SAS3IRCU = "sas3ircu"
PERCCLI = "perccli"
IPMI = "ipmi"
IPMI_DCMI = "ipmi_dcmi"
IPMI_SEL = "ipmi_sel"
IPMI_SENSOR = "ipmi_sensor"
REDFISH = "redfish"


Expand All @@ -55,7 +60,9 @@ class HWTool(str, Enum):
HWTool.SAS2IRCU: ["collector.lsi_sas_2"],
HWTool.SAS3IRCU: ["collector.lsi_sas_3"],
HWTool.SSACLI: ["collector.hpe_ssa"],
HWTool.IPMI: ["collector.ipmi_dcmi", "collector.ipmi_sel", "collector.ipmi_sensor"],
HWTool.IPMI_DCMI: ["collector.ipmi_dcmi"],
HWTool.IPMI_SEL: ["collector.ipmi_sel"],
HWTool.IPMI_SENSOR: ["collector.ipmi_sensor"],
HWTool.REDFISH: ["collector.redfish"],
}

Expand Down
Loading