Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance charm's life-cycle #118

Closed
wants to merge 13 commits into from
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ options:
default: 10000
description: |
Start the prometheus exporter at "exporter-port". By default, it will
start at port 10000.
start at port 10000. Allowed values are between 1 and 65535 inclusively.
exporter-log-level:
type: string
default: "INFO"
Expand Down
213 changes: 107 additions & 106 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,13 @@
from typing import Any, Dict, Optional

import ops
from charms.grafana_agent.v0.cos_agent import COSAgentProvider
from ops.framework import EventBase, StoredState
from ops.model import ActiveStatus, BlockedStatus
from ops.model import ActiveStatus, BlockedStatus, ErrorStatus, MaintenanceStatus

import exporter
from config import HWTool
from hardware import get_bmc_address
from hw_tools import HWToolHelper, bmc_hw_verifier
from service import Exporter
from hw_tools import HWToolHelper

logger = logging.getLogger(__name__)

Expand All @@ -29,144 +28,146 @@ def __init__(self, *args: Any) -> None:
"""Init."""
super().__init__(*args)
self.hw_tool_helper = HWToolHelper()

self.cos_agent_provider = COSAgentProvider(
self,
metrics_endpoints=[
{"path": "/metrics", "port": int(self.model.config["exporter-port"])}
],
)
self.exporter = Exporter(self.charm_dir)
self.exporter_observer = exporter.Observer(self)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.exporter_observer sounds not precise. Just self.exporter?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I want to think about what we're "observing" here. exporter.Observer feels like I'm initializing a class that observes the exporter but it looks more like the class is observing the cos-agent relation?


self.framework.observe(self.on.config_changed, self._on_config_changed)
self.framework.observe(self.on.install, self._on_install_or_upgrade)
self.framework.observe(self.on.remove, self._on_remove)
self.framework.observe(self.on.update_status, self._on_update_status)
self.framework.observe(self.on.upgrade_charm, self._on_install_or_upgrade)
self.framework.observe(
self.on.cos_agent_relation_joined, self._on_cos_agent_relation_joined
)
self.framework.observe(
self.on.cos_agent_relation_departed, self._on_cos_agent_relation_departed
)

self._stored.set_default(installed=False, config={}, blocked_msg="")

def _on_install_or_upgrade(self, event: ops.InstallEvent) -> None:
"""Install and upgrade."""
port = self.model.config.get("exporter-port", "10000")
level = self.model.config.get("exporter-log-level", "INFO")

redfish_creds = self._get_redfish_creds()

self.exporter.install(port, level, redfish_creds)
installed, msg = self.hw_tool_helper.install(self.model.resources)
self._stored.installed = installed
self._stored.set_default(
agent_installed=False,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like this might sound better like exporter_installed and we interface with one single module (with multiple classes) for all exporter functionality.

So it could be exporter.Exporter, exporter.ExporterTemplate, exporter.Observer

resource_install_status={},
config={},
blocked_msg="",
)

if not installed:
logger.info(msg)
def _on_install_or_upgrade(self, _: ops.InstallEvent) -> None:
"""Install or upgrade charm."""
self.model.unit.status = MaintenanceStatus("Installing resources...")
resource_white_list = self.hw_tool_helper.get_resource_white_list(self.model.resources)
resource_install_status = self.hw_tool_helper.install(
resource_white_list,
resource_black_list=self._stored.resource_install_status, # type: ignore[arg-type]
)
self._stored.resource_install_status = resource_install_status
Copy link
Contributor

@jneo8 jneo8 Nov 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

status make my confuse with MaintenanceStatus

One is boolean, another is juju's unit status.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe result might be a better choice -- resource_install_result

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem, I personally will never think of status with *Status (in libjuju) 😆

if not all(resource_install_status.values()):
failed_resources = [r for r, s in resource_install_status.items() if not s]
msg = f"Failed to install resources: {', '.join(failed_resources)}"
logger.error(msg)
self._stored.blocked_msg = msg
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

blocked_msg seems not useful anymore.

self._on_update_status(event)
self.model.unit.status = ErrorStatus(msg)
return

self._stored.installed = True
self._stored.blocked_msg = ""
self.model.unit.status = ActiveStatus("Install complete")
logger.info("Install complete")
if self._stored.agent_installed is not True:
self.model.unit.status = MaintenanceStatus("Installing exporter...")
success = self.exporter_observer.install_agent()
self._stored.agent_installed = success
if not success:
msg = "Failed to installed exporter, please refer to `juju debug-log`"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo: "Failed to install exporter, please refer the logs using juju debug-log for more information"

logger.error(msg)
self._stored.blocked_msg = msg
self.model.unit.status = ErrorStatus(msg)
return

self.update_status()

def _on_remove(self, _: EventBase) -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happen if remove fail?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, there's not remove fail logic in the code

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So it just get into error state?

"""Remove everything when charm is being removed."""
logger.info("Start to remove.")
# Remove binary tool
self.exporter_observer.uninstall_agent()
self.hw_tool_helper.remove(self.model.resources)
self.exporter.uninstall()
self._stored.agent_installed = False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of having manual config here. Maybe a functional to set default value?

self._stored.resource_install_status = {}
logger.info("Remove complete")

def _on_update_status(self, _: EventBase) -> None:
"""Update the charm's status."""
if self._stored.installed is not True and self._stored.blocked_msg != "":
self.model.unit.status = BlockedStatus(self._stored.blocked_msg) # type: ignore
return
if not self.model.get_relation("cos-agent"):
self.update_status()

def _on_config_changed(self, event: EventBase) -> None:
"""Reconfigure charm."""
change_set = self.update_config_store()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might look better like this:

change_set = self.config_store_changed 

Even though the method updates the config store, I feel like the change_set feature is more important and we could name it accordingly.


if self.exporter_observer.agent_enabled:
options = self.get_exporter_configs()
success, message = self.exporter_observer.validate_agent_configs(options)
if not success:
self.model.unit.status = BlockedStatus(message)
event.defer()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the config is not valid. User need to re-config it, right?
Then why we need this event.defer()

return

success = self.exporter_observer.configure_agent(options, change_set)
if not success:
self.model.unit.status = BlockedStatus(
"Failed to configure exporter, please check if the server is healthy..."
)
event.defer()
return

self.update_status()

def update_status(self) -> None:
"""Update the charm's status."""
if not self.exporter_observer.agent_enabled:
self.model.unit.status = BlockedStatus("Missing relation: [cos-agent]")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're having a new constant EXPORTER_RELATION_NAME in src/config.py, maybe we could use that here as well?

return
if not self.exporter.check_health():
self.model.unit.status = BlockedStatus("Exporter is unhealthy")

if self.exporter_observer.too_many_relations:
self.model.unit.status = BlockedStatus("Cannot relate to more than one grafan-agent")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo: "Cannot relate to more than one grafana-agent application"

return
if not self.exporter.check_active():
self.model.unit.status = BlockedStatus("Exporter is not running")

if self.exporter_observer.agent_enabled and not self.exporter_observer.agent_online:
self.model.unit.status = ErrorStatus(
"Exporter crashes unexpectedly, please refer to systemd logs..."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo: "crashes" --> "crashed"
"please refer to systemd logs" --> "please refer to the systemd logs for the exporter service on the unit"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think keeping it short maybe better since you can't display a long text in juju status

)
return

self._stored.blocked_msg = ""
self.model.unit.status = ActiveStatus("Unit is ready")

def _on_config_changed(self, event: EventBase) -> None:
"""Reconfigure charm."""
# Keep track of what model config options + some extra config related
# information are changed. This can be helpful when we want to respond
# to the change of a specific config option.
def update_config_store(self) -> set:
"""Update the config store, and return a set of config options that are changed."""
change_set = set()
model_config: Dict[str, Optional[str]] = dict(self.model.config.items())
for key, value in model_config.items():
if key not in self._stored.config or self._stored.config[key] != value: # type: ignore
logger.info("Setting %s to: %s", key, value)
self._stored.config[key] = value # type: ignore
change_set.add(key)
return change_set

def get_redfish_options(self) -> Dict[str, Any]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to call this get_redfish_configs similar to get_exporter_configs ? I wouldn't mind if both are using options as well, but it would be nice if both are consistent :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's consistent in a way that redfish_options = get_redfish_options(), but not in get_redfish_configs and get_exporter_configs. 😢

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, maybe we could have redfish_configs, get_redfish_configs along with get_exporter_configs? What do you think?

"""Get redfish config options."""
redfish_options = {
"enable": False,
"host": "",
"username": self.model.config.get("redfish-username", ""),
"password": self.model.config.get("redfish-password", ""),
}

if not self._stored.installed: # type: ignore
logging.info( # type: ignore
"Config changed called before install complete, deferring event: %s",
event.handle,
)
event.defer()
return
bmc_address = get_bmc_address()
if bmc_address:
redfish_options["enable"] = True
redfish_options["host"] = f"https://{bmc_address}"

exporter_configs = {
"exporter-port",
"exporter-log-level",
"redfish-host",
"redfish-username",
"redfish-password",
return redfish_options

def get_exporter_configs(self) -> Dict[str, Any]:
"""Get the exporter related config options."""
port = self.model.config.get("exporter-port", "10000")
level = self.model.config.get("exporter-log-level", "INFO")
collectors = self.hw_tool_helper.hw_collector_white_list
redfish_options = {"enable": False}
if HWTool.REDFISH in collectors:
redfish_options = self.get_redfish_options()
return {
"port": port,
"level": level,
"collectors": collectors,
"redfish_options": redfish_options,
}
if exporter_configs.intersection(change_set):
logger.info("Detected changes in exporter config.")
port = self.model.config.get("exporter-port", "10000")
level = self.model.config.get("exporter-log-level", "INFO")

redfish_creds = self._get_redfish_creds()
success = self.exporter.template.render_config(
port=port, level=level, redfish_creds=redfish_creds
)
# First condition prevent the exporter from starting at when the
# charm just installed; the second condition tries to recover the
# exporter from failed status.
if success and self.exporter.check_active() or not self.exporter.check_health():
self.exporter.restart()

self._on_update_status(event)

def _on_cos_agent_relation_joined(self, event: EventBase) -> None:
"""Start the exporter when relation joined."""
self.exporter.start()
self._on_update_status(event)

def _on_cos_agent_relation_departed(self, event: EventBase) -> None:
"""Remove the exporter when relation departed."""
self.exporter.stop()
self._on_update_status(event)

def _get_redfish_creds(self) -> Dict[str, str]:
"""Provide redfish config if redfish is available, else empty dict."""
bmc_tools = bmc_hw_verifier()
if HWTool.REDFISH in bmc_tools:
bmc_address = get_bmc_address()
redfish_creds = {
# Force to use https as default protocol
"host": f"https://{bmc_address}",
"username": self.model.config.get("redfish-username", ""),
"password": self.model.config.get("redfish-password", ""),
}
else:
redfish_creds = {}
return redfish_creds


if __name__ == "__main__": # pragma: nocover
Expand Down
34 changes: 2 additions & 32 deletions src/checksum.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,16 @@
# limitations under the License.
#
# For further info, check https://github.com/canonical/charmcraft
"""Checksum definition, check functions and related utils."""
import hashlib
"""Checksum definition."""
import logging
import typing as t
from dataclasses import dataclass, field
from pathlib import Path

from os_platform import Architecture, UbuntuSeries, get_os_platform
from os_platform import Architecture, UbuntuSeries

logger = logging.getLogger(__name__)


class ResourceChecksumError(Exception):
"""Raise if checksum does not match."""


@dataclass
class ToolVersionInfo:
"""Tool version information for checksum comparison."""
Expand Down Expand Up @@ -209,27 +203,3 @@ class ToolVersionInfo:
sha256_checksum="458d51b030468901fc8a207088070e6ce82db34b181d9190c8f849605f1b9b6d",
),
]


def validate_checksum(support_version_infos: t.List[ToolVersionInfo], path: Path) -> bool:
"""Validate checksum of resource file by checking with supported versions.

Returns True if resource is supported by the charm, architecture, and
checksum validation is successful.
"""
os_platform = get_os_platform()

supported_checksums = []
for info in support_version_infos:
if os_platform.machine in info.supported_architectures and (
info.support_all_series or os_platform.series in info.supported_series
):
supported_checksums.append(info.sha256_checksum)

with open(path, "rb") as f:
sha256_hash = hashlib.sha256(f.read()).hexdigest()

if sha256_hash in supported_checksums:
return True
logger.warning("Checksum validation fail, path: %s hash: %s", path, sha256_hash)
return False
1 change: 1 addition & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
EXPORTER_SERVICE_PATH = Path(f"/etc/systemd/system/{EXPORTER_NAME}.service")
EXPORTER_CONFIG_TEMPLATE = f"{EXPORTER_NAME}-config.yaml.j2"
EXPORTER_SERVICE_TEMPLATE = f"{EXPORTER_NAME}.service.j2"
EXPORTER_RELATION_NAME = "cos-agent"

# Redfish
REDFISH_TIMEOUT = 3
Expand Down
Loading