Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SnapExporter class and DCGM #319

Merged
merged 18 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ options:
description: |
Start the prometheus smartctl exporter at "smartctl-exporter-port". By default,
it will start at port 10201.
dcgm-snap-channel:
type: string
default: "latest/stable"
description: |
Channel to install the DCGM snap if the hardware has NVIDIA GPU. By default, it will install
from latest/stable
exporter-log-level:
type: string
default: "INFO"
Expand Down
8 changes: 6 additions & 2 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus

from hw_tools import HWTool, HWToolHelper, detect_available_tools
from service import BaseExporter, ExporterError, HardwareExporter, SmartCtlExporter
from service import BaseExporter, DCGMExporter, ExporterError, HardwareExporter, SmartCtlExporter

logger = logging.getLogger(__name__)

Expand All @@ -37,6 +37,7 @@ def __init__(self, *args: Any) -> None:
metrics_endpoints=[
{"path": "/metrics", "port": int(self.model.config["hardware-exporter-port"])},
{"path": "/metrics", "port": int(self.model.config["smartctl-exporter-port"])},
{"path": "/metrics", "port": 9400},
],
# Setting scrape_timeout as collect_timeout in the `duration` format specified in
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#duration
Expand Down Expand Up @@ -82,6 +83,9 @@ def exporters(self) -> List[BaseExporter]:
if stored_tools & SmartCtlExporter.hw_tools():
exporters.append(SmartCtlExporter(self.charm_dir, self.model.config))

if stored_tools & DCGMExporter.hw_tools():
exporters.append(DCGMExporter(self.model.config))

return exporters

def get_stored_tools(self) -> Set[HWTool]:
Expand Down Expand Up @@ -226,7 +230,7 @@ def _on_config_changed(self, event: EventBase) -> None:
self.model.unit.status = BlockedStatus(message)
return
for exporter in self.exporters:
success = exporter.render_config()
success = exporter.configure()
if success:
exporter.restart()
else:
Expand Down
59 changes: 55 additions & 4 deletions src/hw_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import requests
import urllib3
from charms.operator_libs_linux.v0 import apt
from charms.operator_libs_linux.v2 import snap
from ops.model import ModelError, Resources

import apt_helpers
Expand Down Expand Up @@ -187,6 +188,57 @@ def remove(self) -> None:
"""Remove details."""


class SnapStrategy(StrategyABC):
"""Snap strategy class."""

channel: str

@property
def snap(self) -> str:
"""Snap name."""
return self._name.value

def install(self) -> None:
"""Install the snap from a channel."""
try:
snap.add(self.snap, channel=self.channel)
logger.info("Installed %s from channel: %s", self.snap, self.channel)

# using the snap.SnapError will result into:
# TypeError: catching classes that do not inherit from BaseException is not allowed
except Exception as err: # pylint: disable=broad-except
logger.error("Failed to install %s from channel: %s: %s", self.snap, self.channel, err)
raise err

def remove(self) -> None:
"""Remove the snap."""
try:
snap.remove([self.snap])

# using the snap.SnapError will result into:
# TypeError: catching classes that do not inherit from BaseException is not allowed
except Exception as err: # pylint: disable=broad-except
logger.error("Failed to remove %s: %s", self.snap, err)
raise err

def check(self) -> bool:
"""Check if all services are active."""
return all(
service.get("active", False)
for service in snap.SnapCache()[self.snap].services.values()
)
gabrielcocenza marked this conversation as resolved.
Show resolved Hide resolved


class DCGMExporterStrategy(SnapStrategy):
"""DCGM strategy class."""

_name = HWTool.DCGM

def __init__(self, channel: str) -> None:
"""Init."""
self.channel = channel


class StorCLIStrategy(TPRStrategyABC):
"""Strategy to install storcli."""

Expand Down Expand Up @@ -689,13 +741,12 @@ def install(self, resources: Resources, hw_available: Set[HWTool]) -> Tuple[bool
if strategy.name not in hw_available:
continue
try:
# TPRStrategy
if isinstance(strategy, TPRStrategyABC):
path = fetch_tools.get(strategy.name) # pylint: disable=W0212
if path:
strategy.install(path)
# APTStrategy
elif isinstance(strategy, APTStrategyABC):

elif isinstance(strategy, (APTStrategyABC, SnapStrategy)):
strategy.install() # pylint: disable=E1120
logger.info("Strategy %s install success", strategy)
except (
Expand All @@ -717,7 +768,7 @@ def remove(self, resources: Resources, hw_available: Set[HWTool]) -> None:
for strategy in self.strategies:
if strategy.name not in hw_available:
continue
if isinstance(strategy, (TPRStrategyABC, APTStrategyABC)):
if isinstance(strategy, (TPRStrategyABC, APTStrategyABC, SnapStrategy)):
strategy.remove()
logger.info("Strategy %s remove success", strategy)

Expand Down
Loading
Loading