Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Snap Strategy #306

Closed
wants to merge 10 commits into from
12 changes: 12 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,18 @@ options:
description: |
Start the prometheus smartctl exporter at "smartctl-exporter-port". By default,
it will start at port 10201.
dcgm-exporter-port:
type: int
default: 9400
description: |
Start the prometheus dcgm exporter at "dcgm-exporter-port". By default,
it will start at port 9400.
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
dcgm-snap-channel:
type: string
default: "latest/stable"
description: |
Channel to install the DCGM snap if the hardware has NVIDIA GPU. By default, it will install
from latest/stable
exporter-log-level:
type: string
default: "INFO"
Expand Down
8 changes: 6 additions & 2 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus

from hw_tools import HWTool, HWToolHelper, detect_available_tools
from service import BaseExporter, ExporterError, HardwareExporter, SmartCtlExporter
from service import BaseExporter, DCGMExporter, ExporterError, HardwareExporter, SmartCtlExporter

logger = logging.getLogger(__name__)

Expand All @@ -26,7 +26,7 @@ class HardwareObserverCharm(ops.CharmBase):
def __init__(self, *args: Any) -> None:
"""Init."""
super().__init__(*args)
self.hw_tool_helper = HWToolHelper()
self.hw_tool_helper = HWToolHelper(self.model.config)

# Add refresh_events to COSAgentProvider to update relation data when
# config changed (default behavior) and upgrade charm. This is useful
Expand All @@ -37,6 +37,7 @@ def __init__(self, *args: Any) -> None:
metrics_endpoints=[
{"path": "/metrics", "port": int(self.model.config["hardware-exporter-port"])},
{"path": "/metrics", "port": int(self.model.config["smartctl-exporter-port"])},
{"path": "/metrics", "port": int(self.model.config["dcgm-exporter-port"])},
],
# Setting scrape_timeout as collect_timeout in the `duration` format specified in
# https://prometheus.io/docs/prometheus/latest/configuration/configuration/#duration
Expand Down Expand Up @@ -82,6 +83,9 @@ def exporters(self) -> List[BaseExporter]:
if stored_tools & SmartCtlExporter.hw_tools():
exporters.append(SmartCtlExporter(self.charm_dir, self.model.config))

if DCGMExporter.hw_tools() in stored_tools:
exporters.append(DCGMExporter(self.charm_dir, self.model.config))

return exporters

def get_stored_tools(self) -> Set[HWTool]:
Expand Down
16 changes: 13 additions & 3 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,21 @@ class ExporterSettings(BaseModel): # pylint: disable = too-few-public-methods

health_retry_count: int = 3
health_retry_timeout: int = 3
service_template: str
service_template: t.Optional[str] = None
service_path: Path
name: str
config_template: str
config_path: Path
config_template: t.Optional[str] = None
config_path: t.Optional[Path] = None


class DCGMExporterSettings(ExporterSettings): # pylint: disable = too-few-public-methods
"""Constant settings for DCGM Exporter."""

name: str = "snap.dcgm.dcgm-exporter"
service_path: Path = Path(f"/etc/systemd/system/{name}.service")
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved


DCGM_EXPORTER_SETTINGS = DCGMExporterSettings()


class HardwareExporterSettings(ExporterSettings): # pylint: disable = too-few-public-methods
Expand Down
50 changes: 47 additions & 3 deletions src/hw_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
import requests
import urllib3
from charms.operator_libs_linux.v0 import apt
from ops.model import ModelError, Resources
from charms.operator_libs_linux.v2 import snap
from ops.model import ConfigData, ModelError, Resources

import apt_helpers
from checksum import (
Expand Down Expand Up @@ -175,6 +176,43 @@ def remove(self) -> None:
# the remove option.


class SnapStrategy(StrategyABC):
Copy link
Contributor

@jneo8 jneo8 Sep 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion:

I kind of prefer code like this:

  • No init function
  • hard code HWTool
  • Pass channel on install function
class SnapStrategyABC(StrategyABC, metaclass=ABCMeta):
    """Snap strategy class."""

    def install(self, channel: str = "latest/stable") -> None:
        """Install the snap from a channel."""
        # define install details here.
        ...
        
class SmartCtlStrategy(SnapStrategyABC):
    _name = HWTool.SMARTCTL_EXPORTER
                     
class HWToolHelper:
    """Helper to install vendor's or hardware related tools."""

   def strategies(self) -> List[StrategyABC]:
        """Define strategies for every tools."""
        return [
            StorCLIStrategy(),
            RedFishStrategy(),
            ...
            # It's more easy to define strategy here
            SmartCtlStrategy(),
        ]
    # The channel information should only be given here. This make sure the object dependency is one-direction.
    def install(self, resources: Resources, channel: (some-type), hw_available: Set[HWTool]) -> Tuple[bool, str]:
        """Install tools."""
        
        ...
           
        for strategy in self.strategies:
            if strategy.name not in hw_available:
                continue
            try:
                if isinstance(strategy, TPRStrategyABC):
                    path = fetch_tools.get(strategy.name)  # pylint: disable=W0212
                    if path:
                        strategy.install(path)
                if isinstance(strategy, SnapStrategy):
                    # The channel should be an argument for install function
                    strategy.install(channel)
                elif isinstance(strategy, APTStrategyABC):
                    strategy.install()  # pylint: disable=E1120
                logger.info("Strategy %s install success", strategy)
            except (
                ResourceFileSizeZeroError,
                OSError,
                apt.PackageError,
                ResourceChecksumError,
                snap.SnapError,
            ) as e:
                logger.warning("Strategy %s install fail: %s", strategy, e)
                fail_strategies.append(strategy.name)

        if fail_strategies:
            return False, f"Fail strategies: {fail_strategies}"
        return True, ""

So the (dis)advantage are:

  • The strategy can keep in hard code, it's more easy to trace and keep the flexibility. (Imagine if you have different operation for DCGM & Smart).
  • Chance to use snap resource in the future
  • The only breaking change only happen on HWToolHelper.install function. Other design won't be broken.
    • Get the channel information in Charm class object and pass it to install function. This make sure the object dependency is one direaction.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Different snaps will have different channels to install from. I don't see how passing a single channel on HWToolHelper can solve this. I think it would be very strange to have one charm config for channel to rule on all snaps channels.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while I agree that we should remove the constructor for consistency with the other strategies, I don't quite get why it would be preferable to move complexity to the install function, as that requires the caller to have deeper knowledge about how each strategy works and adds extra conditionals to HWToolHelper.install

Copy link
Contributor

@jneo8 jneo8 Sep 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need a protocol to pass all the required material, resources, configuration, and hardware detect result, for the strategies.
So install function is the protocol right now. If we break this rule then the ISP(interface-segregation principles) will be broken then you have to find a tricky/hacky way to import those material into the strategies.

(I am open to any way to refactor the protocol we have now, but it's a refactor so it shouldn't happen on this PR)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Different snaps will have different channels to install from. I don't see how passing a single channel on ...

It's a pseudocode code, so pass the configuration you want.

"""Snap strategy class."""

def __init__(self, tool: HWTool, channel: str):
"""Snap strategy constructor."""
self._name = tool
self.snap_name = tool.value
self.channel = channel
self.snap_client = snap.SnapCache()[tool.value]

def install(self) -> None:
"""Install the snap."""
try:
snap.add(self.snap_name, channel=self.channel)
except snap.SnapError as err:
logger.error(
"Failed to install %s on channel: %s: %s", self.snap_name, self.channel, err
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit

Suggested change
"Failed to install %s on channel: %s: %s", self.snap_name, self.channel, err
"Failed to install %s from channel: %s: %s", self.snap_name, self.channel, err

)
else:
logger.info("Installed %s on channel: %s", self.snap_name, self.channel)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
logger.info("Installed %s on channel: %s", self.snap_name, self.channel)
logger.info("Installed %s from channel: %s", self.snap_name, self.channel)

# some services might be disabled by default. E.g: dcgm-exporter
self.enable_services()

def remove(self) -> None:
"""Remove the snap."""
snap.remove([self.snap_name])

def enable_services(self) -> None:
"""Enable the snap services."""
# breakpoint()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I imagine this is a leftover from manual tests

self.snap_client.start(list(self.snap_client.services.keys()), enable=True)

def check(self) -> bool:
"""Check if all services are active."""
return all(service.get("active", False) for service in self.snap_client.services.values())


class TPRStrategyABC(StrategyABC, metaclass=ABCMeta):
"""Third party resource strategy class."""

Expand Down Expand Up @@ -615,6 +653,10 @@ def detect_available_tools() -> Set[HWTool]:
class HWToolHelper:
"""Helper to install vendor's or hardware related tools."""

def __init__(self, config: ConfigData) -> None:
"""Init."""
self.config = config

@property
def strategies(self) -> List[StrategyABC]:
"""Define strategies for every tools."""
Expand All @@ -629,6 +671,7 @@ def strategies(self) -> List[StrategyABC]:
IPMISENSORStrategy(),
RedFishStrategy(),
SmartCtlStrategy(),
SnapStrategy(HWTool.DCGM, self.config["dcgm-snap-channel"]),
]

def fetch_tools( # pylint: disable=W0102
Expand Down Expand Up @@ -695,14 +738,15 @@ def install(self, resources: Resources, hw_available: Set[HWTool]) -> Tuple[bool
if path:
strategy.install(path)
# APTStrategy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: needs updating

elif isinstance(strategy, APTStrategyABC):
elif isinstance(strategy, (APTStrategyABC, SnapStrategy)):
chanchiwai-ray marked this conversation as resolved.
Show resolved Hide resolved
strategy.install() # pylint: disable=E1120
logger.info("Strategy %s install success", strategy)
except (
ResourceFileSizeZeroError,
OSError,
apt.PackageError,
ResourceChecksumError,
snap.SnapError,
) as e:
logger.warning("Strategy %s install fail: %s", strategy, e)
fail_strategies.append(strategy.name)
Expand All @@ -717,7 +761,7 @@ def remove(self, resources: Resources, hw_available: Set[HWTool]) -> None:
for strategy in self.strategies:
if strategy.name not in hw_available:
continue
if isinstance(strategy, (TPRStrategyABC, APTStrategyABC)):
if isinstance(strategy, (TPRStrategyABC, APTStrategyABC, SnapStrategy)):
strategy.remove()
logger.info("Strategy %s remove success", strategy)

Expand Down
57 changes: 44 additions & 13 deletions src/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@
from redfish.rest.v1 import InvalidCredentialsError

from config import (
DCGM_EXPORTER_SETTINGS,
HARDWARE_EXPORTER_COLLECTOR_MAPPING,
HARDWARE_EXPORTER_SETTINGS,
SMARTCTL_EXPORTER_SETTINGS,
ExporterSettings,
HWTool,
)
from hardware import get_bmc_address
from hw_tools import SmartCtlExporterStrategy
from hw_tools import SmartCtlExporterStrategy, SnapStrategy

logger = getLogger(__name__)

Expand All @@ -48,7 +49,11 @@ def __init__(self, charm_dir: Path, config: ConfigData, settings: ExporterSettin

self.settings = settings
self.environment = Environment(loader=FileSystemLoader(charm_dir / "templates"))
self.service_template = self.environment.get_template(self.settings.service_template)
self.service_template = (
self.environment.get_template(self.settings.service_template)
if self.settings.service_template
gabrielcocenza marked this conversation as resolved.
Show resolved Hide resolved
else None
)
self.exporter_service_path = self.settings.service_path
self.exporter_name = self.settings.name

Expand Down Expand Up @@ -132,6 +137,8 @@ def check_health(self) -> bool:

def _render_service(self, params: Dict[str, str]) -> bool:
"""Render and install exporter service file."""
if self.service_template is None:
return True
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
content = self.service_template.render(**params)
return write_to_file(self.exporter_service_path, content)

Expand Down Expand Up @@ -324,6 +331,22 @@ def remove_resources(self) -> bool:
return True


class DCGMExporter(BaseExporter):
"""A class representing the DCGM exporter and the metric endpoints."""

def __init__(self, charm_dir: Path, config: ConfigData) -> None:
"""Initialize the Hardware Exporter class."""
super().__init__(charm_dir, config, DCGM_EXPORTER_SETTINGS)

self.port = int(config["dcgm-exporter-port"])
self.strategy = SnapStrategy(HWTool.DCGM, config["dcgm-snap-channel"])

@staticmethod
def hw_tools() -> Set[HWTool]:
"""Return hardware tools to watch."""
return {HWTool.DCGM}

gabrielcocenza marked this conversation as resolved.
Show resolved Hide resolved

class HardwareExporter(BaseExporter):
"""A class representing the hardware exporter and the metric endpoints."""

Expand All @@ -333,7 +356,11 @@ def __init__(self, charm_dir: Path, config: ConfigData, available_tools: Set[HWT
"""Initialize the Hardware Exporter class."""
super().__init__(charm_dir, config, HARDWARE_EXPORTER_SETTINGS)

self.config_template = self.environment.get_template(self.settings.config_template)
self.config_template = (
self.environment.get_template(self.settings.config_template)
if self.settings.config_template
else None
)
aieri marked this conversation as resolved.
Show resolved Hide resolved
self.exporter_config_path = self.settings.config_path
self.port = int(config["hardware-exporter-port"])
self.config = config
Expand All @@ -348,16 +375,20 @@ def _render_config_content(self) -> str:
collector = HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(tool)
if collector is not None:
collectors.add(collector)
content = self.config_template.render(
PORT=self.port,
LEVEL=self.log_level,
COLLECT_TIMEOUT=self.collect_timeout,
COLLECTORS=collectors,
REDFISH_ENABLE=HWTool.REDFISH in self.enabled_tools,
REDFISH_HOST=self.redfish_conn_params.get("host", ""),
REDFISH_USERNAME=self.redfish_conn_params.get("username", ""),
REDFISH_PASSWORD=self.redfish_conn_params.get("password", ""),
REDFISH_CLIENT_TIMEOUT=self.redfish_conn_params.get("timeout", ""),
content = (
self.config_template.render(
PORT=self.port,
LEVEL=self.log_level,
COLLECT_TIMEOUT=self.collect_timeout,
COLLECTORS=collectors,
REDFISH_ENABLE=HWTool.REDFISH in self.enabled_tools,
REDFISH_HOST=self.redfish_conn_params.get("host", ""),
REDFISH_USERNAME=self.redfish_conn_params.get("username", ""),
REDFISH_PASSWORD=self.redfish_conn_params.get("password", ""),
REDFISH_CLIENT_TIMEOUT=self.redfish_conn_params.get("timeout", ""),
)
if self.config_template
else ""
aieri marked this conversation as resolved.
Show resolved Hide resolved
)
return content

Expand Down
20 changes: 15 additions & 5 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import charm
from charm import ExporterError, HardwareObserverCharm
from config import HWTool
from service import HardwareExporter
from service import DCGMExporter, HardwareExporter


class TestCharm(unittest.TestCase):
Expand Down Expand Up @@ -55,15 +55,18 @@ def test_harness(self) -> None:
@parameterized.expand(
[
(
"Enable two exporters",
{HWTool.IPMI_SEL, HWTool.SMARTCTL},
{"hardware-exporter", "smartctl-exporter"},
"Enable three exporters",
{HWTool.IPMI_SEL, HWTool.SMARTCTL, HWTool.DCGM},
{"hardware-exporter", "smartctl-exporter", "dcgm"},
)
]
)
@mock.patch("charm.SmartCtlExporter.__init__", return_value=None)
@mock.patch("charm.HardwareExporter.__init__", return_value=None)
def test_exporters(self, _, stored_tools, expect, mock_hw_exporter, mock_smart_exporter):
@mock.patch("charm.DCGMExporter.__init__", return_value=None)
def test_exporters(
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
self, _, stored_tools, expect, mock_dcgm_exporter, mock_hw_exporter, mock_smart_exporter
):
self.harness.begin()
self.harness.charm.get_stored_tools = mock.MagicMock()
self.harness.charm.get_stored_tools.return_value = stored_tools
Expand All @@ -90,6 +93,13 @@ def test_exporters(self, _, stored_tools, expect, mock_hw_exporter, mock_smart_e
self.harness.charm.model.config,
)

if "dcgm-exporter" in expect:
self.assertTrue(any([isinstance(exporter, DCGMExporter) for exporter in exporters]))
mock_dcgm_exporter.assert_called_with(
self.harness.charm.charm_dir,
self.harness.charm.model.config,
)

@parameterized.expand(
[
(
Expand Down
Loading
Loading