Skip to content

Commit

Permalink
Setup synthetic metrics and test alerts on COS
Browse files Browse the repository at this point in the history
  • Loading branch information
sudeephb committed Mar 13, 2024
1 parent 98557fb commit 1e8c5b7
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 18 deletions.
37 changes: 37 additions & 0 deletions tests/integration/export_mock_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python3
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.

# This file is supposed to run on the hardware observer unit.

import time

from mock_data import SAMPLE_METRICS
from prometheus_client import REGISTRY, start_http_server
from prometheus_client.core import GaugeMetricFamily

PORT = 10200 # Default exporter port


class SyntheticCollector:
"""Collector for creating synthetic(mock) metrics."""

def collect(self):
for sample_metric in SAMPLE_METRICS:
metric = GaugeMetricFamily(
name=sample_metric["name"],
documentation=sample_metric["documentation"],
labels=list(sample_metric["labels"].keys()),
)
metric.add_metric( # type: ignore[attr-defined]
labels=list(sample_metric["labels"].values()), value=sample_metric["value"]
)
yield metric


if __name__ == "__main__":
start_http_server(PORT)
REGISTRY.register(SyntheticCollector())

while True:
time.sleep(10) # Keep the server running
46 changes: 46 additions & 0 deletions tests/integration/mock_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Metrics
SAMPLE_METRICS = [
{
"name": "ipmi_dcmi_command_success",
"documentation": "Indicates if the ipmi dcmi command is successful or not",
"labels": {},
"value": 0.0,
},
{
"name": "redfish_call_success",
"documentation": "Indicates if call to the redfish API succeeded or not",
"labels": {},
"value": 1.0,
},
{
"name": "ipmi_temperature_celsius",
"documentation": "Temperature measure from temperature sensors",
"labels": {"name": "testname", "state": "Critical", "unit": "C"},
"value": 200,
},
]


# Expected alerts based on above metrics
SAMPLE_ALERTS = [
{
"labels": {
"alertname": "IPMIDCMICommandFailed",
"juju_application": "hardware-observer",
"juju_unit": "hardware-observer/0",
"severity": "critical",
},
"state": "firing",
"value": 0.0,
},
{
"labels": {
"alertname": "IPMITemperatureStateNotOk",
"juju_application": "hardware-observer",
"juju_unit": "hardware-observer/0",
"severity": "critical",
},
"state": "firing",
"value": 200,
},
]
4 changes: 3 additions & 1 deletion tests/integration/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
jinja2
juju~=3.3.0 # must be compatible with the juju CLI version installed by CI
pytest
pytest-operator
pytest-operator
prometheus-client
pyinstaller
130 changes: 113 additions & 17 deletions tests/integration/test_cos_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@
# See LICENSE file for licensing details.

import asyncio
import json
import logging
import os
import subprocess
from pathlib import Path

import time

import pytest
from juju.controller import Controller
from mock_data import SAMPLE_ALERTS
from pytest_operator.plugin import OpsTest
from utils import get_or_add_model

Expand All @@ -22,35 +26,127 @@


@pytest.mark.abort_on_fail
@pytest.mark.skip_if_deployed
async def test_setup_and_deploy(ops_test: OpsTest, series, channel):
"""Setup models and then deploy Hardware Observer and COS."""
if LXD_CTL_NAME is None or K8S_CTL_NAME is None:
pytest.fail("LXD_CONTROLLER and K8S_CONTROLLER env variables should be provided")

# The current model name is generated by pytest-operator from the test name + random suffix.
# Use the same model name in both controllers.
k8s_mdl_name = lxd_mdl_name = ops_test.model_name
k8s_model_name = lxd_model_name = ops_test.model_name

# Assuming a lxd controller is ready and its name is stored in $LXD_CONTROLLER.
lxd_ctl = Controller()
await lxd_ctl.connect(LXD_CTL_NAME)
lxd_mdl = await get_or_add_model(ops_test, lxd_ctl, lxd_mdl_name)
await lxd_mdl.set_config(MODEL_CONFIG)
lxd_model = await get_or_add_model(ops_test, lxd_ctl, lxd_model_name)
await lxd_model.set_config(MODEL_CONFIG)

# Assuming a k8s controller is ready and its name is stored in $K8S_CONTROLLER.
k8s_ctl = Controller()
await k8s_ctl.connect(K8S_CTL_NAME)
k8s_mdl = await get_or_add_model(ops_test, k8s_ctl, k8s_mdl_name)
await k8s_mdl.set_config(MODEL_CONFIG)
k8s_model = await get_or_add_model(ops_test, k8s_ctl, k8s_model_name)
await k8s_model.set_config(MODEL_CONFIG)

await _deploy_cos(channel, k8s_mdl)
await _deploy_cos(channel, k8s_model)

await _deploy_hardware_observer(series, channel, lxd_mdl)
await _deploy_hardware_observer(series, channel, lxd_model)

await _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_mdl, lxd_mdl)
await _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_model, lxd_model)

# This verifies that the cross-controller relation with COS is successful
assert lxd_mdl.applications["grafana-agent"].status == "active"
assert lxd_model.applications["grafana-agent"].status == "active"


async def test_alerts(ops_test: OpsTest):
"""Verify that the required alerts are fired."""
await _disable_hardware_exporter(ops_test)
await _export_mock_metrics(ops_test)

# Sometimes alerts take some time to show after the metrics are exposed on the host.
time.sleep(300)

model_name = ops_test.model_name
k8s_ctl = Controller()
await k8s_ctl.connect(K8S_CTL_NAME)
k8s_model = await get_or_add_model(ops_test, k8s_ctl, model_name)

model_status = await k8s_model.get_status()
traefik_ip = model_status["applications"]["traefik"].public_address

prometheus_alerts_endpoint = f"http://{traefik_ip}/{model_name}-prometheus-0/api/v1/alerts"

cmd = ["curl", prometheus_alerts_endpoint]
try:
alerts_response = subprocess.check_output(cmd)
except subprocess.CalledProcessError:
logger.error("Failed to fetch alerts data from COS")
raise

alerts = json.loads(alerts_response)["data"]["alerts"]

expected_alerts = SAMPLE_ALERTS

for expected_alert in expected_alerts:
assert any(_is_same_alert(expected_alert, received_alert) for received_alert in alerts)


def _is_same_alert(expected_alert, received_alert):
"""Compare the alert dictionaries only based on relevant fields."""
if expected_alert["state"] != received_alert["state"]:
return False
if float(expected_alert["value"]) != float(received_alert["value"]):
return False
for key, value in expected_alert.get("labels").items():
if received_alert.get("labels").get(key) != value:
return False
return True


async def _disable_hardware_exporter(
ops_test: OpsTest,
):
"""Disable the hardware exporter service."""
disable_cmd = "sudo systemctl stop hardware-exporter.service"
lxd_model_name = ops_test.model_name

lxd_ctl = Controller()
await lxd_ctl.connect(LXD_CTL_NAME)
lxd_model = await get_or_add_model(ops_test, lxd_ctl, lxd_model_name)
hardware_observer = lxd_model.applications.get("hardware-observer")
hardware_observer_unit = hardware_observer.units[0]

disable_action = await hardware_observer_unit.run(disable_cmd)
await disable_action.wait()


async def _export_mock_metrics(ops_test: OpsTest):
"""Expose the mock metrics for further testing."""
lxd_model_name = ops_test.model_name
lxd_ctl = Controller()
await lxd_ctl.connect(LXD_CTL_NAME)
lxd_model = await get_or_add_model(ops_test, lxd_ctl, lxd_model_name)
hardware_observer = lxd_model.applications.get("hardware-observer")
hardware_observer_unit = hardware_observer.units[0]

# Create an executable from `export_mock_metrics.py`
bundle_cmd = [
"pyinstaller",
"--onefile",
str(Path(__file__).parent.resolve() / "export_mock_metrics.py"),
]
try:
subprocess.run(bundle_cmd)
except subprocess.CalledProcessError:
logger.error("Failed to bundle export_mock_metrics")
raise

# scp the executable to hardware-observer unit
await hardware_observer_unit.scp_to("./dist/export_mock_metrics", "/home/ubuntu")

# Run the executable in the background without waiting.
run_export_mock_metrics_cmd = "/home/ubuntu/export_mock_metrics"
await hardware_observer_unit.run(run_export_mock_metrics_cmd)


async def _deploy_cos(channel, model):
Expand Down Expand Up @@ -91,7 +187,7 @@ async def _deploy_hardware_observer(series, channel, model):
await model.block_until(lambda: model.applications["hardware-observer"].status == "active")


async def _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_mdl, lxd_mdl):
async def _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_model, lxd_model):
"""Add relations between Grafana Agent and COS."""
cos_saas_names = ["prometheus-receive-remote-write", "loki-logging", "grafana-dashboards"]
for saas in cos_saas_names:
Expand All @@ -101,23 +197,23 @@ async def _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_mdl, lxd_mdl):
"juju",
"consume",
"--model",
f"{lxd_ctl.controller_name}:{k8s_mdl.name}",
f"{k8s_ctl.controller_name}:admin/{k8s_mdl.name}.{saas}",
f"{lxd_ctl.controller_name}:{k8s_model.name}",
f"{k8s_ctl.controller_name}:admin/{k8s_model.name}.{saas}",
]
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
await lxd_mdl.add_relation("grafana-agent", saas),
await lxd_model.add_relation("grafana-agent", saas),

# `idle_period` needs to be greater than the scrape interval to make sure metrics ingested.
await asyncio.gather(
# First, we wait for the critical phase to pass with raise_on_error=False.
# (In CI, using github runners, we often see unreproducible hook failures.)
lxd_mdl.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False),
k8s_mdl.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False),
lxd_model.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False),
k8s_model.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False),
)

await asyncio.gather(
# Then we wait for "active", without raise_on_error=False, so the test fails sooner in case
# there is a persistent error status.
lxd_mdl.wait_for_idle(status="active", timeout=7200, idle_period=180),
k8s_mdl.wait_for_idle(status="active", timeout=7200, idle_period=180),
lxd_model.wait_for_idle(status="active", timeout=7200, idle_period=180),
k8s_model.wait_for_idle(status="active", timeout=7200, idle_period=180),
)

0 comments on commit 1e8c5b7

Please sign in to comment.