From 1e8c5b71d3e77e3439fab26bcb9e4c4050b6e0c3 Mon Sep 17 00:00:00 2001 From: sudeephb Date: Wed, 13 Mar 2024 21:30:10 +0545 Subject: [PATCH] Setup synthetic metrics and test alerts on COS --- tests/integration/export_mock_metrics.py | 37 ++++++ tests/integration/mock_data.py | 46 ++++++++ tests/integration/requirements.txt | 4 +- tests/integration/test_cos_integration.py | 130 +++++++++++++++++++--- 4 files changed, 199 insertions(+), 18 deletions(-) create mode 100644 tests/integration/export_mock_metrics.py create mode 100644 tests/integration/mock_data.py diff --git a/tests/integration/export_mock_metrics.py b/tests/integration/export_mock_metrics.py new file mode 100644 index 00000000..a2488f85 --- /dev/null +++ b/tests/integration/export_mock_metrics.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +# This file is supposed to run on the hardware observer unit. + +import time + +from mock_data import SAMPLE_METRICS +from prometheus_client import REGISTRY, start_http_server +from prometheus_client.core import GaugeMetricFamily + +PORT = 10200 # Default exporter port + + +class SyntheticCollector: + """Collector for creating synthetic(mock) metrics.""" + + def collect(self): + for sample_metric in SAMPLE_METRICS: + metric = GaugeMetricFamily( + name=sample_metric["name"], + documentation=sample_metric["documentation"], + labels=list(sample_metric["labels"].keys()), + ) + metric.add_metric( # type: ignore[attr-defined] + labels=list(sample_metric["labels"].values()), value=sample_metric["value"] + ) + yield metric + + +if __name__ == "__main__": + start_http_server(PORT) + REGISTRY.register(SyntheticCollector()) + + while True: + time.sleep(10) # Keep the server running diff --git a/tests/integration/mock_data.py b/tests/integration/mock_data.py new file mode 100644 index 00000000..18e60428 --- /dev/null +++ b/tests/integration/mock_data.py @@ -0,0 +1,46 @@ +# Metrics +SAMPLE_METRICS = [ + { + "name": "ipmi_dcmi_command_success", + "documentation": "Indicates if the ipmi dcmi command is successful or not", + "labels": {}, + "value": 0.0, + }, + { + "name": "redfish_call_success", + "documentation": "Indicates if call to the redfish API succeeded or not", + "labels": {}, + "value": 1.0, + }, + { + "name": "ipmi_temperature_celsius", + "documentation": "Temperature measure from temperature sensors", + "labels": {"name": "testname", "state": "Critical", "unit": "C"}, + "value": 200, + }, +] + + +# Expected alerts based on above metrics +SAMPLE_ALERTS = [ + { + "labels": { + "alertname": "IPMIDCMICommandFailed", + "juju_application": "hardware-observer", + "juju_unit": "hardware-observer/0", + "severity": "critical", + }, + "state": "firing", + "value": 0.0, + }, + { + "labels": { + "alertname": "IPMITemperatureStateNotOk", + "juju_application": "hardware-observer", + "juju_unit": "hardware-observer/0", + "severity": "critical", + }, + "state": "firing", + "value": 200, + }, +] diff --git a/tests/integration/requirements.txt b/tests/integration/requirements.txt index 79c84a6b..81a7496e 100644 --- a/tests/integration/requirements.txt +++ b/tests/integration/requirements.txt @@ -1,4 +1,6 @@ jinja2 juju~=3.3.0 # must be compatible with the juju CLI version installed by CI pytest -pytest-operator \ No newline at end of file +pytest-operator +prometheus-client +pyinstaller \ No newline at end of file diff --git a/tests/integration/test_cos_integration.py b/tests/integration/test_cos_integration.py index fff35196..ad0bf20e 100644 --- a/tests/integration/test_cos_integration.py +++ b/tests/integration/test_cos_integration.py @@ -3,13 +3,17 @@ # See LICENSE file for licensing details. import asyncio +import json import logging import os import subprocess from pathlib import Path +import time + import pytest from juju.controller import Controller +from mock_data import SAMPLE_ALERTS from pytest_operator.plugin import OpsTest from utils import get_or_add_model @@ -22,6 +26,7 @@ @pytest.mark.abort_on_fail +@pytest.mark.skip_if_deployed async def test_setup_and_deploy(ops_test: OpsTest, series, channel): """Setup models and then deploy Hardware Observer and COS.""" if LXD_CTL_NAME is None or K8S_CTL_NAME is None: @@ -29,28 +34,119 @@ async def test_setup_and_deploy(ops_test: OpsTest, series, channel): # The current model name is generated by pytest-operator from the test name + random suffix. # Use the same model name in both controllers. - k8s_mdl_name = lxd_mdl_name = ops_test.model_name + k8s_model_name = lxd_model_name = ops_test.model_name # Assuming a lxd controller is ready and its name is stored in $LXD_CONTROLLER. lxd_ctl = Controller() await lxd_ctl.connect(LXD_CTL_NAME) - lxd_mdl = await get_or_add_model(ops_test, lxd_ctl, lxd_mdl_name) - await lxd_mdl.set_config(MODEL_CONFIG) + lxd_model = await get_or_add_model(ops_test, lxd_ctl, lxd_model_name) + await lxd_model.set_config(MODEL_CONFIG) # Assuming a k8s controller is ready and its name is stored in $K8S_CONTROLLER. k8s_ctl = Controller() await k8s_ctl.connect(K8S_CTL_NAME) - k8s_mdl = await get_or_add_model(ops_test, k8s_ctl, k8s_mdl_name) - await k8s_mdl.set_config(MODEL_CONFIG) + k8s_model = await get_or_add_model(ops_test, k8s_ctl, k8s_model_name) + await k8s_model.set_config(MODEL_CONFIG) - await _deploy_cos(channel, k8s_mdl) + await _deploy_cos(channel, k8s_model) - await _deploy_hardware_observer(series, channel, lxd_mdl) + await _deploy_hardware_observer(series, channel, lxd_model) - await _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_mdl, lxd_mdl) + await _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_model, lxd_model) # This verifies that the cross-controller relation with COS is successful - assert lxd_mdl.applications["grafana-agent"].status == "active" + assert lxd_model.applications["grafana-agent"].status == "active" + + +async def test_alerts(ops_test: OpsTest): + """Verify that the required alerts are fired.""" + await _disable_hardware_exporter(ops_test) + await _export_mock_metrics(ops_test) + + # Sometimes alerts take some time to show after the metrics are exposed on the host. + time.sleep(300) + + model_name = ops_test.model_name + k8s_ctl = Controller() + await k8s_ctl.connect(K8S_CTL_NAME) + k8s_model = await get_or_add_model(ops_test, k8s_ctl, model_name) + + model_status = await k8s_model.get_status() + traefik_ip = model_status["applications"]["traefik"].public_address + + prometheus_alerts_endpoint = f"http://{traefik_ip}/{model_name}-prometheus-0/api/v1/alerts" + + cmd = ["curl", prometheus_alerts_endpoint] + try: + alerts_response = subprocess.check_output(cmd) + except subprocess.CalledProcessError: + logger.error("Failed to fetch alerts data from COS") + raise + + alerts = json.loads(alerts_response)["data"]["alerts"] + + expected_alerts = SAMPLE_ALERTS + + for expected_alert in expected_alerts: + assert any(_is_same_alert(expected_alert, received_alert) for received_alert in alerts) + + +def _is_same_alert(expected_alert, received_alert): + """Compare the alert dictionaries only based on relevant fields.""" + if expected_alert["state"] != received_alert["state"]: + return False + if float(expected_alert["value"]) != float(received_alert["value"]): + return False + for key, value in expected_alert.get("labels").items(): + if received_alert.get("labels").get(key) != value: + return False + return True + + +async def _disable_hardware_exporter( + ops_test: OpsTest, +): + """Disable the hardware exporter service.""" + disable_cmd = "sudo systemctl stop hardware-exporter.service" + lxd_model_name = ops_test.model_name + + lxd_ctl = Controller() + await lxd_ctl.connect(LXD_CTL_NAME) + lxd_model = await get_or_add_model(ops_test, lxd_ctl, lxd_model_name) + hardware_observer = lxd_model.applications.get("hardware-observer") + hardware_observer_unit = hardware_observer.units[0] + + disable_action = await hardware_observer_unit.run(disable_cmd) + await disable_action.wait() + + +async def _export_mock_metrics(ops_test: OpsTest): + """Expose the mock metrics for further testing.""" + lxd_model_name = ops_test.model_name + lxd_ctl = Controller() + await lxd_ctl.connect(LXD_CTL_NAME) + lxd_model = await get_or_add_model(ops_test, lxd_ctl, lxd_model_name) + hardware_observer = lxd_model.applications.get("hardware-observer") + hardware_observer_unit = hardware_observer.units[0] + + # Create an executable from `export_mock_metrics.py` + bundle_cmd = [ + "pyinstaller", + "--onefile", + str(Path(__file__).parent.resolve() / "export_mock_metrics.py"), + ] + try: + subprocess.run(bundle_cmd) + except subprocess.CalledProcessError: + logger.error("Failed to bundle export_mock_metrics") + raise + + # scp the executable to hardware-observer unit + await hardware_observer_unit.scp_to("./dist/export_mock_metrics", "/home/ubuntu") + + # Run the executable in the background without waiting. + run_export_mock_metrics_cmd = "/home/ubuntu/export_mock_metrics" + await hardware_observer_unit.run(run_export_mock_metrics_cmd) async def _deploy_cos(channel, model): @@ -91,7 +187,7 @@ async def _deploy_hardware_observer(series, channel, model): await model.block_until(lambda: model.applications["hardware-observer"].status == "active") -async def _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_mdl, lxd_mdl): +async def _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_model, lxd_model): """Add relations between Grafana Agent and COS.""" cos_saas_names = ["prometheus-receive-remote-write", "loki-logging", "grafana-dashboards"] for saas in cos_saas_names: @@ -101,23 +197,23 @@ async def _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_mdl, lxd_mdl): "juju", "consume", "--model", - f"{lxd_ctl.controller_name}:{k8s_mdl.name}", - f"{k8s_ctl.controller_name}:admin/{k8s_mdl.name}.{saas}", + f"{lxd_ctl.controller_name}:{k8s_model.name}", + f"{k8s_ctl.controller_name}:admin/{k8s_model.name}.{saas}", ] subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - await lxd_mdl.add_relation("grafana-agent", saas), + await lxd_model.add_relation("grafana-agent", saas), # `idle_period` needs to be greater than the scrape interval to make sure metrics ingested. await asyncio.gather( # First, we wait for the critical phase to pass with raise_on_error=False. # (In CI, using github runners, we often see unreproducible hook failures.) - lxd_mdl.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False), - k8s_mdl.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False), + lxd_model.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False), + k8s_model.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False), ) await asyncio.gather( # Then we wait for "active", without raise_on_error=False, so the test fails sooner in case # there is a persistent error status. - lxd_mdl.wait_for_idle(status="active", timeout=7200, idle_period=180), - k8s_mdl.wait_for_idle(status="active", timeout=7200, idle_period=180), + lxd_model.wait_for_idle(status="active", timeout=7200, idle_period=180), + k8s_model.wait_for_idle(status="active", timeout=7200, idle_period=180), )