From 1e8c5b71d3e77e3439fab26bcb9e4c4050b6e0c3 Mon Sep 17 00:00:00 2001
From: sudeephb <sudeep.bhandari@canonical.com>
Date: Wed, 13 Mar 2024 21:30:10 +0545
Subject: [PATCH] Setup synthetic metrics and test alerts on COS

---
 tests/integration/export_mock_metrics.py  |  37 ++++++
 tests/integration/mock_data.py            |  46 ++++++++
 tests/integration/requirements.txt        |   4 +-
 tests/integration/test_cos_integration.py | 130 +++++++++++++++++++---
 4 files changed, 199 insertions(+), 18 deletions(-)
 create mode 100644 tests/integration/export_mock_metrics.py
 create mode 100644 tests/integration/mock_data.py

diff --git a/tests/integration/export_mock_metrics.py b/tests/integration/export_mock_metrics.py
new file mode 100644
index 00000000..a2488f85
--- /dev/null
+++ b/tests/integration/export_mock_metrics.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+# Copyright 2024 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+# This file is supposed to run on the hardware observer unit.
+
+import time
+
+from mock_data import SAMPLE_METRICS
+from prometheus_client import REGISTRY, start_http_server
+from prometheus_client.core import GaugeMetricFamily
+
+PORT = 10200  # Default exporter port
+
+
+class SyntheticCollector:
+    """Collector for creating synthetic(mock) metrics."""
+
+    def collect(self):
+        for sample_metric in SAMPLE_METRICS:
+            metric = GaugeMetricFamily(
+                name=sample_metric["name"],
+                documentation=sample_metric["documentation"],
+                labels=list(sample_metric["labels"].keys()),
+            )
+            metric.add_metric(  # type: ignore[attr-defined]
+                labels=list(sample_metric["labels"].values()), value=sample_metric["value"]
+            )
+            yield metric
+
+
+if __name__ == "__main__":
+    start_http_server(PORT)
+    REGISTRY.register(SyntheticCollector())
+
+    while True:
+        time.sleep(10)  # Keep the server running
diff --git a/tests/integration/mock_data.py b/tests/integration/mock_data.py
new file mode 100644
index 00000000..18e60428
--- /dev/null
+++ b/tests/integration/mock_data.py
@@ -0,0 +1,46 @@
+# Metrics
+SAMPLE_METRICS = [
+    {
+        "name": "ipmi_dcmi_command_success",
+        "documentation": "Indicates if the ipmi dcmi command is successful or not",
+        "labels": {},
+        "value": 0.0,
+    },
+    {
+        "name": "redfish_call_success",
+        "documentation": "Indicates if call to the redfish API succeeded or not",
+        "labels": {},
+        "value": 1.0,
+    },
+    {
+        "name": "ipmi_temperature_celsius",
+        "documentation": "Temperature measure from temperature sensors",
+        "labels": {"name": "testname", "state": "Critical", "unit": "C"},
+        "value": 200,
+    },
+]
+
+
+# Expected alerts based on above metrics
+SAMPLE_ALERTS = [
+    {
+        "labels": {
+            "alertname": "IPMIDCMICommandFailed",
+            "juju_application": "hardware-observer",
+            "juju_unit": "hardware-observer/0",
+            "severity": "critical",
+        },
+        "state": "firing",
+        "value": 0.0,
+    },
+    {
+        "labels": {
+            "alertname": "IPMITemperatureStateNotOk",
+            "juju_application": "hardware-observer",
+            "juju_unit": "hardware-observer/0",
+            "severity": "critical",
+        },
+        "state": "firing",
+        "value": 200,
+    },
+]
diff --git a/tests/integration/requirements.txt b/tests/integration/requirements.txt
index 79c84a6b..81a7496e 100644
--- a/tests/integration/requirements.txt
+++ b/tests/integration/requirements.txt
@@ -1,4 +1,6 @@
 jinja2
 juju~=3.3.0  # must be compatible with the juju CLI version installed by CI
 pytest
-pytest-operator
\ No newline at end of file
+pytest-operator
+prometheus-client
+pyinstaller
\ No newline at end of file
diff --git a/tests/integration/test_cos_integration.py b/tests/integration/test_cos_integration.py
index fff35196..ad0bf20e 100644
--- a/tests/integration/test_cos_integration.py
+++ b/tests/integration/test_cos_integration.py
@@ -3,13 +3,17 @@
 # See LICENSE file for licensing details.
 
 import asyncio
+import json
 import logging
 import os
 import subprocess
 from pathlib import Path
 
+import time
+
 import pytest
 from juju.controller import Controller
+from mock_data import SAMPLE_ALERTS
 from pytest_operator.plugin import OpsTest
 from utils import get_or_add_model
 
@@ -22,6 +26,7 @@
 
 
 @pytest.mark.abort_on_fail
+@pytest.mark.skip_if_deployed
 async def test_setup_and_deploy(ops_test: OpsTest, series, channel):
     """Setup models and then deploy Hardware Observer and COS."""
     if LXD_CTL_NAME is None or K8S_CTL_NAME is None:
@@ -29,28 +34,119 @@ async def test_setup_and_deploy(ops_test: OpsTest, series, channel):
 
     # The current model name is generated by pytest-operator from the test name + random suffix.
     # Use the same model name in both controllers.
-    k8s_mdl_name = lxd_mdl_name = ops_test.model_name
+    k8s_model_name = lxd_model_name = ops_test.model_name
 
     # Assuming a lxd controller is ready and its name is stored in $LXD_CONTROLLER.
     lxd_ctl = Controller()
     await lxd_ctl.connect(LXD_CTL_NAME)
-    lxd_mdl = await get_or_add_model(ops_test, lxd_ctl, lxd_mdl_name)
-    await lxd_mdl.set_config(MODEL_CONFIG)
+    lxd_model = await get_or_add_model(ops_test, lxd_ctl, lxd_model_name)
+    await lxd_model.set_config(MODEL_CONFIG)
 
     # Assuming a k8s controller is ready and its name is stored in $K8S_CONTROLLER.
     k8s_ctl = Controller()
     await k8s_ctl.connect(K8S_CTL_NAME)
-    k8s_mdl = await get_or_add_model(ops_test, k8s_ctl, k8s_mdl_name)
-    await k8s_mdl.set_config(MODEL_CONFIG)
+    k8s_model = await get_or_add_model(ops_test, k8s_ctl, k8s_model_name)
+    await k8s_model.set_config(MODEL_CONFIG)
 
-    await _deploy_cos(channel, k8s_mdl)
+    await _deploy_cos(channel, k8s_model)
 
-    await _deploy_hardware_observer(series, channel, lxd_mdl)
+    await _deploy_hardware_observer(series, channel, lxd_model)
 
-    await _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_mdl, lxd_mdl)
+    await _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_model, lxd_model)
 
     # This verifies that the cross-controller relation with COS is successful
-    assert lxd_mdl.applications["grafana-agent"].status == "active"
+    assert lxd_model.applications["grafana-agent"].status == "active"
+
+
+async def test_alerts(ops_test: OpsTest):
+    """Verify that the required alerts are fired."""
+    await _disable_hardware_exporter(ops_test)
+    await _export_mock_metrics(ops_test)
+
+    # Sometimes alerts take some time to show after the metrics are exposed on the host.
+    time.sleep(300)
+
+    model_name = ops_test.model_name
+    k8s_ctl = Controller()
+    await k8s_ctl.connect(K8S_CTL_NAME)
+    k8s_model = await get_or_add_model(ops_test, k8s_ctl, model_name)
+
+    model_status = await k8s_model.get_status()
+    traefik_ip = model_status["applications"]["traefik"].public_address
+
+    prometheus_alerts_endpoint = f"http://{traefik_ip}/{model_name}-prometheus-0/api/v1/alerts"
+
+    cmd = ["curl", prometheus_alerts_endpoint]
+    try:
+        alerts_response = subprocess.check_output(cmd)
+    except subprocess.CalledProcessError:
+        logger.error("Failed to fetch alerts data from COS")
+        raise
+
+    alerts = json.loads(alerts_response)["data"]["alerts"]
+
+    expected_alerts = SAMPLE_ALERTS
+
+    for expected_alert in expected_alerts:
+        assert any(_is_same_alert(expected_alert, received_alert) for received_alert in alerts)
+
+
+def _is_same_alert(expected_alert, received_alert):
+    """Compare the alert dictionaries only based on relevant fields."""
+    if expected_alert["state"] != received_alert["state"]:
+        return False
+    if float(expected_alert["value"]) != float(received_alert["value"]):
+        return False
+    for key, value in expected_alert.get("labels").items():
+        if received_alert.get("labels").get(key) != value:
+            return False
+    return True
+
+
+async def _disable_hardware_exporter(
+    ops_test: OpsTest,
+):
+    """Disable the hardware exporter service."""
+    disable_cmd = "sudo systemctl stop hardware-exporter.service"
+    lxd_model_name = ops_test.model_name
+
+    lxd_ctl = Controller()
+    await lxd_ctl.connect(LXD_CTL_NAME)
+    lxd_model = await get_or_add_model(ops_test, lxd_ctl, lxd_model_name)
+    hardware_observer = lxd_model.applications.get("hardware-observer")
+    hardware_observer_unit = hardware_observer.units[0]
+
+    disable_action = await hardware_observer_unit.run(disable_cmd)
+    await disable_action.wait()
+
+
+async def _export_mock_metrics(ops_test: OpsTest):
+    """Expose the mock metrics for further testing."""
+    lxd_model_name = ops_test.model_name
+    lxd_ctl = Controller()
+    await lxd_ctl.connect(LXD_CTL_NAME)
+    lxd_model = await get_or_add_model(ops_test, lxd_ctl, lxd_model_name)
+    hardware_observer = lxd_model.applications.get("hardware-observer")
+    hardware_observer_unit = hardware_observer.units[0]
+
+    # Create an executable from `export_mock_metrics.py`
+    bundle_cmd = [
+        "pyinstaller",
+        "--onefile",
+        str(Path(__file__).parent.resolve() / "export_mock_metrics.py"),
+    ]
+    try:
+        subprocess.run(bundle_cmd)
+    except subprocess.CalledProcessError:
+        logger.error("Failed to bundle export_mock_metrics")
+        raise
+
+    # scp the executable to hardware-observer unit
+    await hardware_observer_unit.scp_to("./dist/export_mock_metrics", "/home/ubuntu")
+
+    # Run the executable in the background without waiting.
+    run_export_mock_metrics_cmd = "/home/ubuntu/export_mock_metrics"
+    await hardware_observer_unit.run(run_export_mock_metrics_cmd)
 
 
 async def _deploy_cos(channel, model):
@@ -91,7 +187,7 @@ async def _deploy_hardware_observer(series, channel, model):
     await model.block_until(lambda: model.applications["hardware-observer"].status == "active")
 
 
-async def _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_mdl, lxd_mdl):
+async def _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_model, lxd_model):
     """Add relations between Grafana Agent and COS."""
     cos_saas_names = ["prometheus-receive-remote-write", "loki-logging", "grafana-dashboards"]
     for saas in cos_saas_names:
@@ -101,23 +197,23 @@ async def _add_cross_controller_relations(k8s_ctl, lxd_ctl, k8s_mdl, lxd_mdl):
             "juju",
             "consume",
             "--model",
-            f"{lxd_ctl.controller_name}:{k8s_mdl.name}",
-            f"{k8s_ctl.controller_name}:admin/{k8s_mdl.name}.{saas}",
+            f"{lxd_ctl.controller_name}:{k8s_model.name}",
+            f"{k8s_ctl.controller_name}:admin/{k8s_model.name}.{saas}",
         ]
         subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-        await lxd_mdl.add_relation("grafana-agent", saas),
+        await lxd_model.add_relation("grafana-agent", saas),
 
     # `idle_period` needs to be greater than the scrape interval to make sure metrics ingested.
     await asyncio.gather(
         # First, we wait for the critical phase to pass with raise_on_error=False.
         # (In CI, using github runners, we often see unreproducible hook failures.)
-        lxd_mdl.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False),
-        k8s_mdl.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False),
+        lxd_model.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False),
+        k8s_model.wait_for_idle(timeout=1800, idle_period=180, raise_on_error=False),
     )
 
     await asyncio.gather(
         # Then we wait for "active", without raise_on_error=False, so the test fails sooner in case
         # there is a persistent error status.
-        lxd_mdl.wait_for_idle(status="active", timeout=7200, idle_period=180),
-        k8s_mdl.wait_for_idle(status="active", timeout=7200, idle_period=180),
+        lxd_model.wait_for_idle(status="active", timeout=7200, idle_period=180),
+        k8s_model.wait_for_idle(status="active", timeout=7200, idle_period=180),
     )