From fd3366a5da11a9dd14ae875834b9d28b1e0ef1fc Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 14 Nov 2024 21:15:40 -0500 Subject: [PATCH 01/13] Add test to check if smartctl-exporter snap is being installed --- tests/functional/test_charm.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index b3d4bb0d..ead3b921 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -262,6 +262,19 @@ async def test_redfish_client_timeout_config(self, app, unit, ops_test, provided assert config["redfish_client_timeout"] == int(new_timeout) await app.reset_config(["collect-timeout"]) + + async def test_smarctl_exporter_snap_available(self, ops_test, app, unit): + """Test if smartctl exporter snap is installed and ranning on the unit.""" + snap_name = "smartctl-exporter" + cmd = f"snap list {snap_name}" + results = await run_command_on_unit(ops_test, unit.name, cmd) + assert results.get("return-code") == 0 + assert snap_name in results.get("stdout").strip() + + check_active_cmd = "systemctl is-active snap.smartctl-exporter.smartctl-exporter" + results = await run_command_on_unit(ops_test, unit.name, check_active_cmd) + assert results.get("return-code") == 0 + assert results.get("stdout").strip() == "active" async def test_metrics_available(self, app, unit, ops_test): """Test if metrics are available at the expected endpoint on unit.""" From a3b156cc31563b6a7b0d6e66669d795c3e7475cc Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 14 Nov 2024 21:17:32 -0500 Subject: [PATCH 02/13] Reformat --- tests/functional/test_charm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index ead3b921..c818438b 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -262,7 +262,7 @@ async def test_redfish_client_timeout_config(self, app, unit, ops_test, provided assert config["redfish_client_timeout"] == int(new_timeout) await app.reset_config(["collect-timeout"]) - + async def test_smarctl_exporter_snap_available(self, ops_test, app, unit): """Test if smartctl exporter snap is installed and ranning on the unit.""" snap_name = "smartctl-exporter" @@ -270,7 +270,7 @@ async def test_smarctl_exporter_snap_available(self, ops_test, app, unit): results = await run_command_on_unit(ops_test, unit.name, cmd) assert results.get("return-code") == 0 assert snap_name in results.get("stdout").strip() - + check_active_cmd = "systemctl is-active snap.smartctl-exporter.smartctl-exporter" results = await run_command_on_unit(ops_test, unit.name, check_active_cmd) assert results.get("return-code") == 0 From a953228e51975833dacb3d99c9f43b8d00ca1424 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 14 Nov 2024 23:04:53 -0500 Subject: [PATCH 03/13] Remote realhw marks --- tests/functional/test_charm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index c818438b..ae6d0be4 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -162,7 +162,6 @@ async def test_required_resources(ops_test: OpsTest, provided_collectors, requir assert unit.workload_status_message == AppStatus.MISSING_RELATION -@pytest.mark.realhw @pytest.mark.abort_on_fail async def test_cos_agent_relation(ops_test: OpsTest, provided_collectors): """Test adding relation with grafana-agent.""" @@ -225,7 +224,6 @@ async def test_redfish_credential_validation(ops_test: OpsTest, provided_collect assert unit.workload_status_message == AppStatus.READY -@pytest.mark.realhw class TestCharmWithHW: """Run functional tests that require specific hardware.""" From dfac9d129595615ee2053ad748026016db16f72d Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 19 Nov 2024 18:34:17 -0500 Subject: [PATCH 04/13] Test bug fixes --- tests/functional/conftest.py | 8 ++++---- tests/functional/test_charm.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index 219964ef..4c93e1c6 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -100,7 +100,7 @@ def resources() -> list[Resource]: Resource( resource_name=TPR_RESOURCES.get(HWTool.STORCLI), file_name="storcli.deb", - collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.STORCLI)[0].replace( + collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.STORCLI).replace( "collector.", "" ), bin_name=HWTool.STORCLI.value, @@ -108,7 +108,7 @@ def resources() -> list[Resource]: Resource( resource_name=TPR_RESOURCES.get(HWTool.PERCCLI), file_name="perccli.deb", - collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.PERCCLI)[0].replace( + collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.PERCCLI).replace( "collector.", "" ), bin_name=HWTool.PERCCLI.value, @@ -116,7 +116,7 @@ def resources() -> list[Resource]: Resource( resource_name=TPR_RESOURCES.get(HWTool.SAS2IRCU), file_name="sas2ircu", - collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS2IRCU)[0].replace( + collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS2IRCU).replace( "collector.", "" ), bin_name=HWTool.SAS2IRCU.value, @@ -124,7 +124,7 @@ def resources() -> list[Resource]: Resource( resource_name=TPR_RESOURCES.get(HWTool.SAS3IRCU), file_name="sas3ircu", - collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS3IRCU)[0].replace( + collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS3IRCU).replace( "collector.", "" ), bin_name=HWTool.SAS3IRCU.value, diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index ae6d0be4..ad102727 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -469,8 +469,10 @@ async def test_resource_in_correct_location(self, ops_test, unit, required_resou results = await run_command_on_unit(ops_test, unit.name, check_resource_cmd) assert results.get("return-code") == 0, f"{symlink_bin} resource doesn't exist" - async def test_redfish_config(self, app, unit, ops_test): + async def test_redfish_config(self, ops_test, app, unit, provided_collectors): """Test Redfish options.""" + if "redfish" not in provided_collectors: + pytest.skip("redfish not in provided collectors, skipping test") # initially Redfish is available and enabled cmd = "cat /etc/hardware-exporter-config.yaml" results_before = await run_command_on_unit(ops_test, unit.name, cmd) @@ -584,7 +586,6 @@ async def test_resource_clean_up(self, ops_test, app, unit, required_resources): ) -@pytest.mark.realhw class TestCharm: """Perform tests that require one or more exporters to be present.""" From 62d3ea3ddc9392b98db2f282b325f01525150b91 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 19 Nov 2024 19:09:45 -0500 Subject: [PATCH 05/13] Remove extra build step --- tests/functional/conftest.py | 30 ++++++++++++++++++++++++++++-- tests/functional/test_charm.py | 10 +++------- tox.ini | 1 + 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index bc56aae5..098d6ea4 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -1,5 +1,7 @@ import logging +import os import platform +from pathlib import Path import pytest from utils import RESOURCES_DIR, Resource @@ -8,13 +10,19 @@ log = logging.getLogger(__name__) +BASES = { + "ubuntu@20.04": "focal", + "ubuntu@22.04": "jammy", + "ubuntu@24.04": "noble", +} + def pytest_addoption(parser): parser.addoption( "--base", type=str.lower, - default="ubuntu@22.04", - choices=["ubuntu@20.04", "ubuntu@22.04", "ubuntu@24.04"], + default="ubuntu@20.04", + choices=BASES.keys(), help="Set base for the applications.", ) @@ -146,3 +154,21 @@ def required_resources(resources: list[Resource], provided_collectors: set) -> l required_resources.append(resource) return required_resources + + +@pytest.fixture() +def charm_path(base: str) -> Path: + """Fixture to determine the charm path based on the base.""" + env_charm_path = f"CHARM_PATH_{BASES[base].upper()}" + path = os.getenv(env_charm_path) + + if not path: + raise EnvironmentError( + f"Environment variable '{env_charm_path}' is not set for base '{base}'." + ) + if not Path(path).exists(): + raise FileNotFoundError( + f"The path specified in '{env_charm_path}' ({path}) does not exist." + ) + + return Path(path) diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index 837665cd..aed738f1 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -56,17 +56,13 @@ class AppStatus(str, Enum): @pytest.mark.abort_on_fail @pytest.mark.skip_if_deployed async def test_build_and_deploy( # noqa: C901, function is too complex - ops_test: OpsTest, base, architecture, provided_collectors, required_resources + ops_test: OpsTest, base, architecture, provided_collectors, required_resources, charm_path ): - """Build the charm-under-test and deploy it together with related charms. + """Deploy the charm together with related charms. Assert on the unit status before any relations/configurations take place. Optionally attach required resources when testing with real hardware. """ - # Build and deploy charm from local source folder - charm = await ops_test.build_charm(".") - assert charm, "Charm was not built successfully." - # This is required for subordinate appliation to choose right revison # on different architecture. # See issue: https://bugs.launchpad.net/juju/+bug/2067749 @@ -77,7 +73,7 @@ async def test_build_and_deploy( # noqa: C901, function is too complex logger.info("Rendering bundle %s", bundle_template_path) bundle = ops_test.render_bundle( bundle_template_path, - charm=charm, + charm=charm_path, base=base, resources={ "storcli-deb": "empty-resource", diff --git a/tox.ini b/tox.ini index a388a52c..894148dd 100644 --- a/tox.ini +++ b/tox.ini @@ -69,6 +69,7 @@ deps = passenv = REDFISH_USERNAME REDFISH_PASSWORD + CHARM_PATH_* [testenv:integration] description = Run integration tests with COS From 63f560382741420ccd13e96aacb7c04fbe25b392 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 19 Nov 2024 20:02:41 -0500 Subject: [PATCH 06/13] Tests restructure --- tests/functional/test_charm.py | 248 ++++++++++++++++----------------- 1 file changed, 121 insertions(+), 127 deletions(-) diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index aed738f1..907b528a 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -188,9 +188,10 @@ async def test_cos_agent_relation(ops_test: OpsTest, provided_collectors): # Test with cos-agent relation logging.info("Check whether hardware-exporter is active after creating relation.") for unit in ops_test.model.applications[APP_NAME].units: - results = await run_command_on_unit(ops_test, unit.name, check_active_cmd) - assert results.get("return-code") == 0 - assert results.get("stdout").strip() == "active" + if provided_collectors: + results = await run_command_on_unit(ops_test, unit.name, check_active_cmd) + assert results.get("return-code") == 0 + assert results.get("stdout").strip() == "active" if redfish_present: assert unit.workload_status_message == AppStatus.INVALID_REDFISH_CREDS else: @@ -220,9 +221,104 @@ async def test_redfish_credential_validation(ops_test: OpsTest, provided_collect assert unit.workload_status_message == AppStatus.READY +@pytest.mark.realhw class TestCharmWithHW: """Run functional tests that require specific hardware.""" + async def test_config_file_permissions(self, unit, ops_test): + """Check config file permissions are set correctly.""" + expected_file_mode = "600" + cmd = "stat -c '%a' /etc/hardware-exporter-config.yaml" + results = await run_command_on_unit(ops_test, unit.name, cmd) + assert results.get("return-code") == 0 + assert results.get("stdout").rstrip("\n") == expected_file_mode + + async def test_config_changed_port(self, app, unit, ops_test): + """Test changing the config option: hardware-exporter-port.""" + new_port = "10001" + await asyncio.gather( + app.set_config({"hardware-exporter-port": new_port}), + ops_test.model.wait_for_idle(apps=[APP_NAME]), + ) + + cmd = "cat /etc/hardware-exporter-config.yaml" + results = await run_command_on_unit(ops_test, unit.name, cmd) + assert results.get("return-code") == 0 + config = yaml.safe_load(results.get("stdout").strip()) + assert config["port"] == int(new_port) + + await app.reset_config(["hardware-exporter-port"]) + + async def test_no_redfish_config(self, unit, ops_test): + """Test that there is no Redfish options because it's not available on lxd machines.""" + cmd = "cat /etc/hardware-exporter-config.yaml" + results = await run_command_on_unit(ops_test, unit.name, cmd) + assert results.get("return-code") == 0 + config = yaml.safe_load(results.get("stdout").strip()) + assert config.get("redfish_host") is None + assert config.get("redfish_username") is None + assert config.get("redfish_client_timeout") is None + + async def test_config_changed_log_level(self, app, unit, ops_test): + """Test changing the config option: exporter-log-level.""" + new_log_level = "DEBUG" + await asyncio.gather( + app.set_config({"exporter-log-level": new_log_level}), + ops_test.model.wait_for_idle(apps=[APP_NAME]), + ) + + cmd = "cat /etc/hardware-exporter-config.yaml" + results = await run_command_on_unit(ops_test, unit.name, cmd) + assert results.get("return-code") == 0 + config = yaml.safe_load(results.get("stdout").strip()) + assert config["level"] == new_log_level + + await app.reset_config(["exporter-log-level"]) + + async def test_config_changed_collect_timeout(self, app, unit, ops_test): + """Test changing the config option: collect-timeout.""" + new_collect_timeout = "20" + await asyncio.gather( + app.set_config({"collect-timeout": new_collect_timeout}), + ops_test.model.wait_for_idle(apps=[APP_NAME]), + ) + + cmd = "cat /etc/hardware-exporter-config.yaml" + results = await run_command_on_unit(ops_test, unit.name, cmd) + assert results.get("return-code") == 0 + config = yaml.safe_load(results.get("stdout").strip()) + assert config["collect_timeout"] == int(new_collect_timeout) + + await app.reset_config(["collect-timeout"]) + + async def test_start_and_stop_exporter(self, app, unit, ops_test): + """Test starting and stopping the exporter results in correct charm status.""" + # Stop the exporter, and the exporter should auto-restart after update status fire. + stop_cmd = "systemctl stop hardware-exporter" + async with ops_test.fast_forward(): + await asyncio.gather( + unit.run(stop_cmd), + ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=TIMEOUT), + ) + assert unit.workload_status_message == AppStatus.READY + + async def test_exporter_failed(self, app, unit, ops_test): + """Test failure in the exporter results in correct charm status.""" + # Setting incorrect log level will crash the exporter + async with ops_test.fast_forward(): + await asyncio.gather( + app.set_config({"exporter-log-level": "RANDOM_LEVEL"}), + ops_test.model.wait_for_idle(apps=[APP_NAME], status="blocked", timeout=TIMEOUT), + ) + assert unit.workload_status_message == AppStatus.INVALID_CONFIG_EXPORTER_LOG_LEVEL + + async with ops_test.fast_forward(): + await asyncio.gather( + app.reset_config(["exporter-log-level"]), + ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=TIMEOUT), + ) + assert unit.workload_status_message == AppStatus.READY + async def test_config_collector_enabled(self, app, unit, ops_test, provided_collectors): """Test whether provided collectors are present in exporter config.""" cmd = "cat /etc/hardware-exporter-config.yaml" @@ -582,131 +678,29 @@ async def test_resource_clean_up(self, ops_test, app, unit, required_resources): ) -class TestCharm: - """Perform tests that require one or more exporters to be present.""" - - async def test_config_file_permissions(self, unit, ops_test): - """Check config file permissions are set correctly.""" - expected_file_mode = "600" - cmd = "stat -c '%a' /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - assert results.get("stdout").rstrip("\n") == expected_file_mode - - async def test_config_changed_port(self, app, unit, ops_test): - """Test changing the config option: hardware-exporter-port.""" - new_port = "10001" - await asyncio.gather( - app.set_config({"hardware-exporter-port": new_port}), - ops_test.model.wait_for_idle(apps=[APP_NAME]), - ) - - cmd = "cat /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - config = yaml.safe_load(results.get("stdout").strip()) - assert config["port"] == int(new_port) - - await app.reset_config(["hardware-exporter-port"]) - - async def test_no_redfish_config(self, unit, ops_test): - """Test that there is no Redfish options because it's not available on lxd machines.""" - cmd = "cat /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - config = yaml.safe_load(results.get("stdout").strip()) - assert config.get("redfish_host") is None - assert config.get("redfish_username") is None - assert config.get("redfish_client_timeout") is None - - async def test_config_changed_log_level(self, app, unit, ops_test): - """Test changing the config option: exporter-log-level.""" - new_log_level = "DEBUG" - await asyncio.gather( - app.set_config({"exporter-log-level": new_log_level}), - ops_test.model.wait_for_idle(apps=[APP_NAME]), - ) - - cmd = "cat /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - config = yaml.safe_load(results.get("stdout").strip()) - assert config["level"] == new_log_level - - await app.reset_config(["exporter-log-level"]) - - async def test_config_changed_collect_timeout(self, app, unit, ops_test): - """Test changing the config option: collect-timeout.""" - new_collect_timeout = "20" - await asyncio.gather( - app.set_config({"collect-timeout": new_collect_timeout}), - ops_test.model.wait_for_idle(apps=[APP_NAME]), - ) - - cmd = "cat /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - config = yaml.safe_load(results.get("stdout").strip()) - assert config["collect_timeout"] == int(new_collect_timeout) - - await app.reset_config(["collect-timeout"]) - - async def test_start_and_stop_exporter(self, app, unit, ops_test): - """Test starting and stopping the exporter results in correct charm status.""" - # Stop the exporter, and the exporter should auto-restart after update status fire. - stop_cmd = "systemctl stop hardware-exporter" - async with ops_test.fast_forward(): - await asyncio.gather( - unit.run(stop_cmd), - ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=TIMEOUT), - ) - assert unit.workload_status_message == AppStatus.READY - - async def test_exporter_failed(self, app, unit, ops_test): - """Test failure in the exporter results in correct charm status.""" - # Setting incorrect log level will crash the exporter - async with ops_test.fast_forward(): - await asyncio.gather( - app.set_config({"exporter-log-level": "RANDOM_LEVEL"}), - ops_test.model.wait_for_idle(apps=[APP_NAME], status="blocked", timeout=TIMEOUT), - ) - assert unit.workload_status_message == AppStatus.INVALID_CONFIG_EXPORTER_LOG_LEVEL - - async with ops_test.fast_forward(): - await asyncio.gather( - app.reset_config(["exporter-log-level"]), - ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=TIMEOUT), - ) - assert unit.workload_status_message == AppStatus.READY - - async def test_on_remove_event(self, app, ops_test): - """Test _on_remove event cleans up the service on the host machine.""" - await asyncio.gather( - app.remove_relation(f"{APP_NAME}:general-info", f"{PRINCIPAL_APP_NAME}:juju-info"), - ops_test.model.wait_for_idle( - apps=[PRINCIPAL_APP_NAME], status="active", timeout=TIMEOUT - ), - ) - principal_unit = ops_test.model.applications[PRINCIPAL_APP_NAME].units[0] +@pytest.mark.abort_on_fail +async def test_on_remove_event(app, ops_test): + """Test _on_remove event cleans up the service on the host machine.""" + await asyncio.gather( + app.remove_relation(f"{APP_NAME}:general-info", f"{PRINCIPAL_APP_NAME}:juju-info"), + ops_test.model.wait_for_idle(apps=[PRINCIPAL_APP_NAME], status="active", timeout=TIMEOUT), + ) + principal_unit = ops_test.model.applications[PRINCIPAL_APP_NAME].units[0] - # Wait for cleanup activities to finish - await ops_test.model.block_until( - lambda: ops_test.model.applications[APP_NAME].status == "unknown" - ) + # Wait for cleanup activities to finish + await ops_test.model.block_until( + lambda: ops_test.model.applications[APP_NAME].status == "unknown" + ) - cmd = "ls /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, principal_unit.name, cmd) - assert results.get("return-code") > 0 + cmd = "ls /etc/hardware-exporter-config.yaml" + results = await run_command_on_unit(ops_test, principal_unit.name, cmd) + assert results.get("return-code") > 0 - cmd = "ls /etc/systemd/system/hardware-exporter.service" - results = await run_command_on_unit(ops_test, principal_unit.name, cmd) - assert results.get("return-code") > 0 + cmd = "ls /etc/systemd/system/hardware-exporter.service" + results = await run_command_on_unit(ops_test, principal_unit.name, cmd) + assert results.get("return-code") > 0 - await asyncio.gather( - ops_test.model.add_relation( - f"{APP_NAME}:general-info", f"{PRINCIPAL_APP_NAME}:juju-info" - ), - ops_test.model.wait_for_idle( - apps=[PRINCIPAL_APP_NAME], status="active", timeout=TIMEOUT - ), - ) + await asyncio.gather( + ops_test.model.add_relation(f"{APP_NAME}:general-info", f"{PRINCIPAL_APP_NAME}:juju-info"), + ops_test.model.wait_for_idle(apps=[PRINCIPAL_APP_NAME], status="active", timeout=TIMEOUT), + ) From 0e5cc37695d84f2f89f9c594e44ccd25bdc01cdd Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 20 Nov 2024 11:42:56 -0500 Subject: [PATCH 07/13] Revert default base choice --- tests/functional/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index 098d6ea4..73733e9f 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -21,7 +21,7 @@ def pytest_addoption(parser): parser.addoption( "--base", type=str.lower, - default="ubuntu@20.04", + default="ubuntu@22.04", choices=BASES.keys(), help="Set base for the applications.", ) From edd1c0ab7c4693abbe51eab86eede484c57103ac Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 21 Nov 2024 18:09:29 -0500 Subject: [PATCH 08/13] Find built charm by glob --- tests/functional/conftest.py | 26 +++--------- tests/functional/test_charm.py | 78 +++++++++++++++++----------------- tests/functional/utils.py | 16 +++++++ 3 files changed, 63 insertions(+), 57 deletions(-) diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index 73733e9f..517436f3 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -10,19 +10,13 @@ log = logging.getLogger(__name__) -BASES = { - "ubuntu@20.04": "focal", - "ubuntu@22.04": "jammy", - "ubuntu@24.04": "noble", -} - def pytest_addoption(parser): parser.addoption( "--base", type=str.lower, default="ubuntu@22.04", - choices=BASES.keys(), + choices=["ubuntu@20.04", "ubuntu@22.04", "ubuntu@24.04"], help="Set base for the applications.", ) @@ -157,18 +151,12 @@ def required_resources(resources: list[Resource], provided_collectors: set) -> l @pytest.fixture() -def charm_path(base: str) -> Path: +def charm_path(base: str, architecture: str) -> Path: """Fixture to determine the charm path based on the base.""" - env_charm_path = f"CHARM_PATH_{BASES[base].upper()}" - path = os.getenv(env_charm_path) + glob_path = f"hardware-observer_*{base.replace('@', '-')}-{architecture}*.charm" + paths = list(Path(".").glob(glob_path)) - if not path: - raise EnvironmentError( - f"Environment variable '{env_charm_path}' is not set for base '{base}'." - ) - if not Path(path).exists(): - raise FileNotFoundError( - f"The path specified in '{env_charm_path}' ({path}) does not exist." - ) + if not paths: + raise FileNotFoundError(f"The path for the charm for {base}-{architecture} is not found.") - return Path(path) + return os.getcwd() / paths[0] diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index 907b528a..1162b883 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -16,8 +16,10 @@ from tenacity import AsyncRetrying, RetryError, stop_after_attempt, wait_fixed from utils import ( RESOURCES_DIR, + HardwareExporterConfigError, MetricsFetchError, assert_metrics, + get_hardware_exporter_config, get_metrics_output, run_command_on_unit, ) @@ -241,20 +243,20 @@ async def test_config_changed_port(self, app, unit, ops_test): ops_test.model.wait_for_idle(apps=[APP_NAME]), ) - cmd = "cat /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - config = yaml.safe_load(results.get("stdout").strip()) + try: + config = await get_hardware_exporter_config(ops_test, unit.name) + except HardwareExporterConfigError: + pytest.fail("Not able to obtain hardware-exporter config!") assert config["port"] == int(new_port) await app.reset_config(["hardware-exporter-port"]) async def test_no_redfish_config(self, unit, ops_test): """Test that there is no Redfish options because it's not available on lxd machines.""" - cmd = "cat /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - config = yaml.safe_load(results.get("stdout").strip()) + try: + config = await get_hardware_exporter_config(ops_test, unit.name) + except HardwareExporterConfigError: + pytest.fail("Not able to obtain hardware-exporter config!") assert config.get("redfish_host") is None assert config.get("redfish_username") is None assert config.get("redfish_client_timeout") is None @@ -267,10 +269,10 @@ async def test_config_changed_log_level(self, app, unit, ops_test): ops_test.model.wait_for_idle(apps=[APP_NAME]), ) - cmd = "cat /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - config = yaml.safe_load(results.get("stdout").strip()) + try: + config = await get_hardware_exporter_config(ops_test, unit.name) + except HardwareExporterConfigError: + pytest.fail("Not able to obtain hardware-exporter config!") assert config["level"] == new_log_level await app.reset_config(["exporter-log-level"]) @@ -283,10 +285,10 @@ async def test_config_changed_collect_timeout(self, app, unit, ops_test): ops_test.model.wait_for_idle(apps=[APP_NAME]), ) - cmd = "cat /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - config = yaml.safe_load(results.get("stdout").strip()) + try: + config = await get_hardware_exporter_config(ops_test, unit.name) + except HardwareExporterConfigError: + pytest.fail("Not able to obtain hardware-exporter config!") assert config["collect_timeout"] == int(new_collect_timeout) await app.reset_config(["collect-timeout"]) @@ -321,10 +323,10 @@ async def test_exporter_failed(self, app, unit, ops_test): async def test_config_collector_enabled(self, app, unit, ops_test, provided_collectors): """Test whether provided collectors are present in exporter config.""" - cmd = "cat /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - config = yaml.safe_load(results.get("stdout").strip()) + try: + config = await get_hardware_exporter_config(ops_test, unit.name) + except HardwareExporterConfigError: + pytest.fail("Not able to obtain hardware-exporter config!") collectors_in_config = { collector.replace("collector.", "") for collector in config.get("enable_collectors") } @@ -345,10 +347,10 @@ async def test_redfish_client_timeout_config(self, app, unit, ops_test, provided ops_test.model.wait_for_idle(apps=[APP_NAME]), ) - cmd = "cat /etc/hardware-exporter-config.yaml" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - config = yaml.safe_load(results.get("stdout").strip()) + try: + config = await get_hardware_exporter_config(ops_test, unit.name) + except HardwareExporterConfigError: + pytest.fail("Not able to obtain hardware-exporter config!") assert config["redfish_client_timeout"] == int(new_timeout) await app.reset_config(["collect-timeout"]) @@ -566,13 +568,13 @@ async def test_redfish_config(self, ops_test, app, unit, provided_collectors): if "redfish" not in provided_collectors: pytest.skip("redfish not in provided collectors, skipping test") # initially Redfish is available and enabled - cmd = "cat /etc/hardware-exporter-config.yaml" - results_before = await run_command_on_unit(ops_test, unit.name, cmd) - assert results_before.get("return-code") == 0 - config = yaml.safe_load(results_before.get("stdout").strip()) - assert config.get("redfish_host") is not None - assert config.get("redfish_username") is not None - assert config.get("redfish_client_timeout") is not None + try: + config_before = await get_hardware_exporter_config(ops_test, unit.name) + except HardwareExporterConfigError: + pytest.fail("Not able to obtain hardware-exporter config!") + assert config_before.get("redfish_host") is not None + assert config_before.get("redfish_username") is not None + assert config_before.get("redfish_client_timeout") is not None # Disable Redfish and see if the config is not present await asyncio.gather( @@ -580,13 +582,13 @@ async def test_redfish_config(self, ops_test, app, unit, provided_collectors): ops_test.model.wait_for_idle(apps=[APP_NAME]), ) - cmd = "cat /etc/hardware-exporter-config.yaml" - results_after = await run_command_on_unit(ops_test, unit.name, cmd) - assert results_before.get("return-code") == 0 - config = yaml.safe_load(results_after.get("stdout").strip()) - assert config.get("redfish_host") is None - assert config.get("redfish_username") is None - assert config.get("redfish_client_timeout") is None + try: + config_after = await get_hardware_exporter_config(ops_test, unit.name) + except HardwareExporterConfigError: + pytest.fail("Not able to obtain hardware-exporter config!") + assert config_after.get("redfish_host") is None + assert config_after.get("redfish_username") is None + assert config_after.get("redfish_client_timeout") is None await app.reset_config(["redfish-disable"]) diff --git a/tests/functional/utils.py b/tests/functional/utils.py index 7c948cf3..a6421d61 100644 --- a/tests/functional/utils.py +++ b/tests/functional/utils.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Optional +import yaml from async_lru import alru_cache RESOURCES_DIR = Path("./resources/") @@ -44,6 +45,12 @@ class MetricsFetchError(Exception): pass +class HardwareExporterConfigError(Exception): + """Raise if something goes wrong when getting hardware-exporter config.""" + + pass + + async def run_command_on_unit(ops_test, unit_name, command): complete_command = ["exec", "--unit", unit_name, "--", *command.split()] return_code, stdout, _ = await ops_test.juju(*complete_command) @@ -54,6 +61,15 @@ async def run_command_on_unit(ops_test, unit_name, command): return results +async def get_hardware_exporter_config(ops_test, unit_name) -> dict: + """Return hardware-exporter config from endpoint on unit.""" + command = "cat /etc/hardware-exporter/config.yaml" + results = await run_command_on_unit(ops_test, unit_name, command) + if results.get("return-code") > 0: + raise HardwareExporterConfigError + return yaml.safe_load(results.get("stdout")) + + @alru_cache async def get_metrics_output(ops_test, unit_name) -> Optional[dict[str, list[Metric]]]: """Return parsed prometheus metric output from endpoint on unit. From 5f10c8864edd64d9e95b72ceee2003aadac2aa2a Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 22 Nov 2024 13:43:32 -0500 Subject: [PATCH 09/13] Log about the charm path used and raise if multiple paths found --- tests/functional/conftest.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index 517436f3..e3757818 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -1,5 +1,4 @@ import logging -import os import platform from pathlib import Path @@ -152,11 +151,19 @@ def required_resources(resources: list[Resource], provided_collectors: set) -> l @pytest.fixture() def charm_path(base: str, architecture: str) -> Path: - """Fixture to determine the charm path based on the base.""" + """Fixture to determine the charm path based on the base and architecture.""" glob_path = f"hardware-observer_*{base.replace('@', '-')}-{architecture}*.charm" paths = list(Path(".").glob(glob_path)) if not paths: raise FileNotFoundError(f"The path for the charm for {base}-{architecture} is not found.") - return os.getcwd() / paths[0] + if len(paths) > 1: + raise FileNotFoundError( + f"Multiple charms found for {base}-{architecture}. Please provide only one." + ) + + # The bundle will need the full path to the charm + path = paths[0].absolute() + log.info(f"Using charm path: {path}") + return path From 70fb2e18288d4df6f3f90030bed6091cd688b819 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 22 Nov 2024 15:45:14 -0500 Subject: [PATCH 10/13] Add dcgm collector and test for the exporter snap --- tests/functional/conftest.py | 1 + tests/functional/test_charm.py | 46 +++++++++++++++++++++++++++++++--- tests/functional/utils.py | 9 +++++++ 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index e3757818..ab413d09 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -33,6 +33,7 @@ def pytest_addoption(parser): "poweredge_raid", "lsi_sas_2", "lsi_sas_3", + "dcgm", ], help="Provide space-separated list of collectors for testing with real hardware.", ) diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index 1162b883..99fc23f4 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -19,6 +19,7 @@ HardwareExporterConfigError, MetricsFetchError, assert_metrics, + assert_snap_installed, get_hardware_exporter_config, get_metrics_output, run_command_on_unit, @@ -53,6 +54,12 @@ class AppStatus(str, Enum): CHECKSUM_ERROR = "Fail strategies: " INVALID_CONFIG_EXPORTER_LOG_LEVEL = "Invalid config: 'exporter-log-level'" INVALID_REDFISH_CREDS = "Invalid config: 'redfish-username' or 'redfish-password'" + NO_NVIDIA_DRIVER_DETECTED = ( + "Failed to communicate with NVIDIA driver. See more details in the logs" + ) + MANUAL_NVIDIA_DRIVER_INSTALL = ( + "No drivers for the NVIDIA GPU were found. Manual installation is necessary" + ) @pytest.mark.abort_on_fail @@ -160,6 +167,25 @@ async def test_required_resources(ops_test: OpsTest, provided_collectors, requir assert unit.workload_status_message == AppStatus.MISSING_RELATION +@pytest.mark.abort_on_fail +async def test_nvidia_driver_installation(ops_test: OpsTest, provided_collectors, unit): + """Test nvidia driver installation.""" + if "dcgm" not in provided_collectors: + pytest.skip("dcgm not in provided collectors, skipping test") + + check_nvidia_driver_cmd = "cat /proc/driver/nvidia/version" + results = await run_command_on_unit(ops_test, unit.name, check_nvidia_driver_cmd) + exists = results.get("return-code") == 0 + + if not exists: + if unit.workload_status_message == AppStatus.MANUAL_NVIDIA_DRIVER_INSTALL: + pytest.fail("This machine requires manual installation of NVIDIA driver.") + elif unit.workload_status_message == AppStatus.NO_NVIDIA_DRIVER_DETECTED: + pytest.fail("Nvidia GPU detected, the tests should be run on the real hardware.") + else: + pytest.fail("Error occured during the driver installation.") + + @pytest.mark.abort_on_fail async def test_cos_agent_relation(ops_test: OpsTest, provided_collectors): """Test adding relation with grafana-agent.""" @@ -358,16 +384,28 @@ async def test_redfish_client_timeout_config(self, app, unit, ops_test, provided async def test_smarctl_exporter_snap_available(self, ops_test, app, unit): """Test if smartctl exporter snap is installed and ranning on the unit.""" snap_name = "smartctl-exporter" - cmd = f"snap list {snap_name}" - results = await run_command_on_unit(ops_test, unit.name, cmd) - assert results.get("return-code") == 0 - assert snap_name in results.get("stdout").strip() + if not assert_snap_installed(ops_test, unit.name, snap_name): + pytest.fail(f"{snap_name} snap is not installed on the unit.") check_active_cmd = "systemctl is-active snap.smartctl-exporter.smartctl-exporter" results = await run_command_on_unit(ops_test, unit.name, check_active_cmd) assert results.get("return-code") == 0 assert results.get("stdout").strip() == "active" + async def test_dcgm_exporter_snap_available(self, ops_test, app, unit, provided_collectors): + """Test if dcgm exporter snap is installed and ranning on the unit.""" + if "dcgm" not in provided_collectors: + pytest.skip("dcgm not in provided collectors, skipping test") + + snap_name = "dcgm" + if not assert_snap_installed(ops_test, unit.name, snap_name): + pytest.fail(f"{snap_name} snap is not installed on the unit.") + + check_active_cmd = "systemctl is-active snap.dcgm.dcgm-exporter" + results = await run_command_on_unit(ops_test, unit.name, check_active_cmd) + assert results.get("return-code") == 0 + assert results.get("stdout").strip() == "active" + async def test_metrics_available(self, app, unit, ops_test): """Test if metrics are available at the expected endpoint on unit.""" # takes some time for exporter to start and metrics to be available diff --git a/tests/functional/utils.py b/tests/functional/utils.py index a6421d61..70df8af4 100644 --- a/tests/functional/utils.py +++ b/tests/functional/utils.py @@ -84,6 +84,15 @@ async def get_metrics_output(ops_test, unit_name) -> Optional[dict[str, list[Met return parsed_metrics +async def assert_snap_installed(ops_test, unit_name: str, snap_name: str) -> bool: + """Assert whether snap is installed on the model.""" + cmd = f"snap list {snap_name}" + results = await run_command_on_unit(ops_test, unit_name, cmd) + if results.get("return-code") > 0 or snap_name not in results.get("stdout"): + return False + return True + + def assert_metrics(metrics: list[Metric], expected_metric_values_map: dict[str, float]) -> bool: """Assert whether values in obtained list of metrics for a collector are as expected. From ac141508a747f78b90f4ba64bcc1f24bd7cc4723 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 29 Nov 2024 16:56:27 -0500 Subject: [PATCH 11/13] Use nvidia flag instead of collector --- tests/functional/conftest.py | 12 +++++++++++- tests/functional/test_charm.py | 8 ++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index ab413d09..7f5ba60a 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -19,6 +19,12 @@ def pytest_addoption(parser): help="Set base for the applications.", ) + parser.addoption( + "--nvidia", + action="store_true", + help="Enable NVIDIA GPU support for testing with real hardware.", + ) + parser.addoption( "--collectors", nargs="+", @@ -33,7 +39,6 @@ def pytest_addoption(parser): "poweredge_raid", "lsi_sas_2", "lsi_sas_3", - "dcgm", ], help="Provide space-separated list of collectors for testing with real hardware.", ) @@ -44,6 +49,11 @@ def base(request): return request.config.getoption("--base") +@pytest.fixture(scope="module") +def nvidia_present(request): + return request.config.getoption("--nvidia") + + @pytest.fixture(scope="module") def architecture(): machine = platform.machine() diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index 99fc23f4..508ab139 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -168,9 +168,9 @@ async def test_required_resources(ops_test: OpsTest, provided_collectors, requir @pytest.mark.abort_on_fail -async def test_nvidia_driver_installation(ops_test: OpsTest, provided_collectors, unit): +async def test_nvidia_driver_installation(ops_test: OpsTest, nvidia_present, unit): """Test nvidia driver installation.""" - if "dcgm" not in provided_collectors: + if not nvidia_present: pytest.skip("dcgm not in provided collectors, skipping test") check_nvidia_driver_cmd = "cat /proc/driver/nvidia/version" @@ -392,9 +392,9 @@ async def test_smarctl_exporter_snap_available(self, ops_test, app, unit): assert results.get("return-code") == 0 assert results.get("stdout").strip() == "active" - async def test_dcgm_exporter_snap_available(self, ops_test, app, unit, provided_collectors): + async def test_dcgm_exporter_snap_available(self, ops_test, app, unit, nvidia_present): """Test if dcgm exporter snap is installed and ranning on the unit.""" - if "dcgm" not in provided_collectors: + if not nvidia_present: pytest.skip("dcgm not in provided collectors, skipping test") snap_name = "dcgm" From 2422ecb19d99fa0c1dac5c90e0ae30569efc1ce9 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 29 Nov 2024 17:24:16 -0500 Subject: [PATCH 12/13] Add --realhw flag --- tests/functional/conftest.py | 26 ++++++++++++-------------- tests/functional/test_charm.py | 5 +++-- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index 7f5ba60a..b29025aa 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -19,6 +19,12 @@ def pytest_addoption(parser): help="Set base for the applications.", ) + parser.addoption( + "--realhw", + action="store_true", + help="Enable real hardware testing.", + ) + parser.addoption( "--nvidia", action="store_true", @@ -54,6 +60,11 @@ def nvidia_present(request): return request.config.getoption("--nvidia") +@pytest.fixture(scope="module") +def realhw(request): + return request.config.getoption("--realhw") + + @pytest.fixture(scope="module") def architecture(): machine = platform.machine() @@ -72,20 +83,7 @@ def pytest_configure(config): def pytest_collection_modifyitems(config, items): - if config.getoption("collectors"): - # --collectors provided, skip hw independent tests - skip_hw_independent = pytest.mark.skip( - reason="Hardware independent tests are skipped since --collectors was provided." - ) - for item in items: - # skip TestCharm tests where "realhw" marker is not present - # we don't want to skip test_setup_and_build, test_required_resources, - # test_cos_agent_relation and test_redfish_credential_validation - # even for hw independent tests - # so we also check for the abort_on_fail marker - if "realhw" not in item.keywords and "abort_on_fail" not in item.keywords: - item.add_marker(skip_hw_independent) - else: + if not config.getoption("--realhw"): # skip hw dependent tests in TestCharmWithHW marked with "realhw" skip_hw_dependent = pytest.mark.skip( reason="Hardware dependent test. Provide collectors with the --collectors option." diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index 508ab139..54e23584 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -65,7 +65,7 @@ class AppStatus(str, Enum): @pytest.mark.abort_on_fail @pytest.mark.skip_if_deployed async def test_build_and_deploy( # noqa: C901, function is too complex - ops_test: OpsTest, base, architecture, provided_collectors, required_resources, charm_path + ops_test: OpsTest, base, architecture, realhw, required_resources, charm_path ): """Deploy the charm together with related charms. @@ -96,7 +96,7 @@ async def test_build_and_deploy( # noqa: C901, function is too complex # deploy bundle to already added machine instead of provisioning new one # when testing with real hardware - if provided_collectors: + if realhw: juju_cmd.append("--map-machines=existing") logging.info("Deploying bundle...") @@ -168,6 +168,7 @@ async def test_required_resources(ops_test: OpsTest, provided_collectors, requir @pytest.mark.abort_on_fail +@pytest.mark.realhw async def test_nvidia_driver_installation(ops_test: OpsTest, nvidia_present, unit): """Test nvidia driver installation.""" if not nvidia_present: From 292c60379cea9977d5be5ad5782ddbd4bdee8ed0 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 29 Nov 2024 21:54:04 -0500 Subject: [PATCH 13/13] Update docs and the realhw test logic --- tests/functional/README.md | 23 +++++---- tests/functional/test_charm.py | 92 ++++++++++++++++++++-------------- 2 files changed, 66 insertions(+), 49 deletions(-) diff --git a/tests/functional/README.md b/tests/functional/README.md index 5992a576..8f4e2fa7 100644 --- a/tests/functional/README.md +++ b/tests/functional/README.md @@ -3,33 +3,33 @@ There are 2 main types of functional tests for the Hardware Observer charm - tho Here, "real hardware" refers to machines that are not VMs or containers and have access to real hardware resources like RAID cards and BMC management tools. +Note: the built charm must be present in the root of the project's directory for the tests to run. + ## Hardware Independent Tests These are the tests for hardware observer that do not require any real hardware. -Hardware independent tests are run on every PR / weekly scheduled test run. They belong to the `TestCharm` class in the `test_charm.py` module. +Hardware independent tests are run on every PR / weekly scheduled test run. These include: * Testing whether juju config changes produce the required results -* Check whether the exporter systemd service starts and stops correctly -* Test exporter is stopped and related files removed on removal of charm - -and more. -Running these tests is as simple as executing the `make functional` command. +Running these tests is as simple as executing the `tox -e func -- -v` ## Hardware Dependent Tests These are the tests that depend on real hardware to be executed. This is performed manually when required, for example - validating the charm's full functionality before a new release. Hardware dependent tests are present in the `TestCharmWithHW` class in the `test_charm.py` module. The pytest marker `realhw` has been added to this class (which would include all the tests in this class). -These tests will only be executed if the `--collectors` option for pytest is provided some value. Otherwise, all these tests are skipped (this is done by checking for the presence of the `realhw` marker mentioned earlier.) +These tests will only be executed if the `--realhw` option for pytest is provided. Additionally, the `--collectors` option with space separated values can be provided, if specific hardware is present. Check the `conftest.py` for options. Otherwise, all these tests are skipped (this is done by checking for the presence of the `realhw` marker mentioned earlier.) -Note: The `test_build_and_deploy` function sets up the test environment for both types of tests. +Note: The operator must set up a test model with the machine added beforehand. The machine must be an actual host, containers or VMs won't work. Some of these tests include: * Check if all collectors are detected in the exporter config file * Test if metrics are available at the expected endpoint * Test if metrics specific to the collectors being tested are available +* Test if smarctl-exporter snap is installed and running +* Test if Nvidia drivers and dcgm-exporter snap are installed and more. @@ -37,7 +37,8 @@ In order to run these tests, a couple of prerequisite steps need to be completed 1. Setup test environment 2. Add environment variables for Redfish credentials. 3. Setup required resource files -4. Find supported collectors +4. Determine if the machine has Nvidia GPUs and add the `--nvidia` flag is present. +5. Find supported collectors ### 1. Setup test environment For the hardware dependent tests, we add the test machine beforehand and the bundle only handles deploying the applications to this machine. @@ -82,7 +83,7 @@ Note: The tests expect these resources to be named exactly in the manner provide ### 4. Find supported collectors Note down all the collectors supported by the machine as they need to be provided to pytest as part of its CLI arguments. -This is done by passing the required collectors in a space-separated manner via the `FUNC_ARGS` environment variable to the make target. +This is done by passing the required collectors in a space-separated manner via `--collector` option to the tox target. The supported collectors can be found by checking the output of the `lshw` command (for RAID cards) or checking availability of Redfish and IPMI on the BMC. @@ -92,7 +93,7 @@ The supported collectors can be found by checking the output of the `lshw` comma After ensuring the prerequisite steps are complete, the final command to run the tests would look something like this: ``` -FUNC_ARGS="--model test --collectors ipmi_dcmi ipmi_sel ipmi_sensor redfish mega_raid" make functional +tox -e func -- -v --realhw --model test --collectors ipmi_dcmi ipmi_sel ipmi_sensor redfish mega_raid --keep-models ``` This would pass the required collectors to tox which then sends it to the pytest command and starts the hardware dependent tests. diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index 54e23584..aba5b4fe 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -54,12 +54,6 @@ class AppStatus(str, Enum): CHECKSUM_ERROR = "Fail strategies: " INVALID_CONFIG_EXPORTER_LOG_LEVEL = "Invalid config: 'exporter-log-level'" INVALID_REDFISH_CREDS = "Invalid config: 'redfish-username' or 'redfish-password'" - NO_NVIDIA_DRIVER_DETECTED = ( - "Failed to communicate with NVIDIA driver. See more details in the logs" - ) - MANUAL_NVIDIA_DRIVER_INSTALL = ( - "No drivers for the NVIDIA GPU were found. Manual installation is necessary" - ) @pytest.mark.abort_on_fail @@ -133,7 +127,7 @@ async def test_build_and_deploy( # noqa: C901, function is too complex @pytest.mark.abort_on_fail -async def test_required_resources(ops_test: OpsTest, provided_collectors, required_resources): +async def test_required_resources(ops_test: OpsTest, required_resources): if not required_resources: pytest.skip("No required resources to be attached, skipping test") @@ -179,12 +173,7 @@ async def test_nvidia_driver_installation(ops_test: OpsTest, nvidia_present, uni exists = results.get("return-code") == 0 if not exists: - if unit.workload_status_message == AppStatus.MANUAL_NVIDIA_DRIVER_INSTALL: - pytest.fail("This machine requires manual installation of NVIDIA driver.") - elif unit.workload_status_message == AppStatus.NO_NVIDIA_DRIVER_DETECTED: - pytest.fail("Nvidia GPU detected, the tests should be run on the real hardware.") - else: - pytest.fail("Error occured during the driver installation.") + pytest.fail("Error occured during the driver installation. Check the logs.") @pytest.mark.abort_on_fail @@ -254,16 +243,22 @@ async def test_redfish_credential_validation(ops_test: OpsTest, provided_collect class TestCharmWithHW: """Run functional tests that require specific hardware.""" - async def test_config_file_permissions(self, unit, ops_test): + async def test_config_file_permissions(self, unit, ops_test, provided_collectors): """Check config file permissions are set correctly.""" + if not provided_collectors: + pytest.skip("No collectors provided, skipping test") + expected_file_mode = "600" cmd = "stat -c '%a' /etc/hardware-exporter-config.yaml" results = await run_command_on_unit(ops_test, unit.name, cmd) assert results.get("return-code") == 0 assert results.get("stdout").rstrip("\n") == expected_file_mode - async def test_config_changed_port(self, app, unit, ops_test): + async def test_config_changed_port(self, app, unit, ops_test, provided_collectors): """Test changing the config option: hardware-exporter-port.""" + if not provided_collectors: + pytest.skip("No collectors provided, skipping test") + new_port = "10001" await asyncio.gather( app.set_config({"hardware-exporter-port": new_port}), @@ -278,8 +273,11 @@ async def test_config_changed_port(self, app, unit, ops_test): await app.reset_config(["hardware-exporter-port"]) - async def test_no_redfish_config(self, unit, ops_test): + async def test_no_redfish_config(self, unit, ops_test, provided_collectors): """Test that there is no Redfish options because it's not available on lxd machines.""" + if not provided_collectors: + pytest.skip("No collectors provided, skipping test") + try: config = await get_hardware_exporter_config(ops_test, unit.name) except HardwareExporterConfigError: @@ -288,8 +286,11 @@ async def test_no_redfish_config(self, unit, ops_test): assert config.get("redfish_username") is None assert config.get("redfish_client_timeout") is None - async def test_config_changed_log_level(self, app, unit, ops_test): + async def test_config_changed_log_level(self, app, unit, ops_test, provided_collectors): """Test changing the config option: exporter-log-level.""" + if not provided_collectors: + pytest.skip("No collectors provided, skipping test") + new_log_level = "DEBUG" await asyncio.gather( app.set_config({"exporter-log-level": new_log_level}), @@ -304,8 +305,11 @@ async def test_config_changed_log_level(self, app, unit, ops_test): await app.reset_config(["exporter-log-level"]) - async def test_config_changed_collect_timeout(self, app, unit, ops_test): + async def test_config_changed_collect_timeout(self, app, unit, ops_test, provided_collectors): """Test changing the config option: collect-timeout.""" + if not provided_collectors: + pytest.skip("No collectors provided, skipping test") + new_collect_timeout = "20" await asyncio.gather( app.set_config({"collect-timeout": new_collect_timeout}), @@ -320,8 +324,11 @@ async def test_config_changed_collect_timeout(self, app, unit, ops_test): await app.reset_config(["collect-timeout"]) - async def test_start_and_stop_exporter(self, app, unit, ops_test): + async def test_start_and_stop_exporter(self, app, unit, ops_test, provided_collectors): """Test starting and stopping the exporter results in correct charm status.""" + if not provided_collectors: + pytest.skip("No collectors provided, skipping test") + # Stop the exporter, and the exporter should auto-restart after update status fire. stop_cmd = "systemctl stop hardware-exporter" async with ops_test.fast_forward(): @@ -331,8 +338,11 @@ async def test_start_and_stop_exporter(self, app, unit, ops_test): ) assert unit.workload_status_message == AppStatus.READY - async def test_exporter_failed(self, app, unit, ops_test): + async def test_exporter_failed(self, app, unit, ops_test, provided_collectors): """Test failure in the exporter results in correct charm status.""" + if not provided_collectors: + pytest.skip("No collectors provided, skipping test") + # Setting incorrect log level will crash the exporter async with ops_test.fast_forward(): await asyncio.gather( @@ -350,6 +360,9 @@ async def test_exporter_failed(self, app, unit, ops_test): async def test_config_collector_enabled(self, app, unit, ops_test, provided_collectors): """Test whether provided collectors are present in exporter config.""" + if not provided_collectors: + pytest.skip("No collectors provided, skipping test") + try: config = await get_hardware_exporter_config(ops_test, unit.name) except HardwareExporterConfigError: @@ -363,6 +376,27 @@ async def test_config_collector_enabled(self, app, unit, ops_test, provided_coll ) assert provided_collectors == collectors_in_config, error_msg + async def test_metrics_available(self, app, unit, ops_test, provided_collectors): + """Test if metrics are available at the expected endpoint on unit.""" + if not provided_collectors: + pytest.skip("No collectors provided, skipping test") + + # takes some time for exporter to start and metrics to be available + try: + async for attempt in AsyncRetrying( + stop=stop_after_attempt(5), + wait=wait_fixed(10), + ): + with attempt: + logging.info(f"Fetching metrics attempt #{attempt.retry_state.attempt_number}") + get_metrics_output.cache_clear() # clear empty metrics from cache + metrics = await get_metrics_output(ops_test, unit.name) + await ops_test.model.wait_for_idle(apps=[APP_NAME]) + except RetryError: + pytest.fail("Not able to obtain metrics!") + + assert metrics, "Metrics result should not be empty" + async def test_redfish_client_timeout_config(self, app, unit, ops_test, provided_collectors): """Test whether the redfish client's timeout depends on collect-timeout charm config.""" if "redfish" not in provided_collectors: @@ -407,24 +441,6 @@ async def test_dcgm_exporter_snap_available(self, ops_test, app, unit, nvidia_pr assert results.get("return-code") == 0 assert results.get("stdout").strip() == "active" - async def test_metrics_available(self, app, unit, ops_test): - """Test if metrics are available at the expected endpoint on unit.""" - # takes some time for exporter to start and metrics to be available - try: - async for attempt in AsyncRetrying( - stop=stop_after_attempt(5), - wait=wait_fixed(10), - ): - with attempt: - logging.info(f"Fetching metrics attempt #{attempt.retry_state.attempt_number}") - get_metrics_output.cache_clear() # clear empty metrics from cache - metrics = await get_metrics_output(ops_test, unit.name) - await ops_test.model.wait_for_idle(apps=[APP_NAME]) - except RetryError: - pytest.fail("Not able to obtain metrics!") - - assert metrics, "Metrics result should not be empty" - @pytest.mark.parametrize( "collector", [