From fd3366a5da11a9dd14ae875834b9d28b1e0ef1fc Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Thu, 14 Nov 2024 21:15:40 -0500
Subject: [PATCH 01/13] Add test to check if smartctl-exporter snap is being
 installed

---
 tests/functional/test_charm.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py
index b3d4bb0d..ead3b921 100644
--- a/tests/functional/test_charm.py
+++ b/tests/functional/test_charm.py
@@ -262,6 +262,19 @@ async def test_redfish_client_timeout_config(self, app, unit, ops_test, provided
         assert config["redfish_client_timeout"] == int(new_timeout)
 
         await app.reset_config(["collect-timeout"])
+        
+    async def test_smarctl_exporter_snap_available(self, ops_test, app, unit):
+        """Test if smartctl exporter snap is installed and ranning on the unit."""
+        snap_name = "smartctl-exporter"
+        cmd = f"snap list {snap_name}"
+        results = await run_command_on_unit(ops_test, unit.name, cmd)
+        assert results.get("return-code") == 0
+        assert snap_name in results.get("stdout").strip()
+        
+        check_active_cmd = "systemctl is-active snap.smartctl-exporter.smartctl-exporter"
+        results = await run_command_on_unit(ops_test, unit.name, check_active_cmd)
+        assert results.get("return-code") == 0
+        assert results.get("stdout").strip() == "active"
 
     async def test_metrics_available(self, app, unit, ops_test):
         """Test if metrics are available at the expected endpoint on unit."""

From a3b156cc31563b6a7b0d6e66669d795c3e7475cc Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Thu, 14 Nov 2024 21:17:32 -0500
Subject: [PATCH 02/13] Reformat

---
 tests/functional/test_charm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py
index ead3b921..c818438b 100644
--- a/tests/functional/test_charm.py
+++ b/tests/functional/test_charm.py
@@ -262,7 +262,7 @@ async def test_redfish_client_timeout_config(self, app, unit, ops_test, provided
         assert config["redfish_client_timeout"] == int(new_timeout)
 
         await app.reset_config(["collect-timeout"])
-        
+
     async def test_smarctl_exporter_snap_available(self, ops_test, app, unit):
         """Test if smartctl exporter snap is installed and ranning on the unit."""
         snap_name = "smartctl-exporter"
@@ -270,7 +270,7 @@ async def test_smarctl_exporter_snap_available(self, ops_test, app, unit):
         results = await run_command_on_unit(ops_test, unit.name, cmd)
         assert results.get("return-code") == 0
         assert snap_name in results.get("stdout").strip()
-        
+
         check_active_cmd = "systemctl is-active snap.smartctl-exporter.smartctl-exporter"
         results = await run_command_on_unit(ops_test, unit.name, check_active_cmd)
         assert results.get("return-code") == 0

From a953228e51975833dacb3d99c9f43b8d00ca1424 Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Thu, 14 Nov 2024 23:04:53 -0500
Subject: [PATCH 03/13] Remote realhw marks

---
 tests/functional/test_charm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py
index c818438b..ae6d0be4 100644
--- a/tests/functional/test_charm.py
+++ b/tests/functional/test_charm.py
@@ -162,7 +162,6 @@ async def test_required_resources(ops_test: OpsTest, provided_collectors, requir
         assert unit.workload_status_message == AppStatus.MISSING_RELATION
 
 
-@pytest.mark.realhw
 @pytest.mark.abort_on_fail
 async def test_cos_agent_relation(ops_test: OpsTest, provided_collectors):
     """Test adding relation with grafana-agent."""
@@ -225,7 +224,6 @@ async def test_redfish_credential_validation(ops_test: OpsTest, provided_collect
         assert unit.workload_status_message == AppStatus.READY
 
 
-@pytest.mark.realhw
 class TestCharmWithHW:
     """Run functional tests that require specific hardware."""
 

From dfac9d129595615ee2053ad748026016db16f72d Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Tue, 19 Nov 2024 18:34:17 -0500
Subject: [PATCH 04/13] Test bug fixes

---
 tests/functional/conftest.py   | 8 ++++----
 tests/functional/test_charm.py | 5 +++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py
index 219964ef..4c93e1c6 100644
--- a/tests/functional/conftest.py
+++ b/tests/functional/conftest.py
@@ -100,7 +100,7 @@ def resources() -> list[Resource]:
         Resource(
             resource_name=TPR_RESOURCES.get(HWTool.STORCLI),
             file_name="storcli.deb",
-            collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.STORCLI)[0].replace(
+            collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.STORCLI).replace(
                 "collector.", ""
             ),
             bin_name=HWTool.STORCLI.value,
@@ -108,7 +108,7 @@ def resources() -> list[Resource]:
         Resource(
             resource_name=TPR_RESOURCES.get(HWTool.PERCCLI),
             file_name="perccli.deb",
-            collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.PERCCLI)[0].replace(
+            collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.PERCCLI).replace(
                 "collector.", ""
             ),
             bin_name=HWTool.PERCCLI.value,
@@ -116,7 +116,7 @@ def resources() -> list[Resource]:
         Resource(
             resource_name=TPR_RESOURCES.get(HWTool.SAS2IRCU),
             file_name="sas2ircu",
-            collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS2IRCU)[0].replace(
+            collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS2IRCU).replace(
                 "collector.", ""
             ),
             bin_name=HWTool.SAS2IRCU.value,
@@ -124,7 +124,7 @@ def resources() -> list[Resource]:
         Resource(
             resource_name=TPR_RESOURCES.get(HWTool.SAS3IRCU),
             file_name="sas3ircu",
-            collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS3IRCU)[0].replace(
+            collector_name=HARDWARE_EXPORTER_COLLECTOR_MAPPING.get(HWTool.SAS3IRCU).replace(
                 "collector.", ""
             ),
             bin_name=HWTool.SAS3IRCU.value,
diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py
index ae6d0be4..ad102727 100644
--- a/tests/functional/test_charm.py
+++ b/tests/functional/test_charm.py
@@ -469,8 +469,10 @@ async def test_resource_in_correct_location(self, ops_test, unit, required_resou
             results = await run_command_on_unit(ops_test, unit.name, check_resource_cmd)
             assert results.get("return-code") == 0, f"{symlink_bin} resource doesn't exist"
 
-    async def test_redfish_config(self, app, unit, ops_test):
+    async def test_redfish_config(self, ops_test, app, unit, provided_collectors):
         """Test Redfish options."""
+        if "redfish" not in provided_collectors:
+            pytest.skip("redfish not in provided collectors, skipping test")
         # initially Redfish is available and enabled
         cmd = "cat /etc/hardware-exporter-config.yaml"
         results_before = await run_command_on_unit(ops_test, unit.name, cmd)
@@ -584,7 +586,6 @@ async def test_resource_clean_up(self, ops_test, app, unit, required_resources):
         )
 
 
-@pytest.mark.realhw
 class TestCharm:
     """Perform tests that require one or more exporters to be present."""
 

From 62d3ea3ddc9392b98db2f282b325f01525150b91 Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Tue, 19 Nov 2024 19:09:45 -0500
Subject: [PATCH 05/13] Remove extra build step

---
 tests/functional/conftest.py   | 30 ++++++++++++++++++++++++++++--
 tests/functional/test_charm.py | 10 +++-------
 tox.ini                        |  1 +
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py
index bc56aae5..098d6ea4 100644
--- a/tests/functional/conftest.py
+++ b/tests/functional/conftest.py
@@ -1,5 +1,7 @@
 import logging
+import os
 import platform
+from pathlib import Path
 
 import pytest
 from utils import RESOURCES_DIR, Resource
@@ -8,13 +10,19 @@
 
 log = logging.getLogger(__name__)
 
+BASES = {
+    "ubuntu@20.04": "focal",
+    "ubuntu@22.04": "jammy",
+    "ubuntu@24.04": "noble",
+}
+
 
 def pytest_addoption(parser):
     parser.addoption(
         "--base",
         type=str.lower,
-        default="ubuntu@22.04",
-        choices=["ubuntu@20.04", "ubuntu@22.04", "ubuntu@24.04"],
+        default="ubuntu@20.04",
+        choices=BASES.keys(),
         help="Set base for the applications.",
     )
 
@@ -146,3 +154,21 @@ def required_resources(resources: list[Resource], provided_collectors: set) -> l
             required_resources.append(resource)
 
     return required_resources
+
+
+@pytest.fixture()
+def charm_path(base: str) -> Path:
+    """Fixture to determine the charm path based on the base."""
+    env_charm_path = f"CHARM_PATH_{BASES[base].upper()}"
+    path = os.getenv(env_charm_path)
+
+    if not path:
+        raise EnvironmentError(
+            f"Environment variable '{env_charm_path}' is not set for base '{base}'."
+        )
+    if not Path(path).exists():
+        raise FileNotFoundError(
+            f"The path specified in '{env_charm_path}' ({path}) does not exist."
+        )
+
+    return Path(path)
diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py
index 837665cd..aed738f1 100644
--- a/tests/functional/test_charm.py
+++ b/tests/functional/test_charm.py
@@ -56,17 +56,13 @@ class AppStatus(str, Enum):
 @pytest.mark.abort_on_fail
 @pytest.mark.skip_if_deployed
 async def test_build_and_deploy(  # noqa: C901, function is too complex
-    ops_test: OpsTest, base, architecture, provided_collectors, required_resources
+    ops_test: OpsTest, base, architecture, provided_collectors, required_resources, charm_path
 ):
-    """Build the charm-under-test and deploy it together with related charms.
+    """Deploy the charm together with related charms.
 
     Assert on the unit status before any relations/configurations take place.
     Optionally attach required resources when testing with real hardware.
     """
-    # Build and deploy charm from local source folder
-    charm = await ops_test.build_charm(".")
-    assert charm, "Charm was not built successfully."
-
     # This is required for subordinate appliation to choose right revison
     # on different architecture.
     # See issue: https://bugs.launchpad.net/juju/+bug/2067749
@@ -77,7 +73,7 @@ async def test_build_and_deploy(  # noqa: C901, function is too complex
     logger.info("Rendering bundle %s", bundle_template_path)
     bundle = ops_test.render_bundle(
         bundle_template_path,
-        charm=charm,
+        charm=charm_path,
         base=base,
         resources={
             "storcli-deb": "empty-resource",
diff --git a/tox.ini b/tox.ini
index a388a52c..894148dd 100644
--- a/tox.ini
+++ b/tox.ini
@@ -69,6 +69,7 @@ deps =
 passenv =
     REDFISH_USERNAME
     REDFISH_PASSWORD
+    CHARM_PATH_*
 
 [testenv:integration]
 description = Run integration tests with COS

From 63f560382741420ccd13e96aacb7c04fbe25b392 Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Tue, 19 Nov 2024 20:02:41 -0500
Subject: [PATCH 06/13] Tests restructure

---
 tests/functional/test_charm.py | 248 ++++++++++++++++-----------------
 1 file changed, 121 insertions(+), 127 deletions(-)

diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py
index aed738f1..907b528a 100644
--- a/tests/functional/test_charm.py
+++ b/tests/functional/test_charm.py
@@ -188,9 +188,10 @@ async def test_cos_agent_relation(ops_test: OpsTest, provided_collectors):
     # Test with cos-agent relation
     logging.info("Check whether hardware-exporter is active after creating relation.")
     for unit in ops_test.model.applications[APP_NAME].units:
-        results = await run_command_on_unit(ops_test, unit.name, check_active_cmd)
-        assert results.get("return-code") == 0
-        assert results.get("stdout").strip() == "active"
+        if provided_collectors:
+            results = await run_command_on_unit(ops_test, unit.name, check_active_cmd)
+            assert results.get("return-code") == 0
+            assert results.get("stdout").strip() == "active"
         if redfish_present:
             assert unit.workload_status_message == AppStatus.INVALID_REDFISH_CREDS
         else:
@@ -220,9 +221,104 @@ async def test_redfish_credential_validation(ops_test: OpsTest, provided_collect
         assert unit.workload_status_message == AppStatus.READY
 
 
+@pytest.mark.realhw
 class TestCharmWithHW:
     """Run functional tests that require specific hardware."""
 
+    async def test_config_file_permissions(self, unit, ops_test):
+        """Check config file permissions are set correctly."""
+        expected_file_mode = "600"
+        cmd = "stat -c '%a' /etc/hardware-exporter-config.yaml"
+        results = await run_command_on_unit(ops_test, unit.name, cmd)
+        assert results.get("return-code") == 0
+        assert results.get("stdout").rstrip("\n") == expected_file_mode
+
+    async def test_config_changed_port(self, app, unit, ops_test):
+        """Test changing the config option: hardware-exporter-port."""
+        new_port = "10001"
+        await asyncio.gather(
+            app.set_config({"hardware-exporter-port": new_port}),
+            ops_test.model.wait_for_idle(apps=[APP_NAME]),
+        )
+
+        cmd = "cat /etc/hardware-exporter-config.yaml"
+        results = await run_command_on_unit(ops_test, unit.name, cmd)
+        assert results.get("return-code") == 0
+        config = yaml.safe_load(results.get("stdout").strip())
+        assert config["port"] == int(new_port)
+
+        await app.reset_config(["hardware-exporter-port"])
+
+    async def test_no_redfish_config(self, unit, ops_test):
+        """Test that there is no Redfish options because it's not available on lxd machines."""
+        cmd = "cat /etc/hardware-exporter-config.yaml"
+        results = await run_command_on_unit(ops_test, unit.name, cmd)
+        assert results.get("return-code") == 0
+        config = yaml.safe_load(results.get("stdout").strip())
+        assert config.get("redfish_host") is None
+        assert config.get("redfish_username") is None
+        assert config.get("redfish_client_timeout") is None
+
+    async def test_config_changed_log_level(self, app, unit, ops_test):
+        """Test changing the config option: exporter-log-level."""
+        new_log_level = "DEBUG"
+        await asyncio.gather(
+            app.set_config({"exporter-log-level": new_log_level}),
+            ops_test.model.wait_for_idle(apps=[APP_NAME]),
+        )
+
+        cmd = "cat /etc/hardware-exporter-config.yaml"
+        results = await run_command_on_unit(ops_test, unit.name, cmd)
+        assert results.get("return-code") == 0
+        config = yaml.safe_load(results.get("stdout").strip())
+        assert config["level"] == new_log_level
+
+        await app.reset_config(["exporter-log-level"])
+
+    async def test_config_changed_collect_timeout(self, app, unit, ops_test):
+        """Test changing the config option: collect-timeout."""
+        new_collect_timeout = "20"
+        await asyncio.gather(
+            app.set_config({"collect-timeout": new_collect_timeout}),
+            ops_test.model.wait_for_idle(apps=[APP_NAME]),
+        )
+
+        cmd = "cat /etc/hardware-exporter-config.yaml"
+        results = await run_command_on_unit(ops_test, unit.name, cmd)
+        assert results.get("return-code") == 0
+        config = yaml.safe_load(results.get("stdout").strip())
+        assert config["collect_timeout"] == int(new_collect_timeout)
+
+        await app.reset_config(["collect-timeout"])
+
+    async def test_start_and_stop_exporter(self, app, unit, ops_test):
+        """Test starting and stopping the exporter results in correct charm status."""
+        # Stop the exporter, and the exporter should auto-restart after update status fire.
+        stop_cmd = "systemctl stop hardware-exporter"
+        async with ops_test.fast_forward():
+            await asyncio.gather(
+                unit.run(stop_cmd),
+                ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=TIMEOUT),
+            )
+            assert unit.workload_status_message == AppStatus.READY
+
+    async def test_exporter_failed(self, app, unit, ops_test):
+        """Test failure in the exporter results in correct charm status."""
+        # Setting incorrect log level will crash the exporter
+        async with ops_test.fast_forward():
+            await asyncio.gather(
+                app.set_config({"exporter-log-level": "RANDOM_LEVEL"}),
+                ops_test.model.wait_for_idle(apps=[APP_NAME], status="blocked", timeout=TIMEOUT),
+            )
+            assert unit.workload_status_message == AppStatus.INVALID_CONFIG_EXPORTER_LOG_LEVEL
+
+        async with ops_test.fast_forward():
+            await asyncio.gather(
+                app.reset_config(["exporter-log-level"]),
+                ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=TIMEOUT),
+            )
+            assert unit.workload_status_message == AppStatus.READY
+
     async def test_config_collector_enabled(self, app, unit, ops_test, provided_collectors):
         """Test whether provided collectors are present in exporter config."""
         cmd = "cat /etc/hardware-exporter-config.yaml"
@@ -582,131 +678,29 @@ async def test_resource_clean_up(self, ops_test, app, unit, required_resources):
         )
 
 
-class TestCharm:
-    """Perform tests that require one or more exporters to be present."""
-
-    async def test_config_file_permissions(self, unit, ops_test):
-        """Check config file permissions are set correctly."""
-        expected_file_mode = "600"
-        cmd = "stat -c '%a' /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        assert results.get("stdout").rstrip("\n") == expected_file_mode
-
-    async def test_config_changed_port(self, app, unit, ops_test):
-        """Test changing the config option: hardware-exporter-port."""
-        new_port = "10001"
-        await asyncio.gather(
-            app.set_config({"hardware-exporter-port": new_port}),
-            ops_test.model.wait_for_idle(apps=[APP_NAME]),
-        )
-
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        config = yaml.safe_load(results.get("stdout").strip())
-        assert config["port"] == int(new_port)
-
-        await app.reset_config(["hardware-exporter-port"])
-
-    async def test_no_redfish_config(self, unit, ops_test):
-        """Test that there is no Redfish options because it's not available on lxd machines."""
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        config = yaml.safe_load(results.get("stdout").strip())
-        assert config.get("redfish_host") is None
-        assert config.get("redfish_username") is None
-        assert config.get("redfish_client_timeout") is None
-
-    async def test_config_changed_log_level(self, app, unit, ops_test):
-        """Test changing the config option: exporter-log-level."""
-        new_log_level = "DEBUG"
-        await asyncio.gather(
-            app.set_config({"exporter-log-level": new_log_level}),
-            ops_test.model.wait_for_idle(apps=[APP_NAME]),
-        )
-
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        config = yaml.safe_load(results.get("stdout").strip())
-        assert config["level"] == new_log_level
-
-        await app.reset_config(["exporter-log-level"])
-
-    async def test_config_changed_collect_timeout(self, app, unit, ops_test):
-        """Test changing the config option: collect-timeout."""
-        new_collect_timeout = "20"
-        await asyncio.gather(
-            app.set_config({"collect-timeout": new_collect_timeout}),
-            ops_test.model.wait_for_idle(apps=[APP_NAME]),
-        )
-
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        config = yaml.safe_load(results.get("stdout").strip())
-        assert config["collect_timeout"] == int(new_collect_timeout)
-
-        await app.reset_config(["collect-timeout"])
-
-    async def test_start_and_stop_exporter(self, app, unit, ops_test):
-        """Test starting and stopping the exporter results in correct charm status."""
-        # Stop the exporter, and the exporter should auto-restart after update status fire.
-        stop_cmd = "systemctl stop hardware-exporter"
-        async with ops_test.fast_forward():
-            await asyncio.gather(
-                unit.run(stop_cmd),
-                ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=TIMEOUT),
-            )
-            assert unit.workload_status_message == AppStatus.READY
-
-    async def test_exporter_failed(self, app, unit, ops_test):
-        """Test failure in the exporter results in correct charm status."""
-        # Setting incorrect log level will crash the exporter
-        async with ops_test.fast_forward():
-            await asyncio.gather(
-                app.set_config({"exporter-log-level": "RANDOM_LEVEL"}),
-                ops_test.model.wait_for_idle(apps=[APP_NAME], status="blocked", timeout=TIMEOUT),
-            )
-            assert unit.workload_status_message == AppStatus.INVALID_CONFIG_EXPORTER_LOG_LEVEL
-
-        async with ops_test.fast_forward():
-            await asyncio.gather(
-                app.reset_config(["exporter-log-level"]),
-                ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=TIMEOUT),
-            )
-            assert unit.workload_status_message == AppStatus.READY
-
-    async def test_on_remove_event(self, app, ops_test):
-        """Test _on_remove event cleans up the service on the host machine."""
-        await asyncio.gather(
-            app.remove_relation(f"{APP_NAME}:general-info", f"{PRINCIPAL_APP_NAME}:juju-info"),
-            ops_test.model.wait_for_idle(
-                apps=[PRINCIPAL_APP_NAME], status="active", timeout=TIMEOUT
-            ),
-        )
-        principal_unit = ops_test.model.applications[PRINCIPAL_APP_NAME].units[0]
+@pytest.mark.abort_on_fail
+async def test_on_remove_event(app, ops_test):
+    """Test _on_remove event cleans up the service on the host machine."""
+    await asyncio.gather(
+        app.remove_relation(f"{APP_NAME}:general-info", f"{PRINCIPAL_APP_NAME}:juju-info"),
+        ops_test.model.wait_for_idle(apps=[PRINCIPAL_APP_NAME], status="active", timeout=TIMEOUT),
+    )
+    principal_unit = ops_test.model.applications[PRINCIPAL_APP_NAME].units[0]
 
-        # Wait for cleanup activities to finish
-        await ops_test.model.block_until(
-            lambda: ops_test.model.applications[APP_NAME].status == "unknown"
-        )
+    # Wait for cleanup activities to finish
+    await ops_test.model.block_until(
+        lambda: ops_test.model.applications[APP_NAME].status == "unknown"
+    )
 
-        cmd = "ls /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, principal_unit.name, cmd)
-        assert results.get("return-code") > 0
+    cmd = "ls /etc/hardware-exporter-config.yaml"
+    results = await run_command_on_unit(ops_test, principal_unit.name, cmd)
+    assert results.get("return-code") > 0
 
-        cmd = "ls /etc/systemd/system/hardware-exporter.service"
-        results = await run_command_on_unit(ops_test, principal_unit.name, cmd)
-        assert results.get("return-code") > 0
+    cmd = "ls /etc/systemd/system/hardware-exporter.service"
+    results = await run_command_on_unit(ops_test, principal_unit.name, cmd)
+    assert results.get("return-code") > 0
 
-        await asyncio.gather(
-            ops_test.model.add_relation(
-                f"{APP_NAME}:general-info", f"{PRINCIPAL_APP_NAME}:juju-info"
-            ),
-            ops_test.model.wait_for_idle(
-                apps=[PRINCIPAL_APP_NAME], status="active", timeout=TIMEOUT
-            ),
-        )
+    await asyncio.gather(
+        ops_test.model.add_relation(f"{APP_NAME}:general-info", f"{PRINCIPAL_APP_NAME}:juju-info"),
+        ops_test.model.wait_for_idle(apps=[PRINCIPAL_APP_NAME], status="active", timeout=TIMEOUT),
+    )

From 0e5cc37695d84f2f89f9c594e44ccd25bdc01cdd Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Wed, 20 Nov 2024 11:42:56 -0500
Subject: [PATCH 07/13] Revert default base choice

---
 tests/functional/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py
index 098d6ea4..73733e9f 100644
--- a/tests/functional/conftest.py
+++ b/tests/functional/conftest.py
@@ -21,7 +21,7 @@ def pytest_addoption(parser):
     parser.addoption(
         "--base",
         type=str.lower,
-        default="ubuntu@20.04",
+        default="ubuntu@22.04",
         choices=BASES.keys(),
         help="Set base for the applications.",
     )

From edd1c0ab7c4693abbe51eab86eede484c57103ac Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Thu, 21 Nov 2024 18:09:29 -0500
Subject: [PATCH 08/13] Find built charm by glob

---
 tests/functional/conftest.py   | 26 +++---------
 tests/functional/test_charm.py | 78 +++++++++++++++++-----------------
 tests/functional/utils.py      | 16 +++++++
 3 files changed, 63 insertions(+), 57 deletions(-)

diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py
index 73733e9f..517436f3 100644
--- a/tests/functional/conftest.py
+++ b/tests/functional/conftest.py
@@ -10,19 +10,13 @@
 
 log = logging.getLogger(__name__)
 
-BASES = {
-    "ubuntu@20.04": "focal",
-    "ubuntu@22.04": "jammy",
-    "ubuntu@24.04": "noble",
-}
-
 
 def pytest_addoption(parser):
     parser.addoption(
         "--base",
         type=str.lower,
         default="ubuntu@22.04",
-        choices=BASES.keys(),
+        choices=["ubuntu@20.04", "ubuntu@22.04", "ubuntu@24.04"],
         help="Set base for the applications.",
     )
 
@@ -157,18 +151,12 @@ def required_resources(resources: list[Resource], provided_collectors: set) -> l
 
 
 @pytest.fixture()
-def charm_path(base: str) -> Path:
+def charm_path(base: str, architecture: str) -> Path:
     """Fixture to determine the charm path based on the base."""
-    env_charm_path = f"CHARM_PATH_{BASES[base].upper()}"
-    path = os.getenv(env_charm_path)
+    glob_path = f"hardware-observer_*{base.replace('@', '-')}-{architecture}*.charm"
+    paths = list(Path(".").glob(glob_path))
 
-    if not path:
-        raise EnvironmentError(
-            f"Environment variable '{env_charm_path}' is not set for base '{base}'."
-        )
-    if not Path(path).exists():
-        raise FileNotFoundError(
-            f"The path specified in '{env_charm_path}' ({path}) does not exist."
-        )
+    if not paths:
+        raise FileNotFoundError(f"The path for the charm for {base}-{architecture} is not found.")
 
-    return Path(path)
+    return os.getcwd() / paths[0]
diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py
index 907b528a..1162b883 100644
--- a/tests/functional/test_charm.py
+++ b/tests/functional/test_charm.py
@@ -16,8 +16,10 @@
 from tenacity import AsyncRetrying, RetryError, stop_after_attempt, wait_fixed
 from utils import (
     RESOURCES_DIR,
+    HardwareExporterConfigError,
     MetricsFetchError,
     assert_metrics,
+    get_hardware_exporter_config,
     get_metrics_output,
     run_command_on_unit,
 )
@@ -241,20 +243,20 @@ async def test_config_changed_port(self, app, unit, ops_test):
             ops_test.model.wait_for_idle(apps=[APP_NAME]),
         )
 
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        config = yaml.safe_load(results.get("stdout").strip())
+        try:
+            config = await get_hardware_exporter_config(ops_test, unit.name)
+        except HardwareExporterConfigError:
+            pytest.fail("Not able to obtain hardware-exporter config!")
         assert config["port"] == int(new_port)
 
         await app.reset_config(["hardware-exporter-port"])
 
     async def test_no_redfish_config(self, unit, ops_test):
         """Test that there is no Redfish options because it's not available on lxd machines."""
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        config = yaml.safe_load(results.get("stdout").strip())
+        try:
+            config = await get_hardware_exporter_config(ops_test, unit.name)
+        except HardwareExporterConfigError:
+            pytest.fail("Not able to obtain hardware-exporter config!")
         assert config.get("redfish_host") is None
         assert config.get("redfish_username") is None
         assert config.get("redfish_client_timeout") is None
@@ -267,10 +269,10 @@ async def test_config_changed_log_level(self, app, unit, ops_test):
             ops_test.model.wait_for_idle(apps=[APP_NAME]),
         )
 
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        config = yaml.safe_load(results.get("stdout").strip())
+        try:
+            config = await get_hardware_exporter_config(ops_test, unit.name)
+        except HardwareExporterConfigError:
+            pytest.fail("Not able to obtain hardware-exporter config!")
         assert config["level"] == new_log_level
 
         await app.reset_config(["exporter-log-level"])
@@ -283,10 +285,10 @@ async def test_config_changed_collect_timeout(self, app, unit, ops_test):
             ops_test.model.wait_for_idle(apps=[APP_NAME]),
         )
 
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        config = yaml.safe_load(results.get("stdout").strip())
+        try:
+            config = await get_hardware_exporter_config(ops_test, unit.name)
+        except HardwareExporterConfigError:
+            pytest.fail("Not able to obtain hardware-exporter config!")
         assert config["collect_timeout"] == int(new_collect_timeout)
 
         await app.reset_config(["collect-timeout"])
@@ -321,10 +323,10 @@ async def test_exporter_failed(self, app, unit, ops_test):
 
     async def test_config_collector_enabled(self, app, unit, ops_test, provided_collectors):
         """Test whether provided collectors are present in exporter config."""
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        config = yaml.safe_load(results.get("stdout").strip())
+        try:
+            config = await get_hardware_exporter_config(ops_test, unit.name)
+        except HardwareExporterConfigError:
+            pytest.fail("Not able to obtain hardware-exporter config!")
         collectors_in_config = {
             collector.replace("collector.", "") for collector in config.get("enable_collectors")
         }
@@ -345,10 +347,10 @@ async def test_redfish_client_timeout_config(self, app, unit, ops_test, provided
             ops_test.model.wait_for_idle(apps=[APP_NAME]),
         )
 
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        config = yaml.safe_load(results.get("stdout").strip())
+        try:
+            config = await get_hardware_exporter_config(ops_test, unit.name)
+        except HardwareExporterConfigError:
+            pytest.fail("Not able to obtain hardware-exporter config!")
         assert config["redfish_client_timeout"] == int(new_timeout)
 
         await app.reset_config(["collect-timeout"])
@@ -566,13 +568,13 @@ async def test_redfish_config(self, ops_test, app, unit, provided_collectors):
         if "redfish" not in provided_collectors:
             pytest.skip("redfish not in provided collectors, skipping test")
         # initially Redfish is available and enabled
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results_before = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results_before.get("return-code") == 0
-        config = yaml.safe_load(results_before.get("stdout").strip())
-        assert config.get("redfish_host") is not None
-        assert config.get("redfish_username") is not None
-        assert config.get("redfish_client_timeout") is not None
+        try:
+            config_before = await get_hardware_exporter_config(ops_test, unit.name)
+        except HardwareExporterConfigError:
+            pytest.fail("Not able to obtain hardware-exporter config!")
+        assert config_before.get("redfish_host") is not None
+        assert config_before.get("redfish_username") is not None
+        assert config_before.get("redfish_client_timeout") is not None
 
         # Disable Redfish and see if the config is not present
         await asyncio.gather(
@@ -580,13 +582,13 @@ async def test_redfish_config(self, ops_test, app, unit, provided_collectors):
             ops_test.model.wait_for_idle(apps=[APP_NAME]),
         )
 
-        cmd = "cat /etc/hardware-exporter-config.yaml"
-        results_after = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results_before.get("return-code") == 0
-        config = yaml.safe_load(results_after.get("stdout").strip())
-        assert config.get("redfish_host") is None
-        assert config.get("redfish_username") is None
-        assert config.get("redfish_client_timeout") is None
+        try:
+            config_after = await get_hardware_exporter_config(ops_test, unit.name)
+        except HardwareExporterConfigError:
+            pytest.fail("Not able to obtain hardware-exporter config!")
+        assert config_after.get("redfish_host") is None
+        assert config_after.get("redfish_username") is None
+        assert config_after.get("redfish_client_timeout") is None
 
         await app.reset_config(["redfish-disable"])
 
diff --git a/tests/functional/utils.py b/tests/functional/utils.py
index 7c948cf3..a6421d61 100644
--- a/tests/functional/utils.py
+++ b/tests/functional/utils.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 from typing import Optional
 
+import yaml
 from async_lru import alru_cache
 
 RESOURCES_DIR = Path("./resources/")
@@ -44,6 +45,12 @@ class MetricsFetchError(Exception):
     pass
 
 
+class HardwareExporterConfigError(Exception):
+    """Raise if something goes wrong when getting hardware-exporter config."""
+
+    pass
+
+
 async def run_command_on_unit(ops_test, unit_name, command):
     complete_command = ["exec", "--unit", unit_name, "--", *command.split()]
     return_code, stdout, _ = await ops_test.juju(*complete_command)
@@ -54,6 +61,15 @@ async def run_command_on_unit(ops_test, unit_name, command):
     return results
 
 
+async def get_hardware_exporter_config(ops_test, unit_name) -> dict:
+    """Return hardware-exporter config from endpoint on unit."""
+    command = "cat /etc/hardware-exporter/config.yaml"
+    results = await run_command_on_unit(ops_test, unit_name, command)
+    if results.get("return-code") > 0:
+        raise HardwareExporterConfigError
+    return yaml.safe_load(results.get("stdout"))
+
+
 @alru_cache
 async def get_metrics_output(ops_test, unit_name) -> Optional[dict[str, list[Metric]]]:
     """Return parsed prometheus metric output from endpoint on unit.

From 5f10c8864edd64d9e95b72ceee2003aadac2aa2a Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Fri, 22 Nov 2024 13:43:32 -0500
Subject: [PATCH 09/13] Log about the charm path used and raise if multiple
 paths found

---
 tests/functional/conftest.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py
index 517436f3..e3757818 100644
--- a/tests/functional/conftest.py
+++ b/tests/functional/conftest.py
@@ -1,5 +1,4 @@
 import logging
-import os
 import platform
 from pathlib import Path
 
@@ -152,11 +151,19 @@ def required_resources(resources: list[Resource], provided_collectors: set) -> l
 
 @pytest.fixture()
 def charm_path(base: str, architecture: str) -> Path:
-    """Fixture to determine the charm path based on the base."""
+    """Fixture to determine the charm path based on the base and architecture."""
     glob_path = f"hardware-observer_*{base.replace('@', '-')}-{architecture}*.charm"
     paths = list(Path(".").glob(glob_path))
 
     if not paths:
         raise FileNotFoundError(f"The path for the charm for {base}-{architecture} is not found.")
 
-    return os.getcwd() / paths[0]
+    if len(paths) > 1:
+        raise FileNotFoundError(
+            f"Multiple charms found for {base}-{architecture}. Please provide only one."
+        )
+
+    # The bundle will need the full path to the charm
+    path = paths[0].absolute()
+    log.info(f"Using charm path: {path}")
+    return path

From 70fb2e18288d4df6f3f90030bed6091cd688b819 Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Fri, 22 Nov 2024 15:45:14 -0500
Subject: [PATCH 10/13] Add dcgm collector and test for the exporter snap

---
 tests/functional/conftest.py   |  1 +
 tests/functional/test_charm.py | 46 +++++++++++++++++++++++++++++++---
 tests/functional/utils.py      |  9 +++++++
 3 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py
index e3757818..ab413d09 100644
--- a/tests/functional/conftest.py
+++ b/tests/functional/conftest.py
@@ -33,6 +33,7 @@ def pytest_addoption(parser):
             "poweredge_raid",
             "lsi_sas_2",
             "lsi_sas_3",
+            "dcgm",
         ],
         help="Provide space-separated list of collectors for testing with real hardware.",
     )
diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py
index 1162b883..99fc23f4 100644
--- a/tests/functional/test_charm.py
+++ b/tests/functional/test_charm.py
@@ -19,6 +19,7 @@
     HardwareExporterConfigError,
     MetricsFetchError,
     assert_metrics,
+    assert_snap_installed,
     get_hardware_exporter_config,
     get_metrics_output,
     run_command_on_unit,
@@ -53,6 +54,12 @@ class AppStatus(str, Enum):
     CHECKSUM_ERROR = "Fail strategies: "
     INVALID_CONFIG_EXPORTER_LOG_LEVEL = "Invalid config: 'exporter-log-level'"
     INVALID_REDFISH_CREDS = "Invalid config: 'redfish-username' or 'redfish-password'"
+    NO_NVIDIA_DRIVER_DETECTED = (
+        "Failed to communicate with NVIDIA driver. See more details in the logs"
+    )
+    MANUAL_NVIDIA_DRIVER_INSTALL = (
+        "No drivers for the NVIDIA GPU were found. Manual installation is necessary"
+    )
 
 
 @pytest.mark.abort_on_fail
@@ -160,6 +167,25 @@ async def test_required_resources(ops_test: OpsTest, provided_collectors, requir
         assert unit.workload_status_message == AppStatus.MISSING_RELATION
 
 
+@pytest.mark.abort_on_fail
+async def test_nvidia_driver_installation(ops_test: OpsTest, provided_collectors, unit):
+    """Test nvidia driver installation."""
+    if "dcgm" not in provided_collectors:
+        pytest.skip("dcgm not in provided collectors, skipping test")
+
+    check_nvidia_driver_cmd = "cat /proc/driver/nvidia/version"
+    results = await run_command_on_unit(ops_test, unit.name, check_nvidia_driver_cmd)
+    exists = results.get("return-code") == 0
+
+    if not exists:
+        if unit.workload_status_message == AppStatus.MANUAL_NVIDIA_DRIVER_INSTALL:
+            pytest.fail("This machine requires manual installation of NVIDIA driver.")
+        elif unit.workload_status_message == AppStatus.NO_NVIDIA_DRIVER_DETECTED:
+            pytest.fail("Nvidia GPU detected, the tests should be run on the real hardware.")
+        else:
+            pytest.fail("Error occured during the driver installation.")
+
+
 @pytest.mark.abort_on_fail
 async def test_cos_agent_relation(ops_test: OpsTest, provided_collectors):
     """Test adding relation with grafana-agent."""
@@ -358,16 +384,28 @@ async def test_redfish_client_timeout_config(self, app, unit, ops_test, provided
     async def test_smarctl_exporter_snap_available(self, ops_test, app, unit):
         """Test if smartctl exporter snap is installed and ranning on the unit."""
         snap_name = "smartctl-exporter"
-        cmd = f"snap list {snap_name}"
-        results = await run_command_on_unit(ops_test, unit.name, cmd)
-        assert results.get("return-code") == 0
-        assert snap_name in results.get("stdout").strip()
+        if not assert_snap_installed(ops_test, unit.name, snap_name):
+            pytest.fail(f"{snap_name} snap is not installed on the unit.")
 
         check_active_cmd = "systemctl is-active snap.smartctl-exporter.smartctl-exporter"
         results = await run_command_on_unit(ops_test, unit.name, check_active_cmd)
         assert results.get("return-code") == 0
         assert results.get("stdout").strip() == "active"
 
+    async def test_dcgm_exporter_snap_available(self, ops_test, app, unit, provided_collectors):
+        """Test if dcgm exporter snap is installed and ranning on the unit."""
+        if "dcgm" not in provided_collectors:
+            pytest.skip("dcgm not in provided collectors, skipping test")
+
+        snap_name = "dcgm"
+        if not assert_snap_installed(ops_test, unit.name, snap_name):
+            pytest.fail(f"{snap_name} snap is not installed on the unit.")
+
+        check_active_cmd = "systemctl is-active snap.dcgm.dcgm-exporter"
+        results = await run_command_on_unit(ops_test, unit.name, check_active_cmd)
+        assert results.get("return-code") == 0
+        assert results.get("stdout").strip() == "active"
+
     async def test_metrics_available(self, app, unit, ops_test):
         """Test if metrics are available at the expected endpoint on unit."""
         # takes some time for exporter to start and metrics to be available
diff --git a/tests/functional/utils.py b/tests/functional/utils.py
index a6421d61..70df8af4 100644
--- a/tests/functional/utils.py
+++ b/tests/functional/utils.py
@@ -84,6 +84,15 @@ async def get_metrics_output(ops_test, unit_name) -> Optional[dict[str, list[Met
     return parsed_metrics
 
 
+async def assert_snap_installed(ops_test, unit_name: str, snap_name: str) -> bool:
+    """Assert whether snap is installed on the model."""
+    cmd = f"snap list {snap_name}"
+    results = await run_command_on_unit(ops_test, unit_name, cmd)
+    if results.get("return-code") > 0 or snap_name not in results.get("stdout"):
+        return False
+    return True
+
+
 def assert_metrics(metrics: list[Metric], expected_metric_values_map: dict[str, float]) -> bool:
     """Assert whether values in obtained list of metrics for a collector are as expected.
 

From ac141508a747f78b90f4ba64bcc1f24bd7cc4723 Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Fri, 29 Nov 2024 16:56:27 -0500
Subject: [PATCH 11/13] Use nvidia flag instead of collector

---
 tests/functional/conftest.py   | 12 +++++++++++-
 tests/functional/test_charm.py |  8 ++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py
index ab413d09..7f5ba60a 100644
--- a/tests/functional/conftest.py
+++ b/tests/functional/conftest.py
@@ -19,6 +19,12 @@ def pytest_addoption(parser):
         help="Set base for the applications.",
     )
 
+    parser.addoption(
+        "--nvidia",
+        action="store_true",
+        help="Enable NVIDIA GPU support for testing with real hardware.",
+    )
+
     parser.addoption(
         "--collectors",
         nargs="+",
@@ -33,7 +39,6 @@ def pytest_addoption(parser):
             "poweredge_raid",
             "lsi_sas_2",
             "lsi_sas_3",
-            "dcgm",
         ],
         help="Provide space-separated list of collectors for testing with real hardware.",
     )
@@ -44,6 +49,11 @@ def base(request):
     return request.config.getoption("--base")
 
 
+@pytest.fixture(scope="module")
+def nvidia_present(request):
+    return request.config.getoption("--nvidia")
+
+
 @pytest.fixture(scope="module")
 def architecture():
     machine = platform.machine()
diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py
index 99fc23f4..508ab139 100644
--- a/tests/functional/test_charm.py
+++ b/tests/functional/test_charm.py
@@ -168,9 +168,9 @@ async def test_required_resources(ops_test: OpsTest, provided_collectors, requir
 
 
 @pytest.mark.abort_on_fail
-async def test_nvidia_driver_installation(ops_test: OpsTest, provided_collectors, unit):
+async def test_nvidia_driver_installation(ops_test: OpsTest, nvidia_present, unit):
     """Test nvidia driver installation."""
-    if "dcgm" not in provided_collectors:
+    if not nvidia_present:
         pytest.skip("dcgm not in provided collectors, skipping test")
 
     check_nvidia_driver_cmd = "cat /proc/driver/nvidia/version"
@@ -392,9 +392,9 @@ async def test_smarctl_exporter_snap_available(self, ops_test, app, unit):
         assert results.get("return-code") == 0
         assert results.get("stdout").strip() == "active"
 
-    async def test_dcgm_exporter_snap_available(self, ops_test, app, unit, provided_collectors):
+    async def test_dcgm_exporter_snap_available(self, ops_test, app, unit, nvidia_present):
         """Test if dcgm exporter snap is installed and ranning on the unit."""
-        if "dcgm" not in provided_collectors:
+        if not nvidia_present:
             pytest.skip("dcgm not in provided collectors, skipping test")
 
         snap_name = "dcgm"

From 2422ecb19d99fa0c1dac5c90e0ae30569efc1ce9 Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Fri, 29 Nov 2024 17:24:16 -0500
Subject: [PATCH 12/13] Add --realhw flag

---
 tests/functional/conftest.py   | 26 ++++++++++++--------------
 tests/functional/test_charm.py |  5 +++--
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py
index 7f5ba60a..b29025aa 100644
--- a/tests/functional/conftest.py
+++ b/tests/functional/conftest.py
@@ -19,6 +19,12 @@ def pytest_addoption(parser):
         help="Set base for the applications.",
     )
 
+    parser.addoption(
+        "--realhw",
+        action="store_true",
+        help="Enable real hardware testing.",
+    )
+
     parser.addoption(
         "--nvidia",
         action="store_true",
@@ -54,6 +60,11 @@ def nvidia_present(request):
     return request.config.getoption("--nvidia")
 
 
+@pytest.fixture(scope="module")
+def realhw(request):
+    return request.config.getoption("--realhw")
+
+
 @pytest.fixture(scope="module")
 def architecture():
     machine = platform.machine()
@@ -72,20 +83,7 @@ def pytest_configure(config):
 
 
 def pytest_collection_modifyitems(config, items):
-    if config.getoption("collectors"):
-        # --collectors provided, skip hw independent tests
-        skip_hw_independent = pytest.mark.skip(
-            reason="Hardware independent tests are skipped since --collectors was provided."
-        )
-        for item in items:
-            # skip TestCharm tests where "realhw" marker is not present
-            # we don't want to skip test_setup_and_build, test_required_resources,
-            # test_cos_agent_relation and test_redfish_credential_validation
-            # even for hw independent tests
-            # so we also check for the abort_on_fail marker
-            if "realhw" not in item.keywords and "abort_on_fail" not in item.keywords:
-                item.add_marker(skip_hw_independent)
-    else:
+    if not config.getoption("--realhw"):
         # skip hw dependent tests in TestCharmWithHW marked with "realhw"
         skip_hw_dependent = pytest.mark.skip(
             reason="Hardware dependent test. Provide collectors with the --collectors option."
diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py
index 508ab139..54e23584 100644
--- a/tests/functional/test_charm.py
+++ b/tests/functional/test_charm.py
@@ -65,7 +65,7 @@ class AppStatus(str, Enum):
 @pytest.mark.abort_on_fail
 @pytest.mark.skip_if_deployed
 async def test_build_and_deploy(  # noqa: C901, function is too complex
-    ops_test: OpsTest, base, architecture, provided_collectors, required_resources, charm_path
+    ops_test: OpsTest, base, architecture, realhw, required_resources, charm_path
 ):
     """Deploy the charm together with related charms.
 
@@ -96,7 +96,7 @@ async def test_build_and_deploy(  # noqa: C901, function is too complex
 
     # deploy bundle to already added machine instead of provisioning new one
     # when testing with real hardware
-    if provided_collectors:
+    if realhw:
         juju_cmd.append("--map-machines=existing")
 
     logging.info("Deploying bundle...")
@@ -168,6 +168,7 @@ async def test_required_resources(ops_test: OpsTest, provided_collectors, requir
 
 
 @pytest.mark.abort_on_fail
+@pytest.mark.realhw
 async def test_nvidia_driver_installation(ops_test: OpsTest, nvidia_present, unit):
     """Test nvidia driver installation."""
     if not nvidia_present:

From 292c60379cea9977d5be5ad5782ddbd4bdee8ed0 Mon Sep 17 00:00:00 2001
From: Deezzir <deezzir@gmail.com>
Date: Fri, 29 Nov 2024 21:54:04 -0500
Subject: [PATCH 13/13] Update docs and the realhw test logic

---
 tests/functional/README.md     | 23 +++++----
 tests/functional/test_charm.py | 92 ++++++++++++++++++++--------------
 2 files changed, 66 insertions(+), 49 deletions(-)

diff --git a/tests/functional/README.md b/tests/functional/README.md
index 5992a576..8f4e2fa7 100644
--- a/tests/functional/README.md
+++ b/tests/functional/README.md
@@ -3,33 +3,33 @@ There are 2 main types of functional tests for the Hardware Observer charm - tho
 
 Here, "real hardware" refers to machines that are not VMs or containers and have access to real hardware resources like RAID cards and BMC management tools.
 
+Note: the built charm must be present in the root of the project's directory for the tests to run.
+
 ## Hardware Independent Tests
 These are the tests for hardware observer that do not require any real hardware.
 
-Hardware independent tests are run on every PR / weekly scheduled test run. They belong to the `TestCharm` class in the `test_charm.py` module.
+Hardware independent tests are run on every PR / weekly scheduled test run.
 
 These include:
 * Testing whether juju config changes produce the required results
-* Check whether the exporter systemd service starts and stops correctly
-* Test exporter is stopped and related files removed on removal of charm
-
-and more.
 
-Running these tests is as simple as executing the `make functional` command.
+Running these tests is as simple as executing the `tox -e func -- -v`
 
 ## Hardware Dependent Tests
 These are the tests that depend on real hardware to be executed. This is performed manually when required, for example - validating the charm's full functionality before a new release.
 
 Hardware dependent tests are present in the `TestCharmWithHW` class in the `test_charm.py` module. The pytest marker `realhw` has been added to this class (which would include all the tests in this class).
 
-These tests will only be executed if the `--collectors` option for pytest is provided some value. Otherwise, all these tests are skipped (this is done by checking for the presence of the `realhw` marker mentioned earlier.)
+These tests will only be executed if the `--realhw` option for pytest is provided. Additionally, the `--collectors` option with space separated values can be provided, if specific hardware is present. Check the `conftest.py` for options. Otherwise, all these tests are skipped (this is done by checking for the presence of the `realhw` marker mentioned earlier.)
 
-Note: The `test_build_and_deploy` function sets up the test environment for both types of tests.
+Note: The operator must set up a test model with the machine added beforehand. The machine must be an actual host, containers or VMs won't work.
 
 Some of these tests include:
 * Check if all collectors are detected in the exporter config file
 * Test if metrics are available at the expected endpoint
 * Test if metrics specific to the collectors being tested are available
+* Test if smarctl-exporter snap is installed and running
+* Test if Nvidia drivers and dcgm-exporter snap are installed
 
 and more.
 
@@ -37,7 +37,8 @@ In order to run these tests, a couple of prerequisite steps need to be completed
 1. Setup test environment
 2. Add environment variables for Redfish credentials.
 3. Setup required resource files
-4. Find supported collectors
+4. Determine if the machine has Nvidia GPUs and add the `--nvidia` flag is present.
+5. Find supported collectors
 
 ### 1. Setup test environment
 For the hardware dependent tests, we add the test machine beforehand and the bundle only handles deploying the applications to this machine.
@@ -82,7 +83,7 @@ Note: The tests expect these resources to be named exactly in the manner provide
 ### 4. Find supported collectors
 Note down all the collectors supported by the machine as they need to be provided to pytest as part of its CLI arguments.
 
-This is done by passing the required collectors in a space-separated manner via the `FUNC_ARGS` environment variable to the make target.
+This is done by passing the required collectors in a space-separated manner via `--collector` option to the tox target.
 
 The supported collectors can be found by checking the output of the `lshw` command (for RAID cards) or checking availability of Redfish and IPMI on the BMC.
 
@@ -92,7 +93,7 @@ The supported collectors can be found by checking the output of the `lshw` comma
 
 After ensuring the prerequisite steps are complete, the final command to run the tests would look something like this:
 ```
-FUNC_ARGS="--model test --collectors ipmi_dcmi ipmi_sel ipmi_sensor redfish mega_raid" make functional
+tox -e func -- -v --realhw --model test --collectors ipmi_dcmi ipmi_sel ipmi_sensor redfish mega_raid --keep-models
 ```
 
 This would pass the required collectors to tox which then sends it to the pytest command and starts the hardware dependent tests.
diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py
index 54e23584..aba5b4fe 100644
--- a/tests/functional/test_charm.py
+++ b/tests/functional/test_charm.py
@@ -54,12 +54,6 @@ class AppStatus(str, Enum):
     CHECKSUM_ERROR = "Fail strategies: "
     INVALID_CONFIG_EXPORTER_LOG_LEVEL = "Invalid config: 'exporter-log-level'"
     INVALID_REDFISH_CREDS = "Invalid config: 'redfish-username' or 'redfish-password'"
-    NO_NVIDIA_DRIVER_DETECTED = (
-        "Failed to communicate with NVIDIA driver. See more details in the logs"
-    )
-    MANUAL_NVIDIA_DRIVER_INSTALL = (
-        "No drivers for the NVIDIA GPU were found. Manual installation is necessary"
-    )
 
 
 @pytest.mark.abort_on_fail
@@ -133,7 +127,7 @@ async def test_build_and_deploy(  # noqa: C901, function is too complex
 
 
 @pytest.mark.abort_on_fail
-async def test_required_resources(ops_test: OpsTest, provided_collectors, required_resources):
+async def test_required_resources(ops_test: OpsTest, required_resources):
     if not required_resources:
         pytest.skip("No required resources to be attached, skipping test")
 
@@ -179,12 +173,7 @@ async def test_nvidia_driver_installation(ops_test: OpsTest, nvidia_present, uni
     exists = results.get("return-code") == 0
 
     if not exists:
-        if unit.workload_status_message == AppStatus.MANUAL_NVIDIA_DRIVER_INSTALL:
-            pytest.fail("This machine requires manual installation of NVIDIA driver.")
-        elif unit.workload_status_message == AppStatus.NO_NVIDIA_DRIVER_DETECTED:
-            pytest.fail("Nvidia GPU detected, the tests should be run on the real hardware.")
-        else:
-            pytest.fail("Error occured during the driver installation.")
+        pytest.fail("Error occured during the driver installation. Check the logs.")
 
 
 @pytest.mark.abort_on_fail
@@ -254,16 +243,22 @@ async def test_redfish_credential_validation(ops_test: OpsTest, provided_collect
 class TestCharmWithHW:
     """Run functional tests that require specific hardware."""
 
-    async def test_config_file_permissions(self, unit, ops_test):
+    async def test_config_file_permissions(self, unit, ops_test, provided_collectors):
         """Check config file permissions are set correctly."""
+        if not provided_collectors:
+            pytest.skip("No collectors provided, skipping test")
+
         expected_file_mode = "600"
         cmd = "stat -c '%a' /etc/hardware-exporter-config.yaml"
         results = await run_command_on_unit(ops_test, unit.name, cmd)
         assert results.get("return-code") == 0
         assert results.get("stdout").rstrip("\n") == expected_file_mode
 
-    async def test_config_changed_port(self, app, unit, ops_test):
+    async def test_config_changed_port(self, app, unit, ops_test, provided_collectors):
         """Test changing the config option: hardware-exporter-port."""
+        if not provided_collectors:
+            pytest.skip("No collectors provided, skipping test")
+
         new_port = "10001"
         await asyncio.gather(
             app.set_config({"hardware-exporter-port": new_port}),
@@ -278,8 +273,11 @@ async def test_config_changed_port(self, app, unit, ops_test):
 
         await app.reset_config(["hardware-exporter-port"])
 
-    async def test_no_redfish_config(self, unit, ops_test):
+    async def test_no_redfish_config(self, unit, ops_test, provided_collectors):
         """Test that there is no Redfish options because it's not available on lxd machines."""
+        if not provided_collectors:
+            pytest.skip("No collectors provided, skipping test")
+
         try:
             config = await get_hardware_exporter_config(ops_test, unit.name)
         except HardwareExporterConfigError:
@@ -288,8 +286,11 @@ async def test_no_redfish_config(self, unit, ops_test):
         assert config.get("redfish_username") is None
         assert config.get("redfish_client_timeout") is None
 
-    async def test_config_changed_log_level(self, app, unit, ops_test):
+    async def test_config_changed_log_level(self, app, unit, ops_test, provided_collectors):
         """Test changing the config option: exporter-log-level."""
+        if not provided_collectors:
+            pytest.skip("No collectors provided, skipping test")
+
         new_log_level = "DEBUG"
         await asyncio.gather(
             app.set_config({"exporter-log-level": new_log_level}),
@@ -304,8 +305,11 @@ async def test_config_changed_log_level(self, app, unit, ops_test):
 
         await app.reset_config(["exporter-log-level"])
 
-    async def test_config_changed_collect_timeout(self, app, unit, ops_test):
+    async def test_config_changed_collect_timeout(self, app, unit, ops_test, provided_collectors):
         """Test changing the config option: collect-timeout."""
+        if not provided_collectors:
+            pytest.skip("No collectors provided, skipping test")
+
         new_collect_timeout = "20"
         await asyncio.gather(
             app.set_config({"collect-timeout": new_collect_timeout}),
@@ -320,8 +324,11 @@ async def test_config_changed_collect_timeout(self, app, unit, ops_test):
 
         await app.reset_config(["collect-timeout"])
 
-    async def test_start_and_stop_exporter(self, app, unit, ops_test):
+    async def test_start_and_stop_exporter(self, app, unit, ops_test, provided_collectors):
         """Test starting and stopping the exporter results in correct charm status."""
+        if not provided_collectors:
+            pytest.skip("No collectors provided, skipping test")
+
         # Stop the exporter, and the exporter should auto-restart after update status fire.
         stop_cmd = "systemctl stop hardware-exporter"
         async with ops_test.fast_forward():
@@ -331,8 +338,11 @@ async def test_start_and_stop_exporter(self, app, unit, ops_test):
             )
             assert unit.workload_status_message == AppStatus.READY
 
-    async def test_exporter_failed(self, app, unit, ops_test):
+    async def test_exporter_failed(self, app, unit, ops_test, provided_collectors):
         """Test failure in the exporter results in correct charm status."""
+        if not provided_collectors:
+            pytest.skip("No collectors provided, skipping test")
+
         # Setting incorrect log level will crash the exporter
         async with ops_test.fast_forward():
             await asyncio.gather(
@@ -350,6 +360,9 @@ async def test_exporter_failed(self, app, unit, ops_test):
 
     async def test_config_collector_enabled(self, app, unit, ops_test, provided_collectors):
         """Test whether provided collectors are present in exporter config."""
+        if not provided_collectors:
+            pytest.skip("No collectors provided, skipping test")
+
         try:
             config = await get_hardware_exporter_config(ops_test, unit.name)
         except HardwareExporterConfigError:
@@ -363,6 +376,27 @@ async def test_config_collector_enabled(self, app, unit, ops_test, provided_coll
         )
         assert provided_collectors == collectors_in_config, error_msg
 
+    async def test_metrics_available(self, app, unit, ops_test, provided_collectors):
+        """Test if metrics are available at the expected endpoint on unit."""
+        if not provided_collectors:
+            pytest.skip("No collectors provided, skipping test")
+
+        # takes some time for exporter to start and metrics to be available
+        try:
+            async for attempt in AsyncRetrying(
+                stop=stop_after_attempt(5),
+                wait=wait_fixed(10),
+            ):
+                with attempt:
+                    logging.info(f"Fetching metrics attempt #{attempt.retry_state.attempt_number}")
+                    get_metrics_output.cache_clear()  # clear empty metrics from cache
+                    metrics = await get_metrics_output(ops_test, unit.name)
+                    await ops_test.model.wait_for_idle(apps=[APP_NAME])
+        except RetryError:
+            pytest.fail("Not able to obtain metrics!")
+
+        assert metrics, "Metrics result should not be empty"
+
     async def test_redfish_client_timeout_config(self, app, unit, ops_test, provided_collectors):
         """Test whether the redfish client's timeout depends on collect-timeout charm config."""
         if "redfish" not in provided_collectors:
@@ -407,24 +441,6 @@ async def test_dcgm_exporter_snap_available(self, ops_test, app, unit, nvidia_pr
         assert results.get("return-code") == 0
         assert results.get("stdout").strip() == "active"
 
-    async def test_metrics_available(self, app, unit, ops_test):
-        """Test if metrics are available at the expected endpoint on unit."""
-        # takes some time for exporter to start and metrics to be available
-        try:
-            async for attempt in AsyncRetrying(
-                stop=stop_after_attempt(5),
-                wait=wait_fixed(10),
-            ):
-                with attempt:
-                    logging.info(f"Fetching metrics attempt #{attempt.retry_state.attempt_number}")
-                    get_metrics_output.cache_clear()  # clear empty metrics from cache
-                    metrics = await get_metrics_output(ops_test, unit.name)
-                    await ops_test.model.wait_for_idle(apps=[APP_NAME])
-        except RetryError:
-            pytest.fail("Not able to obtain metrics!")
-
-        assert metrics, "Metrics result should not be empty"
-
     @pytest.mark.parametrize(
         "collector",
         [