Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop managing the installation of the NVIDIA driver. #380

Merged
merged 2 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import subprocess
import typing as t
from pathlib import Path

from charms.operator_libs_linux.v0 import apt

Expand Down Expand Up @@ -102,3 +103,8 @@ def hwinfo(*args: str) -> t.Dict[str, str]:
key = item.splitlines()[0].strip()
hardware[key] = item
return hardware


def is_nvidia_driver_loaded() -> bool:
"""Determine if an NVIDIA driver has been loaded."""
return Path("/proc/driver/nvidia/version").exists()
110 changes: 9 additions & 101 deletions src/hw_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import logging
import os
import re
import shutil
import stat
import subprocess
Expand Down Expand Up @@ -43,6 +42,7 @@
LSHW_SUPPORTED_STORAGES,
get_bmc_address,
hwinfo,
is_nvidia_driver_loaded,
lshw,
)
from keys import HP_KEYS
Expand All @@ -62,14 +62,6 @@ def __init__(self, tool: HWTool, path: Path):
self.message = f"Tool: {tool} path: {path} size is zero"


class ResourceInstallationError(Exception):
"""Exception raised when a hardware tool installation fails."""

def __init__(self, tool: HWTool):
"""Init."""
super().__init__(f"Installation failed for tool: {tool}")


def copy_to_snap_common_bin(source: Path, filename: str) -> None:
"""Copy file to $SNAP_COMMON/bin folder."""
Path(f"{SNAP_COMMON}/bin").mkdir(parents=False, exist_ok=True)
Expand Down Expand Up @@ -266,54 +258,6 @@ def _create_custom_metrics(self) -> None:
raise err


class NVIDIADriverStrategy(APTStrategyABC):
"""NVIDIA driver strategy class."""

_name = HWTool.NVIDIA_DRIVER
pkg_pattern = r"nvidia(?:-[a-zA-Z-]*)?-(\d+)(?:-[a-zA-Z]*)?"

def install(self) -> None:
"""Install the NVIDIA driver if not present."""
if Path("/proc/driver/nvidia/version").exists():
logger.info("NVIDIA driver already installed in the machine")
return

with open("/proc/modules", encoding="utf-8") as modules:
if "nouveau" in modules.read():
logger.error("Nouveau driver is loaded. Unload it before installing NVIDIA driver")
raise ResourceInstallationError(self._name)

logger.info("Installing NVIDIA driver")
apt.add_package("ubuntu-drivers-common", update_cache=True)

try:
# This can be changed to check_call and not rely in the output if this is fixed
# https://github.com/canonical/ubuntu-drivers-common/issues/106
# https://bugs.launchpad.net/ubuntu/+source/ubuntu-drivers-common/+bug/2090502
result = subprocess.check_output("ubuntu-drivers --gpgpu install".split(), text=True)
subprocess.check_call("modprobe nvidia".split())

except subprocess.CalledProcessError as err:
logger.error("Failed to install the NVIDIA driver: %s", err)
raise err

if "No drivers found for installation" in result:
logger.warning(
"No drivers for the NVIDIA GPU were found. Manual installation is necessary"
)
raise ResourceInstallationError(self._name)

logger.info("NVIDIA driver installed")

def remove(self) -> None:
"""Drivers shouldn't be removed by the strategy."""
return None

def check(self) -> bool:
"""Check if driver was installed."""
return Path("/proc/driver/nvidia/version").exists()


class SmartCtlExporterStrategy(SnapStrategy):
"""SmartCtl strategy class."""

Expand Down Expand Up @@ -670,61 +614,25 @@ def disk_hw_verifier() -> Set[HWTool]:


def nvidia_gpu_verifier() -> Set[HWTool]:
"""Verify if the hardware has NVIDIA gpu and the driver is not blacklisted.
"""Verify if an NVIDIA gpu is present and the driver is loaded.

If the sysadmin has blacklisted the nvidia driver (e.g. to configure pci passthrough)
DCGM won't be able to manage the GPU
Depending on the usage of the node (local gpu usage, vgpu configuration,
pci passthrough), a driver must or must not be installed. Since hardware
observer has no way to know what is the intention of the operator, we don't
automate the graphics driver installation. This task should be left to the
principal charm that is going to use the gpu.
"""
gpus = lshw(class_filter="display")
if any("nvidia" in gpu.get("vendor", "").lower() for gpu in gpus):
logger.debug("NVIDIA GPU(s) detected")
if not _is_nvidia_module_blacklisted():
if is_nvidia_driver_loaded():
logger.debug("Enabling DCGM.")
return {HWTool.DCGM}

logger.debug("the NVIDIA driver has been blacklisted. Not enabling DCGM.")
logger.debug("no NVIDIA driver has been loaded. Not enabling DCGM.")
return set()


def _is_nvidia_module_blacklisted() -> bool:
"""Verify if the NVIDIA driver has been blacklisted.

This is currently done by looking into modprobe config and kernel parameters
NOTE: we can't simply try loading the module with `modprobe -n <module>` because:
* the driver may not be installed
* we don't know the full name of the module
"""
return (
_is_nvidia_module_blacklisted_via_modprobe() or _is_nvidia_module_blacklisted_via_cmdline()
)


def _is_nvidia_module_blacklisted_via_modprobe() -> bool:
"""Verify if the NVIDIA driver has been blacklisted via modprobe config.

see the manpages of modprobe and modprobe.d for more details
"""
modprobe_config = subprocess.check_output(["modprobe", "-c"], text=True).split("\n")

# modprobe normalizes config options to "blacklist MODULE" so no need to
# worry about extra whitespace
return any(opt.startswith("blacklist nvidia") for opt in modprobe_config)


def _is_nvidia_module_blacklisted_via_cmdline() -> bool:
"""Verify if the NVIDIA driver has been blacklisted via kernel parameters.

possible formats: module_blacklist= or modprobe.blacklist= followed by a
comma-separated list of modules. See:
https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html
"""
cmdline = Path("/proc/cmdline").read_text(encoding="utf-8")

return bool(
re.search(r"((?<=module_blacklist)|(?<=modprobe\.blacklist))=[\w,]*nvidia", cmdline)
)


def detect_available_tools() -> Set[HWTool]:
"""Return HWTool detected after checking the hardware."""
return raid_hw_verifier() | bmc_hw_verifier() | disk_hw_verifier() | nvidia_gpu_verifier()
Expand Down
17 changes: 5 additions & 12 deletions src/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,8 @@
ExporterSettings,
HWTool,
)
from hardware import get_bmc_address
from hw_tools import (
APTStrategyABC,
DCGMExporterStrategy,
NVIDIADriverStrategy,
SmartCtlExporterStrategy,
SnapStrategy,
)
from hardware import get_bmc_address, is_nvidia_driver_loaded
from hw_tools import APTStrategyABC, DCGMExporterStrategy, SmartCtlExporterStrategy, SnapStrategy

logger = getLogger(__name__)

Expand Down Expand Up @@ -424,7 +418,6 @@ def __init__(self, config: ConfigData):
"""Init."""
self.strategies = [
DCGMExporterStrategy(str(config["dcgm-snap-channel"])),
NVIDIADriverStrategy(),
]
super().__init__(config)

Expand All @@ -434,15 +427,15 @@ def hw_tools() -> Set[HWTool]:
return {HWTool.DCGM}

def validate_exporter_configs(self) -> Tuple[bool, str]:
"""Validate the if the DCGM exporter is able to run."""
"""Validate if the DCGM exporter is able to run."""
valid, msg = super().validate_exporter_configs()
if not valid:
return False, msg

if not NVIDIADriverStrategy().check():
if not is_nvidia_driver_loaded():
return (
False,
"Failed to communicate with NVIDIA driver. See more details in the logs",
"The NVIDIA driver isn't installed or loaded. See more details in the logs",
)
return valid, msg

Expand Down
15 changes: 0 additions & 15 deletions tests/functional/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,21 +161,6 @@ async def test_required_resources(ops_test: OpsTest, required_resources):
assert unit.workload_status_message == AppStatus.MISSING_RELATION


@pytest.mark.abort_on_fail
@pytest.mark.realhw
async def test_nvidia_driver_installation(ops_test: OpsTest, nvidia_present, unit):
"""Test nvidia driver installation."""
if not nvidia_present:
pytest.skip("dcgm not in provided collectors, skipping test")

check_nvidia_driver_cmd = "cat /proc/driver/nvidia/version"
results = await run_command_on_unit(ops_test, unit.name, check_nvidia_driver_cmd)
exists = results.get("return-code") == 0

if not exists:
pytest.fail("Error occured during the driver installation. Check the logs.")


@pytest.mark.abort_on_fail
async def test_cos_agent_relation(ops_test: OpsTest, provided_collectors):
"""Test adding relation with grafana-agent."""
Expand Down
9 changes: 8 additions & 1 deletion tests/unit/test_hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pytest

from hardware import get_bmc_address, hwinfo, lshw
from hardware import get_bmc_address, hwinfo, is_nvidia_driver_loaded, lshw


class TestHwinfo:
Expand Down Expand Up @@ -173,3 +173,10 @@ def test_get_bmc_address(self, mock_check_output, mock_apt):
def test_get_bmc_address_error_handling(self, mock_subprocess, mock_apt):
output = get_bmc_address()
self.assertEqual(output, None)


@pytest.mark.parametrize("path_exists,expected", [(True, True), (False, False)])
@mock.patch("hardware.Path.exists")
def test_is_nvidia_driver_loaded(mock_path, path_exists, expected):
mock_path.return_value = path_exists
assert is_nvidia_driver_loaded() == expected
Loading
Loading