Skip to content

Commit

Permalink
Don't install DCGM if the driver has been blacklisted
Browse files Browse the repository at this point in the history
If the sysadmin wants to pass the gpu to a virtual instance via pci
passthrough, they will need to make the gpu unavailable to the host
system by blacklisting[0] the kernel driver. On such a system DCGM would
not be able to function and should therefore not be deployed.

This commit makes the NVIDIA gpu verifier more strict by only marking
DCGM as an available tool if both an NVIDIA gpu is detected *and* the
kernel module is not blacklisted.

[0] https://wiki.debian.org/KernelModuleBlacklisting
  • Loading branch information
aieri committed Dec 4, 2024
1 parent 174389c commit 0868bce
Showing 1 changed file with 27 additions and 2 deletions.
29 changes: 27 additions & 2 deletions src/hw_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@

import logging
import os
import re
import shutil
import stat
import subprocess
from abc import ABCMeta, abstractmethod
from glob import iglob
from itertools import chain
from pathlib import Path
from typing import Dict, List, Set, Tuple

Expand Down Expand Up @@ -669,9 +672,31 @@ def disk_hw_verifier() -> Set[HWTool]:


def nvidia_gpu_verifier() -> Set[HWTool]:
"""Verify if the hardware has NVIDIA gpu."""
"""Verify if the hardware has NVIDIA gpu and the driver is not blacklisted.
If the sysadmin has blacklisted the nvidia driver (e.g. to configure pci passthrough)
DCGM won't be able to manage the GPU
"""
gpus = lshw(class_filter="display")
return {HWTool.DCGM for gpu in gpus if "nvidia" in gpu.get("vendor", "").lower()}
return {
HWTool.DCGM
for gpu in gpus
if "nvidia" in gpu.get("vendor", "").lower() and not _is_nvidia_module_blacklisted()
}


def _is_nvidia_module_blacklisted() -> bool:
module_re = re.compile(r"blacklist\s+nvidia")
for conffile in chain(iglob("/etc/modprobe.d/*.conf"), "/etc/modprobe.conf"):
try:
with open(conffile, "r", encoding="utf-8") as fd:
for line in fd.readline():
if module_re.match(line):
return True
except (IsADirectoryError, FileNotFoundError):
# glob may match directories, and modprobe.conf may or may not exist
continue
return False


def detect_available_tools() -> Set[HWTool]:
Expand Down

0 comments on commit 0868bce

Please sign in to comment.