Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Ubuntu24.04] Install the driver in a single step #156

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions ubuntu24.04/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,7 @@ ADD install.sh /tmp
RUN apt-key del 7fa2af80 && OS_ARCH=${TARGETARCH/amd64/x86_64} && OS_ARCH=${OS_ARCH/arm64/sbsa} && \
apt-key adv --fetch-keys "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${OS_ARCH}/3bf863cc.pub"

RUN /tmp/install.sh reposetup && /tmp/install.sh depinstall && \
curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey && \
chmod +x /usr/local/bin/donkey
RUN /tmp/install.sh depinstall
tariq1890 marked this conversation as resolved.
Show resolved Hide resolved

COPY nvidia-driver /usr/local/bin

Expand Down
22 changes: 1 addition & 21 deletions ubuntu24.04/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,31 +35,11 @@ dep_install () {
fi
}

repo_setup () {
if [ "$TARGETARCH" = "amd64" ]; then
echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ noble main universe" > /etc/apt/sources.list && \
echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ noble-updates main universe" >> /etc/apt/sources.list && \
echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ noble-security main universe" >> /etc/apt/sources.list && \
usermod -o -u 0 -g 0 _apt
elif [ "$TARGETARCH" = "arm64" ]; then
echo "deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports noble main universe" > /etc/apt/sources.list && \
echo "deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports noble-updates main universe" >> /etc/apt/sources.list && \
echo "deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports noble-security main universe" >> /etc/apt/sources.list && \
usermod -o -u 0 -g 0 _apt
else
echo "TARGETARCH doesn't match a known arch target"
exit 1
fi
}

if [ "$1" = "reposetup" ]; then
repo_setup
elif [ "$1" = "depinstall" ]; then
if [ "$1" = "depinstall" ]; then
dep_install
elif [ "$1" = "download_installer" ]; then
download_installer
else
echo "Unknown function: $1"
exit 1
fi

173 changes: 31 additions & 142 deletions ubuntu24.04/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -120,71 +120,22 @@ _kernel_requires_package() {
return 0
}

# Compile the kernel modules, optionally sign them, and generate a precompiled package for use by the nvidia-installer.
_create_driver_package() (
local pkg_name="nvidia-modules-${KERNEL_VERSION%-*}${PACKAGE_TAG:+-${PACKAGE_TAG}}"
local nvidia_sign_args=""
local nvidia_modeset_sign_args=""
local nvidia_uvm_sign_args=""

trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT

echo "Compiling NVIDIA driver kernel modules..."
cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE}

# This is required as currently GPU driver installer doesn't expect headers in x86_64 folder, but only in either default
# or kernel-version folder.
_link_ofa_kernel() (
if _gpu_direct_rdma_enabled; then
ln -s /run/mellanox/drivers/usr/src/ofa_kernel /usr/src/
# if arch directory exists(MOFED >=5.5) then create a symlink as expected by GPU driver installer
# This is required as currently GPU driver installer doesn't expect headers in x86_64 folder, but only in either default or kernel-version folder.
# ls -ltr /usr/src/ofa_kernel/
# lrwxrwxrwx 1 root root 36 Dec 8 20:10 default -> /etc/alternatives/ofa_kernel_headers
# drwxr-xr-x 4 root root 4096 Dec 8 20:14 x86_64
# lrwxrwxrwx 1 root root 44 Dec 9 19:05 5.4.0-90-generic -> /usr/src/ofa_kernel/x86_64/5.4.0-90-generic/
if [[ -d /run/mellanox/drivers/usr/src/ofa_kernel/$DRIVER_ARCH/`uname -r` ]]; then
if [[ ! -e /usr/src/ofa_kernel/`uname -r` ]]; then
ln -s /run/mellanox/drivers/usr/src/ofa_kernel/$DRIVER_ARCH/`uname -r` /usr/src/ofa_kernel/
if [[ -d /run/mellanox/drivers/usr/src/ofa_kernel/$DRIVER_ARCH/$(uname -r) ]]; then
if [[ ! -e /usr/src/ofa_kernel/$(uname -r) ]]; then
ln -s /run/mellanox/drivers/usr/src/ofa_kernel/$DRIVER_ARCH/$(uname -r) /usr/src/ofa_kernel/
fi
fi
fi

export IGNORE_CC_MISMATCH=1
make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build nv-linux.o nv-modeset-linux.o > /dev/null
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By removing this code, we are no longer using the MAX_THREADS variable which was introduced here: f18cc79

The runfile installer does expose a similar option that controls the concurrency level when building the kernel modules. I am wondering if we can convert MAX_THREADS (if set) to the relevant option and add it to the list of install_args used when installing the run file below.

-j CONCURRENCY-LEVEL, --concurrency-level=CONCURRENCY-LEVEL
      Set the concurrency level for operations such as building the kernel module which may be parallelized on SMP systems. By default, this will be set to the number of detected CPUs, or to '1', if nvidia-installer fails to detect the number of CPUs. Systems with a large number of CPUs will have the default concurrency level limited to 32; setting a higher level on the command line will override this limit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch, thanks!

Copy link
Contributor Author

@tariq1890 tariq1890 Nov 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has been fixed. Please check now


echo "Relinking NVIDIA driver kernel modules..."
rm -f nvidia.ko nvidia-modeset.ko
ld -d -r -o nvidia.ko ./nv-linux.o ./nvidia/nv-kernel.o_binary
ld -d -r -o nvidia-modeset.ko ./nv-modeset-linux.o ./nvidia-modeset/nv-modeset-kernel.o_binary

if [ -n "${PRIVATE_KEY}" ]; then
echo "Signing NVIDIA driver kernel modules..."
donkey get ${PRIVATE_KEY} sh -c "PATH=${PATH}:/usr/src/linux-headers-${KERNEL_VERSION}/scripts && \
sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia.ko nvidia.ko.sign && \
sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-modeset.ko nvidia-modeset.ko.sign && \
sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-uvm.ko"
nvidia_sign_args="--linked-module nvidia.ko --signed-module nvidia.ko.sign"
nvidia_modeset_sign_args="--linked-module nvidia-modeset.ko --signed-module nvidia-modeset.ko.sign"
nvidia_uvm_sign_args="--signed"
fi

echo "Building NVIDIA driver package ${pkg_name}..."
../mkprecompiled --pack ${pkg_name} --description ${KERNEL_VERSION} \
--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc \
--driver-version ${DRIVER_VERSION} \
--kernel-interface nv-linux.o \
--linked-module-name nvidia.ko \
--core-object-name nvidia/nv-kernel.o_binary \
${nvidia_sign_args} \
--target-directory . \
--kernel-interface nv-modeset-linux.o \
--linked-module-name nvidia-modeset.ko \
--core-object-name nvidia-modeset/nv-modeset-kernel.o_binary \
${nvidia_modeset_sign_args} \
--target-directory . \
--kernel-module nvidia-uvm.ko \
${nvidia_uvm_sign_args} \
--target-directory .
mkdir -p precompiled
mv ${pkg_name} precompiled
)

_assert_nvswitch_system() {
Expand Down Expand Up @@ -420,18 +371,33 @@ _unload_driver() {
_install_driver() {
local install_args=()

echo "Installing NVIDIA driver kernel modules..."
cd /usr/src/nvidia-${DRIVER_VERSION}
if [ -d /lib/modules/${KERNEL_VERSION}/kernel/drivers/video ]; then
rm -rf /lib/modules/${KERNEL_VERSION}/kernel/drivers/video
else
rm -rf /lib/modules/${KERNEL_VERSION}/video
fi
Comment on lines -424 to -429
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question -- is it safe to remove this? I am unfamiliar with why this logic was required.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to remove this. It wouldn't work otherwise


if [ "${ACCEPT_LICENSE}" = "yes" ]; then
install_args+=("--accept-license")
fi
nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"}

if [ -n "${MAX_THREADS}" ]; then
install_args+=("--concurrency-level=${MAX_THREADS}")
fi

# Install the NVIDIA driver in one step
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
./nvidia-installer --silent \
--ui=none \
--no-drm \
--no-nouveau-check \
--no-nvidia-modprobe \
--no-rpms \
--no-backup \
--no-check-for-alternate-installs \
--no-libglx-indirect \
--no-install-libglvnd \
--x-prefix=/tmp/null \
--x-module-path=/tmp/null \
--x-library-path=/tmp/null \
--x-sysconfig-path=/tmp/null \
-m="${KERNEL_TYPE}" \
${install_args[@]+"${install_args[@]}"}
}

# Mount the driver rootfs into the run directory with the exception of sysfs.
Expand Down Expand Up @@ -524,26 +490,6 @@ init() {
_find_vgpu_driver_version || exit 1
fi

# Install the userspace components and copy the kernel module sources.
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
./nvidia-installer --silent \
--no-kernel-module \
--no-nouveau-check \
--no-nvidia-modprobe \
--no-rpms \
--no-backup \
--no-check-for-alternate-installs \
--no-libglx-indirect \
--no-install-libglvnd \
--x-prefix=/tmp/null \
--x-module-path=/tmp/null \
--x-library-path=/tmp/null \
--x-sysconfig-path=/tmp/null && \
mkdir -p /usr/src/nvidia-${DRIVER_VERSION} && \
mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} && \
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest

echo -e "\n========== NVIDIA Software Installer ==========\n"
echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"

Expand All @@ -565,7 +511,7 @@ init() {
_update_package_cache
_resolve_kernel_version || exit 1
_install_prerequisites
_create_driver_package
_link_ofa_kernel
#_remove_prerequisites
#_cleanup_package_cache
fi
Expand All @@ -583,63 +529,6 @@ init() {
exit 0
}

update() {
exec 3>&2
if exec 2> /dev/null 4< ${PID_FILE}; then
if ! flock -n 4 && read pid <&4 && kill -0 "${pid}"; then
exec > >(tee -a "/proc/${pid}/fd/1")
exec 2> >(tee -a "/proc/${pid}/fd/2" >&3)
else
exec 2>&3
fi
exec 4>&-
fi
exec 3>&-

# vgpu driver version is choosen dynamically during runtime, so pre-compile modules for
# only non-vgpu driver types
if [ "${DRIVER_TYPE}" != "vgpu" ]; then
# Install the userspace components and copy the kernel module sources.
if [ ! -e /usr/src/nvidia-${DRIVER_VERSION}/mkprecompiled ]; then
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
./nvidia-installer --silent \
--no-kernel-module \
--no-nouveau-check \
--no-nvidia-modprobe \
--no-rpms \
--no-backup \
--no-check-for-alternate-installs \
--no-libglx-indirect \
--no-install-libglvnd \
--x-prefix=/tmp/null \
--x-module-path=/tmp/null \
--x-library-path=/tmp/null \
--x-sysconfig-path=/tmp/null && \
mkdir -p /usr/src/nvidia-${DRIVER_VERSION} && \
mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} && \
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest
fi
fi

echo -e "\n========== NVIDIA Software Updater ==========\n"
echo -e "Starting update of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"

trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM

_update_package_cache
_resolve_kernel_version || exit 1
_install_prerequisites
if _kernel_requires_package; then
_create_driver_package
fi
_remove_prerequisites
_cleanup_package_cache

echo "Done"
exit 0
}

# Wait for MOFED drivers to be loaded and load nvidia-peermem whenever it gets unloaded during MOFED driver updates
reload_nvidia_peermem() {
if [ "$USE_HOST_MOFED" = "true" ]; then
Expand Down