diff --git a/ci/fedora/.gitlab-ci-fcos.yml b/ci/fedora/.gitlab-ci-fcos.yml index a37feada..c4557d1a 100644 --- a/ci/fedora/.gitlab-ci-fcos.yml +++ b/ci/fedora/.gitlab-ci-fcos.yml @@ -35,9 +35,9 @@ # the commit sha). This release stage is purely to test out the CICD code that # would for the 'fedora' branch publish to a remote repository. # -# Branches == "fedora" and tags == .*fedora$ +# Tags == .*fedora$ # -# The protected branch 'fedora' will cause container image builds on all three +# Matching pipelines will cause container image builds on all three # fcos runner types and build ALL_DRIVER_VERSIONS. The images will then be scan- # ned and providing there are no detected vulnerabilities will be pushed to the # remote repository defined by RELEASE_REGISTRY_PROJECT. @@ -49,7 +49,7 @@ # # Branches == "fedora.+" # -# Any other protected branch with the word fedora in it will do the same - build +# Any protected branch with the word fedora in it will do the same - build # all the NVIDIA driver versions on all the fcos releases - and scan them, but # will not publish them to the remote registry. # @@ -96,10 +96,10 @@ variables: # To survey latest Data Center driver versions available: # https://www.nvidia.com/Download/Find.aspx # https://www.nvidia.com/en-us/drivers/unix/ - DRIVER_VERSION: "535.154.05" - DRIVER_VERSIONS: 535.154.05 525.147.05 + DRIVER_VERSION: "550.90.07" + DRIVER_VERSIONS: 550.90.07 535.183.01 - CUDA_VERSION: 12.2.0 + CUDA_VERSION: 12.4.1 CVE_UPDATES: "curl libc6" @@ -115,9 +115,9 @@ variables: RELEASE_REGISTRY_TOKEN: "" default: - image: docker:20.10.10-git + image: docker:25.0.2-git services: - - name: docker:20.10.10-dind + - name: docker:25.0.2-dind stages: - build @@ -199,8 +199,9 @@ build-push-next-one-only: - for driver_version in ${DRIVER_VERSION}; do build_push_fn ${driver_version} $OVERWRITE_TAGS ${CI_COMMIT_SHORT_SHA}-; done tags: - fcos-next - except: - - /fedora/ + rules: + # Only run on branches (not tags) which do not start with fedora + - if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null build-push: stage: build @@ -212,8 +213,8 @@ build-push: - STREAM: [next, testing, stable] tags: - fcos-${STREAM} - only: - - /fedora/ + rules: + - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/ .common-scan: image: registry.gitlab.com/security-products/container-scanning:6 @@ -273,8 +274,9 @@ scan-next-one-only: - scan_fn ${DRIVER_VERSION} ${CI_COMMIT_SHORT_SHA}- tags: - fcos-next - except: - - /fedora/ + rules: + # Only run on branches (not tags) which do not matching fedora + - if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null # Gitlab does not yet support matrix jobs with dynamic matrix-based dependencies. # https://forum.gitlab.com/t/ci-specifying-artifact-dependencies-when-using-parallel-matrix/45026/2 @@ -288,24 +290,24 @@ scan-next: needs: ["build-push: [next]"] tags: - fcos-next - only: - - /fedora/ + rules: + - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/ scan-testing: extends: .common-scan needs: ["build-push: [testing]"] tags: - fcos-testing - only: - - /fedora/ + rules: + - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/ scan-stable: extends: .common-scan needs: ["build-push: [stable]"] tags: - fcos-stable - only: - - /fedora/ + rules: + - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/ .common-release-fn-script: &common-release-fn-script - | @@ -347,7 +349,7 @@ scan-stable: - docker login -u "${RELEASE_REGISTRY_USER}" -p "${RELEASE_REGISTRY_TOKEN}" "${RELEASE_REGISTRY}" - for driver_version in ${DRIVER_VERSIONS:-${DRIVER_VERSION}}; do release_fn ${driver_version};done rules: - - if: $CI_COMMIT_TAG =~ /fedora$/ || $CI_COMMIT_REF_NAME == 'fedora' + - if: $CI_COMMIT_TAG =~ /fedora$/ release-next-one-only: stage: release @@ -370,8 +372,9 @@ release-next-one-only: - for driver_version in ${DRIVER_VERSION}; do release_fn ${driver_version} ${OVERWRITE_REMOTE_TAGS} ${CI_COMMIT_SHORT_SHA}-; done tags: - fcos-next - except: - - /fedora/ + rules: + # Only run on branches (not tags) which do not matching ^fedora + - if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null # Gitlab does not yet support matrix jobs with dynamic matrix-based dependencies. # https://forum.gitlab.com/t/ci-specifying-artifact-dependencies-when-using-parallel-matrix/45026/2 diff --git a/fedora/Dockerfile b/fedora/Dockerfile index 44841d34..42afd078 100644 --- a/fedora/Dockerfile +++ b/fedora/Dockerfile @@ -9,7 +9,7 @@ SHELL ["/bin/bash", "-c"] RUN dnf install -y git wget -ENV GOLANG_VERSION=1.21.5 +ENV GOLANG_VERSION=1.22.2 # download appropriate binary based on the target architecture for multi-arch builds RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && \ @@ -63,6 +63,8 @@ ENV NVIDIA_VISIBLE_DEVICES=void # getopt etc. RUN dnf install -y util-linux 'dnf-command(download)' +RUN dnf install -y patch + ADD install.sh /tmp/ RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \ diff --git a/fedora/README.md b/fedora/README.md index 54174658..acb0738d 100644 --- a/fedora/README.md +++ b/fedora/README.md @@ -27,7 +27,7 @@ Currently built driver versions are specified in `ci/fedora/.common-ci-fcos.yml` The driver container is privileged, and here we choose to launch via podman instead of docker although both work. ```bash -$ DRIVER_VERSION=535.104.12 # Check ci/fedora/.common-ci-fcos.yml for latest +$ DRIVER_VERSION=550.90.07 # Check ci/fedora/.common-ci-fcos.yml for latest driver versions $ FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2) $ podman run -d --privileged --pid=host \ -v /run/nvidia:/run/nvidia:shared \ @@ -36,13 +36,14 @@ $ podman run -d --privileged --pid=host \ registry.gitlab.com/container-toolkit-fcos/driver:${DRIVER_VERSION}-fedora$$FEDORA_VERSION_ID ``` -Or, on FCOS registering as a systemd unit via an ignition snippet, and using an image with kernel headers pre-installed for faster start up: +Or, on FCOS registering as a systemd unit via an ignition snippet. In this unit we attempt to pull a driver image matching the running kernel version (with pre-compiled kernel headers), but fall back to a generic Fedora version if one does not exist. Furthermore, we +mount a single patch file from a host directory that, if detected, will be applied to the generic Fedora version. ```yaml variant: fcos -version: 1.4.0 -storage: - files: +version: 1.5.0 +systemd: + units: - name: acme-nvidia-driver.service enabled: true contents: | @@ -57,18 +58,32 @@ storage: ExecStartPre=-/bin/podman rm nvidia-driver ExecStartPre=-setenforce 0 ExecStartPre=-/bin/mkdir -p /run/nvidia - ExecStartPre=-/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \ - /bin/podman pull registry.gitlab.com/container-toolkit-fcos/driver:535.104.12-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID' + # 5/17/24 - Without the following line the nvidia driver container will crash with no meaningful error message ExecStartPre=-/usr/sbin/modprobe video - ExecStart=/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \ - /bin/podman run --name nvidia-driver \ - -v /run/nvidia:/run/nvidia:shared \ - -v /var/log:/var/log \ - --privileged --pid=host \ - # No need for network IF using container image with pre-built kernel headers \ - --network=none \ - registry.gitlab.com/container-toolkit-fcos/driver:535.104.12-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID \ - --accept-license' + + # If there is a kernel-specific image (with pre-compiled kernel headers) then + # use it, otherwise fallback to the generic Fedora image mounting any patches that exist. + # + # Replace registry.gitlab.com/container-toolkit-fcos/driver with the registry name + # of your built/published driver images, or perhaps, docker.io/fifofonix/driver + ExecStart=/bin/sh -c ' \ + FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \ + KERNEL_VERSION=$(/bin/uname -r); \ + if /bin/podman manifest inspect registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID > /dev/null; then \ + IMAGE_NAME=registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID; \ + else \ + IMAGE_NAME=registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-fedora$$FEDORA_VERSION_ID; \ + PATCH_MOUNT="-v /var/acme/nvidia-driver-patch:/patch" + fi; \ + /bin/podman pull $$IMAGE_NAME; \ + /bin/podman run --name nvidia-driver \ + -v /run/nvidia:/run/nvidia:shared \ + -v /var/log:/var/log \ + $$PATCH_MOUNT \ + --privileged \ + --pid host \ + $$IMAGE_NAME \ + --accept-license' ExecStop=/bin/podman stop nvidia-driver Restart=on-failure @@ -84,47 +99,64 @@ You should be able to step into the driver container and run the `nvidia-smi` to ```bash $ # Assumes you're running the driver container via podman and named nvidia-driver as above... -$ podman exec -it nvidia-driver bash -[root@8dc88dad905e nvidia-510.47.03]# nvidia-smi -Wed May 25 15:24:00 2022 -+-----------------------------------------------------------------------------+ -| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 | -|-------------------------------+----------------------+----------------------+ -| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|===============================+======================+======================| -| 0 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 | -| 0% 39C P0 197W / 300W | 22022MiB / 23028MiB | 96% Default | -| | | N/A | -+-------------------------------+----------------------+----------------------+ - -+-----------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=============================================================================| -| No running processes found | -+-----------------------------------------------------------------------------+ -[root@8dc88dad905e]# +$ podman exec -it nvidia-driver sh +sh-5.2# nvidia-smi +Tue Jun 11 19:55:25 2024 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.90.07 Driver Version: 550.90.07 CUDA Version: 12.4 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 Tesla M60 On | 00000000:00:1E.0 Off | 0 | +| N/A 47C P0 46W / 150W | 7131MiB / 7680MiB | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ ``` ### Install Container Runtime / Toolkit To run a CUDA container that leverages the NVIDIA driver container you now have running, install the separate NVIDIA container runtime and register it with your container runtime system (e.g. docker) following NVIDIA's instructions [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). -On FedoraCoreOS you may choose to layer the container toolkit using `rpm-ostree`, and configure your runtime, with an ignition snippet like this (substitute your runtime, docker is shown, but containerd works too for example): +On FedoraCoreOS you may choose to layer the container toolkit using `rpm-ostree`, and configure your runtime, with an ignition snippet like this (substitute your runtime, containerd is shown, but docker works too for example): ```yaml variant: fcos -version: 1.4.0 +version: 1.5.0 storage: files: - - name: acme-layer-nvidia-container-runtime.service + - path: /etc/nvidia-container-runtime/config.toml + mode: 0644 + contents: + inline: | + [nvidia-container-cli] + #debug = "/var/log/nvidia-container-toolkit.log" + root = "/run/nvidia/driver" + path = "/usr/bin/nvidia-container-cli" + # Improvements made in NVIDIA container toolkit 1.15.0 do not yet seem to correctly + # support FCOS so we still need to explicitly add the driver path to ld.so.conf + - path: /etc/ld.so.conf.d/container-toolkit.conf + mode: 0644 + contents: + inline: | + /run/nvidia/driver/usr/lib64 +systemd: + units: + - name: acme-layer-nvidia-container-toolkit.service enabled: true # We run before `zincati.service` to avoid conflicting rpm-ostree transactions. contents: | [Unit] + Wants=network-online.target After=network-online.target Before=zincati.service ConditionPathExists=!/var/lib/%N.stamp @@ -137,13 +169,12 @@ storage: ExecStartPre=-/bin/sh -c 'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \ > /etc/yum.repos.d/nvidia-container-toolkit.repo' # Perhaps consider pinning the rpm version here depending on change aversion... - ExecStart=/usr/bin/rpm-ostree install --idempotent --allow-inactive --apply-live nvidia-container-toolkit - ExecStart=/bin/sh -c 'echo "/run/nvidia/driver/usr/lib64" > /etc/ld.so.conf.d/nv.conf; ldconfig' - # If we see that the nvidia-ctk is present, then we can configure docker... + ExecStart=/usr/bin/rpm-ostree install -y --idempotent --allow-inactive nvidia-container-toolkit ExecStart=/bin/sh -c 'if [[ -f /usr/bin/nvidia-ctk ]]; then \ - /usr/bin/nvidia-ctk runtime configure --runtime=docker --nvidia-set-as-default; \ - systemctl restart docker; \ + /usr/bin/nvidia-ctk runtime configure --runtime=containerd --nvidia-set-as-default; \ + systemctl restart containerd; \ /bin/touch /var/lib/%N.stamp; fi' + ExecStart=/bin/systemctl --no-block reboot Restart=on-failure RestartSec=60 diff --git a/fedora/nvidia-driver b/fedora/nvidia-driver index a8468ad8..c18a3b21 100755 --- a/fedora/nvidia-driver +++ b/fedora/nvidia-driver @@ -129,7 +129,7 @@ _install_prerequisites() ( rm ./*.rpm echo "Installing Linux kernel-modules-core files..." - if ! dnf -q -y download kernel-modules-core${KERNEL_VERSION} > /dev/null; then + if ! dnf -q -y download kernel-modules-core-${KERNEL_VERSION} > /dev/null; then echo "Failed to find kernel-modules-core-${KERNEL_VERSION} in repositories." echo "Trying to download kernel-modules-core from koji..." KOJI_KERNEL_CORE_RPM=$KOJI_BASE_URL/packages/kernel/$KERNEL_RPM_VERSION/$KERNEL_RPM_RELEASE/$KERNEL_RPM_ARCH/kernel-modules-core-$KERNEL_VERSION.rpm @@ -240,9 +240,9 @@ _create_driver_package() ( # lrwxrwxrwx 1 root root 36 Dec 8 20:10 default -> /etc/alternatives/ofa_kernel_headers # drwxr-xr-x 4 root root 4096 Dec 8 20:14 x86_64 # lrwxrwxrwx 1 root root 44 Dec 9 19:05 5.4.0-90-generic -> /usr/src/ofa_kernel/x86_64/5.4.0-90-generic/ - if [[ -d /run/mellanox/drivers/usr/src/ofa_kernel/x86_64/`uname -r` ]]; then - if [[ ! -e /usr/src/ofa_kernel/`uname -r` ]]; then - ln -s /run/mellanox/drivers/usr/src/ofa_kernel/x86_64/`uname -r` /usr/src/ofa_kernel/ + if [[ -d "/run/mellanox/drivers/usr/src/ofa_kernel/$(uname -m)/$(uname -r)" ]]; then + if [[ ! -e "/usr/src/ofa_kernel/$(uname -r)" ]]; then + ln -s "/run/mellanox/drivers/usr/src/ofa_kernel/$(uname -m)/$(uname -r)" /usr/src/ofa_kernel/ fi fi fi @@ -595,14 +595,33 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } +_apply_patch () { + # Apply a single *.patch file that has been mounted to /patch + if [ -d /patch ]; then + # Exit if multiple patches are found + if [ $(ls -1 /patch/*.patch | wc -l) -gt 1 ]; then + echo "Multiple patches found, only one patch is supported" + exit 1 + fi + for patch in /patch/*.patch; do + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run --apply-patch ${patch} -m=${KERNEL_TYPE} + # The patched installer has by default the file name ending '-custom.run' + PATCHED_SUFFIX="-custom" + echo -e "NVIDIA Software installer patched with '/patch/${patch}'\n" + done + fi +} + _prepare() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 fi + _apply_patch + # Install the userspace components and copy the kernel module sources. - sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x -m=${KERNEL_TYPE} && \ - cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION${PATCHED_SUFFIX:-}.run -x -m=${KERNEL_TYPE} && \ + cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION${PATCHED_SUFFIX:-} && \ sh /tmp/install.sh nvinstall && \ mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \