Skip to content

ubuntu24.04 ci pipeline fix #592

ubuntu24.04 ci pipeline fix

ubuntu24.04 ci pipeline fix #592

Workflow file for this run

# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Run this workflow on a schedule
name: Precompiled images
# on:
# schedule:
# - cron: '00 09 * * *' # scheduled job
on:
pull_request:
types:
- opened
- synchronize
branches:
- test-ubuntu24.04
push:
branches:
- test-ubuntu24.04
jobs:
set-driver-version-matrix:
runs-on: linux-amd64-cpu4
outputs:
driver_branch: ${{ steps.extract_driver_branch.outputs.driver_branch }}
kernel_flavors: ${{ steps.extract_driver_branch.outputs.kernel_flavors }}
dist: ${{ steps.extract_driver_branch.outputs.dist }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Read driver versions
id: extract_driver_branch
run: |
# get driver_branch
# DRIVER_BRANCH=("535" "550")
DRIVER_BRANCH=("550")
driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .)
echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT
# get kernel flavors
# KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle")
KERNEL_FLAVORS=("aws")
# KERNEL_FLAVORS=("aws" "generic")
kernel_flavors_json=$(printf '%s\n' "${KERNEL_FLAVORS[@]}" | jq -R . | jq -cs .)
echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT
# get ubuntu distributions
DIST=("ubuntu22.04" "ubuntu24.04")
dist_json=$(printf '%s\n' "${DIST[@]}" | jq -R . | jq -cs .)
echo "dist=$dist_json" >> $GITHUB_OUTPUT
precompiled-build-image:
needs: set-driver-version-matrix
runs-on: linux-amd64-cpu4
strategy:
matrix:
driver_branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
flavor: ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
exclude:
- dist: ubuntu24.04
driver_branch: 535
steps:
- uses: actions/checkout@v4
name: Check out code
- name: Calculate build vars
id: vars
env:
DIST: ${{ matrix.dist }}
run: |
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
REPO_FULL_NAME="${{ github.repository }}"
echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV
GENERATE_ARTIFACTS="false"
echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
if [ "$DIST" == "ubuntu22.04" ]; then
echo "BASE_TARGET=jammy" >> $GITHUB_OUTPUT
echo "LTS_KERNEL=5.15" >> $GITHUB_OUTPUT
elif [ "$DIST" == "ubuntu24.04" ]; then
echo "BASE_TARGET=noble" >> $GITHUB_OUTPUT
echo "LTS_KERNEL=6.8" >> $GITHUB_OUTPUT
fi
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build base image and get kernel version
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
BASE_TARGET: ${{ steps.vars.outputs.BASE_TARGET }}
LTS_KERNEL: ${{ steps.vars.outputs.LTS_KERNEL }}
run: |
make DRIVER_BRANCH=${{ matrix.driver_branch }} KERNEL_FLAVOR=${{ matrix.flavor }} LTS_KERNEL=${LTS_KERNEL} build-base-${BASE_TARGET}
trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver_branch }}
# try 3 times every 10 seconds to get the file, if success exit the loop
for i in {1..3}; do
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
sleep 10
done
- name: Build image
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
PRECOMPILED: "true"
DIST: signed_${{ matrix.dist }}
run: |
source kernel_version.txt && \
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver_branch }} build-${DIST}-${DRIVER_VERSION}
- name: Save build image and kernel version file
env:
DIST: ${{ matrix.dist }}
PRIVATE_REGISTRY: "ghcr.io"
run: |
source kernel_version.txt
tar -cvf kernel-version-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar kernel_version.txt
docker save "${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}" \
-o ./driver-images-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar
# set env for artifacts upload
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
echo "DIST=$DIST" >> $GITHUB_ENV
- name: Upload build image as an artifact
uses: actions/upload-artifact@v4
with:
name: driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}
path: ./driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}.tar
retention-days: 1
- name: Upload kernel version as an artifact
uses: actions/upload-artifact@v4
with:
name: kernel-version-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}
path: ./kernel-version-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}.tar
retention-days: 1
determine-e2e-test-matrix:
runs-on: linux-amd64-cpu4
strategy:
matrix:
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
needs:
- precompiled-build-image
- set-driver-version-matrix
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set kernel version
env:
DIST: ${{ matrix.dist }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
if [ "$DIST" == "ubuntu22.04" ]; then
export BASE_TARGET="jammy"
export LTS_KERNEL="5.15"
elif [ "$DIST" == "ubuntu24.04" ]; then
export BASE_TARGET="noble"
export LTS_KERNEL="6.8"
fi
kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}'
KERNEL_FLAVORS=($(echo "$kernel_flavors_json" | jq -r '.[]'))
driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]'))
# remove 535 driver branch for ubuntu24.04
if [ "$DIST" == "ubuntu24.04" ]; then
DRIVER_BRANCHES=($(for branch in "${DRIVER_BRANCHES[@]}"; do
[[ $branch != "535" ]] && echo "$branch"
done))
fi
source ./tests/scripts/ci-precompiled-helpers.sh
KERNEL_VERSIONS=($(get_kernel_versions_to_test $BASE_TARGET KERNEL_FLAVORS[@] DRIVER_BRANCHES[@] $DIST))
if [ -z "$KERNEL_VERSIONS" ]; then
# no new kernel release
echo "Skipping e2e tests"
exit 0
fi
for i in "${!KERNEL_VERSIONS[@]}"; do
KERNEL_VERSIONS[$i]="${KERNEL_VERSIONS[$i]}-$DIST"
done
# Convert array to JSON format and assign
echo "[]" > ./matrix_values_$DIST.json
printf '%s\n' "${KERNEL_VERSIONS[@]}" | jq -R . | jq -s . > ./matrix_values_$DIST.json
- name: Upload kernel matrix values as artifacts
uses: actions/upload-artifact@v4
with:
name: matrix-values-${{ matrix.dist }}
path: ./matrix_values_${{ matrix.dist }}.json
collect-e2e-test-matrix:
runs-on: linux-amd64-cpu4
needs:
- determine-e2e-test-matrix
- set-driver-version-matrix
outputs:
matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set and append matrix values for ubuntu
id: set_kernel_version
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT
# combined_values="[]"
kernel_versions=()
# Read and merge kernel_version values from dist files
DIST=("ubuntu22.04" "ubuntu24.04")
for d in "${DIST[@]}"; do
artifact_name="matrix-values-${d}"
file_path="./matrix_values_${d}.json"
echo "Attempting to download artifact: $artifact_name"
if gh run download --name "$artifact_name" --dir ./; then
echo "Successfully downloaded artifact: $artifact_name"
value=$(cat "$file_path")
kernel_versions+=($(echo "$value" | jq -r '.[]'))
echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
fi
done
echo "Collected Kernel Versions: ${kernel_versions[@]}"
combined_values=$(printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s .)
echo "matrix_values=$combined_values" >> $GITHUB_OUTPUT
e2e-tests-nvidiadriver:
runs-on: linux-amd64-cpu4
needs:
- collect-e2e-test-matrix
- set-driver-version-matrix
if: ${{ needs.collect-e2e-test-matrix.outputs.matrix_values_not_empty == '1' }}
strategy:
matrix:
kernel_version: ${{ fromJson(needs.collect-e2e-test-matrix.outputs.matrix_values) }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
echo "Full Kernel Version: ${{ matrix.kernel_version }}"
KERNEL_VERSION2="${{ matrix.kernel_version }}"
CLEANED_KERNEL_VERSION=$(echo "$KERNEL_VERSION1" | sed -E 's/^\\[\\s*\"|\"\\s*\\]$//g')
echo "Cleaned Kernel Version: $CLEANED_KERNEL_VERSION"
KERNEL_VERSION="${{ matrix.kernel_version }}"
echo "SHIVA1 $KERNEL_VERSION"
KERNEL_VERSION1="$KERNEL_VERSION"
echo "SHIVA3 ${KERNEL_VERSION1}"
KERNEL_VERSION=$(echo "$KERNEL_VERSION" | sed -E 's/^\[|\]$//g')
echo "SHIVA4 ${KERNEL_VERSION[0]}"
# Extract the last segment after the last dash
DIST=$(echo "$KERNEL_VERSION" | awk -F'-' '{print $NF}')
DIST=$(echo "$DIST" | sed 's/]$//') # Remove trailing bracket if present
echo "SHIVA5 $DIST"
# Persist DIST for subsequent steps
echo "DIST=$DIST" >> $GITHUB_ENV
echo "SHIVA6 Extracted DIST: $DIST"
# Debug the values
echo "KERNEL_VERSION=$KERNEL_VERSION"
echo "SHIVA4 $KERNEL_VERSION"
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
echo "SHIVA3 $KERNEL_VERSION"
driver_branch_json="${{ needs.set-driver-version-matrix.outputs.driver_branch }}"
DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]'))
echo "DRIVER_BRANCHES=${DRIVER_BRANCHES[*]}" >> $GITHUB_ENV
- name: Set up Holodeck
uses: NVIDIA/[email protected]
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
DIST: ${{ env.DIST }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck_${{ env.DIST }}.yaml"
- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
# step added to skip azure e2e tests and publish the image
# FIXME -- remove step once azure kernel upgrade starts working
# - name: Skip azure e2e
# run: |
# if [[ "${KERNEL_VERSION}" == *-azure ]]; then
# echo "e2e test for azure flavor skipped, as kernel upgrade AWS => azure is not supported"
# exit 0
# fi
- name: Install GitHub CLI
run: |
sudo apt-get update
sudo apt-get install -y gh
- name: Upgrade the kernel for Precompiled e2e test
env:
UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh"
run: |
status=0
./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" "${KERNEL_VERSION}" || status=$?
# On the target system, all scripts/test-case exit with code 1 for error handling.
# However, since reboot-related disconnections break the SSH connection
# and can cause the entire job to exit, we should ignore all errors except
# exit code 1. During a reboot, exit code 1 will not be thrown, so handling
# other errors as code 1 will ensure proper management of reboot scenarios
if [ $status -eq 1 ]; then
echo "Kernel version $KERNEL_VERSION upgrade failed"
exit 1
fi
./tests/scripts/remote_retry.sh || status=$?
if [ $status -ne 0 ]; then
echo "Failed to connect to remote instance"
exit $status
fi
- name: Precompiled e2e test gpu driver validation
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true \
--set driver.imagePullPolicy=Never"
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
rc=0
# for precompiled driver we are setting driver branch as driver version
DRIVER_BRANCHES=(${{ env.DRIVER_BRANCHES }})
# remove 535 driver branch for ubuntu24.04
if [ "$DIST" == "ubuntu24.04" ]; then
DRIVER_BRANCHES=($(for branch in "${DRIVER_BRANCHES[@]}"; do
[[ $branch != "535" ]] && echo "$branch"
done))
fi
for DRIVER_VERSION in "${DRIVER_BRANCHES[@]}"; do
echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
image="driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}"
echo "Downloading $image in tests directory"
gh run download --name $image --dir ./tests/
status=0
TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}"
# add escape character for space
TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS")
IMAGE_PATH="./tests/driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}.tar"
./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" ${IMAGE_PATH} || status=$?
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
rm -f $IMAGE_PATH
done
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15
publish-precompiled-image:
runs-on: linux-amd64-cpu4
needs:
- set-driver-version-matrix
- collect-e2e-test-matrix
- e2e-tests-nvidiadriver
strategy:
matrix:
driver_branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
kernel_version: ${{ fromJson(needs.collect-e2e-test-matrix.outputs.matrix_values) }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set image vars
run: |
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
- name: Download built image artifact
uses: actions/download-artifact@v4
with:
name: driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}
path: ./
- name: Publish image
run: |
image_path="./driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}.tar"
echo "uploading $image_path"
docker load -i $image_path
docker push ${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${{ matrix.kernel_version }}