Skip to content

ubuntu24.04 ci pipeline fix #539

ubuntu24.04 ci pipeline fix

ubuntu24.04 ci pipeline fix #539

Workflow file for this run

# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Run this workflow on a schedule
name: Precompiled images
# on:
# schedule:
# - cron: '00 09 * * *' # scheduled job
on:
pull_request:
types:
- opened
- synchronize
branches:
- ci-precompile-ubuntu24.04
push:
branches:
- ci-precompile-ubuntu24.04
jobs:
set-driver-version-matrix:
runs-on: linux-amd64-cpu4
outputs:
driver_branch: ${{ steps.extract_driver_branch.outputs.driver_branch }}
kernel_flavors: ${{ steps.extract_driver_branch.outputs.kernel_flavors }}
dist: ${{ steps.extract_driver_branch.outputs.dist }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Read driver versions
id: extract_driver_branch
run: |
# get driver_branch
# SHIVA
# DRIVER_BRANCH=("535" "550")
DRIVER_BRANCH=("550")
driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .)
echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT
# get kernel flavors
# SHIVA
# KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle")
# KERNEL_FLAVORS=("azure" "generic" "nvidia" "oracle")
# KERNEL_FLAVORS=("aws")
KERNEL_FLAVORS=("generic")
kernel_flavors_json=$(printf '%s\n' "${KERNEL_FLAVORS[@]}" | jq -R . | jq -cs .)
echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT
# get ubuntu distributions
# SHIVA
# DIST=("ubuntu22.04" "ubuntu24.04")
# DIST=("ubuntu22.04")
DIST=("ubuntu24.04")
dist_json=$(printf '%s\n' "${DIST[@]}" | jq -R . | jq -cs .)
echo "dist=$dist_json" >> $GITHUB_OUTPUT
precompiled-build-image:
needs: set-driver-version-matrix
runs-on: linux-amd64-cpu4
strategy:
matrix:
driver_branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
flavor: ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
steps:
- uses: actions/checkout@v4
name: Check out code
- name: Calculate build vars
id: vars
env:
DIST: ${{ matrix.dist }}
run: |
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
REPO_FULL_NAME="${{ github.repository }}"
echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV
GENERATE_ARTIFACTS="false"
echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
if [ "$DIST" == "ubuntu22.04" ]; then
echo "BASE_TARGET=jammy" >> $GITHUB_OUTPUT
echo "LTS_KERNEL=5.15" >> $GITHUB_OUTPUT
elif [ "$DIST" == "ubuntu24.04" ]; then
echo "BASE_TARGET=noble" >> $GITHUB_OUTPUT
echo "LTS_KERNEL=6.8" >> $GITHUB_OUTPUT
fi
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build base image and get kernel version
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
BASE_TARGET: ${{ steps.vars.outputs.BASE_TARGET }}
LTS_KERNEL: ${{ steps.vars.outputs.LTS_KERNEL }}
run: |
make DRIVER_BRANCH=${{ matrix.driver_branch }} KERNEL_FLAVOR=${{ matrix.flavor }} LTS_KERNEL=${LTS_KERNEL} build-base-${BASE_TARGET}
trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver_branch }}
# try 3 times every 10 seconds to get the file, if success exit the loop
for i in {1..3}; do
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
sleep 10
done
- name: Build image
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
PRECOMPILED: "true"
DIST: signed_${{ matrix.dist }}
run: |
source kernel_version.txt && \
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver_branch }} build-${DIST}-${DRIVER_VERSION}
- name: Save build image and kernel version file
env:
DIST: ${{ matrix.dist }}
PRIVATE_REGISTRY: "ghcr.io"
run: |
source kernel_version.txt
tar -cvf kernel-version-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar kernel_version.txt
docker save "${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}" \
-o ./driver-images-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar
# set env for artifacts upload
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
echo "DIST=$DIST" >> $GITHUB_ENV
- name: Upload build image as an artifact
uses: actions/upload-artifact@v4
with:
name: driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}
path: ./driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}.tar
retention-days: 1
- name: Upload kernel version as an artifact
uses: actions/upload-artifact@v4
with:
name: kernel-version-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}
path: ./kernel-version-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}.tar
retention-days: 1
determine-e2e-test-matrix:
runs-on: linux-amd64-cpu4
strategy:
matrix:
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
needs:
- precompiled-build-image
- set-driver-version-matrix
outputs:
matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }}
dist: ${{ steps.set-driver-version-matrix.outputs.dist }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set kernel version
id: set_kernel_version
env:
DIST: ${{ matrix.dist }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT
if [ "$DIST" == "ubuntu22.04" ]; then
export BASE_TARGET="jammy"
export LTS_KERNEL="5.15"
elif [ "$DIST" == "ubuntu24.04" ]; then
export BASE_TARGET="noble"
export LTS_KERNEL="6.8"
fi
kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}'
KERNEL_FLAVORS=($(echo "$kernel_flavors_json" | jq -r '.[]'))
driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]'))
source ./tests/scripts/ci-precompiled-helpers.sh
KERNEL_VERSIONS=($(get_kernel_versions_to_test $BASE_TARGET KERNEL_FLAVORS[@] DRIVER_BRANCHES[@] $DIST))
if [ -z "$KERNEL_VERSIONS" ]; then
# no new kernel release
echo "Skipping e2e tests"
exit 0
fi
# Convert array to JSON format and assign
echo "[]" > $GITHUB_WORKSPACE/matrix_values.json
printf '%s\n' "${KERNEL_VERSIONS[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json
echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT
echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
# determine-e2e-test-matrix-first-release:
# runs-on: linux-amd64-cpu4
# if: ${{ needs.determine-e2e-test-matrix.outputs.matrix_values_not_empty == '0' }}
# strategy:
# matrix:
# dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
# needs:
# - precompiled-build-image
# - set-driver-version-matrix
# - determine-e2e-test-matrix
# outputs:
# matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
# matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }}
# dist: ${{ steps.set-driver-version-matrix.outputs.dist }}
# steps:
# - name: Check out code
# uses: actions/checkout@v4
# - name: Login to GitHub Container Registry
# uses: docker/login-action@v3
# with:
# registry: ghcr.io
# username: ${{ github.actor }}
# password: ${{ secrets.GITHUB_TOKEN }}
# # download kernel version file for first release
# - name: Download kernel version artifact
# env:
# DIST: ${{ matrix.dist }}
# GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# run: |
# if [ "$DIST" == "ubuntu22.04" ]; then
# export BASE_TARGET="jammy"
# export LTS_KERNEL="5.15"
# elif [ "$DIST" == "ubuntu24.04" ]; then
# export BASE_TARGET="noble"
# export LTS_KERNEL="6.8"
# fi
# echo "SHIVA1"
# gh api -X GET /repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/artifacts --jq '.artifacts[].name'
# echo "SHIVA2"
# echo "SHIVA"
# DRIVER_BRANCH="550"
# prefix="kernel-version-550-6.8"
# suffix="generic-ubuntu24.04"
# artifacts=$(gh api -X GET /repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/artifacts --jq '.artifacts[].name')
# echo "SHIVA 11111 $artifacts"
# # Use a loop or a pattern to find the matching artifact dynamically
# for artifact in $artifacts; do
# echo "SHIVA 22222 $artifact"
# if [[ $artifact == $prefix*-$suffix ]]; then
# echo "Found matching artifact: $artifact"
# # Download the artifact
# gh run download --name "$artifact" --dir ./
# ls
# tar -xvf $artifact.tar
# ls
# cat kernel_version.txt
# # Exit loop once downloaded
# break
# fi
# done
# - name: Set kernel version
# id: set_kernel_version
# env:
# DIST: ${{ matrix.dist }}
# GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# run: |
# echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT
# if [ "$DIST" == "ubuntu22.04" ]; then
# export BASE_TARGET="jammy"
# export LTS_KERNEL="5.15"
# elif [ "$DIST" == "ubuntu24.04" ]; then
# export BASE_TARGET="noble"
# export LTS_KERNEL="6.8"
# fi
# kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}'
# KERNEL_FLAVORS=($(echo "$kernel_flavors_json" | jq -r '.[]'))
# driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
# DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]'))
# source ./tests/scripts/ci-precompiled-helpers.sh
# KERNEL_VERSIONS=($(get_kernel_versions_to_test $BASE_TARGET KERNEL_FLAVORS[@] DRIVER_BRANCHES[@] $DIST))
# if [ -z "$KERNEL_VERSIONS" ]; then
# # no new kernel release
# echo "Skipping e2e tests"
# exit 0
# fi
# # Convert array to JSON format and assign
# echo "[]" > $GITHUB_WORKSPACE/matrix_values.json
# printf '%s\n' "${KERNEL_VERSIONS[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json
# echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT
# echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
e2e-tests-nvidiadriver:
runs-on: linux-amd64-cpu4
needs:
- determine-e2e-test-matrix
- set-driver-version-matrix
# - determine-e2e-test-matrix-first-release
if: ${{ needs.determine-e2e-test-matrix.outputs.matrix_values_not_empty == '1' }}
strategy:
matrix:
kernel_version: ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }}
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Holodeck
uses: NVIDIA/[email protected]
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck_${{ matrix.dist }}.yaml"
- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
KERNEL_VERSION="${{ matrix.kernel_version }}"
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
echo "DIST=${{ matrix.dist }}" >> $GITHUB_ENV
driver_branch_json="${{ needs.set-driver-version-matrix.outputs.driver_branch }}"
DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]'))
echo "DRIVER_BRANCHES=${DRIVER_BRANCHES[*]}" >> $GITHUB_ENV
# step added to skip azure e2e tests and publish the image
# FIXME -- remove step once azure kernel upgrade starts working
- name: Skip azure e2e
run: |
if [[ "${KERNEL_VERSION}" == *-azure ]]; then
echo "e2e test for azure flavor skipped, as kernel upgrade AWS => azure is not supported"
exit 0
fi
- name: Install GitHub CLI
run: |
sudo apt-get update
sudo apt-get install -y gh
- name: Upgrade the kernel for Precompiled e2e test
env:
UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh"
run: |
status=0
./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" "${KERNEL_VERSION}" || status=$?
# On the target system, all scripts/test-case exit with code 1 for error handling.
# However, since reboot-related disconnections break the SSH connection
# and can cause the entire job to exit, we should ignore all errors except
# exit code 1. During a reboot, exit code 1 will not be thrown, so handling
# other errors as code 1 will ensure proper management of reboot scenarios
if [ $status -eq 1 ]; then
echo "Kernel version $KERNEL_VERSION upgrade failed"
exit 1
fi
./tests/scripts/remote_retry.sh || status=$?
if [ $status -ne 0 ]; then
echo "Failed to connect to remote instance"
exit $status
fi
- name: Precompiled e2e test gpu driver validation
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true \
--set driver.imagePullPolicy=Never"
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
rc=0
# for precompiled driver we are setting driver branch as driver version
DRIVER_BRANCHES=(${{ env.DRIVER_BRANCHES }})
for DRIVER_VERSION in "${DRIVER_BRANCHES[@]}"; do
echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
image="driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}"
echo "Downloading $image in tests directory"
gh run download --name $image --dir ./tests/
status=0
TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}"
# add escape character for space
TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS")
IMAGE_PATH="./tests/driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}.tar"
./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" ${IMAGE_PATH} || status=$?
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
rm -f $IMAGE_PATH
done
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15
publish-precompiled-image:
runs-on: linux-amd64-cpu4
needs:
- set-driver-version-matrix
- determine-e2e-test-matrix
- e2e-tests-nvidiadriver
strategy:
matrix:
driver_branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
kernel_version: ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }}
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set image vars
run: |
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
echo "DIST=${{ matrix.dist }}" >> $GITHUB_ENV
- name: Download built image artifact
uses: actions/download-artifact@v4
with:
name: driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }}
path: ./
- name: Publish image
run: |
image_path="./driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }}.tar"
echo "uploading $image_path"
docker load -i $image_path
docker push ${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }}