From 065efd1b25b34e45f19cbc3e4470e74a95068bfc Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 14 Apr 2022 14:31:14 +0200 Subject: [PATCH 01/48] Add scripts to support CUDA CUDA itself is not shipped by EESSI and has to be installed on the host. The scripts perform various checks and download and install the CUDA compat libs. Modules with CUDA as a dependecy are hidden in Lmod, unless the CUDA compat libs are installed which is only done when CUDA itself is installed on the host. This aspect still has to be tested with an updated Lmod version in the EESSI compat layer. --- EESSI-pilot-install-software.sh | 3 + eb_hooks.py | 21 ++- gpu_support/add_gpu_support.sh | 183 ++++++++++++++++++++++ gpu_support/get_latest_cuda_compatlibs.sh | 21 +++ gpu_support/setup.sh | 171 ++++++++++++++++++++ gpu_support/test_cuda | 39 +++++ init/SitePackage.lua | 29 ++++ init/bash | 5 + 8 files changed, 471 insertions(+), 1 deletion(-) create mode 100755 gpu_support/add_gpu_support.sh create mode 100755 gpu_support/get_latest_cuda_compatlibs.sh create mode 100644 gpu_support/setup.sh create mode 100644 gpu_support/test_cuda create mode 100644 init/SitePackage.lua diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index a25c3f1d9e..8b6db2cc92 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -108,6 +108,9 @@ module --force purge # ignore current $MODULEPATH entirely module unuse $MODULEPATH module use $EASYBUILD_INSTALLPATH/modules/all +if [ -d /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all ]; then + module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all +fi if [[ -z ${MODULEPATH} ]]; then fatal_error "Failed to set up \$MODULEPATH?!" else diff --git a/eb_hooks.py b/eb_hooks.py index 653094266d..2c2e6f16cc 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -7,7 +7,7 @@ from easybuild.tools.systemtools import AARCH64, POWER, get_cpu_architecture EESSI_RPATH_OVERRIDE_ATTR = 'orig_rpath_override_dirs' - +CUDA_ENABLED_TOOLCHAINS = ["pmvmklc", "gmvmklc", "gmvapich2c", "pmvapich2c"] def get_eessi_envvar(eessi_envvar): """Get an EESSI environment variable from the environment""" @@ -41,6 +41,23 @@ def get_rpath_override_dirs(software_name): return rpath_injection_dirs +def inject_gpu_property(ec): + ec_dict = ec.asdict() + # Check if CUDA is in the dependencies, if so add the GPU Lmod tag + if ( + "CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])] + or ec_dict["toolchain"]["name"] in CUDA_ENABLED_TOOLCHAINS + ): + key = "modluafooter" + value = 'add_property("arch","gpu")' + if key in ec_dict: + if not value in ec_dict[key]: + ec[key] = "\n".join([ec_dict[key], value]) + else: + ec[key] = value + ec.log.info("[parse hook] Injecting gpu as Lmod arch property") + + return ec def parse_hook(ec, *args, **kwargs): """Main parse hook: trigger custom functions based on software name.""" @@ -48,6 +65,8 @@ def parse_hook(ec, *args, **kwargs): # determine path to Prefix installation in compat layer via $EPREFIX eprefix = get_eessi_envvar('EPREFIX') + ec = inject_gpu_property(ec) + if ec.name in PARSE_HOOKS: PARSE_HOOKS[ec.name](ec, eprefix) diff --git a/gpu_support/add_gpu_support.sh b/gpu_support/add_gpu_support.sh new file mode 100755 index 0000000000..730089776a --- /dev/null +++ b/gpu_support/add_gpu_support.sh @@ -0,0 +1,183 @@ +# Drop into the prefix shell or pipe this script into a Prefix shell with +# $EPREFIX/startprefix <<< /path/to/this_script.sh + +# verify existence of nvidia-smi or this is a waste of time +# Check if nvidia-smi exists and can be executed without error +if command -v nvidia-smi > /dev/null 2>&1; then + nvidia-smi > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "nvidia-smi was found but returned error code, exiting now..." >&2 + exit 1 + fi + echo "nvidia-smi found, continue setup." +else + echo "nvidia-smi not found, exiting now..." >&2 + exit 1 +fi + +# set up basic environment variables, EasyBuild and Lmod +# TODO: copied necessary parts from EESSI-pilot-install-software.sh, trim further down? +source setup.sh + +# Get arch type from EESSI environment +eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}" + +# Get OS family +# TODO: needs more thorough testing +os_family=$(uname | tr '[:upper:]' '[:lower:]') + +# Get OS version +# TODO: needs more thorough testing, taken from https://unix.stackexchange.com/a/6348 +if [ -f /etc/os-release ]; then + # freedesktop.org and systemd + . /etc/os-release + os=$NAME + ver=$VERSION_ID + if [[ "$os" == *"Rocky"* ]]; then + os="rhel" + fi + if [[ "$os" == *"Debian"* ]]; then + os="debian" + fi +elif type lsb_release >/dev/null 2>&1; then + # linuxbase.org + os=$(lsb_release -si) + ver=$(lsb_release -sr) +elif [ -f /etc/lsb-release ]; then + # For some versions of Debian/Ubuntu without lsb_release command + . /etc/lsb-release + os=$DISTRIB_ID + ver=$DISTRIB_RELEASE +elif [ -f /etc/debian_version ]; then + # Older Debian/Ubuntu/etc. + os=Debian + ver=$(cat /etc/debian_version) +else + # Fall back to uname, e.g. "Linux ", also works for BSD, etc. + os=$(uname -s) + ver=$(uname -r) +fi +# Convert OS version to major versions, e.g. rhel8.5 -> rhel8 +# TODO: needs testing for e.g. Ubuntu 20.04 +ver=${ver%.*} + +############################################################################################## +# Check that the CUDA driver version is adequate +# ( +# needs to be r450 or r470 which are LTS, other production branches are acceptable but not +# recommended, below r450 is not compatible [with an exception we will not explore,see +# https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers] +# ) +# only check first number in case of multiple GPUs +driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | tail -n1) +driver_version="${driver_version%%.*}" +# Now check driver_version for compatability +# Check driver is at least LTS driver R450, see https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers +if (( $driver_version < 450 )); then + echo "Your NVIDIA driver version is too old, please update first.." + exit 1 +fi + + +# Check if the CUDA compat libraries are installed and compatible with the target CUDA version +# if not find the latest version of the compatibility libraries and install them + +# get URL to latest CUDA compat libs, exit if URL is invalid +latest_cuda_compat_url="$(./get_latest_cuda_compatlibs.sh ${os} ${ver} ${eessi_cpu_family})" +ret=$? +if [ $ret -ne 0 ]; then + echo $latest_cuda_compat_url + exit 1 +fi + +# Create a general space for our NVIDIA compat drivers +if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then + mkdir -p /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia + cd /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia +else + echo "Cannot write to eessi host_injections space, exiting now..." >&2 + exit 1 +fi + +# Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest + +driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) +eessi_cuda_version=$(LD_LIBRARY_PATH=/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) +if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi + +# Check if our target CUDA is satisfied by what is installed already +# TODO: Find required CUDA version and see if we need an update + +# If not, grab the latest compat library RPM or deb +# download and unpack in temporary directory, easier cleanup after installation +mkdir -p tmp +cd tmp +compat_file=${latest_cuda_compat_url##*/} +wget ${latest_cuda_compat_url} + +# Unpack it +# (the requirements here are OS dependent, can we get around that?) +# (for rpms looks like we can use https://gitweb.gentoo.org/repo/proj/prefix.git/tree/eclass/rpm.eclass?id=d7fc8cf65c536224bace1d22c0cd85a526490a1e) +# (deb files can be unpacked with ar and tar) +file_extension=${compat_file##*.} +if [[ ${file_extension} == "rpm" ]]; then + rpm2cpio ${compat_file} | cpio -idmv +elif [[ ${file_extension} == "deb" ]]; then + ar x ${compat_file} + tar xf data.tar.* +else + echo "File extension of cuda compat lib not supported, exiting now..." >&2 + exit 1 +fi +cd .. +# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir +mv -n tmp/usr/local/cuda-* . +rm -r tmp + +# Add a symlink that points to the latest version +latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1) +echo $latest_cuda_dir +ln -sf ${latest_cuda_dir} latest + +if [ ! -e latest ] ; then + echo "Symlink to latest cuda compat lib version is broken, exiting now..." + exit 1 +fi + +# Create the space to host the libraries +mkdir -p /cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family} +# Symlink in the path to the latest libraries +if [ ! -d "/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family}/lib" ]; then + ln -s /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/compat /cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family}/lib +fi + +# return to initial dir +cd $current_dir + +############################################################################################### +############################################################################################### +# Install CUDA +# TODO: Can we do a trimmed install? +# if modules dir exists, load it for usage within Lmod +if [ -d /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all ]; then + module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all +fi +# only install CUDA if specified version is not found +install_cuda_version="11.3.1" +module avail 2>&1 | grep -i CUDA/${install_cuda_version} &> ${ml_av_easybuild_out} +if [[ $? -eq 0 ]]; then + echo_green ">> CUDA module found!" +else + # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` + # (CUDA is a binary installation so no need to worry too much about this) + # TODO: The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed + avail_space=$(df --output=avail /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/ | tail -n 1 | awk '{print $1}') + if (( ${avail_space} < 16000000 )); then + echo "Need more disk space to install CUDA, exiting now..." + exit 1 + fi + # install cuda in host_injections + eb --installpath=/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/ CUDA-${install_cuda_version}.eb +fi + +source test_cuda diff --git a/gpu_support/get_latest_cuda_compatlibs.sh b/gpu_support/get_latest_cuda_compatlibs.sh new file mode 100755 index 0000000000..91680f5e2e --- /dev/null +++ b/gpu_support/get_latest_cuda_compatlibs.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +os=$1 +ver=$2 +eessi_cpu_family=$3 + +# build URL for CUDA libraries +cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/"${os}${ver}"/"${eessi_cpu_family}"/" +# get latest version, files are sorted by date +# TODO: probably better to explicitly check version numbers than trusting that it is sorted +latest_file=$(curl -s "${cuda_url}" | grep 'cuda-compat' | tail -1) +if [[ -z "${latest_file// }" ]]; then + echo "Could not find any compat lib files under" ${cuda_url} + exit 1 +fi +# extract actual file name from html snippet +file=$(echo $latest_file | sed 's/<\/\?[^>]\+>//g') +# build final URL for wget +cuda_url="${cuda_url}$file" +# simply echo the URL, result will be used by add_gpu_support.sh +echo $cuda_url diff --git a/gpu_support/setup.sh b/gpu_support/setup.sh new file mode 100644 index 0000000000..15689fcbad --- /dev/null +++ b/gpu_support/setup.sh @@ -0,0 +1,171 @@ +#!/bin/bash +# +# Script to install EESSI pilot software stack (version 2021.12) +# +current_dir=$(dirname $(realpath $0)) +TOPDIR=$(dirname "$current_dir") + +function echo_green() { + echo -e "\e[32m$1\e[0m" +} + +function echo_red() { + echo -e "\e[31m$1\e[0m" +} + +function echo_yellow() { + echo -e "\e[33m$1\e[0m" +} + +function fatal_error() { + echo_red "ERROR: $1" >&2 + exit 1 +} + +function check_exit_code { + ec=$1 + ok_msg=$2 + fail_msg=$3 + + if [[ $ec -eq 0 ]]; then + echo_green "${ok_msg}" + else + fatal_error "${fail_msg}" + fi +} + +# honor $TMPDIR if it is already defined, use /tmp otherwise +if [ -z $TMPDIR ]; then + export WORKDIR=/tmp/$USER +else + export WORKDIR=$TMPDIR/$USER +fi + +TMPDIR=$(mktemp -d) + +echo ">> Setting up environment..." + +source $TOPDIR/init/minimal_eessi_env + +if [ -d $EESSI_CVMFS_REPO ]; then + echo_green "$EESSI_CVMFS_REPO available, OK!" +else + fatal_error "$EESSI_CVMFS_REPO is not available!" +fi + +# make sure we're in Prefix environment by checking $SHELL +if [[ ${SHELL} = ${EPREFIX}/bin/bash ]]; then + echo_green ">> It looks like we're in a Gentoo Prefix environment, good!" +else + fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" +fi + +# avoid that pyc files for EasyBuild are stored in EasyBuild installation directory +export PYTHONPYCACHEPREFIX=$TMPDIR/pycache + +DETECTION_PARAMETERS='' +GENERIC=0 +EB='eb' +if [[ "$1" == "--generic" || "$EASYBUILD_OPTARCH" == "GENERIC" ]]; then + echo_yellow ">> GENERIC build requested, taking appropriate measures!" + DETECTION_PARAMETERS="$DETECTION_PARAMETERS --generic" + GENERIC=1 + EB='eb --optarch=GENERIC' +fi + +echo ">> Determining software subdirectory to use for current build host..." +export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS) + +# Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE) +# $EESSI_SILENT - don't print any messages +# $EESSI_BASIC_ENV - give a basic set of environment variables +EESSI_SILENT=1 EESSI_BASIC_ENV=1 source $TOPDIR/init/eessi_environment_variables + +if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then + fatal_error "Failed to determine software subdirectory?!" +elif [[ "${EESSI_SOFTWARE_SUBDIR}" != "${EESSI_SOFTWARE_SUBDIR_OVERRIDE}" ]]; then + fatal_error "Values for EESSI_SOFTWARE_SUBDIR_OVERRIDE (${EESSI_SOFTWARE_SUBDIR_OVERRIDE}) and EESSI_SOFTWARE_SUBDIR (${EESSI_SOFTWARE_SUBDIR}) differ!" +else + echo_green ">> Using ${EESSI_SOFTWARE_SUBDIR} as software subdirectory!" +fi + +echo ">> Initializing Lmod..." +source $EPREFIX/usr/share/Lmod/init/bash +ml_version_out=$TMPDIR/ml.out +ml --version &> $ml_version_out +if [[ $? -eq 0 ]]; then + echo_green ">> Found Lmod ${LMOD_VERSION}" +else + fatal_error "Failed to initialize Lmod?! (see output in ${ml_version_out}" +fi + +echo ">> Configuring EasyBuild..." +# need to actually change dir because of the way configure_easybuild is written +cd .. +source configure_easybuild +cd - + +echo ">> Setting up \$MODULEPATH..." +# make sure no modules are loaded +module --force purge +# ignore current $MODULEPATH entirely +module unuse $MODULEPATH +module use $EASYBUILD_INSTALLPATH/modules/all +if [[ -z ${MODULEPATH} ]]; then + fatal_error "Failed to set up \$MODULEPATH?!" +else + echo_green ">> MODULEPATH set up: ${MODULEPATH}" +fi + +REQ_EB_VERSION='4.5.0' + +echo ">> Checking for EasyBuild module..." +ml_av_easybuild_out=$TMPDIR/ml_av_easybuild.out +module avail 2>&1 | grep -i easybuild/${REQ_EB_VERSION} &> ${ml_av_easybuild_out} +if [[ $? -eq 0 ]]; then + echo_green ">> EasyBuild module found!" +else + echo_yellow ">> No EasyBuild module yet, installing it..." + + EB_TMPDIR=${TMPDIR}/ebtmp + echo ">> Temporary installation (in ${EB_TMPDIR})..." + pip_install_out=${TMPDIR}/pip_install.out + pip3 install --prefix $EB_TMPDIR easybuild &> ${pip_install_out} + + echo ">> Final installation in ${EASYBUILD_INSTALLPATH}..." + export PATH=${EB_TMPDIR}/bin:$PATH + export PYTHONPATH=$(ls -d ${EB_TMPDIR}/lib/python*/site-packages):$PYTHONPATH + eb_install_out=${TMPDIR}/eb_install.out + eb --install-latest-eb-release &> ${eb_install_out} + + eb --search EasyBuild-${REQ_EB_VERSION}.eb | grep EasyBuild-${REQ_EB_VERSION}.eb > /dev/null + if [[ $? -eq 0 ]]; then + eb EasyBuild-${REQ_EB_VERSION}.eb >> ${eb_install_out} 2>&1 + fi + + module avail easybuild/${REQ_EB_VERSION} &> ${ml_av_easybuild_out} + if [[ $? -eq 0 ]]; then + echo_green ">> EasyBuild module installed!" + else + fatal_error "EasyBuild/${REQ_EB_VERSION} module failed to install?! (output of 'pip install' in ${pip_install_out}, output of 'eb' in ${eb_install_out}, output of 'ml av easybuild' in ${ml_av_easybuild_out})" + fi +fi + +echo ">> Loading EasyBuild module..." +module load EasyBuild/$REQ_EB_VERSION +eb_show_system_info_out=${TMPDIR}/eb_show_system_info.out +$EB --show-system-info > ${eb_show_system_info_out} +if [[ $? -eq 0 ]]; then + echo_green ">> EasyBuild seems to be working!" + $EB --version | grep "${REQ_EB_VERSION}" + if [[ $? -eq 0 ]]; then + echo_green "Found EasyBuild version ${REQ_EB_VERSION}, looking good!" + else + $EB --version + fatal_error "Expected to find EasyBuild version ${REQ_EB_VERSION}, giving up here..." + fi + $EB --show-config +else + cat ${eb_show_system_info_out} + fatal_error "EasyBuild not working?!" +fi diff --git a/gpu_support/test_cuda b/gpu_support/test_cuda new file mode 100644 index 0000000000..cf4a7476a7 --- /dev/null +++ b/gpu_support/test_cuda @@ -0,0 +1,39 @@ +#!/bin/bash + +# Test CUDA +module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all/ +module load CUDA +tmp_dir=$(mktemp -d) +cp -r $EBROOTCUDA/samples $tmp_dir +current_dir=$PWD +cd $tmp_dir/samples/1_Utilities/deviceQuery +make HOST_COMPILER=$(which g++) -j +./deviceQuery + +if [ $? -eq 0 ] +then + # Set the color variable + green='\033[0;32m' + # Clear the color after that + clear='\033[0m' + echo -e ${green} + echo "Congratulations, your GPU is working with EESSI!" + echo " - To build CUDA enabled modules use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/ as your EasyBuild prefix" + echo " - To use these modules:" + echo " module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all/" + echo -e ${clear} +else + echo "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 +fi + +# Clean up +cd $current_dir +rm -r $tmp_dir + +# Test building something with CUDA and running +# TODO: Use samples from installation directory, `device_query` is a good option + +# Test a CUDA-enabled module from EESSI +# TODO: GROMACS? +# TODO: Include a GDR copy test? +############################################################################################### diff --git a/init/SitePackage.lua b/init/SitePackage.lua new file mode 100644 index 0000000000..6e9720a17f --- /dev/null +++ b/init/SitePackage.lua @@ -0,0 +1,29 @@ +require("strict") +local hook = require("Hook") + +-- from https://stackoverflow.com/a/40195356 +--- Check if a file or directory exists in this path +function exists(file) + local ok, err, code = os.rename(file, file) + if not ok then + if code == 13 then + -- Permission denied, but it exists + return true + end + end + return ok, err +end + +local function visible_hook(modT) + local frameStk = require("FrameStk"):singleton() + local mt = frameStk:mt() + local cudaDir = exists('/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/') + if not cudaDir then + local haveGpu = mt:haveProperty(modT.sn,"arch","gpu") + if haveGpu then + modT.isVisible = false + end + end +end + +hook.register("isVisibleHook", visible_hook) diff --git a/init/bash b/init/bash index ea605db0b5..c5c1a583e7 100644 --- a/init/bash +++ b/init/bash @@ -19,6 +19,11 @@ if [ $? -eq 0 ]; then # see https://github.com/EESSI/software-layer/issues/52 export PATH=$EPREFIX/usr/bin:$EPREFIX/bin:$PATH + # used for EESSI specific SitePackage.lua, hide GPU modules if CUDA is not installed + # TODO: better place to store SitePackage file? + # TODO: another method to define path to lua file? + export LMOD_PACKAGE_PATH=$(dirname "$BASH_SOURCE") + # init Lmod echo "Initializing Lmod..." >> $output source $EESSI_EPREFIX/usr/share/Lmod/init/bash From caa43bf638c09d1250f6a3f5f5bca19fab539da2 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 12 May 2022 14:45:03 +0200 Subject: [PATCH 02/48] Fix dump location of check whether CUDA module is installed --- gpu_support/add_gpu_support.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/add_gpu_support.sh b/gpu_support/add_gpu_support.sh index 730089776a..9fbd9cf28c 100755 --- a/gpu_support/add_gpu_support.sh +++ b/gpu_support/add_gpu_support.sh @@ -164,7 +164,7 @@ if [ -d /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all ]; then fi # only install CUDA if specified version is not found install_cuda_version="11.3.1" -module avail 2>&1 | grep -i CUDA/${install_cuda_version} &> ${ml_av_easybuild_out} +module avail 2>&1 | grep -i CUDA/${install_cuda_version} &> /dev/null if [[ $? -eq 0 ]]; then echo_green ">> CUDA module found!" else From d7212a068e0b7a64349001af7071912b57c6b935 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 12 May 2022 14:49:06 +0200 Subject: [PATCH 03/48] Remove setup script, use shipped init script to set env vars etc. instead --- gpu_support/add_gpu_support.sh | 7 +- gpu_support/setup.sh | 171 --------------------------------- 2 files changed, 5 insertions(+), 173 deletions(-) delete mode 100644 gpu_support/setup.sh diff --git a/gpu_support/add_gpu_support.sh b/gpu_support/add_gpu_support.sh index 9fbd9cf28c..4773f97dbc 100755 --- a/gpu_support/add_gpu_support.sh +++ b/gpu_support/add_gpu_support.sh @@ -16,8 +16,9 @@ else fi # set up basic environment variables, EasyBuild and Lmod -# TODO: copied necessary parts from EESSI-pilot-install-software.sh, trim further down? -source setup.sh +EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/versions/2021.12/init/bash + +current_dir=$(dirname $(realpath $0)) # Get arch type from EESSI environment eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}" @@ -177,7 +178,9 @@ else exit 1 fi # install cuda in host_injections + module load EasyBuild eb --installpath=/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/ CUDA-${install_cuda_version}.eb fi +cd $current_dir source test_cuda diff --git a/gpu_support/setup.sh b/gpu_support/setup.sh deleted file mode 100644 index 15689fcbad..0000000000 --- a/gpu_support/setup.sh +++ /dev/null @@ -1,171 +0,0 @@ -#!/bin/bash -# -# Script to install EESSI pilot software stack (version 2021.12) -# -current_dir=$(dirname $(realpath $0)) -TOPDIR=$(dirname "$current_dir") - -function echo_green() { - echo -e "\e[32m$1\e[0m" -} - -function echo_red() { - echo -e "\e[31m$1\e[0m" -} - -function echo_yellow() { - echo -e "\e[33m$1\e[0m" -} - -function fatal_error() { - echo_red "ERROR: $1" >&2 - exit 1 -} - -function check_exit_code { - ec=$1 - ok_msg=$2 - fail_msg=$3 - - if [[ $ec -eq 0 ]]; then - echo_green "${ok_msg}" - else - fatal_error "${fail_msg}" - fi -} - -# honor $TMPDIR if it is already defined, use /tmp otherwise -if [ -z $TMPDIR ]; then - export WORKDIR=/tmp/$USER -else - export WORKDIR=$TMPDIR/$USER -fi - -TMPDIR=$(mktemp -d) - -echo ">> Setting up environment..." - -source $TOPDIR/init/minimal_eessi_env - -if [ -d $EESSI_CVMFS_REPO ]; then - echo_green "$EESSI_CVMFS_REPO available, OK!" -else - fatal_error "$EESSI_CVMFS_REPO is not available!" -fi - -# make sure we're in Prefix environment by checking $SHELL -if [[ ${SHELL} = ${EPREFIX}/bin/bash ]]; then - echo_green ">> It looks like we're in a Gentoo Prefix environment, good!" -else - fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" -fi - -# avoid that pyc files for EasyBuild are stored in EasyBuild installation directory -export PYTHONPYCACHEPREFIX=$TMPDIR/pycache - -DETECTION_PARAMETERS='' -GENERIC=0 -EB='eb' -if [[ "$1" == "--generic" || "$EASYBUILD_OPTARCH" == "GENERIC" ]]; then - echo_yellow ">> GENERIC build requested, taking appropriate measures!" - DETECTION_PARAMETERS="$DETECTION_PARAMETERS --generic" - GENERIC=1 - EB='eb --optarch=GENERIC' -fi - -echo ">> Determining software subdirectory to use for current build host..." -export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS) - -# Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE) -# $EESSI_SILENT - don't print any messages -# $EESSI_BASIC_ENV - give a basic set of environment variables -EESSI_SILENT=1 EESSI_BASIC_ENV=1 source $TOPDIR/init/eessi_environment_variables - -if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then - fatal_error "Failed to determine software subdirectory?!" -elif [[ "${EESSI_SOFTWARE_SUBDIR}" != "${EESSI_SOFTWARE_SUBDIR_OVERRIDE}" ]]; then - fatal_error "Values for EESSI_SOFTWARE_SUBDIR_OVERRIDE (${EESSI_SOFTWARE_SUBDIR_OVERRIDE}) and EESSI_SOFTWARE_SUBDIR (${EESSI_SOFTWARE_SUBDIR}) differ!" -else - echo_green ">> Using ${EESSI_SOFTWARE_SUBDIR} as software subdirectory!" -fi - -echo ">> Initializing Lmod..." -source $EPREFIX/usr/share/Lmod/init/bash -ml_version_out=$TMPDIR/ml.out -ml --version &> $ml_version_out -if [[ $? -eq 0 ]]; then - echo_green ">> Found Lmod ${LMOD_VERSION}" -else - fatal_error "Failed to initialize Lmod?! (see output in ${ml_version_out}" -fi - -echo ">> Configuring EasyBuild..." -# need to actually change dir because of the way configure_easybuild is written -cd .. -source configure_easybuild -cd - - -echo ">> Setting up \$MODULEPATH..." -# make sure no modules are loaded -module --force purge -# ignore current $MODULEPATH entirely -module unuse $MODULEPATH -module use $EASYBUILD_INSTALLPATH/modules/all -if [[ -z ${MODULEPATH} ]]; then - fatal_error "Failed to set up \$MODULEPATH?!" -else - echo_green ">> MODULEPATH set up: ${MODULEPATH}" -fi - -REQ_EB_VERSION='4.5.0' - -echo ">> Checking for EasyBuild module..." -ml_av_easybuild_out=$TMPDIR/ml_av_easybuild.out -module avail 2>&1 | grep -i easybuild/${REQ_EB_VERSION} &> ${ml_av_easybuild_out} -if [[ $? -eq 0 ]]; then - echo_green ">> EasyBuild module found!" -else - echo_yellow ">> No EasyBuild module yet, installing it..." - - EB_TMPDIR=${TMPDIR}/ebtmp - echo ">> Temporary installation (in ${EB_TMPDIR})..." - pip_install_out=${TMPDIR}/pip_install.out - pip3 install --prefix $EB_TMPDIR easybuild &> ${pip_install_out} - - echo ">> Final installation in ${EASYBUILD_INSTALLPATH}..." - export PATH=${EB_TMPDIR}/bin:$PATH - export PYTHONPATH=$(ls -d ${EB_TMPDIR}/lib/python*/site-packages):$PYTHONPATH - eb_install_out=${TMPDIR}/eb_install.out - eb --install-latest-eb-release &> ${eb_install_out} - - eb --search EasyBuild-${REQ_EB_VERSION}.eb | grep EasyBuild-${REQ_EB_VERSION}.eb > /dev/null - if [[ $? -eq 0 ]]; then - eb EasyBuild-${REQ_EB_VERSION}.eb >> ${eb_install_out} 2>&1 - fi - - module avail easybuild/${REQ_EB_VERSION} &> ${ml_av_easybuild_out} - if [[ $? -eq 0 ]]; then - echo_green ">> EasyBuild module installed!" - else - fatal_error "EasyBuild/${REQ_EB_VERSION} module failed to install?! (output of 'pip install' in ${pip_install_out}, output of 'eb' in ${eb_install_out}, output of 'ml av easybuild' in ${ml_av_easybuild_out})" - fi -fi - -echo ">> Loading EasyBuild module..." -module load EasyBuild/$REQ_EB_VERSION -eb_show_system_info_out=${TMPDIR}/eb_show_system_info.out -$EB --show-system-info > ${eb_show_system_info_out} -if [[ $? -eq 0 ]]; then - echo_green ">> EasyBuild seems to be working!" - $EB --version | grep "${REQ_EB_VERSION}" - if [[ $? -eq 0 ]]; then - echo_green "Found EasyBuild version ${REQ_EB_VERSION}, looking good!" - else - $EB --version - fatal_error "Expected to find EasyBuild version ${REQ_EB_VERSION}, giving up here..." - fi - $EB --show-config -else - cat ${eb_show_system_info_out} - fatal_error "EasyBuild not working?!" -fi From 48f445595f751ca909a1b537fe8694404f4f61ba Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Fri, 13 May 2022 09:21:49 +0200 Subject: [PATCH 04/48] Check return values and path existence in CUDA tests --- gpu_support/test_cuda | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/gpu_support/test_cuda b/gpu_support/test_cuda index cf4a7476a7..098efdc049 100644 --- a/gpu_support/test_cuda +++ b/gpu_support/test_cuda @@ -1,8 +1,18 @@ #!/bin/bash # Test CUDA -module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all/ +if [ -d /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all ]; then + module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all/ +else + echo "Cannot test CUDA, modules path does not exist, exiting now..." + exit 1 +fi module load CUDA +ret=$? +if [ $ret -ne 0 ]; then + echo "Could not load CUDA even though modules path exists..." + exit 1 +fi tmp_dir=$(mktemp -d) cp -r $EBROOTCUDA/samples $tmp_dir current_dir=$PWD From c50daa2efea4968117f1a5c0f7c94fc487e097b1 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Fri, 13 May 2022 09:43:43 +0200 Subject: [PATCH 05/48] Check return value of eb install, improve source of other scripts --- gpu_support/add_gpu_support.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gpu_support/add_gpu_support.sh b/gpu_support/add_gpu_support.sh index 4773f97dbc..04656384e9 100755 --- a/gpu_support/add_gpu_support.sh +++ b/gpu_support/add_gpu_support.sh @@ -84,7 +84,7 @@ fi # if not find the latest version of the compatibility libraries and install them # get URL to latest CUDA compat libs, exit if URL is invalid -latest_cuda_compat_url="$(./get_latest_cuda_compatlibs.sh ${os} ${ver} ${eessi_cpu_family})" +latest_cuda_compat_url="$($(dirname "$BASH_SOURCE")/get_latest_cuda_compatlibs.sh ${os} ${ver} ${eessi_cpu_family})" ret=$? if [ $ret -ne 0 ]; then echo $latest_cuda_compat_url @@ -180,7 +180,12 @@ else # install cuda in host_injections module load EasyBuild eb --installpath=/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/ CUDA-${install_cuda_version}.eb + ret=$? + if [ $ret -ne 0 ]; then + echo "CUDA installation failed, please check EasyBuild logs..." + exit 1 + fi fi cd $current_dir -source test_cuda +source $(dirname "$BASH_SOURCE")/test_cuda From d4e85cce3b53767d8ccdd27f016241c742c7d97b Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Fri, 13 May 2022 09:44:37 +0200 Subject: [PATCH 06/48] Use mktemp to create temporary directory to install compat libs --- gpu_support/add_gpu_support.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gpu_support/add_gpu_support.sh b/gpu_support/add_gpu_support.sh index 04656384e9..dd38c6487b 100755 --- a/gpu_support/add_gpu_support.sh +++ b/gpu_support/add_gpu_support.sh @@ -111,8 +111,9 @@ if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to # If not, grab the latest compat library RPM or deb # download and unpack in temporary directory, easier cleanup after installation -mkdir -p tmp -cd tmp +host_injections_dir=$(dirname $(realpath $0)) +tmpdir=$(mktemp -d) +cd $tmpdir compat_file=${latest_cuda_compat_url##*/} wget ${latest_cuda_compat_url} @@ -130,10 +131,10 @@ else echo "File extension of cuda compat lib not supported, exiting now..." >&2 exit 1 fi -cd .. +cd $host_injections_dir # TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir -mv -n tmp/usr/local/cuda-* . -rm -r tmp +mv -n ${tmpdir}/usr/local/cuda-* . +rm -r ${tmpdir} # Add a symlink that points to the latest version latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1) From 590e04291b26bdcada279f215951dacfb184664b Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Fri, 13 May 2022 09:45:06 +0200 Subject: [PATCH 07/48] Fix echo --- gpu_support/add_gpu_support.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/add_gpu_support.sh b/gpu_support/add_gpu_support.sh index dd38c6487b..38737b1743 100755 --- a/gpu_support/add_gpu_support.sh +++ b/gpu_support/add_gpu_support.sh @@ -168,7 +168,7 @@ fi install_cuda_version="11.3.1" module avail 2>&1 | grep -i CUDA/${install_cuda_version} &> /dev/null if [[ $? -eq 0 ]]; then - echo_green ">> CUDA module found!" + echo "CUDA module found! No need to install CUDA again, proceeding with tests" else # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` # (CUDA is a binary installation so no need to worry too much about this) From 7b9bb4946fbb0aaccaa5cd5e2e97f2d9753409ea Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Mon, 16 May 2022 13:18:09 +0200 Subject: [PATCH 08/48] Replace explicit dir names with variables, check symlink destination --- gpu_support/add_gpu_support.sh | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/gpu_support/add_gpu_support.sh b/gpu_support/add_gpu_support.sh index 38737b1743..c2dc86cb6d 100755 --- a/gpu_support/add_gpu_support.sh +++ b/gpu_support/add_gpu_support.sh @@ -92,9 +92,10 @@ if [ $ret -ne 0 ]; then fi # Create a general space for our NVIDIA compat drivers +host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then - mkdir -p /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia - cd /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia + mkdir -p ${host_injections_dir} + cd ${host_injections_dir} else echo "Cannot write to eessi host_injections space, exiting now..." >&2 exit 1 @@ -103,7 +104,7 @@ fi # Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) -eessi_cuda_version=$(LD_LIBRARY_PATH=/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) +eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi # Check if our target CUDA is satisfied by what is installed already @@ -111,7 +112,6 @@ if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to # If not, grab the latest compat library RPM or deb # download and unpack in temporary directory, easier cleanup after installation -host_injections_dir=$(dirname $(realpath $0)) tmpdir=$(mktemp -d) cd $tmpdir compat_file=${latest_cuda_compat_url##*/} @@ -138,7 +138,6 @@ rm -r ${tmpdir} # Add a symlink that points to the latest version latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1) -echo $latest_cuda_dir ln -sf ${latest_cuda_dir} latest if [ ! -e latest ] ; then @@ -147,10 +146,15 @@ if [ ! -e latest ] ; then fi # Create the space to host the libraries -mkdir -p /cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family} +host_injection_libs_dir=/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family} +mkdir -p ${host_injection_libs_dir} # Symlink in the path to the latest libraries -if [ ! -d "/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family}/lib" ]; then - ln -s /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/compat /cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family}/lib +if [ ! -d "${host_injection_libs_dir}/lib" ]; then + ln -s ${host_injections_dir}/latest/compat ${host_injection_libs_dir}/lib +elif [ ! "${host_injection_libs_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then + echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..." + echo "${host_injection_libs_dir}/lib should point to ${host_injections_dir}/latest/compat" + exit 1 fi # return to initial dir @@ -161,8 +165,8 @@ cd $current_dir # Install CUDA # TODO: Can we do a trimmed install? # if modules dir exists, load it for usage within Lmod -if [ -d /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all ]; then - module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all +if [ -d ${host_injections_dir}/modules/all ]; then + module use ${host_injections_dir}/modules/all fi # only install CUDA if specified version is not found install_cuda_version="11.3.1" @@ -173,14 +177,14 @@ else # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` # (CUDA is a binary installation so no need to worry too much about this) # TODO: The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed - avail_space=$(df --output=avail /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/ | tail -n 1 | awk '{print $1}') + avail_space=$(df --output=avail ${host_injections_dir}/ | tail -n 1 | awk '{print $1}') if (( ${avail_space} < 16000000 )); then echo "Need more disk space to install CUDA, exiting now..." exit 1 fi # install cuda in host_injections module load EasyBuild - eb --installpath=/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/ CUDA-${install_cuda_version}.eb + eb --installpath=${host_injections_dir}/ CUDA-${install_cuda_version}.eb ret=$? if [ $ret -ne 0 ]; then echo "CUDA installation failed, please check EasyBuild logs..." From 01844c6e30ba43902769bf3157e12d9d40cb204b Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Mon, 16 May 2022 15:50:21 +0200 Subject: [PATCH 09/48] Install CUDA in modified version of EESSI_SOFTWARE_PATH --- gpu_support/add_gpu_support.sh | 10 ++++++---- gpu_support/test_cuda | 5 +++-- init/SitePackage.lua | 5 +++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/gpu_support/add_gpu_support.sh b/gpu_support/add_gpu_support.sh index c2dc86cb6d..09a6f18623 100755 --- a/gpu_support/add_gpu_support.sh +++ b/gpu_support/add_gpu_support.sh @@ -165,8 +165,10 @@ cd $current_dir # Install CUDA # TODO: Can we do a trimmed install? # if modules dir exists, load it for usage within Lmod -if [ -d ${host_injections_dir}/modules/all ]; then - module use ${host_injections_dir}/modules/all +cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" +mkdir -p ${cuda_install_dir} +if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all fi # only install CUDA if specified version is not found install_cuda_version="11.3.1" @@ -177,14 +179,14 @@ else # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` # (CUDA is a binary installation so no need to worry too much about this) # TODO: The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed - avail_space=$(df --output=avail ${host_injections_dir}/ | tail -n 1 | awk '{print $1}') + avail_space=$(df --output=avail ${cuda_install_dir}/ | tail -n 1 | awk '{print $1}') if (( ${avail_space} < 16000000 )); then echo "Need more disk space to install CUDA, exiting now..." exit 1 fi # install cuda in host_injections module load EasyBuild - eb --installpath=${host_injections_dir}/ CUDA-${install_cuda_version}.eb + eb --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb ret=$? if [ $ret -ne 0 ]; then echo "CUDA installation failed, please check EasyBuild logs..." diff --git a/gpu_support/test_cuda b/gpu_support/test_cuda index 098efdc049..6aa41f7d3c 100644 --- a/gpu_support/test_cuda +++ b/gpu_support/test_cuda @@ -1,8 +1,9 @@ #!/bin/bash # Test CUDA -if [ -d /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all ]; then - module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all/ +cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" +if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ else echo "Cannot test CUDA, modules path does not exist, exiting now..." exit 1 diff --git a/init/SitePackage.lua b/init/SitePackage.lua index 6e9720a17f..25f9f2eed7 100644 --- a/init/SitePackage.lua +++ b/init/SitePackage.lua @@ -17,8 +17,9 @@ end local function visible_hook(modT) local frameStk = require("FrameStk"):singleton() local mt = frameStk:mt() - local cudaDir = exists('/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/') - if not cudaDir then + local cudaDir = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + local cudaDirExists = exists(cudaDir) + if not cudaDirExists then local haveGpu = mt:haveProperty(modT.sn,"arch","gpu") if haveGpu then modT.isVisible = false From 0e8861f15e3752f7444a2692e96f0f0e7e7f5e62 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Mon, 16 May 2022 15:50:59 +0200 Subject: [PATCH 10/48] If CUDA install dir exists, add it to EESSI_MODULE_PATH --- init/eessi_environment_variables | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 34dc8f9f98..0ef323d475 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -32,6 +32,9 @@ if [ -d $EESSI_PREFIX ]; then echo "Using ${EESSI_SOFTWARE_SUBDIR} as software subdirectory." >> $output export EESSI_SOFTWARE_PATH=$EESSI_PREFIX/software/$EESSI_OS_TYPE/$EESSI_SOFTWARE_SUBDIR + if [ -d "${EESSI_SOFTWARE_PATH/versions/host_injections}" ]; then + eessi_gpu_software_path="${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all" + fi if [ ! -z $EESSI_BASIC_ENV ]; then echo "Only setting up basic environment, so we're done" >> $output elif [ -d $EESSI_SOFTWARE_PATH ]; then @@ -51,7 +54,7 @@ if [ -d $EESSI_PREFIX ]; then fi if [ -d $EESSI_MODULEPATH ]; then - export EESSI_MODULEPATH=$EESSI_MODULEPATH + export EESSI_MODULEPATH="$EESSI_MODULEPATH $eessi_gpu_software_path" echo "Using ${EESSI_MODULEPATH} as the directory to be added to MODULEPATH." >> $output else error "EESSI module path at $EESSI_MODULEPATH not found!" From 7d6af69655b0d19eba8c85dcc001e4f8e37cc3b6 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Mon, 16 May 2022 17:02:59 +0200 Subject: [PATCH 11/48] Use env var to check for GPU support and add this to module path --- init/bash | 4 ++++ init/eessi_environment_variables | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/init/bash b/init/bash index c5c1a583e7..b4b7367fe7 100644 --- a/init/bash +++ b/init/bash @@ -31,6 +31,10 @@ if [ $? -eq 0 ]; then # prepend location of modules for EESSI software stack to $MODULEPATH echo "Prepending $EESSI_MODULEPATH to \$MODULEPATH..." >> $output module use $EESSI_MODULEPATH + if [[ ! -z "${EESSI_SITE_MODULEPATH}" ]]; then + echo "Add ${EESSI_SITE_MODULEPATH} to \$MODULEPATH for GPU support..." + module use ${EESSI_SITE_MODULEPATH} + fi #echo >> $output #echo "*** Known problems in the ${EESSI_PILOT_VERSION} pilot software stack ***" >> $output diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 0ef323d475..63903c2cf7 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -32,8 +32,9 @@ if [ -d $EESSI_PREFIX ]; then echo "Using ${EESSI_SOFTWARE_SUBDIR} as software subdirectory." >> $output export EESSI_SOFTWARE_PATH=$EESSI_PREFIX/software/$EESSI_OS_TYPE/$EESSI_SOFTWARE_SUBDIR - if [ -d "${EESSI_SOFTWARE_PATH/versions/host_injections}" ]; then - eessi_gpu_software_path="${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all" + eessi_site_modulepath="${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all" + if [ -d "$eessi_site_modulepath" ]; then + export EESSI_SITE_MODULEPATH="$eessi_site_modulepath" fi if [ ! -z $EESSI_BASIC_ENV ]; then echo "Only setting up basic environment, so we're done" >> $output From 2cc5ce9645b11b457364194dd565473d91d64eb0 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Wed, 18 May 2022 14:25:49 +0200 Subject: [PATCH 12/48] Move (conditional) installation of cuda compat libs to external script Only install cuda compat libs when either they are not installed yet or they are outdated --- gpu_support/add_gpu_support.sh | 74 +++++--------------------- gpu_support/install_cuda_compatlibs.sh | 73 +++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 60 deletions(-) create mode 100644 gpu_support/install_cuda_compatlibs.sh diff --git a/gpu_support/add_gpu_support.sh b/gpu_support/add_gpu_support.sh index 09a6f18623..abe694f968 100755 --- a/gpu_support/add_gpu_support.sh +++ b/gpu_support/add_gpu_support.sh @@ -90,76 +90,30 @@ if [ $ret -ne 0 ]; then echo $latest_cuda_compat_url exit 1 fi +latest_driver_version="${latest_cuda_compat_url%-*}" +latest_driver_version="${latest_driver_version##*_}" -# Create a general space for our NVIDIA compat drivers +install_compat_libs=false host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" -if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then - mkdir -p ${host_injections_dir} - cd ${host_injections_dir} +# libcuda.so points to actual cuda compat lib with driver version in its name +# if this file exists, cuda compat libs are installed and we can compare the version +if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then + eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so) + eessi_driver_version="${eessi_driver_version##*so.}" else - echo "Cannot write to eessi host_injections space, exiting now..." >&2 - exit 1 + eessi_driver_version=0 fi -# Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest - -driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) -eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) -if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi - -# Check if our target CUDA is satisfied by what is installed already -# TODO: Find required CUDA version and see if we need an update - -# If not, grab the latest compat library RPM or deb -# download and unpack in temporary directory, easier cleanup after installation -tmpdir=$(mktemp -d) -cd $tmpdir -compat_file=${latest_cuda_compat_url##*/} -wget ${latest_cuda_compat_url} - -# Unpack it -# (the requirements here are OS dependent, can we get around that?) -# (for rpms looks like we can use https://gitweb.gentoo.org/repo/proj/prefix.git/tree/eclass/rpm.eclass?id=d7fc8cf65c536224bace1d22c0cd85a526490a1e) -# (deb files can be unpacked with ar and tar) -file_extension=${compat_file##*.} -if [[ ${file_extension} == "rpm" ]]; then - rpm2cpio ${compat_file} | cpio -idmv -elif [[ ${file_extension} == "deb" ]]; then - ar x ${compat_file} - tar xf data.tar.* +if [ ${latest_driver_version//./} -gt ${eessi_driver_version//./} ]; then + install_compat_libs=true else - echo "File extension of cuda compat lib not supported, exiting now..." >&2 - exit 1 + echo "CUDA compat libs are up-to-date, skip installation." fi -cd $host_injections_dir -# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir -mv -n ${tmpdir}/usr/local/cuda-* . -rm -r ${tmpdir} - -# Add a symlink that points to the latest version -latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1) -ln -sf ${latest_cuda_dir} latest -if [ ! -e latest ] ; then - echo "Symlink to latest cuda compat lib version is broken, exiting now..." - exit 1 -fi - -# Create the space to host the libraries -host_injection_libs_dir=/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family} -mkdir -p ${host_injection_libs_dir} -# Symlink in the path to the latest libraries -if [ ! -d "${host_injection_libs_dir}/lib" ]; then - ln -s ${host_injections_dir}/latest/compat ${host_injection_libs_dir}/lib -elif [ ! "${host_injection_libs_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then - echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..." - echo "${host_injection_libs_dir}/lib should point to ${host_injections_dir}/latest/compat" - exit 1 +if [ "${install_compat_libs}" == true ]; then + source $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh $latest_cuda_compat_url fi -# return to initial dir -cd $current_dir - ############################################################################################### ############################################################################################### # Install CUDA diff --git a/gpu_support/install_cuda_compatlibs.sh b/gpu_support/install_cuda_compatlibs.sh new file mode 100644 index 0000000000..dc93a30dbd --- /dev/null +++ b/gpu_support/install_cuda_compatlibs.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +libs_url=$1 + +current_dir=$(dirname $(realpath $0)) + +# Create a general space for our NVIDIA compat drivers +if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then + mkdir -p ${host_injections_dir} + cd ${host_injections_dir} +else + echo "Cannot write to eessi host_injections space, exiting now..." >&2 + exit 1 +fi + +# Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest + +driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) +eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) +if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi + +# Check if our target CUDA is satisfied by what is installed already +# TODO: Find required CUDA version and see if we need an update + +# If not, grab the latest compat library RPM or deb +# download and unpack in temporary directory, easier cleanup after installation +tmpdir=$(mktemp -d) +cd $tmpdir +compat_file=${libs_url##*/} +wget ${libs_url} + +# Unpack it +# (the requirements here are OS dependent, can we get around that?) +# (for rpms looks like we can use https://gitweb.gentoo.org/repo/proj/prefix.git/tree/eclass/rpm.eclass?id=d7fc8cf65c536224bace1d22c0cd85a526490a1e) +# (deb files can be unpacked with ar and tar) +file_extension=${compat_file##*.} +if [[ ${file_extension} == "rpm" ]]; then + rpm2cpio ${compat_file} | cpio -idmv +elif [[ ${file_extension} == "deb" ]]; then + ar x ${compat_file} + tar xf data.tar.* +else + echo "File extension of cuda compat lib not supported, exiting now..." >&2 + exit 1 +fi +cd $host_injections_dir +# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir +mv -n ${tmpdir}/usr/local/cuda-* . +rm -r ${tmpdir} + +# Add a symlink that points to the latest version +latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1) +ln -sf ${latest_cuda_dir} latest + +if [ ! -e latest ] ; then + echo "Symlink to latest cuda compat lib version is broken, exiting now..." + exit 1 +fi + +# Create the space to host the libraries +host_injection_libs_dir=/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family} +mkdir -p ${host_injection_libs_dir} +# Symlink in the path to the latest libraries +if [ ! -d "${host_injection_libs_dir}/lib" ]; then + ln -s ${host_injections_dir}/latest/compat ${host_injection_libs_dir}/lib +elif [ ! "${host_injection_libs_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then + echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..." + echo "${host_injection_libs_dir}/lib should point to ${host_injections_dir}/latest/compat" + exit 1 +fi + +# return to initial dir +cd $current_dir From d53e80ed47bf59bb64d1d2d334aff7d3907dff41 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Wed, 18 May 2022 15:17:59 +0200 Subject: [PATCH 13/48] Consistently use EESSI_SITE_MODULEPATH to set up GPU support for Lmod --- EESSI-pilot-install-software.sh | 5 +++-- init/eessi_environment_variables | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 8b6db2cc92..2dfffcdbb4 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -108,8 +108,9 @@ module --force purge # ignore current $MODULEPATH entirely module unuse $MODULEPATH module use $EASYBUILD_INSTALLPATH/modules/all -if [ -d /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all ]; then - module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all +if [ ! -z "${EESSI_SITE_MODULEPATH}" ]; then + echo_green "Add ${EESSI_SITE_MODULEPATH} to \$MODULEPATH for GPU support!" + module use ${EESSI_SITE_MODULEPATH} fi if [[ -z ${MODULEPATH} ]]; then fatal_error "Failed to set up \$MODULEPATH?!" diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 63903c2cf7..831ba30537 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -55,7 +55,7 @@ if [ -d $EESSI_PREFIX ]; then fi if [ -d $EESSI_MODULEPATH ]; then - export EESSI_MODULEPATH="$EESSI_MODULEPATH $eessi_gpu_software_path" + export EESSI_MODULEPATH=$EESSI_MODULEPATH echo "Using ${EESSI_MODULEPATH} as the directory to be added to MODULEPATH." >> $output else error "EESSI module path at $EESSI_MODULEPATH not found!" From 850c20ed0b23429d52878b333aa9732d9d63641a Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 19 May 2022 09:39:46 +0200 Subject: [PATCH 14/48] Rename script to add (NVIDIA) GPU support, add dummy script for AMD GPUs --- gpu_support/add_amd_gpu_support.sh | 12 ++++++++++++ ...{add_gpu_support.sh => add_nvidia_gpu_support.sh} | 0 2 files changed, 12 insertions(+) create mode 100755 gpu_support/add_amd_gpu_support.sh rename gpu_support/{add_gpu_support.sh => add_nvidia_gpu_support.sh} (100%) diff --git a/gpu_support/add_amd_gpu_support.sh b/gpu_support/add_amd_gpu_support.sh new file mode 100755 index 0000000000..02266bf25a --- /dev/null +++ b/gpu_support/add_amd_gpu_support.sh @@ -0,0 +1,12 @@ +cat << EOF +This is not implemented yet :( + +If you would like to contribute this support there are a few things you will +need to consider: +- We will need to change the Lmod property added to GPU software so we can + distinguish AMD and Nvidia GPUs +- Support should be implemented in user space, if this is not possible (e.g., + requires a driver update) you need to tell the user what to do +- Support needs to be _verified_ and a trigger put in place (like the existence + of a particular path) so we can tell Lmod to display the associated modules +EOF diff --git a/gpu_support/add_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh similarity index 100% rename from gpu_support/add_gpu_support.sh rename to gpu_support/add_nvidia_gpu_support.sh From 9b2e72fa6babeed6cedce0944a63a577096ee2b5 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 19 May 2022 10:04:24 +0200 Subject: [PATCH 15/48] Add shebang --- gpu_support/add_amd_gpu_support.sh | 2 ++ gpu_support/add_nvidia_gpu_support.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/gpu_support/add_amd_gpu_support.sh b/gpu_support/add_amd_gpu_support.sh index 02266bf25a..29c8abdc88 100755 --- a/gpu_support/add_amd_gpu_support.sh +++ b/gpu_support/add_amd_gpu_support.sh @@ -1,3 +1,5 @@ +#!/bin/bash + cat << EOF This is not implemented yet :( diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index abe694f968..bd1c27865a 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # Drop into the prefix shell or pipe this script into a Prefix shell with # $EPREFIX/startprefix <<< /path/to/this_script.sh From 5f82658ee4ea9e5e038ab5d34dacc8b2f8b4b708 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 19 May 2022 14:51:58 +0200 Subject: [PATCH 16/48] Add option to disable checks, enables installation on nodes w/o GPUs --- gpu_support/add_nvidia_gpu_support.sh | 63 ++++++++++++++++++++------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index bd1c27865a..7b78625fb9 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -3,18 +3,27 @@ # Drop into the prefix shell or pipe this script into a Prefix shell with # $EPREFIX/startprefix <<< /path/to/this_script.sh +# If you want to install CUDA support on login nodes (typically without GPUs), +# set this variable to true. This will skip all GPU-dependent checks +install_wo_gpu=false + # verify existence of nvidia-smi or this is a waste of time # Check if nvidia-smi exists and can be executed without error -if command -v nvidia-smi > /dev/null 2>&1; then - nvidia-smi > /dev/null 2>&1 - if [ $? -ne 0 ]; then - echo "nvidia-smi was found but returned error code, exiting now..." >&2 +if [[ "${install_wo_gpu}" != "true" ]]; then + if command -v nvidia-smi > /dev/null 2>&1; then + nvidia-smi > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "nvidia-smi was found but returned error code, exiting now..." >&2 + exit 1 + fi + echo "nvidia-smi found, continue setup." + else + echo "nvidia-smi not found, exiting now..." >&2 exit 1 fi - echo "nvidia-smi found, continue setup." else - echo "nvidia-smi not found, exiting now..." >&2 - exit 1 + echo "You requested to install CUDA without GPUs present." + echo "This means that all GPU-dependent tests/checks will be skipped!" fi # set up basic environment variables, EasyBuild and Lmod @@ -72,13 +81,15 @@ ver=${ver%.*} # https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers] # ) # only check first number in case of multiple GPUs -driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | tail -n1) -driver_version="${driver_version%%.*}" -# Now check driver_version for compatability -# Check driver is at least LTS driver R450, see https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers -if (( $driver_version < 450 )); then - echo "Your NVIDIA driver version is too old, please update first.." - exit 1 +if [[ "${install_wo_gpu}" != "true" ]]; then + driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | tail -n1) + driver_version="${driver_version%%.*}" + # Now check driver_version for compatability + # Check driver is at least LTS driver R450, see https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers + if (( $driver_version < 450 )); then + echo "Your NVIDIA driver version is too old, please update first.." + exit 1 + fi fi @@ -151,4 +162,26 @@ else fi cd $current_dir -source $(dirname "$BASH_SOURCE")/test_cuda +if [[ "${install_wo_gpu}" != "true" ]]; then + source $(dirname "$BASH_SOURCE")/test_cuda +else + echo "Requested to install CUDA without GPUs present, so we skip final tests." + echo "Instead we test if module load CUDA works as expected..." + if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ + else + echo "Cannot load CUDA, modules path does not exist, exiting now..." + exit 1 + fi + module load CUDA + ret=$? + if [ $ret -ne 0 ]; then + echo "Could not load CUDA even though modules path exists..." + exit 1 + else + echo "Successfully loaded CUDA, you are good to go! :)" + echo " - To build CUDA enabled modules use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/ as your EasyBuild prefix" + echo " - To use these modules:" + echo " module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all/" + fi +fi From 16e87af0f314be2bcd2513757ab7c6f0ad793d43 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 19 May 2022 15:36:46 +0200 Subject: [PATCH 17/48] Allow using an environment variable to skip GPU checks --- gpu_support/add_nvidia_gpu_support.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 7b78625fb9..5b85617ae6 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -6,6 +6,7 @@ # If you want to install CUDA support on login nodes (typically without GPUs), # set this variable to true. This will skip all GPU-dependent checks install_wo_gpu=false +[ "$INSTALL_WO_GPU" = true ] && install_wo_gpu=true # verify existence of nvidia-smi or this is a waste of time # Check if nvidia-smi exists and can be executed without error @@ -19,6 +20,8 @@ if [[ "${install_wo_gpu}" != "true" ]]; then echo "nvidia-smi found, continue setup." else echo "nvidia-smi not found, exiting now..." >&2 + echo "If you do not have a GPU on this device but wish to force the installation," + echo "please set the environment variable INSTALL_WO_GPU=true" exit 1 fi else From cf65a373527a5b17fad7d208e0337505c09adb08 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 19 May 2022 19:00:49 +0200 Subject: [PATCH 18/48] Update list of CUDA enabled toolchains --- eb_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index 2c2e6f16cc..efe1f77f03 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -7,7 +7,7 @@ from easybuild.tools.systemtools import AARCH64, POWER, get_cpu_architecture EESSI_RPATH_OVERRIDE_ATTR = 'orig_rpath_override_dirs' -CUDA_ENABLED_TOOLCHAINS = ["pmvmklc", "gmvmklc", "gmvapich2c", "pmvapich2c"] +CUDA_ENABLED_TOOLCHAINS = ["fosscuda", "gcccuda", "gimpic", "giolfc", "gmklc", "golfc", "gomklc", "gompic", "goolfc", "iccifortcuda", "iimklc", "iimpic", "intelcuda", "iomklc", "iompic", "nvompic", "nvpsmpic"] def get_eessi_envvar(eessi_envvar): """Get an EESSI environment variable from the environment""" From 7319db246c5df5bb13e4802096a00a6802d3548a Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 19 May 2022 19:02:56 +0200 Subject: [PATCH 19/48] Tell users to use the updated path to enable CUDA support --- gpu_support/add_nvidia_gpu_support.sh | 4 ++-- gpu_support/test_cuda | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 5b85617ae6..f8d6cff84a 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -183,8 +183,8 @@ else exit 1 else echo "Successfully loaded CUDA, you are good to go! :)" - echo " - To build CUDA enabled modules use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/ as your EasyBuild prefix" + echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" echo " - To use these modules:" - echo " module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all/" + echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" fi fi diff --git a/gpu_support/test_cuda b/gpu_support/test_cuda index 6aa41f7d3c..ed7f3187ae 100644 --- a/gpu_support/test_cuda +++ b/gpu_support/test_cuda @@ -29,9 +29,9 @@ then clear='\033[0m' echo -e ${green} echo "Congratulations, your GPU is working with EESSI!" - echo " - To build CUDA enabled modules use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/ as your EasyBuild prefix" + echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" echo " - To use these modules:" - echo " module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all/" + echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" echo -e ${clear} else echo "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 From 6537725b5991eb06f769c27c52a07f196922593a Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Fri, 20 May 2022 11:12:05 +0200 Subject: [PATCH 20/48] Add protection against warning if CUDA is not installed on host --- gpu_support/install_cuda_compatlibs.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gpu_support/install_cuda_compatlibs.sh b/gpu_support/install_cuda_compatlibs.sh index dc93a30dbd..cbef381928 100644 --- a/gpu_support/install_cuda_compatlibs.sh +++ b/gpu_support/install_cuda_compatlibs.sh @@ -17,7 +17,9 @@ fi driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) -if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi +if [[ $driver_cuda_version =~ ^[0-9]+$ ]]; then + if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi +fi # Check if our target CUDA is satisfied by what is installed already # TODO: Find required CUDA version and see if we need an update From 2ba47e478c9a2e619c405710819796a1e6790fa6 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 2 Jun 2022 16:42:01 +0200 Subject: [PATCH 21/48] Add README for GPU support --- gpu_support/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 gpu_support/README.md diff --git a/gpu_support/README.md b/gpu_support/README.md new file mode 100644 index 0000000000..48da497b3f --- /dev/null +++ b/gpu_support/README.md @@ -0,0 +1,10 @@ +# How to add GPU support +The collection of scripts in this directory enables you to add GPU support to your setup. +Note that currently this means that CUDA support can be added for Nvidia GPUs. AMD GPUs are not yet supported (feel free to contribute that though!). +To enable the usage of CUDA in your setup, simply run the following script: +``` +./add_nvidia_gpu_support.sh +``` +## Prerequisites and tips +* You need write permissions to `/cvmfs/pilot.eessi-hpc.org/host_injections` (which by default is a symlink to `/opt/eessi` but can be configured in your CVMFS config file to point somewhere else). If you would like to make a system-wide installation you should change this in your configuration to point somewhere on a shared filesystem. +* If you want to install CUDA on a node without GPUs (e.g. on a login node where you want to be able to compile your CUDA-enabled code), you should `export INSTALL_WO_GPU=true` in order to skip checks and tests that can only succeed if you have access to a GPU. From ac268b1090241994b8f259aa4c98b722e5935fdc Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Jun 2022 17:34:01 +0200 Subject: [PATCH 22/48] Iterate over compat libs until we find something that works --- gpu_support/add_nvidia_gpu_support.sh | 194 ++++++++++------------ gpu_support/get_cuda_compatlibs.sh | 60 +++++++ gpu_support/get_latest_cuda_compatlibs.sh | 21 --- gpu_support/install_cuda_compatlibs.sh | 26 +-- gpu_support/{test_cuda => test_cuda.sh} | 14 +- 5 files changed, 175 insertions(+), 140 deletions(-) create mode 100755 gpu_support/get_cuda_compatlibs.sh delete mode 100755 gpu_support/get_latest_cuda_compatlibs.sh rename gpu_support/{test_cuda => test_cuda.sh} (92%) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index f8d6cff84a..6983817452 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -3,6 +3,8 @@ # Drop into the prefix shell or pipe this script into a Prefix shell with # $EPREFIX/startprefix <<< /path/to/this_script.sh +install_cuda_version="11.3.1" + # If you want to install CUDA support on login nodes (typically without GPUs), # set this variable to true. This will skip all GPU-dependent checks install_wo_gpu=false @@ -29,53 +31,8 @@ else echo "This means that all GPU-dependent tests/checks will be skipped!" fi -# set up basic environment variables, EasyBuild and Lmod EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/versions/2021.12/init/bash -current_dir=$(dirname $(realpath $0)) - -# Get arch type from EESSI environment -eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}" - -# Get OS family -# TODO: needs more thorough testing -os_family=$(uname | tr '[:upper:]' '[:lower:]') - -# Get OS version -# TODO: needs more thorough testing, taken from https://unix.stackexchange.com/a/6348 -if [ -f /etc/os-release ]; then - # freedesktop.org and systemd - . /etc/os-release - os=$NAME - ver=$VERSION_ID - if [[ "$os" == *"Rocky"* ]]; then - os="rhel" - fi - if [[ "$os" == *"Debian"* ]]; then - os="debian" - fi -elif type lsb_release >/dev/null 2>&1; then - # linuxbase.org - os=$(lsb_release -si) - ver=$(lsb_release -sr) -elif [ -f /etc/lsb-release ]; then - # For some versions of Debian/Ubuntu without lsb_release command - . /etc/lsb-release - os=$DISTRIB_ID - ver=$DISTRIB_RELEASE -elif [ -f /etc/debian_version ]; then - # Older Debian/Ubuntu/etc. - os=Debian - ver=$(cat /etc/debian_version) -else - # Fall back to uname, e.g. "Linux ", also works for BSD, etc. - os=$(uname -s) - ver=$(uname -r) -fi -# Convert OS version to major versions, e.g. rhel8.5 -> rhel8 -# TODO: needs testing for e.g. Ubuntu 20.04 -ver=${ver%.*} - ############################################################################################## # Check that the CUDA driver version is adequate # ( @@ -95,41 +52,6 @@ if [[ "${install_wo_gpu}" != "true" ]]; then fi fi - -# Check if the CUDA compat libraries are installed and compatible with the target CUDA version -# if not find the latest version of the compatibility libraries and install them - -# get URL to latest CUDA compat libs, exit if URL is invalid -latest_cuda_compat_url="$($(dirname "$BASH_SOURCE")/get_latest_cuda_compatlibs.sh ${os} ${ver} ${eessi_cpu_family})" -ret=$? -if [ $ret -ne 0 ]; then - echo $latest_cuda_compat_url - exit 1 -fi -latest_driver_version="${latest_cuda_compat_url%-*}" -latest_driver_version="${latest_driver_version##*_}" - -install_compat_libs=false -host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" -# libcuda.so points to actual cuda compat lib with driver version in its name -# if this file exists, cuda compat libs are installed and we can compare the version -if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then - eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so) - eessi_driver_version="${eessi_driver_version##*so.}" -else - eessi_driver_version=0 -fi - -if [ ${latest_driver_version//./} -gt ${eessi_driver_version//./} ]; then - install_compat_libs=true -else - echo "CUDA compat libs are up-to-date, skip installation." -fi - -if [ "${install_compat_libs}" == true ]; then - source $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh $latest_cuda_compat_url -fi - ############################################################################################### ############################################################################################### # Install CUDA @@ -141,7 +63,6 @@ if [ -d ${cuda_install_dir}/modules/all ]; then module use ${cuda_install_dir}/modules/all fi # only install CUDA if specified version is not found -install_cuda_version="11.3.1" module avail 2>&1 | grep -i CUDA/${install_cuda_version} &> /dev/null if [[ $? -eq 0 ]]; then echo "CUDA module found! No need to install CUDA again, proceeding with tests" @@ -164,27 +85,92 @@ else fi fi -cd $current_dir -if [[ "${install_wo_gpu}" != "true" ]]; then - source $(dirname "$BASH_SOURCE")/test_cuda -else - echo "Requested to install CUDA without GPUs present, so we skip final tests." - echo "Instead we test if module load CUDA works as expected..." - if [ -d ${cuda_install_dir}/modules/all ]; then - module use ${cuda_install_dir}/modules/all/ - else - echo "Cannot load CUDA, modules path does not exist, exiting now..." - exit 1 - fi - module load CUDA - ret=$? - if [ $ret -ne 0 ]; then - echo "Could not load CUDA even though modules path exists..." - exit 1 - else - echo "Successfully loaded CUDA, you are good to go! :)" - echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" - echo " - To use these modules:" - echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" - fi +# Check if the CUDA compat libraries are installed and compatible with the target CUDA version +# if not find the latest version of the compatibility libraries and install them + +# get URL to latest CUDA compat libs, exit if URL is invalid +cuda_compat_urls="$($(dirname "$BASH_SOURCE")/get_cuda_compatlibs.sh)" +ret=$? +if [ $ret -ne 0 ]; then + echo $cuda_compat_urls + exit 1 fi + +# loop over the compat library versions until we get one that works for us +keep_driver_check=1 +# Do a maximum of five attempts +for value in {1..5} +do + latest_cuda_compat_url=$(echo $cuda_compat_urls | cut -d " " -f1) + # Chomp that value out of the list + cuda_compat_urls=$(echo $cuda_compat_urls | cut -d " " -f2-) + latest_driver_version="${latest_cuda_compat_url%-*}" + latest_driver_version="${latest_driver_version##*-}" + + install_compat_libs=false + host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" + # libcuda.so points to actual cuda compat lib with driver version in its name + # if this file exists, cuda compat libs are installed and we can compare the version + if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then + eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so) + eessi_driver_version="${eessi_driver_version##*so.}" + else + eessi_driver_version=0 + fi + + if [ $keep_driver_check -eq 1 ] + then + # only keep the driver check for the latest version + keep_driver_check=0 + else + eessi_driver_version=0 + fi + + if [ ${latest_driver_version//./} -gt ${eessi_driver_version//./} ]; then + install_compat_libs=true + else + echo "CUDA compat libs are up-to-date, skip installation." + fi + + if [ "${install_compat_libs}" == true ]; then + source $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh $latest_cuda_compat_url + fi + + if [[ "${install_wo_gpu}" != "true" ]]; then + source $(dirname "$BASH_SOURCE")/test_cuda.sh + if [ $? -eq 0 ] + then + exit 0 + else + echo + echo "It looks like your driver is not recent enough to work with that release of CUDA, consider updating!" + echo "I'll try an older release to see if that will work..." + echo + fi + else + echo "Requested to install CUDA without GPUs present, so we skip final tests." + echo "Instead we test if module load CUDA works as expected..." + if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ + else + echo "Cannot load CUDA, modules path does not exist, exiting now..." + exit 1 + fi + module load CUDA + ret=$? + if [ $ret -ne 0 ]; then + echo "Could not load CUDA even though modules path exists..." + exit 1 + else + echo "Successfully loaded CUDA, you are good to go! :)" + echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" + echo " - To use these modules:" + echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" + fi + break + fi +done + +echo "Tried to install 5 different generations of compat libraries and none worked," +echo "this usually means your driver is very out of date!" +exit 1 diff --git a/gpu_support/get_cuda_compatlibs.sh b/gpu_support/get_cuda_compatlibs.sh new file mode 100755 index 0000000000..208d55d676 --- /dev/null +++ b/gpu_support/get_cuda_compatlibs.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +current_dir=$(dirname $(realpath $0)) + +# Get arch type from EESSI environment +if [[ -z "${EESSI_CPU_FAMILY}" ]]; then + # set up basic environment variables, EasyBuild and Lmod + echo Here!! + EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/versions/2021.12/init/bash +fi +eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}" + +# Get OS family +# TODO: needs more thorough testing +os_family=$(uname | tr '[:upper:]' '[:lower:]') + +# Get OS version +# TODO: needs more thorough testing, taken from https://unix.stackexchange.com/a/6348 +if [ -f /etc/os-release ]; then + # freedesktop.org and systemd + . /etc/os-release + os=$NAME + ver=$VERSION_ID + if [[ "$os" == *"Rocky"* ]]; then + os="rhel" + fi + if [[ "$os" == *"Debian"* ]]; then + os="debian" + fi +elif type lsb_release >/dev/null 2>&1; then + # linuxbase.org + os=$(lsb_release -si) + ver=$(lsb_release -sr) +elif [ -f /etc/lsb-release ]; then + # For some versions of Debian/Ubuntu without lsb_release command + . /etc/lsb-release + os=$DISTRIB_ID + ver=$DISTRIB_RELEASE +elif [ -f /etc/debian_version ]; then + # Older Debian/Ubuntu/etc. + os=Debian + ver=$(cat /etc/debian_version) +else + # Fall back to uname, e.g. "Linux ", also works for BSD, etc. + os=$(uname -s) + ver=$(uname -r) +fi +# Convert OS version to major versions, e.g. rhel8.5 -> rhel8 +# TODO: needs testing for e.g. Ubuntu 20.04 +ver=${ver%.*} + +# build URL for CUDA libraries +cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/"${os}${ver}"/"${eessi_cpu_family}"/" +# get all versions in decending order +files=$(curl -s "${cuda_url}" | grep 'cuda-compat' | sed 's/<\/\?[^>]\+>//g' | xargs -n1 | sort -r ) +if [[ -z "${files// }" ]]; then + echo "Could not find any compat lib files under" ${cuda_url} + exit 1 +fi +for file in $files; do echo "${cuda_url}$file"; done diff --git a/gpu_support/get_latest_cuda_compatlibs.sh b/gpu_support/get_latest_cuda_compatlibs.sh deleted file mode 100755 index 91680f5e2e..0000000000 --- a/gpu_support/get_latest_cuda_compatlibs.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -os=$1 -ver=$2 -eessi_cpu_family=$3 - -# build URL for CUDA libraries -cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/"${os}${ver}"/"${eessi_cpu_family}"/" -# get latest version, files are sorted by date -# TODO: probably better to explicitly check version numbers than trusting that it is sorted -latest_file=$(curl -s "${cuda_url}" | grep 'cuda-compat' | tail -1) -if [[ -z "${latest_file// }" ]]; then - echo "Could not find any compat lib files under" ${cuda_url} - exit 1 -fi -# extract actual file name from html snippet -file=$(echo $latest_file | sed 's/<\/\?[^>]\+>//g') -# build final URL for wget -cuda_url="${cuda_url}$file" -# simply echo the URL, result will be used by add_gpu_support.sh -echo $cuda_url diff --git a/gpu_support/install_cuda_compatlibs.sh b/gpu_support/install_cuda_compatlibs.sh index cbef381928..7f5c036a99 100644 --- a/gpu_support/install_cuda_compatlibs.sh +++ b/gpu_support/install_cuda_compatlibs.sh @@ -3,15 +3,17 @@ libs_url=$1 current_dir=$(dirname $(realpath $0)) +host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" +host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} # Create a general space for our NVIDIA compat drivers if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then mkdir -p ${host_injections_dir} - cd ${host_injections_dir} else echo "Cannot write to eessi host_injections space, exiting now..." >&2 exit 1 fi +cd ${host_injections_dir} # Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest @@ -30,6 +32,7 @@ tmpdir=$(mktemp -d) cd $tmpdir compat_file=${libs_url##*/} wget ${libs_url} +echo $compat_file # Unpack it # (the requirements here are OS dependent, can we get around that?) @@ -46,13 +49,13 @@ else exit 1 fi cd $host_injections_dir +cuda_dir=$(basename ${tmpdir}/usr/local/cuda-*) # TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir mv -n ${tmpdir}/usr/local/cuda-* . rm -r ${tmpdir} -# Add a symlink that points to the latest version -latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1) -ln -sf ${latest_cuda_dir} latest +# Add a symlink that points the latest version to the version we just installed +ln -sfn ${cuda_dir} latest if [ ! -e latest ] ; then echo "Symlink to latest cuda compat lib version is broken, exiting now..." @@ -60,16 +63,19 @@ if [ ! -e latest ] ; then fi # Create the space to host the libraries -host_injection_libs_dir=/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family} -mkdir -p ${host_injection_libs_dir} +mkdir -p ${host_injection_linker_dir} # Symlink in the path to the latest libraries -if [ ! -d "${host_injection_libs_dir}/lib" ]; then - ln -s ${host_injections_dir}/latest/compat ${host_injection_libs_dir}/lib -elif [ ! "${host_injection_libs_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then +if [ ! -d "${host_injection_linker_dir}/lib" ]; then + ln -s ${host_injections_dir}/latest/compat ${host_injection_linker_dir}/lib +elif [ ! "${host_injection_linker_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..." - echo "${host_injection_libs_dir}/lib should point to ${host_injections_dir}/latest/compat" + echo "${host_injection_linker_dir}/lib should point to ${host_injections_dir}/latest/compat" exit 1 fi # return to initial dir cd $current_dir + +echo +echo CUDA driver compatability drivers installed for CUDA version: +echo ${cuda_dir/cuda-/} diff --git a/gpu_support/test_cuda b/gpu_support/test_cuda.sh similarity index 92% rename from gpu_support/test_cuda rename to gpu_support/test_cuda.sh index ed7f3187ae..0aee9892f6 100644 --- a/gpu_support/test_cuda +++ b/gpu_support/test_cuda.sh @@ -33,14 +33,18 @@ then echo " - To use these modules:" echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" echo -e ${clear} + + # Clean up + cd $current_dir + rm -r $tmp_dir else - echo "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 + echo "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 + # Clean up + cd $current_dir + rm -r $tmp_dir + false fi -# Clean up -cd $current_dir -rm -r $tmp_dir - # Test building something with CUDA and running # TODO: Use samples from installation directory, `device_query` is a good option From bb5301b325e3ff1a647bbcb6c95c53bd4a5a5a25 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Jun 2022 17:45:07 +0200 Subject: [PATCH 23/48] Don't use source when we don't need to --- gpu_support/add_nvidia_gpu_support.sh | 12 ++++++------ gpu_support/test_cuda.sh | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 6983817452..8f709a6f13 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -137,15 +137,15 @@ do fi if [[ "${install_wo_gpu}" != "true" ]]; then - source $(dirname "$BASH_SOURCE")/test_cuda.sh + bash $(dirname "$BASH_SOURCE")/test_cuda.sh if [ $? -eq 0 ] then - exit 0 + exit 0 else - echo - echo "It looks like your driver is not recent enough to work with that release of CUDA, consider updating!" - echo "I'll try an older release to see if that will work..." - echo + echo + echo "It looks like your driver is not recent enough to work with that release of CUDA, consider updating!" + echo "I'll try an older release to see if that will work..." + echo fi else echo "Requested to install CUDA without GPUs present, so we skip final tests." diff --git a/gpu_support/test_cuda.sh b/gpu_support/test_cuda.sh index 0aee9892f6..28d21355ce 100644 --- a/gpu_support/test_cuda.sh +++ b/gpu_support/test_cuda.sh @@ -42,7 +42,7 @@ else # Clean up cd $current_dir rm -r $tmp_dir - false + exit 1 fi # Test building something with CUDA and running From 17b7662dee1e3c8fdec5aa954b60037df5462f88 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Wed, 8 Jun 2022 16:57:31 +0200 Subject: [PATCH 24/48] Small adjustments to make things work on Debian10, remove debug statement --- gpu_support/add_nvidia_gpu_support.sh | 4 ++++ gpu_support/get_cuda_compatlibs.sh | 3 +-- gpu_support/install_cuda_compatlibs.sh | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 8f709a6f13..626ac2c1f8 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -106,6 +106,10 @@ do cuda_compat_urls=$(echo $cuda_compat_urls | cut -d " " -f2-) latest_driver_version="${latest_cuda_compat_url%-*}" latest_driver_version="${latest_driver_version##*-}" + # URLs differ for different OSes; check if we already have a number, if not remove string part that is not needed + if [[ ! $latest_driver_version =~ ^[0-9]+$ ]]; then + latest_driver_version="${latest_driver_version##*_}" + fi install_compat_libs=false host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" diff --git a/gpu_support/get_cuda_compatlibs.sh b/gpu_support/get_cuda_compatlibs.sh index 208d55d676..6c0b888042 100755 --- a/gpu_support/get_cuda_compatlibs.sh +++ b/gpu_support/get_cuda_compatlibs.sh @@ -5,7 +5,6 @@ current_dir=$(dirname $(realpath $0)) # Get arch type from EESSI environment if [[ -z "${EESSI_CPU_FAMILY}" ]]; then # set up basic environment variables, EasyBuild and Lmod - echo Here!! EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/versions/2021.12/init/bash fi eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}" @@ -52,7 +51,7 @@ ver=${ver%.*} # build URL for CUDA libraries cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/"${os}${ver}"/"${eessi_cpu_family}"/" # get all versions in decending order -files=$(curl -s "${cuda_url}" | grep 'cuda-compat' | sed 's/<\/\?[^>]\+>//g' | xargs -n1 | sort -r ) +files=$(curl -s "${cuda_url}" | grep 'cuda-compat' | sed 's/<\/\?[^>]\+>//g' | xargs -n1 | /cvmfs/pilot.eessi-hpc.org/latest/compat/linux/${eessi_cpu_family}/bin/sort -r --version-sort ) if [[ -z "${files// }" ]]; then echo "Could not find any compat lib files under" ${cuda_url} exit 1 diff --git a/gpu_support/install_cuda_compatlibs.sh b/gpu_support/install_cuda_compatlibs.sh index 7f5c036a99..7d7e86d9ee 100644 --- a/gpu_support/install_cuda_compatlibs.sh +++ b/gpu_support/install_cuda_compatlibs.sh @@ -51,6 +51,7 @@ fi cd $host_injections_dir cuda_dir=$(basename ${tmpdir}/usr/local/cuda-*) # TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir +rm -rf ${cuda_dir} mv -n ${tmpdir}/usr/local/cuda-* . rm -r ${tmpdir} From 03b01f16c98658c5db41bec286bc2cbe07c24213 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Wed, 8 Jun 2022 16:58:57 +0200 Subject: [PATCH 25/48] Make installed CUDA version configurable via env var with a default --- gpu_support/add_nvidia_gpu_support.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 626ac2c1f8..3c5dde4791 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -3,7 +3,7 @@ # Drop into the prefix shell or pipe this script into a Prefix shell with # $EPREFIX/startprefix <<< /path/to/this_script.sh -install_cuda_version="11.3.1" +install_cuda_version="${INSTALL_CUDA_VERSION:=11.3.1}" # If you want to install CUDA support on login nodes (typically without GPUs), # set this variable to true. This will skip all GPU-dependent checks From dadb170d91a1c6d5ee9725991ab282ef523fbe5b Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Wed, 8 Jun 2022 17:05:03 +0200 Subject: [PATCH 26/48] Use generic latest symlink when sourcing init/bash instead specific version --- gpu_support/add_nvidia_gpu_support.sh | 2 +- gpu_support/get_cuda_compatlibs.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 3c5dde4791..d4c46133ca 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -31,7 +31,7 @@ else echo "This means that all GPU-dependent tests/checks will be skipped!" fi -EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/versions/2021.12/init/bash +EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/latest/init/bash ############################################################################################## # Check that the CUDA driver version is adequate diff --git a/gpu_support/get_cuda_compatlibs.sh b/gpu_support/get_cuda_compatlibs.sh index 6c0b888042..eb4b38a63a 100755 --- a/gpu_support/get_cuda_compatlibs.sh +++ b/gpu_support/get_cuda_compatlibs.sh @@ -5,7 +5,7 @@ current_dir=$(dirname $(realpath $0)) # Get arch type from EESSI environment if [[ -z "${EESSI_CPU_FAMILY}" ]]; then # set up basic environment variables, EasyBuild and Lmod - EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/versions/2021.12/init/bash + EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/latest/init/bash fi eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}" From 0f5884fea083f93d2970aed6407368c1202b004f Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Mon, 13 Jun 2022 14:45:19 +0200 Subject: [PATCH 27/48] Implement suggested changes (don't source when not needed, update README) --- gpu_support/README.md | 2 +- gpu_support/add_nvidia_gpu_support.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu_support/README.md b/gpu_support/README.md index 48da497b3f..48172641af 100644 --- a/gpu_support/README.md +++ b/gpu_support/README.md @@ -7,4 +7,4 @@ To enable the usage of CUDA in your setup, simply run the following script: ``` ## Prerequisites and tips * You need write permissions to `/cvmfs/pilot.eessi-hpc.org/host_injections` (which by default is a symlink to `/opt/eessi` but can be configured in your CVMFS config file to point somewhere else). If you would like to make a system-wide installation you should change this in your configuration to point somewhere on a shared filesystem. -* If you want to install CUDA on a node without GPUs (e.g. on a login node where you want to be able to compile your CUDA-enabled code), you should `export INSTALL_WO_GPU=true` in order to skip checks and tests that can only succeed if you have access to a GPU. +* If you want to install CUDA on a node without GPUs (e.g. on a login node where you want to be able to compile your CUDA-enabled code), you should `export INSTALL_WO_GPU=true` in order to skip checks and tests that can only succeed if you have access to a GPU. This approach is not recommended as there is a chance the CUDA compatibility library installed is not compatible with the existing CUDA driver on GPU nodes (and this will not be detected). diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index d4c46133ca..9127cfa707 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -137,7 +137,7 @@ do fi if [ "${install_compat_libs}" == true ]; then - source $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh $latest_cuda_compat_url + bash $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh $latest_cuda_compat_url fi if [[ "${install_wo_gpu}" != "true" ]]; then From 5f2c1f669bfa6987466c2f9228567ca642a35b9e Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Fri, 17 Jun 2022 09:33:20 +0200 Subject: [PATCH 28/48] Add exit code and more detailed message when installing without GPUs --- gpu_support/add_nvidia_gpu_support.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 9127cfa707..1ff3edc428 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -170,6 +170,9 @@ do echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" echo " - To use these modules:" echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" + echo " - Please keep in mind that we just installed the latest CUDA compat libs." + echo " Since we have no GPU to test with, we cannot guarantee that it will work with the installed CUDA drivers on your GPU node(s)." + exit 0 fi break fi From ab9587334261101737e104a156731ac0e8db0d22 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Tue, 19 Jul 2022 12:27:18 +0200 Subject: [PATCH 29/48] Update error message when nvidia-smi returns an error code --- gpu_support/add_nvidia_gpu_support.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 1ff3edc428..c5bcf3ec33 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -17,6 +17,8 @@ if [[ "${install_wo_gpu}" != "true" ]]; then nvidia-smi > /dev/null 2>&1 if [ $? -ne 0 ]; then echo "nvidia-smi was found but returned error code, exiting now..." >&2 + echo "If you do not have a GPU on this device but wish to force the installation," + echo "please set the environment variable INSTALL_WO_GPU=true" exit 1 fi echo "nvidia-smi found, continue setup." From 63fded601ac1b24997073cf8fe19ba254a12ca7d Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Mon, 25 Jul 2022 11:51:54 +0200 Subject: [PATCH 30/48] Convert OS version for Ubuntu systems when getting CUDA compat libs --- gpu_support/get_cuda_compatlibs.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/gpu_support/get_cuda_compatlibs.sh b/gpu_support/get_cuda_compatlibs.sh index eb4b38a63a..7821ebebc2 100755 --- a/gpu_support/get_cuda_compatlibs.sh +++ b/gpu_support/get_cuda_compatlibs.sh @@ -22,10 +22,17 @@ if [ -f /etc/os-release ]; then ver=$VERSION_ID if [[ "$os" == *"Rocky"* ]]; then os="rhel" + # Convert OS version to major versions, e.g. rhel8.5 -> rhel8 + ver=${ver%.*} fi if [[ "$os" == *"Debian"* ]]; then os="debian" fi + if [[ "$os" == *"Ubuntu"* ]]; then + os="ubuntu" + # Convert OS version + ver=${ver/./} + fi elif type lsb_release >/dev/null 2>&1; then # linuxbase.org os=$(lsb_release -si) @@ -44,9 +51,6 @@ else os=$(uname -s) ver=$(uname -r) fi -# Convert OS version to major versions, e.g. rhel8.5 -> rhel8 -# TODO: needs testing for e.g. Ubuntu 20.04 -ver=${ver%.*} # build URL for CUDA libraries cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/"${os}${ver}"/"${eessi_cpu_family}"/" From d3cadb535207eb0fd0af7b679a08b797c09f312a Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Wed, 27 Jul 2022 15:43:43 +0200 Subject: [PATCH 31/48] Use rpm files for all OSes and p7zip to unpack them The deb files and rpm files of the CUDA compat libraries contain the same libraries. This means we can use the rpm files for all OSes and unpack them with p7zip which can be installed via EB. The rhel8 folder is used to download the compat libraries. --- gpu_support/add_nvidia_gpu_support.sh | 21 +++++++++++- gpu_support/get_cuda_compatlibs.sh | 46 ++------------------------ gpu_support/install_cuda_compatlibs.sh | 16 ++++++--- 3 files changed, 34 insertions(+), 49 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index c5bcf3ec33..d757e2c6e5 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -4,6 +4,7 @@ # $EPREFIX/startprefix <<< /path/to/this_script.sh install_cuda_version="${INSTALL_CUDA_VERSION:=11.3.1}" +install_p7zip_version="${INSTALL_P7ZIP_VERSION:=17.04-GCCcore-10.3.0}" # If you want to install CUDA support on login nodes (typically without GPUs), # set this variable to true. This will skip all GPU-dependent checks @@ -67,7 +68,7 @@ fi # only install CUDA if specified version is not found module avail 2>&1 | grep -i CUDA/${install_cuda_version} &> /dev/null if [[ $? -eq 0 ]]; then - echo "CUDA module found! No need to install CUDA again, proceeding with tests" + echo "CUDA module found! No need to install CUDA again, proceeding with tests" else # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` # (CUDA is a binary installation so no need to worry too much about this) @@ -87,6 +88,24 @@ else fi fi +# install p7zip, this will be used to install the CUDA compat libraries from rpm +# the rpm and deb files contain the same libraries, so we just stick to the rpm version +module avail 2>&1 | grep -i p7zip &> /dev/null +if [[ $? -eq 0 ]]; then + echo "p7zip module found! No need to install p7zip again, proceeding with installation of compat libraries" +else + # install p7zip in host_injections + module load EasyBuild + eb --robot --installpath=${cuda_install_dir}/ p7zip-${install_p7zip_version}.eb + ret=$? + if [ $ret -ne 0 ]; then + echo "p7zip installation failed, please check EasyBuild logs..." + exit 1 + fi + # make p7zip known to the environment + module use ${cuda_install_dir}/modules/all +fi + # Check if the CUDA compat libraries are installed and compatible with the target CUDA version # if not find the latest version of the compatibility libraries and install them diff --git a/gpu_support/get_cuda_compatlibs.sh b/gpu_support/get_cuda_compatlibs.sh index 7821ebebc2..c0c6631b06 100755 --- a/gpu_support/get_cuda_compatlibs.sh +++ b/gpu_support/get_cuda_compatlibs.sh @@ -9,51 +9,9 @@ if [[ -z "${EESSI_CPU_FAMILY}" ]]; then fi eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}" -# Get OS family -# TODO: needs more thorough testing -os_family=$(uname | tr '[:upper:]' '[:lower:]') - -# Get OS version -# TODO: needs more thorough testing, taken from https://unix.stackexchange.com/a/6348 -if [ -f /etc/os-release ]; then - # freedesktop.org and systemd - . /etc/os-release - os=$NAME - ver=$VERSION_ID - if [[ "$os" == *"Rocky"* ]]; then - os="rhel" - # Convert OS version to major versions, e.g. rhel8.5 -> rhel8 - ver=${ver%.*} - fi - if [[ "$os" == *"Debian"* ]]; then - os="debian" - fi - if [[ "$os" == *"Ubuntu"* ]]; then - os="ubuntu" - # Convert OS version - ver=${ver/./} - fi -elif type lsb_release >/dev/null 2>&1; then - # linuxbase.org - os=$(lsb_release -si) - ver=$(lsb_release -sr) -elif [ -f /etc/lsb-release ]; then - # For some versions of Debian/Ubuntu without lsb_release command - . /etc/lsb-release - os=$DISTRIB_ID - ver=$DISTRIB_RELEASE -elif [ -f /etc/debian_version ]; then - # Older Debian/Ubuntu/etc. - os=Debian - ver=$(cat /etc/debian_version) -else - # Fall back to uname, e.g. "Linux ", also works for BSD, etc. - os=$(uname -s) - ver=$(uname -r) -fi - # build URL for CUDA libraries -cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/"${os}${ver}"/"${eessi_cpu_family}"/" +# take rpm file for compat libs from rhel8 folder, deb and rpm files contain the same libraries +cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/rhel8/"${eessi_cpu_family}"/" # get all versions in decending order files=$(curl -s "${cuda_url}" | grep 'cuda-compat' | sed 's/<\/\?[^>]\+>//g' | xargs -n1 | /cvmfs/pilot.eessi-hpc.org/latest/compat/linux/${eessi_cpu_family}/bin/sort -r --version-sort ) if [[ -z "${files// }" ]]; then diff --git a/gpu_support/install_cuda_compatlibs.sh b/gpu_support/install_cuda_compatlibs.sh index 7d7e86d9ee..1621cfdf64 100644 --- a/gpu_support/install_cuda_compatlibs.sh +++ b/gpu_support/install_cuda_compatlibs.sh @@ -35,12 +35,20 @@ wget ${libs_url} echo $compat_file # Unpack it -# (the requirements here are OS dependent, can we get around that?) -# (for rpms looks like we can use https://gitweb.gentoo.org/repo/proj/prefix.git/tree/eclass/rpm.eclass?id=d7fc8cf65c536224bace1d22c0cd85a526490a1e) -# (deb files can be unpacked with ar and tar) +# rpm files are the default for all OSes +# Keep support for deb files in case it is needed in the future file_extension=${compat_file##*.} if [[ ${file_extension} == "rpm" ]]; then - rpm2cpio ${compat_file} | cpio -idmv + # Load p7zip to extract files from rpm file + module load p7zip + # Extract .cpio + 7z x ${compat_file} + # Extract lib* + 7z x ${compat_file/rpm/cpio} + # Restore symlinks + cd usr/local/cuda-*/compat + ls *.so *.so.? | xargs -i -I % sh -c '{ echo -n ln -sf" "; cat %; echo " "%; }'| xargs -i sh -c "{}" + cd - elif [[ ${file_extension} == "deb" ]]; then ar x ${compat_file} tar xf data.tar.* From 81e41357c801a56272f833f9167a8c902c24e856 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Wed, 27 Jul 2022 16:04:07 +0200 Subject: [PATCH 32/48] Rename driver_version to driver_major_version --- gpu_support/add_nvidia_gpu_support.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index d757e2c6e5..5b84585a2a 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -45,11 +45,11 @@ EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/latest/init/bash # ) # only check first number in case of multiple GPUs if [[ "${install_wo_gpu}" != "true" ]]; then - driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | tail -n1) - driver_version="${driver_version%%.*}" + driver_major_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | tail -n1) + driver_major_version="${driver_major_version%%.*}" # Now check driver_version for compatability # Check driver is at least LTS driver R450, see https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers - if (( $driver_version < 450 )); then + if (( $driver_major_version < 450 )); then echo "Your NVIDIA driver version is too old, please update first.." exit 1 fi From ec9dd698c7904f781a0b47524be51bff4ebfba48 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Fri, 29 Jul 2022 16:12:30 +0200 Subject: [PATCH 33/48] Prepare shipping CUDA module file with EESSI CUDA can now be installed in EESSI with the actual software being installed in ${EESSI_SOFTWARE_PATH/versions/host_injections} and the module file being written to ${EESSI_SOFTWARE_PATH}. Since the host_injections directory is not shipped with EESSI, the actual software will not be shipped. Trying to load the CUDA module will fail with a pointer to the CUDA install script, unless the software under host_injections is found. The CUDA install script then installs the software in host_injections, while the module file is written to a tmpdir which is removed in the end. --- eb_hooks.py | 18 ++++++++++++++++++ gpu_support/add_nvidia_gpu_support.sh | 11 +++++------ init/SitePackage.lua | 18 ++++++++++++++++++ 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index efe1f77f03..79cb48fd7b 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -122,6 +122,24 @@ def cgal_toolchainopts_precise(ec, eprefix): raise EasyBuildError("CGAL-specific hook triggered for non-CGAL easyconfig?!") +def pre_fetch_hook(self, *args, **kwargs): + """Modify install path for CUDA software.""" + if self.name == 'CUDA': + self.installdir = self.installdir.replace('versions', 'host_injections') + + +def pre_module_hook(self, *args, **kwargs): + """Modify install path for CUDA software.""" + if self.name == 'CUDA': + self.installdir = self.installdir.replace('versions', 'host_injections') + + +def pre_sanitycheck_hook(self, *args, **kwargs): + """Modify install path for CUDA software.""" + if self.name == 'CUDA': + self.installdir = self.installdir.replace('versions', 'host_injections') + + def fontconfig_add_fonts(ec, eprefix): """Inject --with-add-fonts configure option for fontconfig.""" if ec.name == 'fontconfig': diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 5b84585a2a..909b80ca09 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -62,12 +62,8 @@ fi # if modules dir exists, load it for usage within Lmod cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" mkdir -p ${cuda_install_dir} -if [ -d ${cuda_install_dir}/modules/all ]; then - module use ${cuda_install_dir}/modules/all -fi # only install CUDA if specified version is not found -module avail 2>&1 | grep -i CUDA/${install_cuda_version} &> /dev/null -if [[ $? -eq 0 ]]; then +if [ -d ${cuda_install_dir}/software/CUDA/${install_cuda_version} ]; then echo "CUDA module found! No need to install CUDA again, proceeding with tests" else # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` @@ -80,12 +76,15 @@ else fi # install cuda in host_injections module load EasyBuild - eb --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb + # we need the --rebuild option, since the module file is shipped with EESSI + tmpdir=$(mktemp -d) + eb --rebuild --installpath-modules=${tmpdir} --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb ret=$? if [ $ret -ne 0 ]; then echo "CUDA installation failed, please check EasyBuild logs..." exit 1 fi + rm -rf ${tmpdir} fi # install p7zip, this will be used to install the CUDA compat libraries from rpm diff --git a/init/SitePackage.lua b/init/SitePackage.lua index 25f9f2eed7..0b1da1417f 100644 --- a/init/SitePackage.lua +++ b/init/SitePackage.lua @@ -27,4 +27,22 @@ local function visible_hook(modT) end end +local function cuda_load_hook(t) + local frameStk = require("FrameStk"):singleton() + local simpleName = string.match(t.modFullName, "(.-)/") + if string.match(simpleName, 'CUDA') ~= nil then + local cudaDir = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + local cudaDirExists = exists(cudaDir) + if not cudaDirExists then + io.stderr:write("You requested to load ",simpleName,"\n") + io.stderr:write("While the module file exists, the actual software is not shipped with EESSI.\n") + io.stderr:write("In order to be able to use the CUDA module, please follow the instructions in the\n") + io.stderr:write("gpu_support folder. Adding the CUDA software can be as easy as a simple:\n") + io.stderr:write("./add_nvidia_gpu_support.sh\n") + frameStk:__clear() + end + end +end + +hook.register("load", cuda_load_hook) hook.register("isVisibleHook", visible_hook) From 0c1004d23e074049a8745028917bc8adc3fc513e Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Wed, 7 Sep 2022 16:20:33 +0200 Subject: [PATCH 34/48] Remove loading of CUDA specific module locations The CUDA module will now be shipped with EESSI, only the software has to be installed in host_injections. This makes the dedicated handling of CUDA module paths unneccessary. --- EESSI-pilot-install-software.sh | 4 ---- init/bash | 4 ---- init/eessi_environment_variables | 4 ---- 3 files changed, 12 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 4cc237d04e..0c832fdee9 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -81,10 +81,6 @@ module --force purge # ignore current $MODULEPATH entirely module unuse $MODULEPATH module use $EASYBUILD_INSTALLPATH/modules/all -if [ ! -z "${EESSI_SITE_MODULEPATH}" ]; then - echo_green "Add ${EESSI_SITE_MODULEPATH} to \$MODULEPATH for GPU support!" - module use ${EESSI_SITE_MODULEPATH} -fi if [[ -z ${MODULEPATH} ]]; then fatal_error "Failed to set up \$MODULEPATH?!" else diff --git a/init/bash b/init/bash index b4b7367fe7..c5c1a583e7 100644 --- a/init/bash +++ b/init/bash @@ -31,10 +31,6 @@ if [ $? -eq 0 ]; then # prepend location of modules for EESSI software stack to $MODULEPATH echo "Prepending $EESSI_MODULEPATH to \$MODULEPATH..." >> $output module use $EESSI_MODULEPATH - if [[ ! -z "${EESSI_SITE_MODULEPATH}" ]]; then - echo "Add ${EESSI_SITE_MODULEPATH} to \$MODULEPATH for GPU support..." - module use ${EESSI_SITE_MODULEPATH} - fi #echo >> $output #echo "*** Known problems in the ${EESSI_PILOT_VERSION} pilot software stack ***" >> $output diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 831ba30537..34dc8f9f98 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -32,10 +32,6 @@ if [ -d $EESSI_PREFIX ]; then echo "Using ${EESSI_SOFTWARE_SUBDIR} as software subdirectory." >> $output export EESSI_SOFTWARE_PATH=$EESSI_PREFIX/software/$EESSI_OS_TYPE/$EESSI_SOFTWARE_SUBDIR - eessi_site_modulepath="${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all" - if [ -d "$eessi_site_modulepath" ]; then - export EESSI_SITE_MODULEPATH="$eessi_site_modulepath" - fi if [ ! -z $EESSI_BASIC_ENV ]; then echo "Only setting up basic environment, so we're done" >> $output elif [ -d $EESSI_SOFTWARE_PATH ]; then From 7a9827b8bc92455bcf9cb9b37b5998ca4f0f8ac9 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Wed, 7 Sep 2022 16:22:52 +0200 Subject: [PATCH 35/48] Check for full CUDA software path (incl. version) when loading the module --- init/SitePackage.lua | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/init/SitePackage.lua b/init/SitePackage.lua index 0b1da1417f..c6ef9babf6 100644 --- a/init/SitePackage.lua +++ b/init/SitePackage.lua @@ -29,9 +29,13 @@ end local function cuda_load_hook(t) local frameStk = require("FrameStk"):singleton() + -- needed to check if we are trying to load the CUDA module local simpleName = string.match(t.modFullName, "(.-)/") if string.match(simpleName, 'CUDA') ~= nil then + -- get the full host_injections path local cudaDir = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + -- build final path where the CUDA software should be installed + cudaDir = cudaDir .. "/software/" .. t.modFullName local cudaDirExists = exists(cudaDir) if not cudaDirExists then io.stderr:write("You requested to load ",simpleName,"\n") From 6e8664926d02e71d210deb621cbc468774989a35 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Wed, 7 Sep 2022 16:24:38 +0200 Subject: [PATCH 36/48] Refine install of p7zip, keep it until software layer provides it --- gpu_support/add_nvidia_gpu_support.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 909b80ca09..025a1be5e5 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -87,8 +87,11 @@ else rm -rf ${tmpdir} fi -# install p7zip, this will be used to install the CUDA compat libraries from rpm -# the rpm and deb files contain the same libraries, so we just stick to the rpm version +# Install p7zip, this will be used to install the CUDA compat libraries from rpm. +# The rpm and deb files contain the same libraries, so we just stick to the rpm version. +# If p7zip is missing from the software layer (for whatever reason), we need to install it. +# This has to happen in host_injections, so we check first if it is already installed there. +module use ${cuda_install_dir}/modules/all/ module avail 2>&1 | grep -i p7zip &> /dev/null if [[ $? -eq 0 ]]; then echo "p7zip module found! No need to install p7zip again, proceeding with installation of compat libraries" @@ -101,8 +104,6 @@ else echo "p7zip installation failed, please check EasyBuild logs..." exit 1 fi - # make p7zip known to the environment - module use ${cuda_install_dir}/modules/all fi # Check if the CUDA compat libraries are installed and compatible with the target CUDA version From e38391b9baf585eea33f58cf5f819fa1df11f303 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Tue, 13 Sep 2022 09:59:00 +0200 Subject: [PATCH 37/48] Prepend lmod module path only if the dir actually exists --- gpu_support/add_nvidia_gpu_support.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 025a1be5e5..e130355de7 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -91,7 +91,9 @@ fi # The rpm and deb files contain the same libraries, so we just stick to the rpm version. # If p7zip is missing from the software layer (for whatever reason), we need to install it. # This has to happen in host_injections, so we check first if it is already installed there. -module use ${cuda_install_dir}/modules/all/ +if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ +fi module avail 2>&1 | grep -i p7zip &> /dev/null if [[ $? -eq 0 ]]; then echo "p7zip module found! No need to install p7zip again, proceeding with installation of compat libraries" From b2a4865c1eeee507f9ad72cd0f561c417371147d Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Tue, 13 Sep 2022 10:00:41 +0200 Subject: [PATCH 38/48] Make printout of CUDA installation more accurate --- gpu_support/add_nvidia_gpu_support.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index e130355de7..3797beb308 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -64,7 +64,7 @@ cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" mkdir -p ${cuda_install_dir} # only install CUDA if specified version is not found if [ -d ${cuda_install_dir}/software/CUDA/${install_cuda_version} ]; then - echo "CUDA module found! No need to install CUDA again, proceeding with tests" + echo "CUDA software found! No need to install CUDA again, proceeding with tests" else # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` # (CUDA is a binary installation so no need to worry too much about this) From fb73d12ad9deac72ed37f662e7b40b56644e5c11 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Tue, 13 Sep 2022 10:03:47 +0200 Subject: [PATCH 39/48] Only install CUDA module in tmpdir if it's already shipped in EESSI --- gpu_support/add_nvidia_gpu_support.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 3797beb308..12f40a11d9 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -76,9 +76,12 @@ else fi # install cuda in host_injections module load EasyBuild - # we need the --rebuild option, since the module file is shipped with EESSI - tmpdir=$(mktemp -d) - eb --rebuild --installpath-modules=${tmpdir} --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb + # we need the --rebuild option and a random dir for the module if the module file is shipped with EESSI + if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + tmpdir=$(mktemp -d) + extra_args="--rebuild --installpath-modules=${tmpdir}" + fi + eb ${extra_args} --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb ret=$? if [ $ret -ne 0 ]; then echo "CUDA installation failed, please check EasyBuild logs..." From 8c8a227260ca39ebbc58d7aff3ccf0e58c6dfa4e Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Tue, 13 Sep 2022 19:28:43 +0200 Subject: [PATCH 40/48] Load correct module env as long as p7zip is not part of software layer --- gpu_support/install_cuda_compatlibs.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gpu_support/install_cuda_compatlibs.sh b/gpu_support/install_cuda_compatlibs.sh index 1621cfdf64..b3c10aae86 100644 --- a/gpu_support/install_cuda_compatlibs.sh +++ b/gpu_support/install_cuda_compatlibs.sh @@ -39,6 +39,10 @@ echo $compat_file # Keep support for deb files in case it is needed in the future file_extension=${compat_file##*.} if [[ ${file_extension} == "rpm" ]]; then + # p7zip is installed under host_injections for now, make that known to the environment + if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ + fi # Load p7zip to extract files from rpm file module load p7zip # Extract .cpio From f90dd6634753fa4fdcd248293603f6e61f9609b9 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Tue, 13 Sep 2022 19:30:15 +0200 Subject: [PATCH 41/48] Load CUDA version specified to for installation when testing --- gpu_support/add_nvidia_gpu_support.sh | 2 +- gpu_support/test_cuda.sh | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 12f40a11d9..09ad2dbaeb 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -167,7 +167,7 @@ do fi if [[ "${install_wo_gpu}" != "true" ]]; then - bash $(dirname "$BASH_SOURCE")/test_cuda.sh + bash $(dirname "$BASH_SOURCE")/test_cuda.sh "${install_cuda_version}" if [ $? -eq 0 ] then exit 0 diff --git a/gpu_support/test_cuda.sh b/gpu_support/test_cuda.sh index 28d21355ce..07deecff53 100644 --- a/gpu_support/test_cuda.sh +++ b/gpu_support/test_cuda.sh @@ -1,5 +1,7 @@ #!/bin/bash +install_cuda_version=$1 + # Test CUDA cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" if [ -d ${cuda_install_dir}/modules/all ]; then @@ -8,7 +10,7 @@ else echo "Cannot test CUDA, modules path does not exist, exiting now..." exit 1 fi -module load CUDA +module load CUDA/${install_cuda_version} ret=$? if [ $ret -ne 0 ]; then echo "Could not load CUDA even though modules path exists..." From a24e09c91105503b567a1a1f7aaf631680bc0e96 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Tue, 13 Sep 2022 19:31:00 +0200 Subject: [PATCH 42/48] Load GCCcore module when building test executable for CUDA --- gpu_support/test_cuda.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gpu_support/test_cuda.sh b/gpu_support/test_cuda.sh index 07deecff53..17b56198e4 100644 --- a/gpu_support/test_cuda.sh +++ b/gpu_support/test_cuda.sh @@ -20,6 +20,12 @@ tmp_dir=$(mktemp -d) cp -r $EBROOTCUDA/samples $tmp_dir current_dir=$PWD cd $tmp_dir/samples/1_Utilities/deviceQuery +module load GCCcore +ret=$? +if [ $ret -ne 0 ]; then + echo "Could not load GCC, but it should have been shipped with EESSI?! Exiting..." + exit 1 +fi make HOST_COMPILER=$(which g++) -j ./deviceQuery From 3d0ebadd0a40b1c631a8fbcc32dae6dee87917b1 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Tue, 13 Sep 2022 19:31:46 +0200 Subject: [PATCH 43/48] Add EasyBuild configuration for p7zip installation --- gpu_support/add_nvidia_gpu_support.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 09ad2dbaeb..b9f0d12f20 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -102,6 +102,12 @@ if [[ $? -eq 0 ]]; then echo "p7zip module found! No need to install p7zip again, proceeding with installation of compat libraries" else # install p7zip in host_injections + export EASYBUILD_IGNORE_OSDEPS=1 + export EASYBUILD_SYSROOT=${EPREFIX} + export EASYBUILD_RPATH=1 + export EASYBUILD_FILTER_ENV_VARS=LD_LIBRARY_PATH + export EASYBUILD_FILTER_DEPS=Autoconf,Automake,Autotools,binutils,bzip2,cURL,DBus,flex,gettext,gperf,help2man,intltool,libreadline,libtool,Lua,M4,makeinfo,ncurses,util-linux,XZ,zlib + export EASYBUILD_MODULE_EXTENSIONS=1 module load EasyBuild eb --robot --installpath=${cuda_install_dir}/ p7zip-${install_p7zip_version}.eb ret=$? From fe1843f7467fb30505a377364f044458ebd32503 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Tue, 27 Sep 2022 13:33:40 +0200 Subject: [PATCH 44/48] Ship whitelisted CUDA libs and rework scripts accordingly Whitelisted CUDA libraries can now be shipped with EESSI. The other libraries and files are replaced with symlinks to host_injections. A compiled CUDA sample can now also be shipped with EESSI. This is relevant if users only need the runtime capabilities and not the whole CUDA suite (which would include the compilers). It is now possible to solely install the compat libs as a user and get access to the runtime environment this way. It is still possible to also install the whole CUDA suite. CUDA enabled modules with the gpu property now only load if the compat libs are installed in host_injections. --- EESSI-pilot-install-software.sh | 6 + eb_hooks.py | 61 +++++-- gpu_support/add_nvidia_gpu_support.sh | 163 ++---------------- .../{ => cuda_utils}/get_cuda_compatlibs.sh | 2 - gpu_support/cuda_utils/install_cuda.sh | 38 ++++ .../install_cuda_compatlibs.sh | 10 +- .../install_cuda_compatlibs_loop.sh | 102 +++++++++++ .../cuda_utils/prepare_cuda_compatlibs.sh | 31 ++++ gpu_support/cuda_utils/test_cuda.sh | 82 +++++++++ gpu_support/test_cuda.sh | 62 ------- init/SitePackage.lua | 19 ++ 11 files changed, 338 insertions(+), 238 deletions(-) rename gpu_support/{ => cuda_utils}/get_cuda_compatlibs.sh (95%) create mode 100644 gpu_support/cuda_utils/install_cuda.sh rename gpu_support/{ => cuda_utils}/install_cuda_compatlibs.sh (94%) create mode 100644 gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh create mode 100755 gpu_support/cuda_utils/prepare_cuda_compatlibs.sh create mode 100644 gpu_support/cuda_utils/test_cuda.sh delete mode 100644 gpu_support/test_cuda.sh diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 0c832fdee9..ded3e652f9 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -160,6 +160,12 @@ fail_msg="Installation of ${GCC_EC} failed!" $EB ${GCC_EC} --robot --from-pr 14453 GCCcore-9.3.0.eb check_exit_code $? "${ok_msg}" "${fail_msg}" +# install CUDA +ok_msg="CUDA installed, off to a good (?) start!" +fail_msg="Failed to install CUDA, woopsie..." +$EB CUDA-11.3.1.eb --robot +check_exit_code $? "${ok_msg}" "${fail_msg}" + # install CMake with custom easyblock that patches CMake when --sysroot is used echo ">> Install CMake with fixed easyblock to take into account --sysroot" ok_msg="CMake installed!" diff --git a/eb_hooks.py b/eb_hooks.py index 79cb48fd7b..886a255b74 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -122,22 +122,53 @@ def cgal_toolchainopts_precise(ec, eprefix): raise EasyBuildError("CGAL-specific hook triggered for non-CGAL easyconfig?!") -def pre_fetch_hook(self, *args, **kwargs): - """Modify install path for CUDA software.""" +def post_package_hook(self, *args, **kwargs): + """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" if self.name == 'CUDA': - self.installdir = self.installdir.replace('versions', 'host_injections') - - -def pre_module_hook(self, *args, **kwargs): - """Modify install path for CUDA software.""" - if self.name == 'CUDA': - self.installdir = self.installdir.replace('versions', 'host_injections') - - -def pre_sanitycheck_hook(self, *args, **kwargs): - """Modify install path for CUDA software.""" - if self.name == 'CUDA': - self.installdir = self.installdir.replace('versions', 'host_injections') + # install compat libraries and run test + # if the test works, move it to EESSI_SOFTWARE_PATH so we can ship the compiled test + os.system("export SAVE_COMPILED_TEST=true && ./gpu_support/add_nvidia_gpu_support.sh") + print_msg("Replacing CUDA stuff we cannot ship with symlinks...") + # read CUDA EULA + eula_path = os.path.join(self.installdir, 'EULA.txt') + tmp_buffer = [] + with open(eula_path) as infile: + copy = False + for line in infile: + if line.strip() == '2.6. Attachment A': + copy = True + continue + elif line.strip() == '2.7. Attachment B': + copy = False + continue + elif copy: + tmp_buffer.append(line) + # create whitelist without file extensions, not really needed and they only complicate things + whitelist = [] + file_extensions = ['.so', '.a', '.h', '.bc'] + for tmp in tmp_buffer: + for word in tmp.split(): + if any(ext in word for ext in file_extensions): + whitelist.append(word.split('.')[0]) + # add compiled test to whitelist so we can ship it with EESSI + whitelist.append('deviceQuery') + whitelist = list(set(whitelist)) + # iterate over all files in the CUDA path + for root, dirs, files in os.walk(self.installdir): + for filename in files: + # we only really care about real files, i.e. not symlinks + if not os.path.islink(os.path.join(root, filename)): + # check if the current file is part of the whitelist + basename = filename.split('.')[0] + if basename not in whitelist: + # if it is not in the whitelist, delete the file and create a symlink to host_injections + source = os.path.join(root, filename) + target = source.replace('versions', 'host_injections') + os.remove(source) + # have to create subdirs if they don't exit, otherwise the symlink creation fails + if not os.path.isdir(os.path.dirname(target)): + os.makedirs(os.path.dirname(target)) + os.symlink(target, source) def fontconfig_add_fonts(ec, eprefix): diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index b9f0d12f20..314cb83c50 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -3,6 +3,7 @@ # Drop into the prefix shell or pipe this script into a Prefix shell with # $EPREFIX/startprefix <<< /path/to/this_script.sh +install_cuda="${INSTALL_CUDA:=false}" install_cuda_version="${INSTALL_CUDA_VERSION:=11.3.1}" install_p7zip_version="${INSTALL_P7ZIP_VERSION:=17.04-GCCcore-10.3.0}" @@ -55,161 +56,17 @@ if [[ "${install_wo_gpu}" != "true" ]]; then fi fi -############################################################################################### ############################################################################################### # Install CUDA -# TODO: Can we do a trimmed install? -# if modules dir exists, load it for usage within Lmod cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" mkdir -p ${cuda_install_dir} -# only install CUDA if specified version is not found -if [ -d ${cuda_install_dir}/software/CUDA/${install_cuda_version} ]; then - echo "CUDA software found! No need to install CUDA again, proceeding with tests" -else - # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` - # (CUDA is a binary installation so no need to worry too much about this) - # TODO: The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed - avail_space=$(df --output=avail ${cuda_install_dir}/ | tail -n 1 | awk '{print $1}') - if (( ${avail_space} < 16000000 )); then - echo "Need more disk space to install CUDA, exiting now..." - exit 1 - fi - # install cuda in host_injections - module load EasyBuild - # we need the --rebuild option and a random dir for the module if the module file is shipped with EESSI - if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then - tmpdir=$(mktemp -d) - extra_args="--rebuild --installpath-modules=${tmpdir}" - fi - eb ${extra_args} --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb - ret=$? - if [ $ret -ne 0 ]; then - echo "CUDA installation failed, please check EasyBuild logs..." - exit 1 - fi - rm -rf ${tmpdir} -fi - -# Install p7zip, this will be used to install the CUDA compat libraries from rpm. -# The rpm and deb files contain the same libraries, so we just stick to the rpm version. -# If p7zip is missing from the software layer (for whatever reason), we need to install it. -# This has to happen in host_injections, so we check first if it is already installed there. -if [ -d ${cuda_install_dir}/modules/all ]; then - module use ${cuda_install_dir}/modules/all/ -fi -module avail 2>&1 | grep -i p7zip &> /dev/null -if [[ $? -eq 0 ]]; then - echo "p7zip module found! No need to install p7zip again, proceeding with installation of compat libraries" -else - # install p7zip in host_injections - export EASYBUILD_IGNORE_OSDEPS=1 - export EASYBUILD_SYSROOT=${EPREFIX} - export EASYBUILD_RPATH=1 - export EASYBUILD_FILTER_ENV_VARS=LD_LIBRARY_PATH - export EASYBUILD_FILTER_DEPS=Autoconf,Automake,Autotools,binutils,bzip2,cURL,DBus,flex,gettext,gperf,help2man,intltool,libreadline,libtool,Lua,M4,makeinfo,ncurses,util-linux,XZ,zlib - export EASYBUILD_MODULE_EXTENSIONS=1 - module load EasyBuild - eb --robot --installpath=${cuda_install_dir}/ p7zip-${install_p7zip_version}.eb - ret=$? - if [ $ret -ne 0 ]; then - echo "p7zip installation failed, please check EasyBuild logs..." - exit 1 - fi -fi - -# Check if the CUDA compat libraries are installed and compatible with the target CUDA version -# if not find the latest version of the compatibility libraries and install them - -# get URL to latest CUDA compat libs, exit if URL is invalid -cuda_compat_urls="$($(dirname "$BASH_SOURCE")/get_cuda_compatlibs.sh)" -ret=$? -if [ $ret -ne 0 ]; then - echo $cuda_compat_urls - exit 1 +if [ "${install_cuda}" != false ]; then + bash $(dirname "$BASH_SOURCE")/cuda_utils/install_cuda.sh ${install_cuda_version} ${cuda_install_dir} fi - -# loop over the compat library versions until we get one that works for us -keep_driver_check=1 -# Do a maximum of five attempts -for value in {1..5} -do - latest_cuda_compat_url=$(echo $cuda_compat_urls | cut -d " " -f1) - # Chomp that value out of the list - cuda_compat_urls=$(echo $cuda_compat_urls | cut -d " " -f2-) - latest_driver_version="${latest_cuda_compat_url%-*}" - latest_driver_version="${latest_driver_version##*-}" - # URLs differ for different OSes; check if we already have a number, if not remove string part that is not needed - if [[ ! $latest_driver_version =~ ^[0-9]+$ ]]; then - latest_driver_version="${latest_driver_version##*_}" - fi - - install_compat_libs=false - host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" - # libcuda.so points to actual cuda compat lib with driver version in its name - # if this file exists, cuda compat libs are installed and we can compare the version - if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then - eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so) - eessi_driver_version="${eessi_driver_version##*so.}" - else - eessi_driver_version=0 - fi - - if [ $keep_driver_check -eq 1 ] - then - # only keep the driver check for the latest version - keep_driver_check=0 - else - eessi_driver_version=0 - fi - - if [ ${latest_driver_version//./} -gt ${eessi_driver_version//./} ]; then - install_compat_libs=true - else - echo "CUDA compat libs are up-to-date, skip installation." - fi - - if [ "${install_compat_libs}" == true ]; then - bash $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh $latest_cuda_compat_url - fi - - if [[ "${install_wo_gpu}" != "true" ]]; then - bash $(dirname "$BASH_SOURCE")/test_cuda.sh "${install_cuda_version}" - if [ $? -eq 0 ] - then - exit 0 - else - echo - echo "It looks like your driver is not recent enough to work with that release of CUDA, consider updating!" - echo "I'll try an older release to see if that will work..." - echo - fi - else - echo "Requested to install CUDA without GPUs present, so we skip final tests." - echo "Instead we test if module load CUDA works as expected..." - if [ -d ${cuda_install_dir}/modules/all ]; then - module use ${cuda_install_dir}/modules/all/ - else - echo "Cannot load CUDA, modules path does not exist, exiting now..." - exit 1 - fi - module load CUDA - ret=$? - if [ $ret -ne 0 ]; then - echo "Could not load CUDA even though modules path exists..." - exit 1 - else - echo "Successfully loaded CUDA, you are good to go! :)" - echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" - echo " - To use these modules:" - echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" - echo " - Please keep in mind that we just installed the latest CUDA compat libs." - echo " Since we have no GPU to test with, we cannot guarantee that it will work with the installed CUDA drivers on your GPU node(s)." - exit 0 - fi - break - fi -done - -echo "Tried to install 5 different generations of compat libraries and none worked," -echo "this usually means your driver is very out of date!" -exit 1 +############################################################################################### +# Prepare installation of CUDA compat libraries, i.e. install p7zip if it is missing +$(dirname "$BASH_SOURCE")/cuda_utils/prepare_cuda_compatlibs.sh ${install_p7zip_version} ${cuda_install_dir} +############################################################################################### +# Try installing five different versions of CUDA compat libraries until the test works. +# Otherwise, give up +bash $(dirname "$BASH_SOURCE")/cuda_utils/install_cuda_compatlibs_loop.sh ${cuda_install_dir} ${install_cuda_version} diff --git a/gpu_support/get_cuda_compatlibs.sh b/gpu_support/cuda_utils/get_cuda_compatlibs.sh similarity index 95% rename from gpu_support/get_cuda_compatlibs.sh rename to gpu_support/cuda_utils/get_cuda_compatlibs.sh index c0c6631b06..9639917a27 100755 --- a/gpu_support/get_cuda_compatlibs.sh +++ b/gpu_support/cuda_utils/get_cuda_compatlibs.sh @@ -1,7 +1,5 @@ #!/bin/bash -current_dir=$(dirname $(realpath $0)) - # Get arch type from EESSI environment if [[ -z "${EESSI_CPU_FAMILY}" ]]; then # set up basic environment variables, EasyBuild and Lmod diff --git a/gpu_support/cuda_utils/install_cuda.sh b/gpu_support/cuda_utils/install_cuda.sh new file mode 100644 index 0000000000..ea59f45914 --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +install_cuda_version=$1 + +# TODO: Can we do a trimmed install? +# Only install CUDA if specified version is not found. +# This is only relevant for users, the shipped CUDA installation will +# always be in versions instead of host_injections and have symlinks pointing +# to host_injections for everything we're not allowed to ship +if [ -d ${cuda_install_dir}/software/CUDA/${install_cuda_version} ]; then + echo "CUDA software found! No need to install CUDA again, proceeding with tests" +else + # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` + # (CUDA is a binary installation so no need to worry too much about this) + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed + avail_space=$(df --output=avail ${cuda_install_dir}/ | tail -n 1 | awk '{print $1}') + if (( ${avail_space} < 16000000 )); then + echo "Need more disk space to install CUDA, exiting now..." + exit 1 + fi + # install cuda in host_injections + module load EasyBuild + # we need the --rebuild option and a random dir for the module if the module file is shipped with EESSI + if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + tmpdir=$(mktemp -d) + extra_args="--rebuild --installpath-modules=${tmpdir}" + fi + eb ${extra_args} --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb + ret=$? + if [ $ret -ne 0 ]; then + echo "CUDA installation failed, please check EasyBuild logs..." + exit 1 + fi + # clean up tmpdir if it exists + if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + rm -rf ${tmpdir} + fi +fi diff --git a/gpu_support/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh similarity index 94% rename from gpu_support/install_cuda_compatlibs.sh rename to gpu_support/cuda_utils/install_cuda_compatlibs.sh index b3c10aae86..11a7aa7e3d 100644 --- a/gpu_support/install_cuda_compatlibs.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs.sh @@ -1,6 +1,7 @@ #!/bin/bash libs_url=$1 +cuda_install_dir=$2 current_dir=$(dirname $(realpath $0)) host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" @@ -15,19 +16,16 @@ else fi cd ${host_injections_dir} -# Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest - +# Check if our target CUDA is satisfied by what is installed already +# TODO: Find required CUDA version and see if we need an update driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) if [[ $driver_cuda_version =~ ^[0-9]+$ ]]; then if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi fi -# Check if our target CUDA is satisfied by what is installed already -# TODO: Find required CUDA version and see if we need an update - # If not, grab the latest compat library RPM or deb -# download and unpack in temporary directory, easier cleanup after installation +# Download and unpack in temporary directory, easier cleanup after installation tmpdir=$(mktemp -d) cd $tmpdir compat_file=${libs_url##*/} diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh new file mode 100644 index 0000000000..c3a17c6f2a --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +cuda_install_dir=$1 +install_cuda_version=$2 + +# Check if the CUDA compat libraries are installed and compatible with the target CUDA version +# if not find the latest version of the compatibility libraries and install them + +# get URL to latest CUDA compat libs, exit if URL is invalid +cuda_compat_urls="$($(dirname "$BASH_SOURCE")/get_cuda_compatlibs.sh)" +ret=$? +if [ $ret -ne 0 ]; then + echo "Couldn't find current URLs of the CUDA compat libraries, instead got:" + echo $cuda_compat_urls + exit 1 +fi + +# loop over the compat library versions until we get one that works for us +keep_driver_check=1 +# Do a maximum of five attempts +for value in {1..5} +do + latest_cuda_compat_url=$(echo $cuda_compat_urls | cut -d " " -f1) + # Chomp that value out of the list + cuda_compat_urls=$(echo $cuda_compat_urls | cut -d " " -f2-) + latest_driver_version="${latest_cuda_compat_url%-*}" + latest_driver_version="${latest_driver_version##*-}" + # URLs differ for different OSes; check if we already have a number, if not remove string part that is not needed + if [[ ! $latest_driver_version =~ ^[0-9]+$ ]]; then + latest_driver_version="${latest_driver_version##*_}" + fi + + install_compat_libs=false + host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" + # libcuda.so points to actual cuda compat lib with driver version in its name + # if this file exists, cuda compat libs are installed and we can compare the version + if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then + eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so) + eessi_driver_version="${eessi_driver_version##*so.}" + else + eessi_driver_version=0 + fi + + if [ $keep_driver_check -eq 1 ] + then + # only keep the driver check for the latest version + keep_driver_check=0 + else + eessi_driver_version=0 + fi + + if (( ${latest_driver_version//./} > ${eessi_driver_version//./} )); then + install_compat_libs=true + else + echo "CUDA compat libs are up-to-date, skip installation." + fi + + if [ "${install_compat_libs}" == true ]; then + bash $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh ${latest_cuda_compat_url} ${cuda_install_dir} + fi + + if [[ "${install_wo_gpu}" != "true" ]]; then + bash $(dirname "$BASH_SOURCE")/test_cuda.sh "${install_cuda_version}" + if [ $? -eq 0 ] + then + exit 0 + else + echo + echo "It looks like your driver is not recent enough to work with that release of CUDA, consider updating!" + echo "I'll try an older release to see if that will work..." + echo + fi + else + echo "Requested to install CUDA without GPUs present, so we skip final tests." + echo "Instead we test if module load CUDA works as expected..." + if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ + else + echo "Cannot load CUDA, modules path does not exist, exiting now..." + exit 1 + fi + module load CUDA + ret=$? + if [ $ret -ne 0 ]; then + echo "Could not load CUDA even though modules path exists..." + exit 1 + else + echo "Successfully loaded CUDA, you are good to go! :)" + echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" + echo " - To use these modules:" + echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" + echo " - Please keep in mind that we just installed the latest CUDA compat libs." + echo " Since we have no GPU to test with, we cannot guarantee that it will work with the installed CUDA drivers on your GPU node(s)." + exit 0 + fi + break + fi +done + +echo "Tried to install 5 different generations of compat libraries and none worked," +echo "this usually means your driver is very out of date!" +exit 1 diff --git a/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh b/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh new file mode 100755 index 0000000000..9efd2b8e66 --- /dev/null +++ b/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +install_p7zip_version=$1 +cuda_install_dir=$2 + +# Install p7zip, this will be used to install the CUDA compat libraries from rpm. +# The rpm and deb files contain the same libraries, so we just stick to the rpm version. +# If p7zip is missing from the software layer (for whatever reason), we need to install it. +# This has to happen in host_injections, so we check first if it is already installed there. +if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ +fi +module avail 2>&1 | grep -i p7zip &> /dev/null +if [[ $? -eq 0 ]]; then + echo "p7zip module found! No need to install p7zip again, proceeding with installation of compat libraries" +else + # install p7zip in host_injections + export EASYBUILD_IGNORE_OSDEPS=1 + export EASYBUILD_SYSROOT=${EPREFIX} + export EASYBUILD_RPATH=1 + export EASYBUILD_FILTER_ENV_VARS=LD_LIBRARY_PATH + export EASYBUILD_FILTER_DEPS=Autoconf,Automake,Autotools,binutils,bzip2,cURL,DBus,flex,gettext,gperf,help2man,intltool,libreadline,libtool,Lua,M4,makeinfo,ncurses,util-linux,XZ,zlib + export EASYBUILD_MODULE_EXTENSIONS=1 + module load EasyBuild + eb --robot --installpath=${cuda_install_dir}/ p7zip-${install_p7zip_version}.eb + ret=$? + if [ $ret -ne 0 ]; then + echo "p7zip installation failed, please check EasyBuild logs..." + exit 1 + fi +fi diff --git a/gpu_support/cuda_utils/test_cuda.sh b/gpu_support/cuda_utils/test_cuda.sh new file mode 100644 index 0000000000..010a97c4d9 --- /dev/null +++ b/gpu_support/cuda_utils/test_cuda.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +install_cuda_version=$1 +save_compiled_test="${SAVE_COMPILED_TEST:=false}" + +# Test CUDA +cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" +current_dir=$PWD +if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ +else + echo "Cannot test CUDA, modules path does not exist, exiting now..." + exit 1 +fi +module load CUDA/${install_cuda_version} +ret=$? +if [ $ret -ne 0 ]; then + echo "Could not load CUDA even though modules path exists..." + exit 1 +fi +# if we don't want to save the compiled sample, it means we have a shipped version available +if [ "${save_compiled_test}" != false ]; then + tmp_dir=$(mktemp -d) + # convert cuda version to an integer so we can test if the samples are shipped with this version + # starting from version 11.6 the samples can be found in a github repo + cuda_version=$(echo ${install_cuda_version} | cut -f1,2 -d'.') + cuda_version=${cuda_version//./} + if (( ${cuda_version} < 116 )); then + cp -r $EBROOTCUDA/samples $tmp_dir + cd $tmp_dir/samples/1_Utilities/deviceQuery + else + git clone https://github.com/NVIDIA/cuda-samples.git ${tmp_dir} + cd $tmp_dir/Samples/1_Utilities/deviceQuery + fi + module load GCCcore + ret=$? + if [ $ret -ne 0 ]; then + echo "Could not load GCC, but it should have been shipped with EESSI?! Exiting..." + exit 1 + fi + make HOST_COMPILER=$(which g++) -j +else + cd ${EESSI_SOFTWARE_PATH}/software/CUDA/${install_cuda_version} +fi +./deviceQuery + +if [ $? -eq 0 ] +then + # Set the color variable + green='\033[0;32m' + # Clear the color after that + clear='\033[0m' + echo -e ${green} + echo "Congratulations, your GPU is working with EESSI!" + echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" + echo " - To use these modules:" + echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" + echo -e ${clear} + + if [ "${save_compiled_test}" != false ]; then + mv deviceQuery ${EESSI_SOFTWARE_PATH}/software/CUDA/${install_cuda_version} + fi + + # Clean up + cd $current_dir + if [ "${save_compiled_test}" != false ]; then + rm -r $tmp_dir + fi +else + echo "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 + # Clean up + cd $current_dir + if [ "${save_compiled_test}" != false ]; then + rm -r $tmp_dir + fi + exit 1 +fi + +# Test a CUDA-enabled module from EESSI +# TODO: GROMACS? +# TODO: Include a GDR copy test? +############################################################################################### diff --git a/gpu_support/test_cuda.sh b/gpu_support/test_cuda.sh deleted file mode 100644 index 17b56198e4..0000000000 --- a/gpu_support/test_cuda.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -install_cuda_version=$1 - -# Test CUDA -cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" -if [ -d ${cuda_install_dir}/modules/all ]; then - module use ${cuda_install_dir}/modules/all/ -else - echo "Cannot test CUDA, modules path does not exist, exiting now..." - exit 1 -fi -module load CUDA/${install_cuda_version} -ret=$? -if [ $ret -ne 0 ]; then - echo "Could not load CUDA even though modules path exists..." - exit 1 -fi -tmp_dir=$(mktemp -d) -cp -r $EBROOTCUDA/samples $tmp_dir -current_dir=$PWD -cd $tmp_dir/samples/1_Utilities/deviceQuery -module load GCCcore -ret=$? -if [ $ret -ne 0 ]; then - echo "Could not load GCC, but it should have been shipped with EESSI?! Exiting..." - exit 1 -fi -make HOST_COMPILER=$(which g++) -j -./deviceQuery - -if [ $? -eq 0 ] -then - # Set the color variable - green='\033[0;32m' - # Clear the color after that - clear='\033[0m' - echo -e ${green} - echo "Congratulations, your GPU is working with EESSI!" - echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" - echo " - To use these modules:" - echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" - echo -e ${clear} - - # Clean up - cd $current_dir - rm -r $tmp_dir -else - echo "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 - # Clean up - cd $current_dir - rm -r $tmp_dir - exit 1 -fi - -# Test building something with CUDA and running -# TODO: Use samples from installation directory, `device_query` is a good option - -# Test a CUDA-enabled module from EESSI -# TODO: GROMACS? -# TODO: Include a GDR copy test? -############################################################################################### diff --git a/init/SitePackage.lua b/init/SitePackage.lua index c6ef9babf6..ab0e2576d8 100644 --- a/init/SitePackage.lua +++ b/init/SitePackage.lua @@ -38,6 +38,24 @@ local function cuda_load_hook(t) cudaDir = cudaDir .. "/software/" .. t.modFullName local cudaDirExists = exists(cudaDir) if not cudaDirExists then + io.stderr:write("You requested to load ",simpleName,"\n") + io.stderr:write("While the module file exists, the actual software is not shipped with EESSI.\n") + io.stderr:write("In order to be able to use the CUDA module, please follow the instructions in the\n") + io.stderr:write("gpu_support folder. Adding the CUDA software can be as easy as a simple:\n") + io.stderr:write("export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh\n") + frameStk:__clear() + end + end +end + +local function cuda_enabled_load_hook(t) + local frameStk = require("FrameStk"):singleton() + local mt = frameStk:mt() + local compatDir = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/compat/" + local compatDirExists = exists(compatDir) + if not compatDirExists then + local haveGpu = mt:haveProperty(modT.sn,"arch","gpu") + if haveGpu then io.stderr:write("You requested to load ",simpleName,"\n") io.stderr:write("While the module file exists, the actual software is not shipped with EESSI.\n") io.stderr:write("In order to be able to use the CUDA module, please follow the instructions in the\n") @@ -49,4 +67,5 @@ local function cuda_load_hook(t) end hook.register("load", cuda_load_hook) +hook.register("load", cuda_enabled_load_hook) hook.register("isVisibleHook", visible_hook) From 70e5dec43adfc2bedc6d6483b8df4e9fb02f5990 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 29 Sep 2022 16:49:04 +0200 Subject: [PATCH 45/48] If EULA file exists, CUDA is inst. in host_injections + some fixes --- gpu_support/cuda_utils/install_cuda.sh | 3 ++- init/SitePackage.lua | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gpu_support/cuda_utils/install_cuda.sh b/gpu_support/cuda_utils/install_cuda.sh index ea59f45914..4eef0ecdf4 100644 --- a/gpu_support/cuda_utils/install_cuda.sh +++ b/gpu_support/cuda_utils/install_cuda.sh @@ -1,13 +1,14 @@ #!/bin/bash install_cuda_version=$1 +cuda_install_dir=$2 # TODO: Can we do a trimmed install? # Only install CUDA if specified version is not found. # This is only relevant for users, the shipped CUDA installation will # always be in versions instead of host_injections and have symlinks pointing # to host_injections for everything we're not allowed to ship -if [ -d ${cuda_install_dir}/software/CUDA/${install_cuda_version} ]; then +if [ -f ${cuda_install_dir}/software/CUDA/${install_cuda_version}/EULA.txt ]; then echo "CUDA software found! No need to install CUDA again, proceeding with tests" else # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` diff --git a/init/SitePackage.lua b/init/SitePackage.lua index ab0e2576d8..4c35ceb281 100644 --- a/init/SitePackage.lua +++ b/init/SitePackage.lua @@ -54,7 +54,8 @@ local function cuda_enabled_load_hook(t) local compatDir = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/compat/" local compatDirExists = exists(compatDir) if not compatDirExists then - local haveGpu = mt:haveProperty(modT.sn,"arch","gpu") + local simpleName = string.match(t.modFullName, "(.-)/") + local haveGpu = mt:haveProperty(simpleName,"arch","gpu") if haveGpu then io.stderr:write("You requested to load ",simpleName,"\n") io.stderr:write("While the module file exists, the actual software is not shipped with EESSI.\n") From 1075d0b664c5b5d19386d33dde6386f50c612192 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 29 Sep 2022 18:46:35 +0200 Subject: [PATCH 46/48] Add check if CUDA compat lib version is sufficient for module The CUDA version needed for modules are now written as envvars that will be exported into the module files. The CUDA version for which we have the current compat libs installed is saved in a txt file in ../host_injections/nvidia/latest/version.txt The lmod hook called when loading modules with the gpu property now compares these two versions and exits out if the installed version needs to be updated. --- eb_hooks.py | 10 +++++-- gpu_support/add_nvidia_gpu_support.sh | 3 ++ init/SitePackage.lua | 43 +++++++++++++++++++++++---- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 886a255b74..cf16600163 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -48,15 +48,21 @@ def inject_gpu_property(ec): "CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])] or ec_dict["toolchain"]["name"] in CUDA_ENABLED_TOOLCHAINS ): + ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version") key = "modluafooter" value = 'add_property("arch","gpu")' + cuda_version = 0 + for dep in iter(ec_dict["dependencies"]): + if "CUDA" in dep[0]: + cuda_version = dep[1] + ec_dict["dependencies"].remove(dep) + ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict["builddependencies"] + value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % (cuda_version)]) if key in ec_dict: if not value in ec_dict[key]: ec[key] = "\n".join([ec_dict[key], value]) else: ec[key] = value - ec.log.info("[parse hook] Injecting gpu as Lmod arch property") - return ec def parse_hook(ec, *args, **kwargs): diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 314cb83c50..d9a97fe2d9 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -70,3 +70,6 @@ $(dirname "$BASH_SOURCE")/cuda_utils/prepare_cuda_compatlibs.sh ${install_p7zip_ # Try installing five different versions of CUDA compat libraries until the test works. # Otherwise, give up bash $(dirname "$BASH_SOURCE")/cuda_utils/install_cuda_compatlibs_loop.sh ${cuda_install_dir} ${install_cuda_version} + +cuda_version_file="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/version.txt" +echo ${install_cuda_version} > ${cuda_version_file} diff --git a/init/SitePackage.lua b/init/SitePackage.lua index 4c35ceb281..d8de872fef 100644 --- a/init/SitePackage.lua +++ b/init/SitePackage.lua @@ -1,5 +1,14 @@ require("strict") local hook = require("Hook") +local open = io.open + +local function read_file(path) + local file = open(path, "rb") -- r read mode and b binary mode + if not file then return nil end + local content = file:read "*a" -- *a or *all reads the whole file + file:close() + return content +end -- from https://stackoverflow.com/a/40195356 --- Check if a file or directory exists in this path @@ -51,12 +60,12 @@ end local function cuda_enabled_load_hook(t) local frameStk = require("FrameStk"):singleton() local mt = frameStk:mt() - local compatDir = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/compat/" - local compatDirExists = exists(compatDir) - if not compatDirExists then - local simpleName = string.match(t.modFullName, "(.-)/") - local haveGpu = mt:haveProperty(simpleName,"arch","gpu") - if haveGpu then + local simpleName = string.match(t.modFullName, "(.-)/") + local haveGpu = mt:haveProperty(simpleName,"arch","gpu") + if haveGpu then + local compatDir = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/compat/" + local compatDirExists = exists(compatDir) + if not compatDirExists then io.stderr:write("You requested to load ",simpleName,"\n") io.stderr:write("While the module file exists, the actual software is not shipped with EESSI.\n") io.stderr:write("In order to be able to use the CUDA module, please follow the instructions in the\n") @@ -64,6 +73,28 @@ local function cuda_enabled_load_hook(t) io.stderr:write("./add_nvidia_gpu_support.sh\n") frameStk:__clear() end + local cudaVersion = read_file("/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/version.txt") + local cudaVersion_req = os.getenv("EESSICUDAVERSION") + local major, minor, patch = string.match(cudaVersion, "(%d+)%.(%d+)%.(%d+)") + local major_req, minor_req, patch_req = string.match(cudaVersion, "(%d+)%.(%d+)%.(%d+)") + local compat_libs_need_update = false + if major < major_req then + compat_libs_need_update = true + elseif major == major_req then + if minor < minor_req then + compat_libs_need_update = true + elseif minor == minor_req then + if patch < patch_req then + compat_libs_need_update = true + end + end + end + if compat_libs_need_update == true then + io.stderr:write("You requested to load CUDA version ",cudaVersion) + io.stderr:write("but the module you want to load requires CUDA version ",cudaVersion_req,".\n") + io.stderr:write("Please update your CUDA compatibility libraries in order to use ",simpleName,".\n") + frameStk:__clear() + end end end From d65fe3083691ccfec539377dac38d3a916fcba59 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Fri, 30 Sep 2022 12:44:29 +0200 Subject: [PATCH 47/48] Pass CUDA version from eb hook to compat lib script + fix test dir rm The fix for removing the temporary test dir is needed when cloning the samples from github, i.e. for CUDA > 11.6.0. Otherwise, the script call from the eb hook will get stuck. --- eb_hooks.py | 4 +++- gpu_support/cuda_utils/test_cuda.sh | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index cf16600163..472533e0bd 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -131,9 +131,11 @@ def cgal_toolchainopts_precise(ec, eprefix): def post_package_hook(self, *args, **kwargs): """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" if self.name == 'CUDA': + cuda_version = self.installdir.split('/')[-1] + print_msg("Attempt to get compat libs for CUDA version: %s" % (cuda_version)) # install compat libraries and run test # if the test works, move it to EESSI_SOFTWARE_PATH so we can ship the compiled test - os.system("export SAVE_COMPILED_TEST=true && ./gpu_support/add_nvidia_gpu_support.sh") + os.system("export INSTALL_CUDA_VERSION=%s && export SAVE_COMPILED_TEST=true && ./gpu_support/add_nvidia_gpu_support.sh" % (cuda_version)) print_msg("Replacing CUDA stuff we cannot ship with symlinks...") # read CUDA EULA eula_path = os.path.join(self.installdir, 'EULA.txt') diff --git a/gpu_support/cuda_utils/test_cuda.sh b/gpu_support/cuda_utils/test_cuda.sh index 010a97c4d9..6939c77815 100644 --- a/gpu_support/cuda_utils/test_cuda.sh +++ b/gpu_support/cuda_utils/test_cuda.sh @@ -29,7 +29,7 @@ if [ "${save_compiled_test}" != false ]; then cp -r $EBROOTCUDA/samples $tmp_dir cd $tmp_dir/samples/1_Utilities/deviceQuery else - git clone https://github.com/NVIDIA/cuda-samples.git ${tmp_dir} + git clone https://github.com/NVIDIA/cuda-samples.git ${tmp_dir} -q cd $tmp_dir/Samples/1_Utilities/deviceQuery fi module load GCCcore @@ -64,14 +64,14 @@ then # Clean up cd $current_dir if [ "${save_compiled_test}" != false ]; then - rm -r $tmp_dir + rm -rf $tmp_dir fi else echo "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 # Clean up cd $current_dir if [ "${save_compiled_test}" != false ]; then - rm -r $tmp_dir + rm -rf $tmp_dir fi exit 1 fi From 2ac267105fab56db74553d83c8769203e6b4c3f9 Mon Sep 17 00:00:00 2001 From: Michael Huebner Date: Thu, 20 Oct 2022 15:49:33 +0200 Subject: [PATCH 48/48] Update documentation and merge both Lmod load hooks --- gpu_support/README.md | 19 ++++++++++++++++++- init/SitePackage.lua | 24 +++++++++++------------- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/gpu_support/README.md b/gpu_support/README.md index 48172641af..226a7fca6c 100644 --- a/gpu_support/README.md +++ b/gpu_support/README.md @@ -1,10 +1,27 @@ # How to add GPU support The collection of scripts in this directory enables you to add GPU support to your setup. Note that currently this means that CUDA support can be added for Nvidia GPUs. AMD GPUs are not yet supported (feel free to contribute that though!). -To enable the usage of CUDA in your setup, simply run the following script: +To enable the usage of the CUDA runtime in your setup, simply run the following script: ``` ./add_nvidia_gpu_support.sh ``` +This script will install the compatibility libraries (and only those by default!) you need to use the shipped runtime environment of CUDA. + +If you plan on using the full CUDA suite, i.e. if you want to load the CUDA module, you will have to modify the script execution as follows: +``` +export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh +``` +This will again install the needed compatibility libraries as well as the whole CUDA suite. + +If you need a different CUDA version than what is shipped with EESSI, you can also specify that particular version for the script: +``` +export INSTALL_CUDA_VERSION=xx.y.z && export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh +``` +Please note, however, that versions for which the runtime is not shipped with EESSI are not installed in the default modules path. +Thus, you will have to add the following to your modules path to get access to your custom CUDA version: +``` +module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/ +``` ## Prerequisites and tips * You need write permissions to `/cvmfs/pilot.eessi-hpc.org/host_injections` (which by default is a symlink to `/opt/eessi` but can be configured in your CVMFS config file to point somewhere else). If you would like to make a system-wide installation you should change this in your configuration to point somewhere on a shared filesystem. * If you want to install CUDA on a node without GPUs (e.g. on a login node where you want to be able to compile your CUDA-enabled code), you should `export INSTALL_WO_GPU=true` in order to skip checks and tests that can only succeed if you have access to a GPU. This approach is not recommended as there is a chance the CUDA compatibility library installed is not compatible with the existing CUDA driver on GPU nodes (and this will not be detected). diff --git a/init/SitePackage.lua b/init/SitePackage.lua index d8de872fef..d7ab2eefad 100644 --- a/init/SitePackage.lua +++ b/init/SitePackage.lua @@ -36,11 +36,14 @@ local function visible_hook(modT) end end -local function cuda_load_hook(t) +local function cuda_enabled_load_hook(t) local frameStk = require("FrameStk"):singleton() - -- needed to check if we are trying to load the CUDA module + local mt = frameStk:mt() local simpleName = string.match(t.modFullName, "(.-)/") - if string.match(simpleName, 'CUDA') ~= nil then + local eprefix = os.getenv('EESSI_PREFIX') .. "/init/gpu_support" + -- if we try to load CUDA itself, check if the software exists in host_injections + -- otherwise, refuse to load CUDA and print error message + if simpleName == 'CUDA' then -- get the full host_injections path local cudaDir = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') -- build final path where the CUDA software should be installed @@ -55,21 +58,17 @@ local function cuda_load_hook(t) frameStk:__clear() end end -end - -local function cuda_enabled_load_hook(t) - local frameStk = require("FrameStk"):singleton() - local mt = frameStk:mt() - local simpleName = string.match(t.modFullName, "(.-)/") + -- when loading CUDA enabled modules check if the neccessary matching compatibility libraries are installed + -- otherwise, refuse to load the requested module and print error message local haveGpu = mt:haveProperty(simpleName,"arch","gpu") if haveGpu then local compatDir = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/compat/" local compatDirExists = exists(compatDir) if not compatDirExists then io.stderr:write("You requested to load ",simpleName,"\n") - io.stderr:write("While the module file exists, the actual software is not shipped with EESSI.\n") - io.stderr:write("In order to be able to use the CUDA module, please follow the instructions in the\n") - io.stderr:write("gpu_support folder. Adding the CUDA software can be as easy as a simple:\n") + io.stderr:write("which relies on the CUDA runtime environment and its compatibility libraries.\n") + io.stderr:write("In order to be able to use the module, please follow the instructions in the\n") + io.stderr:write("gpu_support folder. Installing the needed compatibility libraries can be as easy as ~~a simple~~:\n") io.stderr:write("./add_nvidia_gpu_support.sh\n") frameStk:__clear() end @@ -98,6 +97,5 @@ local function cuda_enabled_load_hook(t) end end -hook.register("load", cuda_load_hook) hook.register("load", cuda_enabled_load_hook) hook.register("isVisibleHook", visible_hook)