EESSI · huebner-m · Apr 14, 2022 · May 12, 2022 · May 12, 2022 · May 13, 2022
diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh
@@ -108,6 +108,10 @@ module --force purge
 # ignore current $MODULEPATH entirely
 module unuse $MODULEPATH
 module use $EASYBUILD_INSTALLPATH/modules/all
+if [ ! -z "${EESSI_SITE_MODULEPATH}" ]; then
+  echo_green "Add ${EESSI_SITE_MODULEPATH} to \$MODULEPATH for GPU support!"
+  module use ${EESSI_SITE_MODULEPATH}
+fi
 if [[ -z ${MODULEPATH} ]]; then
     fatal_error "Failed to set up \$MODULEPATH?!"
 else

diff --git a/eb_hooks.py b/eb_hooks.py
@@ -7,7 +7,7 @@
 from easybuild.tools.systemtools import AARCH64, POWER, get_cpu_architecture
 
 EESSI_RPATH_OVERRIDE_ATTR = 'orig_rpath_override_dirs'
-
+CUDA_ENABLED_TOOLCHAINS = ["pmvmklc", "gmvmklc", "gmvapich2c", "pmvapich2c"]
 
 def get_eessi_envvar(eessi_envvar):
     """Get an EESSI environment variable from the environment"""
@@ -41,13 +41,32 @@ def get_rpath_override_dirs(software_name):
 
     return rpath_injection_dirs
 
+def inject_gpu_property(ec):
+    ec_dict = ec.asdict()
+    # Check if CUDA is in the dependencies, if so add the GPU Lmod tag
+    if (
+        "CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])]
+        or ec_dict["toolchain"]["name"] in CUDA_ENABLED_TOOLCHAINS
+    ):
+        key = "modluafooter"
+        value = 'add_property("arch","gpu")'
+        if key in ec_dict:
+            if not value in ec_dict[key]:
+                ec[key] = "\n".join([ec_dict[key], value])
+        else:
+            ec[key] = value
+        ec.log.info("[parse hook] Injecting gpu as Lmod arch property")
+
+    return ec
 
 def parse_hook(ec, *args, **kwargs):
     """Main parse hook: trigger custom functions based on software name."""
 
     # determine path to Prefix installation in compat layer via $EPREFIX
     eprefix = get_eessi_envvar('EPREFIX')
 
+    ec = inject_gpu_property(ec)
+
     if ec.name in PARSE_HOOKS:
         PARSE_HOOKS[ec.name](ec, eprefix)
 

diff --git a/gpu_support/add_amd_gpu_support.sh b/gpu_support/add_amd_gpu_support.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+cat << EOF
+This is not implemented yet :(
+
+If you would like to contribute this support there are a few things you will
+need to consider:
+- We will need to change the Lmod property added to GPU software so we can
+  distinguish AMD and Nvidia GPUs
+- Support should be implemented in user space, if this is not possible (e.g.,
+  requires a driver update) you need to tell the user what to do
+- Support needs to be _verified_ and a trigger put in place (like the existence
+  of a particular path) so we can tell Lmod to display the associated modules
+EOF
diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+# Drop into the prefix shell or pipe this script into a Prefix shell with
+#   $EPREFIX/startprefix <<< /path/to/this_script.sh
+
+# If you want to install CUDA support on login nodes (typically without GPUs),
+# set this variable to true. This will skip all GPU-dependent checks
+install_wo_gpu=false
+[ "$INSTALL_WO_GPU" = true ] && install_wo_gpu=true
+
+# verify existence of nvidia-smi or this is a waste of time
+# Check if nvidia-smi exists and can be executed without error
+if [[ "${install_wo_gpu}" != "true" ]]; then
+  if command -v nvidia-smi > /dev/null 2>&1; then
+    nvidia-smi > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+      echo "nvidia-smi was found but returned error code, exiting now..." >&2
+      exit 1
+    fi
+    echo "nvidia-smi found, continue setup."
+  else
+    echo "nvidia-smi not found, exiting now..." >&2
+    echo "If you do not have a GPU on this device but wish to force the installation,"
+    echo "please set the environment variable INSTALL_WO_GPU=true"
+    exit 1
+  fi
+else
+  echo "You requested to install CUDA without GPUs present."
+  echo "This means that all GPU-dependent tests/checks will be skipped!"
+fi
+
+# set up basic environment variables, EasyBuild and Lmod
+EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/versions/2021.12/init/bash
+
+current_dir=$(dirname $(realpath $0))
+
+# Get arch type from EESSI environment
+eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}"
+
+# Get OS family
+# TODO: needs more thorough testing
+os_family=$(uname | tr '[:upper:]' '[:lower:]')
+
+# Get OS version
+# TODO: needs more thorough testing, taken from https://unix.stackexchange.com/a/6348
+if [ -f /etc/os-release ]; then
+  # freedesktop.org and systemd
+  . /etc/os-release
+  os=$NAME
+  ver=$VERSION_ID
+  if [[ "$os" == *"Rocky"* ]]; then
+    os="rhel"
+  fi
+  if [[ "$os" == *"Debian"* ]]; then
+    os="debian"
+  fi
+elif type lsb_release >/dev/null 2>&1; then
+  # linuxbase.org
+  os=$(lsb_release -si)
+  ver=$(lsb_release -sr)
+elif [ -f /etc/lsb-release ]; then
+  # For some versions of Debian/Ubuntu without lsb_release command
+  . /etc/lsb-release
+  os=$DISTRIB_ID
+  ver=$DISTRIB_RELEASE
+elif [ -f /etc/debian_version ]; then
+  # Older Debian/Ubuntu/etc.
+  os=Debian
+  ver=$(cat /etc/debian_version)
+else
+  # Fall back to uname, e.g. "Linux <version>", also works for BSD, etc.
+  os=$(uname -s)
+  ver=$(uname -r)
+fi
+# Convert OS version to major versions, e.g. rhel8.5 -> rhel8
+# TODO: needs testing for e.g. Ubuntu 20.04
+ver=${ver%.*}
+
+##############################################################################################
+# Check that the CUDA driver version is adequate
+# (
+#  needs to be r450 or r470 which are LTS, other production branches are acceptable but not
+#  recommended, below r450 is not compatible [with an exception we will not explore,see
+#  https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers]
+# )
+# only check first number in case of multiple GPUs
+if [[ "${install_wo_gpu}" != "true" ]]; then
+  driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | tail -n1)
+  driver_version="${driver_version%%.*}"
+  # Now check driver_version for compatability
+  # Check driver is at least LTS driver R450, see https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers
+  if (( $driver_version < 450 )); then
+    echo "Your NVIDIA driver version is too old, please update first.."
+    exit 1
+  fi
+fi
+
+
+# Check if the CUDA compat libraries are installed and compatible with the target CUDA version
+# if not find the latest version of the compatibility libraries and install them
+
+# get URL to latest CUDA compat libs, exit if URL is invalid
+latest_cuda_compat_url="$($(dirname "$BASH_SOURCE")/get_latest_cuda_compatlibs.sh ${os} ${ver} ${eessi_cpu_family})"
+ret=$?
+if [ $ret -ne 0 ]; then
+  echo $latest_cuda_compat_url
+  exit 1
+fi
+latest_driver_version="${latest_cuda_compat_url%-*}"
+latest_driver_version="${latest_driver_version##*_}"
-latest_driver_version="${latest_driver_version##*_}"
+latest_driver_version="${latest_driver_version##*-}"
-latest_driver_version="${latest_driver_version##*_}"
+latest_driver_version="${latest_driver_version##*-}"
+
+install_compat_libs=false
+host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia"
+# libcuda.so points to actual cuda compat lib with driver version in its name
+# if this file exists, cuda compat libs are installed and we can compare the version
+if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then
+  eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so)
+  eessi_driver_version="${eessi_driver_version##*so.}"
+else
+  eessi_driver_version=0
+fi
+
+if [ ${latest_driver_version//./} -gt ${eessi_driver_version//./} ]; then
+  install_compat_libs=true
+else
+  echo "CUDA compat libs are up-to-date, skip installation."
+fi
+
+if [ "${install_compat_libs}" == true ]; then
+  source $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh $latest_cuda_compat_url
+fi
+
+###############################################################################################
+###############################################################################################
+# Install CUDA
+# TODO: Can we do a trimmed install?
+# if modules dir exists, load it for usage within Lmod
+cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}"
+mkdir -p ${cuda_install_dir}
+if [ -d ${cuda_install_dir}/modules/all ]; then
+  module use ${cuda_install_dir}/modules/all
+fi
+# only install CUDA if specified version is not found
+install_cuda_version="11.3.1"
+module avail 2>&1 | grep -i CUDA/${install_cuda_version} &> /dev/null
+if [[ $? -eq 0 ]]; then
+    echo "CUDA module found! No need to install CUDA again, proceeding with tests"
+else
+  # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections`
+  #   (CUDA is a binary installation so no need to worry too much about this)
+  # TODO: The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed
+  avail_space=$(df --output=avail ${cuda_install_dir}/ | tail -n 1 | awk '{print $1}')
+  if (( ${avail_space} < 16000000 )); then
+    echo "Need more disk space to install CUDA, exiting now..."
+    exit 1
+  fi
+  # install cuda in host_injections
+  module load EasyBuild
+  eb --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb
+  ret=$?
+  if [ $ret -ne 0 ]; then
+    echo "CUDA installation failed, please check EasyBuild logs..."
+    exit 1
+  fi
+fi
+
+cd $current_dir
+if [[ "${install_wo_gpu}" != "true" ]]; then
+  source $(dirname "$BASH_SOURCE")/test_cuda
+else
+  echo "Requested to install CUDA without GPUs present, so we skip final tests."
+  echo "Instead we test if module load CUDA works as expected..."
+  if [ -d ${cuda_install_dir}/modules/all ]; then
+    module use ${cuda_install_dir}/modules/all/
+  else
+    echo "Cannot load CUDA, modules path does not exist, exiting now..."
+    exit 1
+  fi
+  module load CUDA
+  ret=$?
+  if [ $ret -ne 0 ]; then
+    echo "Could not load CUDA even though modules path exists..."
+    exit 1
+  else
+    echo "Successfully loaded CUDA, you are good to go! :)"
+    echo "  - To build CUDA enabled modules use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/ as your EasyBuild prefix"
+    echo "  - To use these modules:"
+    echo "      module use /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/modules/all/"
+  fi
+fi
diff --git a/gpu_support/get_latest_cuda_compatlibs.sh b/gpu_support/get_latest_cuda_compatlibs.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+os=$1
+ver=$2
+eessi_cpu_family=$3
+
+# build URL for CUDA libraries
+cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/"${os}${ver}"/"${eessi_cpu_family}"/"
+# get latest version, files are sorted by date
+# TODO: probably better to explicitly check version numbers than trusting that it is sorted
+latest_file=$(curl -s "${cuda_url}" | grep 'cuda-compat' | tail -1)
+if [[ -z "${latest_file// }" ]]; then
+        echo "Could not find any compat lib files under" ${cuda_url}
+        exit 1
+fi
+# extract actual file name from html snippet
+file=$(echo $latest_file | sed 's/<\/\?[^>]\+>//g')
+# build final URL for wget
+cuda_url="${cuda_url}$file"
+# simply echo the URL, result will be used by add_gpu_support.sh
+echo $cuda_url
diff --git a/gpu_support/install_cuda_compatlibs.sh b/gpu_support/install_cuda_compatlibs.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+libs_url=$1
+
+current_dir=$(dirname $(realpath $0))
+
+# Create a general space for our NVIDIA compat drivers
+if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then
+  mkdir -p ${host_injections_dir}
+  cd ${host_injections_dir}
+else
+  echo "Cannot write to eessi host_injections space, exiting now..." >&2
+  exit 1
+fi
+
+# Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest
+
+driver_cuda_version=$(nvidia-smi  -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
+eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi  -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
+if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then  echo "You need to update your CUDA compatability libraries"; fi
+
+# Check if our target CUDA is satisfied by what is installed already
+# TODO: Find required CUDA version and see if we need an update
+
+# If not, grab the latest compat library RPM or deb
+# download and unpack in temporary directory, easier cleanup after installation
+tmpdir=$(mktemp -d)
+cd $tmpdir
+compat_file=${libs_url##*/}
+wget ${libs_url}
+
+# Unpack it
+# (the requirements here are OS dependent, can we get around that?)
+# (for rpms looks like we can use https://gitweb.gentoo.org/repo/proj/prefix.git/tree/eclass/rpm.eclass?id=d7fc8cf65c536224bace1d22c0cd85a526490a1e)
+# (deb files can be unpacked with ar and tar)
+file_extension=${compat_file##*.}
+if [[ ${file_extension} == "rpm" ]]; then
+  rpm2cpio ${compat_file} | cpio -idmv
+elif [[ ${file_extension} == "deb" ]]; then
+  ar x ${compat_file}
+  tar xf data.tar.*
+else
+  echo "File extension of cuda compat lib not supported, exiting now..." >&2
+  exit 1
+fi
+cd $host_injections_dir
+# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir
+mv -n ${tmpdir}/usr/local/cuda-* .
+rm -r ${tmpdir}
+
+# Add a symlink that points to the latest version
+latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1)
+ln -sf ${latest_cuda_dir} latest
+
+if [ ! -e latest ] ; then
+  echo "Symlink to latest cuda compat lib version is broken, exiting now..."
+  exit 1
+fi
+
+# Create the space to host the libraries
+host_injection_libs_dir=/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family}
+mkdir -p ${host_injection_libs_dir}
+# Symlink in the path to the latest libraries
+if [ ! -d "${host_injection_libs_dir}/lib" ]; then
+  ln -s ${host_injections_dir}/latest/compat ${host_injection_libs_dir}/lib
+elif [ ! "${host_injection_libs_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then
+  echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..."
+  echo "${host_injection_libs_dir}/lib should point to ${host_injections_dir}/latest/compat"
+  exit 1
+fi
+
+# return to initial dir
+cd $current_dir