diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 0c832fdee9..ded3e652f9 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -160,6 +160,12 @@ fail_msg="Installation of ${GCC_EC} failed!" $EB ${GCC_EC} --robot --from-pr 14453 GCCcore-9.3.0.eb check_exit_code $? "${ok_msg}" "${fail_msg}" +# install CUDA +ok_msg="CUDA installed, off to a good (?) start!" +fail_msg="Failed to install CUDA, woopsie..." +$EB CUDA-11.3.1.eb --robot +check_exit_code $? "${ok_msg}" "${fail_msg}" + # install CMake with custom easyblock that patches CMake when --sysroot is used echo ">> Install CMake with fixed easyblock to take into account --sysroot" ok_msg="CMake installed!" diff --git a/eb_hooks.py b/eb_hooks.py index 653094266d..472533e0bd 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -7,7 +7,7 @@ from easybuild.tools.systemtools import AARCH64, POWER, get_cpu_architecture EESSI_RPATH_OVERRIDE_ATTR = 'orig_rpath_override_dirs' - +CUDA_ENABLED_TOOLCHAINS = ["fosscuda", "gcccuda", "gimpic", "giolfc", "gmklc", "golfc", "gomklc", "gompic", "goolfc", "iccifortcuda", "iimklc", "iimpic", "intelcuda", "iomklc", "iompic", "nvompic", "nvpsmpic"] def get_eessi_envvar(eessi_envvar): """Get an EESSI environment variable from the environment""" @@ -41,6 +41,29 @@ def get_rpath_override_dirs(software_name): return rpath_injection_dirs +def inject_gpu_property(ec): + ec_dict = ec.asdict() + # Check if CUDA is in the dependencies, if so add the GPU Lmod tag + if ( + "CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])] + or ec_dict["toolchain"]["name"] in CUDA_ENABLED_TOOLCHAINS + ): + ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version") + key = "modluafooter" + value = 'add_property("arch","gpu")' + cuda_version = 0 + for dep in iter(ec_dict["dependencies"]): + if "CUDA" in dep[0]: + cuda_version = dep[1] + ec_dict["dependencies"].remove(dep) + ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict["builddependencies"] + value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % (cuda_version)]) + if key in ec_dict: + if not value in ec_dict[key]: + ec[key] = "\n".join([ec_dict[key], value]) + else: + ec[key] = value + return ec def parse_hook(ec, *args, **kwargs): """Main parse hook: trigger custom functions based on software name.""" @@ -48,6 +71,8 @@ def parse_hook(ec, *args, **kwargs): # determine path to Prefix installation in compat layer via $EPREFIX eprefix = get_eessi_envvar('EPREFIX') + ec = inject_gpu_property(ec) + if ec.name in PARSE_HOOKS: PARSE_HOOKS[ec.name](ec, eprefix) @@ -103,6 +128,57 @@ def cgal_toolchainopts_precise(ec, eprefix): raise EasyBuildError("CGAL-specific hook triggered for non-CGAL easyconfig?!") +def post_package_hook(self, *args, **kwargs): + """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" + if self.name == 'CUDA': + cuda_version = self.installdir.split('/')[-1] + print_msg("Attempt to get compat libs for CUDA version: %s" % (cuda_version)) + # install compat libraries and run test + # if the test works, move it to EESSI_SOFTWARE_PATH so we can ship the compiled test + os.system("export INSTALL_CUDA_VERSION=%s && export SAVE_COMPILED_TEST=true && ./gpu_support/add_nvidia_gpu_support.sh" % (cuda_version)) + print_msg("Replacing CUDA stuff we cannot ship with symlinks...") + # read CUDA EULA + eula_path = os.path.join(self.installdir, 'EULA.txt') + tmp_buffer = [] + with open(eula_path) as infile: + copy = False + for line in infile: + if line.strip() == '2.6. Attachment A': + copy = True + continue + elif line.strip() == '2.7. Attachment B': + copy = False + continue + elif copy: + tmp_buffer.append(line) + # create whitelist without file extensions, not really needed and they only complicate things + whitelist = [] + file_extensions = ['.so', '.a', '.h', '.bc'] + for tmp in tmp_buffer: + for word in tmp.split(): + if any(ext in word for ext in file_extensions): + whitelist.append(word.split('.')[0]) + # add compiled test to whitelist so we can ship it with EESSI + whitelist.append('deviceQuery') + whitelist = list(set(whitelist)) + # iterate over all files in the CUDA path + for root, dirs, files in os.walk(self.installdir): + for filename in files: + # we only really care about real files, i.e. not symlinks + if not os.path.islink(os.path.join(root, filename)): + # check if the current file is part of the whitelist + basename = filename.split('.')[0] + if basename not in whitelist: + # if it is not in the whitelist, delete the file and create a symlink to host_injections + source = os.path.join(root, filename) + target = source.replace('versions', 'host_injections') + os.remove(source) + # have to create subdirs if they don't exit, otherwise the symlink creation fails + if not os.path.isdir(os.path.dirname(target)): + os.makedirs(os.path.dirname(target)) + os.symlink(target, source) + + def fontconfig_add_fonts(ec, eprefix): """Inject --with-add-fonts configure option for fontconfig.""" if ec.name == 'fontconfig': diff --git a/gpu_support/README.md b/gpu_support/README.md new file mode 100644 index 0000000000..226a7fca6c --- /dev/null +++ b/gpu_support/README.md @@ -0,0 +1,27 @@ +# How to add GPU support +The collection of scripts in this directory enables you to add GPU support to your setup. +Note that currently this means that CUDA support can be added for Nvidia GPUs. AMD GPUs are not yet supported (feel free to contribute that though!). +To enable the usage of the CUDA runtime in your setup, simply run the following script: +``` +./add_nvidia_gpu_support.sh +``` +This script will install the compatibility libraries (and only those by default!) you need to use the shipped runtime environment of CUDA. + +If you plan on using the full CUDA suite, i.e. if you want to load the CUDA module, you will have to modify the script execution as follows: +``` +export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh +``` +This will again install the needed compatibility libraries as well as the whole CUDA suite. + +If you need a different CUDA version than what is shipped with EESSI, you can also specify that particular version for the script: +``` +export INSTALL_CUDA_VERSION=xx.y.z && export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh +``` +Please note, however, that versions for which the runtime is not shipped with EESSI are not installed in the default modules path. +Thus, you will have to add the following to your modules path to get access to your custom CUDA version: +``` +module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/ +``` +## Prerequisites and tips +* You need write permissions to `/cvmfs/pilot.eessi-hpc.org/host_injections` (which by default is a symlink to `/opt/eessi` but can be configured in your CVMFS config file to point somewhere else). If you would like to make a system-wide installation you should change this in your configuration to point somewhere on a shared filesystem. +* If you want to install CUDA on a node without GPUs (e.g. on a login node where you want to be able to compile your CUDA-enabled code), you should `export INSTALL_WO_GPU=true` in order to skip checks and tests that can only succeed if you have access to a GPU. This approach is not recommended as there is a chance the CUDA compatibility library installed is not compatible with the existing CUDA driver on GPU nodes (and this will not be detected). diff --git a/gpu_support/add_amd_gpu_support.sh b/gpu_support/add_amd_gpu_support.sh new file mode 100755 index 0000000000..29c8abdc88 --- /dev/null +++ b/gpu_support/add_amd_gpu_support.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +cat << EOF +This is not implemented yet :( + +If you would like to contribute this support there are a few things you will +need to consider: +- We will need to change the Lmod property added to GPU software so we can + distinguish AMD and Nvidia GPUs +- Support should be implemented in user space, if this is not possible (e.g., + requires a driver update) you need to tell the user what to do +- Support needs to be _verified_ and a trigger put in place (like the existence + of a particular path) so we can tell Lmod to display the associated modules +EOF diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh new file mode 100755 index 0000000000..d9a97fe2d9 --- /dev/null +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +# Drop into the prefix shell or pipe this script into a Prefix shell with +# $EPREFIX/startprefix <<< /path/to/this_script.sh + +install_cuda="${INSTALL_CUDA:=false}" +install_cuda_version="${INSTALL_CUDA_VERSION:=11.3.1}" +install_p7zip_version="${INSTALL_P7ZIP_VERSION:=17.04-GCCcore-10.3.0}" + +# If you want to install CUDA support on login nodes (typically without GPUs), +# set this variable to true. This will skip all GPU-dependent checks +install_wo_gpu=false +[ "$INSTALL_WO_GPU" = true ] && install_wo_gpu=true + +# verify existence of nvidia-smi or this is a waste of time +# Check if nvidia-smi exists and can be executed without error +if [[ "${install_wo_gpu}" != "true" ]]; then + if command -v nvidia-smi > /dev/null 2>&1; then + nvidia-smi > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "nvidia-smi was found but returned error code, exiting now..." >&2 + echo "If you do not have a GPU on this device but wish to force the installation," + echo "please set the environment variable INSTALL_WO_GPU=true" + exit 1 + fi + echo "nvidia-smi found, continue setup." + else + echo "nvidia-smi not found, exiting now..." >&2 + echo "If you do not have a GPU on this device but wish to force the installation," + echo "please set the environment variable INSTALL_WO_GPU=true" + exit 1 + fi +else + echo "You requested to install CUDA without GPUs present." + echo "This means that all GPU-dependent tests/checks will be skipped!" +fi + +EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/latest/init/bash + +############################################################################################## +# Check that the CUDA driver version is adequate +# ( +# needs to be r450 or r470 which are LTS, other production branches are acceptable but not +# recommended, below r450 is not compatible [with an exception we will not explore,see +# https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers] +# ) +# only check first number in case of multiple GPUs +if [[ "${install_wo_gpu}" != "true" ]]; then + driver_major_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | tail -n1) + driver_major_version="${driver_major_version%%.*}" + # Now check driver_version for compatability + # Check driver is at least LTS driver R450, see https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers + if (( $driver_major_version < 450 )); then + echo "Your NVIDIA driver version is too old, please update first.." + exit 1 + fi +fi + +############################################################################################### +# Install CUDA +cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" +mkdir -p ${cuda_install_dir} +if [ "${install_cuda}" != false ]; then + bash $(dirname "$BASH_SOURCE")/cuda_utils/install_cuda.sh ${install_cuda_version} ${cuda_install_dir} +fi +############################################################################################### +# Prepare installation of CUDA compat libraries, i.e. install p7zip if it is missing +$(dirname "$BASH_SOURCE")/cuda_utils/prepare_cuda_compatlibs.sh ${install_p7zip_version} ${cuda_install_dir} +############################################################################################### +# Try installing five different versions of CUDA compat libraries until the test works. +# Otherwise, give up +bash $(dirname "$BASH_SOURCE")/cuda_utils/install_cuda_compatlibs_loop.sh ${cuda_install_dir} ${install_cuda_version} + +cuda_version_file="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/version.txt" +echo ${install_cuda_version} > ${cuda_version_file} diff --git a/gpu_support/cuda_utils/get_cuda_compatlibs.sh b/gpu_support/cuda_utils/get_cuda_compatlibs.sh new file mode 100755 index 0000000000..9639917a27 --- /dev/null +++ b/gpu_support/cuda_utils/get_cuda_compatlibs.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Get arch type from EESSI environment +if [[ -z "${EESSI_CPU_FAMILY}" ]]; then + # set up basic environment variables, EasyBuild and Lmod + EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/latest/init/bash +fi +eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}" + +# build URL for CUDA libraries +# take rpm file for compat libs from rhel8 folder, deb and rpm files contain the same libraries +cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/rhel8/"${eessi_cpu_family}"/" +# get all versions in decending order +files=$(curl -s "${cuda_url}" | grep 'cuda-compat' | sed 's/<\/\?[^>]\+>//g' | xargs -n1 | /cvmfs/pilot.eessi-hpc.org/latest/compat/linux/${eessi_cpu_family}/bin/sort -r --version-sort ) +if [[ -z "${files// }" ]]; then + echo "Could not find any compat lib files under" ${cuda_url} + exit 1 +fi +for file in $files; do echo "${cuda_url}$file"; done diff --git a/gpu_support/cuda_utils/install_cuda.sh b/gpu_support/cuda_utils/install_cuda.sh new file mode 100644 index 0000000000..4eef0ecdf4 --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +install_cuda_version=$1 +cuda_install_dir=$2 + +# TODO: Can we do a trimmed install? +# Only install CUDA if specified version is not found. +# This is only relevant for users, the shipped CUDA installation will +# always be in versions instead of host_injections and have symlinks pointing +# to host_injections for everything we're not allowed to ship +if [ -f ${cuda_install_dir}/software/CUDA/${install_cuda_version}/EULA.txt ]; then + echo "CUDA software found! No need to install CUDA again, proceeding with tests" +else + # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` + # (CUDA is a binary installation so no need to worry too much about this) + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed + avail_space=$(df --output=avail ${cuda_install_dir}/ | tail -n 1 | awk '{print $1}') + if (( ${avail_space} < 16000000 )); then + echo "Need more disk space to install CUDA, exiting now..." + exit 1 + fi + # install cuda in host_injections + module load EasyBuild + # we need the --rebuild option and a random dir for the module if the module file is shipped with EESSI + if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + tmpdir=$(mktemp -d) + extra_args="--rebuild --installpath-modules=${tmpdir}" + fi + eb ${extra_args} --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb + ret=$? + if [ $ret -ne 0 ]; then + echo "CUDA installation failed, please check EasyBuild logs..." + exit 1 + fi + # clean up tmpdir if it exists + if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + rm -rf ${tmpdir} + fi +fi diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh new file mode 100644 index 0000000000..11a7aa7e3d --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda_compatlibs.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +libs_url=$1 +cuda_install_dir=$2 + +current_dir=$(dirname $(realpath $0)) +host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" +host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} + +# Create a general space for our NVIDIA compat drivers +if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then + mkdir -p ${host_injections_dir} +else + echo "Cannot write to eessi host_injections space, exiting now..." >&2 + exit 1 +fi +cd ${host_injections_dir} + +# Check if our target CUDA is satisfied by what is installed already +# TODO: Find required CUDA version and see if we need an update +driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) +eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) +if [[ $driver_cuda_version =~ ^[0-9]+$ ]]; then + if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi +fi + +# If not, grab the latest compat library RPM or deb +# Download and unpack in temporary directory, easier cleanup after installation +tmpdir=$(mktemp -d) +cd $tmpdir +compat_file=${libs_url##*/} +wget ${libs_url} +echo $compat_file + +# Unpack it +# rpm files are the default for all OSes +# Keep support for deb files in case it is needed in the future +file_extension=${compat_file##*.} +if [[ ${file_extension} == "rpm" ]]; then + # p7zip is installed under host_injections for now, make that known to the environment + if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ + fi + # Load p7zip to extract files from rpm file + module load p7zip + # Extract .cpio + 7z x ${compat_file} + # Extract lib* + 7z x ${compat_file/rpm/cpio} + # Restore symlinks + cd usr/local/cuda-*/compat + ls *.so *.so.? | xargs -i -I % sh -c '{ echo -n ln -sf" "; cat %; echo " "%; }'| xargs -i sh -c "{}" + cd - +elif [[ ${file_extension} == "deb" ]]; then + ar x ${compat_file} + tar xf data.tar.* +else + echo "File extension of cuda compat lib not supported, exiting now..." >&2 + exit 1 +fi +cd $host_injections_dir +cuda_dir=$(basename ${tmpdir}/usr/local/cuda-*) +# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir +rm -rf ${cuda_dir} +mv -n ${tmpdir}/usr/local/cuda-* . +rm -r ${tmpdir} + +# Add a symlink that points the latest version to the version we just installed +ln -sfn ${cuda_dir} latest + +if [ ! -e latest ] ; then + echo "Symlink to latest cuda compat lib version is broken, exiting now..." + exit 1 +fi + +# Create the space to host the libraries +mkdir -p ${host_injection_linker_dir} +# Symlink in the path to the latest libraries +if [ ! -d "${host_injection_linker_dir}/lib" ]; then + ln -s ${host_injections_dir}/latest/compat ${host_injection_linker_dir}/lib +elif [ ! "${host_injection_linker_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then + echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..." + echo "${host_injection_linker_dir}/lib should point to ${host_injections_dir}/latest/compat" + exit 1 +fi + +# return to initial dir +cd $current_dir + +echo +echo CUDA driver compatability drivers installed for CUDA version: +echo ${cuda_dir/cuda-/} diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh new file mode 100644 index 0000000000..c3a17c6f2a --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +cuda_install_dir=$1 +install_cuda_version=$2 + +# Check if the CUDA compat libraries are installed and compatible with the target CUDA version +# if not find the latest version of the compatibility libraries and install them + +# get URL to latest CUDA compat libs, exit if URL is invalid +cuda_compat_urls="$($(dirname "$BASH_SOURCE")/get_cuda_compatlibs.sh)" +ret=$? +if [ $ret -ne 0 ]; then + echo "Couldn't find current URLs of the CUDA compat libraries, instead got:" + echo $cuda_compat_urls + exit 1 +fi + +# loop over the compat library versions until we get one that works for us +keep_driver_check=1 +# Do a maximum of five attempts +for value in {1..5} +do + latest_cuda_compat_url=$(echo $cuda_compat_urls | cut -d " " -f1) + # Chomp that value out of the list + cuda_compat_urls=$(echo $cuda_compat_urls | cut -d " " -f2-) + latest_driver_version="${latest_cuda_compat_url%-*}" + latest_driver_version="${latest_driver_version##*-}" + # URLs differ for different OSes; check if we already have a number, if not remove string part that is not needed + if [[ ! $latest_driver_version =~ ^[0-9]+$ ]]; then + latest_driver_version="${latest_driver_version##*_}" + fi + + install_compat_libs=false + host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" + # libcuda.so points to actual cuda compat lib with driver version in its name + # if this file exists, cuda compat libs are installed and we can compare the version + if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then + eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so) + eessi_driver_version="${eessi_driver_version##*so.}" + else + eessi_driver_version=0 + fi + + if [ $keep_driver_check -eq 1 ] + then + # only keep the driver check for the latest version + keep_driver_check=0 + else + eessi_driver_version=0 + fi + + if (( ${latest_driver_version//./} > ${eessi_driver_version//./} )); then + install_compat_libs=true + else + echo "CUDA compat libs are up-to-date, skip installation." + fi + + if [ "${install_compat_libs}" == true ]; then + bash $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh ${latest_cuda_compat_url} ${cuda_install_dir} + fi + + if [[ "${install_wo_gpu}" != "true" ]]; then + bash $(dirname "$BASH_SOURCE")/test_cuda.sh "${install_cuda_version}" + if [ $? -eq 0 ] + then + exit 0 + else + echo + echo "It looks like your driver is not recent enough to work with that release of CUDA, consider updating!" + echo "I'll try an older release to see if that will work..." + echo + fi + else + echo "Requested to install CUDA without GPUs present, so we skip final tests." + echo "Instead we test if module load CUDA works as expected..." + if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ + else + echo "Cannot load CUDA, modules path does not exist, exiting now..." + exit 1 + fi + module load CUDA + ret=$? + if [ $ret -ne 0 ]; then + echo "Could not load CUDA even though modules path exists..." + exit 1 + else + echo "Successfully loaded CUDA, you are good to go! :)" + echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" + echo " - To use these modules:" + echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" + echo " - Please keep in mind that we just installed the latest CUDA compat libs." + echo " Since we have no GPU to test with, we cannot guarantee that it will work with the installed CUDA drivers on your GPU node(s)." + exit 0 + fi + break + fi +done + +echo "Tried to install 5 different generations of compat libraries and none worked," +echo "this usually means your driver is very out of date!" +exit 1 diff --git a/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh b/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh new file mode 100755 index 0000000000..9efd2b8e66 --- /dev/null +++ b/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +install_p7zip_version=$1 +cuda_install_dir=$2 + +# Install p7zip, this will be used to install the CUDA compat libraries from rpm. +# The rpm and deb files contain the same libraries, so we just stick to the rpm version. +# If p7zip is missing from the software layer (for whatever reason), we need to install it. +# This has to happen in host_injections, so we check first if it is already installed there. +if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ +fi +module avail 2>&1 | grep -i p7zip &> /dev/null +if [[ $? -eq 0 ]]; then + echo "p7zip module found! No need to install p7zip again, proceeding with installation of compat libraries" +else + # install p7zip in host_injections + export EASYBUILD_IGNORE_OSDEPS=1 + export EASYBUILD_SYSROOT=${EPREFIX} + export EASYBUILD_RPATH=1 + export EASYBUILD_FILTER_ENV_VARS=LD_LIBRARY_PATH + export EASYBUILD_FILTER_DEPS=Autoconf,Automake,Autotools,binutils,bzip2,cURL,DBus,flex,gettext,gperf,help2man,intltool,libreadline,libtool,Lua,M4,makeinfo,ncurses,util-linux,XZ,zlib + export EASYBUILD_MODULE_EXTENSIONS=1 + module load EasyBuild + eb --robot --installpath=${cuda_install_dir}/ p7zip-${install_p7zip_version}.eb + ret=$? + if [ $ret -ne 0 ]; then + echo "p7zip installation failed, please check EasyBuild logs..." + exit 1 + fi +fi diff --git a/gpu_support/cuda_utils/test_cuda.sh b/gpu_support/cuda_utils/test_cuda.sh new file mode 100644 index 0000000000..6939c77815 --- /dev/null +++ b/gpu_support/cuda_utils/test_cuda.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +install_cuda_version=$1 +save_compiled_test="${SAVE_COMPILED_TEST:=false}" + +# Test CUDA +cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" +current_dir=$PWD +if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ +else + echo "Cannot test CUDA, modules path does not exist, exiting now..." + exit 1 +fi +module load CUDA/${install_cuda_version} +ret=$? +if [ $ret -ne 0 ]; then + echo "Could not load CUDA even though modules path exists..." + exit 1 +fi +# if we don't want to save the compiled sample, it means we have a shipped version available +if [ "${save_compiled_test}" != false ]; then + tmp_dir=$(mktemp -d) + # convert cuda version to an integer so we can test if the samples are shipped with this version + # starting from version 11.6 the samples can be found in a github repo + cuda_version=$(echo ${install_cuda_version} | cut -f1,2 -d'.') + cuda_version=${cuda_version//./} + if (( ${cuda_version} < 116 )); then + cp -r $EBROOTCUDA/samples $tmp_dir + cd $tmp_dir/samples/1_Utilities/deviceQuery + else + git clone https://github.com/NVIDIA/cuda-samples.git ${tmp_dir} -q + cd $tmp_dir/Samples/1_Utilities/deviceQuery + fi + module load GCCcore + ret=$? + if [ $ret -ne 0 ]; then + echo "Could not load GCC, but it should have been shipped with EESSI?! Exiting..." + exit 1 + fi + make HOST_COMPILER=$(which g++) -j +else + cd ${EESSI_SOFTWARE_PATH}/software/CUDA/${install_cuda_version} +fi +./deviceQuery + +if [ $? -eq 0 ] +then + # Set the color variable + green='\033[0;32m' + # Clear the color after that + clear='\033[0m' + echo -e ${green} + echo "Congratulations, your GPU is working with EESSI!" + echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" + echo " - To use these modules:" + echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" + echo -e ${clear} + + if [ "${save_compiled_test}" != false ]; then + mv deviceQuery ${EESSI_SOFTWARE_PATH}/software/CUDA/${install_cuda_version} + fi + + # Clean up + cd $current_dir + if [ "${save_compiled_test}" != false ]; then + rm -rf $tmp_dir + fi +else + echo "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 + # Clean up + cd $current_dir + if [ "${save_compiled_test}" != false ]; then + rm -rf $tmp_dir + fi + exit 1 +fi + +# Test a CUDA-enabled module from EESSI +# TODO: GROMACS? +# TODO: Include a GDR copy test? +############################################################################################### diff --git a/init/SitePackage.lua b/init/SitePackage.lua new file mode 100644 index 0000000000..d7ab2eefad --- /dev/null +++ b/init/SitePackage.lua @@ -0,0 +1,101 @@ +require("strict") +local hook = require("Hook") +local open = io.open + +local function read_file(path) + local file = open(path, "rb") -- r read mode and b binary mode + if not file then return nil end + local content = file:read "*a" -- *a or *all reads the whole file + file:close() + return content +end + +-- from https://stackoverflow.com/a/40195356 +--- Check if a file or directory exists in this path +function exists(file) + local ok, err, code = os.rename(file, file) + if not ok then + if code == 13 then + -- Permission denied, but it exists + return true + end + end + return ok, err +end + +local function visible_hook(modT) + local frameStk = require("FrameStk"):singleton() + local mt = frameStk:mt() + local cudaDir = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + local cudaDirExists = exists(cudaDir) + if not cudaDirExists then + local haveGpu = mt:haveProperty(modT.sn,"arch","gpu") + if haveGpu then + modT.isVisible = false + end + end +end + +local function cuda_enabled_load_hook(t) + local frameStk = require("FrameStk"):singleton() + local mt = frameStk:mt() + local simpleName = string.match(t.modFullName, "(.-)/") + local eprefix = os.getenv('EESSI_PREFIX') .. "/init/gpu_support" + -- if we try to load CUDA itself, check if the software exists in host_injections + -- otherwise, refuse to load CUDA and print error message + if simpleName == 'CUDA' then + -- get the full host_injections path + local cudaDir = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + -- build final path where the CUDA software should be installed + cudaDir = cudaDir .. "/software/" .. t.modFullName + local cudaDirExists = exists(cudaDir) + if not cudaDirExists then + io.stderr:write("You requested to load ",simpleName,"\n") + io.stderr:write("While the module file exists, the actual software is not shipped with EESSI.\n") + io.stderr:write("In order to be able to use the CUDA module, please follow the instructions in the\n") + io.stderr:write("gpu_support folder. Adding the CUDA software can be as easy as a simple:\n") + io.stderr:write("export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh\n") + frameStk:__clear() + end + end + -- when loading CUDA enabled modules check if the neccessary matching compatibility libraries are installed + -- otherwise, refuse to load the requested module and print error message + local haveGpu = mt:haveProperty(simpleName,"arch","gpu") + if haveGpu then + local compatDir = "/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/compat/" + local compatDirExists = exists(compatDir) + if not compatDirExists then + io.stderr:write("You requested to load ",simpleName,"\n") + io.stderr:write("which relies on the CUDA runtime environment and its compatibility libraries.\n") + io.stderr:write("In order to be able to use the module, please follow the instructions in the\n") + io.stderr:write("gpu_support folder. Installing the needed compatibility libraries can be as easy as ~~a simple~~:\n") + io.stderr:write("./add_nvidia_gpu_support.sh\n") + frameStk:__clear() + end + local cudaVersion = read_file("/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/version.txt") + local cudaVersion_req = os.getenv("EESSICUDAVERSION") + local major, minor, patch = string.match(cudaVersion, "(%d+)%.(%d+)%.(%d+)") + local major_req, minor_req, patch_req = string.match(cudaVersion, "(%d+)%.(%d+)%.(%d+)") + local compat_libs_need_update = false + if major < major_req then + compat_libs_need_update = true + elseif major == major_req then + if minor < minor_req then + compat_libs_need_update = true + elseif minor == minor_req then + if patch < patch_req then + compat_libs_need_update = true + end + end + end + if compat_libs_need_update == true then + io.stderr:write("You requested to load CUDA version ",cudaVersion) + io.stderr:write("but the module you want to load requires CUDA version ",cudaVersion_req,".\n") + io.stderr:write("Please update your CUDA compatibility libraries in order to use ",simpleName,".\n") + frameStk:__clear() + end + end +end + +hook.register("load", cuda_enabled_load_hook) +hook.register("isVisibleHook", visible_hook) diff --git a/init/bash b/init/bash index ea605db0b5..c5c1a583e7 100644 --- a/init/bash +++ b/init/bash @@ -19,6 +19,11 @@ if [ $? -eq 0 ]; then # see https://github.com/EESSI/software-layer/issues/52 export PATH=$EPREFIX/usr/bin:$EPREFIX/bin:$PATH + # used for EESSI specific SitePackage.lua, hide GPU modules if CUDA is not installed + # TODO: better place to store SitePackage file? + # TODO: another method to define path to lua file? + export LMOD_PACKAGE_PATH=$(dirname "$BASH_SOURCE") + # init Lmod echo "Initializing Lmod..." >> $output source $EESSI_EPREFIX/usr/share/Lmod/init/bash