diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 51fffc5c..4e463460 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -1,52 +1,40 @@ // Default to /tmp directory if $VSC_SCRATCH scratch env is not available, // see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config -def scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" - -// Specify the work directory -workDir = "$scratch_dir/work" +scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" +tier1_project = System.getenv("SLURM_ACCOUNT") ?: null // Perform work directory cleanup when the run has succesfully completed // cleanup = true -// Get the hostname and check some values for tier1 -def hostname = "genius" -try { - hostname = ['/bin/bash', '-c', 'sinfo --clusters=genius,wice -s | head -n 1'].execute().text.replace('CLUSTER: ','') -} catch (java.io.IOException e) { - System.err.println("WARNING: Could not run sinfo to determine current cluster, defaulting to genius") -} - -def tier1_project = System.getenv("SLURM_ACCOUNT") ?: null - -if (! tier1_project && (hostname.contains("genius") || hostname.contains("wice"))) { - // Hard-code that Tier 1 cluster dodrio requires a project account - System.err.println("Please specify your VSC project account with environment variable SLURM_ACCOUNT.") - System.exit(1) -} - - // Reduce the job submit rate to about 50 per minute, this way the server won't be bombarded with jobs // Limit queueSize to keep job rate under control and avoid timeouts executor { submitRateLimit = '50/1min' - queueSize = 30 + queueSize = 50 exitReadTimeout = "10min" } // Add backoff strategy to catch cluster timeouts and proper symlinks of files in scratch to the work directory process { - stageInMode = "symlink" - stageOutMode = "rsync" - errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } - maxRetries = 5 - // array = 50 + executor = 'slurm' + stageInMode = "symlink" + stageOutMode = "rsync" + errorStrategy = { sleep(Math.pow(2, task.attempt ?: 1) * 200 as long); return 'retry' } + maxRetries = 3 + array = 30 } // Specify that singularity should be used and where the cache dir will be for the images singularity { - enabled = true - autoMounts = true - cacheDir = "$scratch_dir/.singularity" + enabled = true + autoMounts = true + cacheDir = "$scratch_dir/.singularity" + pullTimeout = "30 min" +} + +params { + config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' + config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' } env { @@ -56,112 +44,137 @@ env { // AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time) aws { - maxErrorRetry = 3 + maxErrorRetry = 3 } // Define profiles for each cluster profiles { genius { - params { - config_profile_description = 'HPC_GENIUS profile for use on the genius cluster of the VSC HPC.' - config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' - config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' - max_memory = 703.GB // 768 - 65 so 65GB for overhead, max is 720000MB - max_time = 168.h - max_cpus = 36 - } + params.config_profile_description = 'genius profile for use on the genius cluster of the VSC HPC.' process { - resourceLimits = [ - memory: 703.GB, - cpus: 136, - time: 168.h - ] - executor = 'slurm' + // 768 - 65 so 65GB for overhead, max is 720000MB + resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] + beforeScript = 'module load cluster/genius' + clusterOptions = { "--clusters=genius --account=$tier1_project" } + queue = { - switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - switch (task.time) { - case { it >= 72.h }: - return 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' - default: - return 'bigmem' - } - default: - switch (task.time) { - case { it >= 72.h }: - return 'batch_long' - default: - return 'batch' - } + task.memory >= 175.GB ? + (task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem') : + (task.time >= 72.h ? 'batch_long' : 'batch') + } + + withLabel: '.*gpu.*'{ + resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + clusterOptions = { + // suggested to use 9 cpus per gpu + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) + "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" + } + + queue = { + task.memory >= 175.GB ? + (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') : + (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd') } } - clusterOptions = { "--clusters=genius --account=$tier1_project" } - scratch = "$scratch_dir" } } - wice { - params { - config_profile_description = 'HPC_WICE profile for use on the Wice cluster of the VSC HPC.' - config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' - config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' - max_memory = 1968.GB // max is 2016000 - max_cpus = 72 - max_time = 168.h + genius_gpu { + params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.' + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + + process { + // 768 - 65 so 65GB for overhead, max is 720000MB + resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] + beforeScript = 'module load cluster/genius' + clusterOptions = { + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) + "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" + } + + queue = { + task.memory >= 175.GB ? + (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') : + (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd') + } } + } + + wice { + params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.' process { - resourceLimits = [ - memory: 1968.GB, - cpus: 72, - time: 168.h - ] - executor = 'slurm' + // max is 2016000 + resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] + clusterOptions = { "--clusters=wice --account=$tier1_project"} + beforeScript = 'module load cluster/wice' + queue = { - switch (task.memory) { - case { it >= 239.GB }: // max is 244800 - switch (task.time) { - case { it >= 72.h }: - return 'dedicated_big_bigmem' - default: - return 'bigmem,hugemem' - } - default: - switch (task.time) { - case { it >= 72.h }: - return 'batch_long,batch_icelake_long,batch_sapphirerapids_long' - default: - return 'batch,batch_sapphirerapids,batch_icelake' - } + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : + (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') + } + + withLabel: '.*gpu.*'{ + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + clusterOptions = { + // suggested to use 16 cpus per gpu + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) + "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" + } + + queue = { + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } } - clusterOptions = { "--clusters=wice --account=$tier1_project"} - scratch = "$scratch_dir" } } - superdome { - params { - config_profile_description = 'HPC_SUPERDOME profile for use on the genius cluster of the VSC HPC.' - config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' - config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' - max_memory = 5772.GB // 6000 - 228 so 228GB for overhead, max is 5910888MB - max_cpus = 14 - max_time = 168.h + + wice_gpu { + params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.' + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + + process { + // 768 - 65 so 65GB for overhead, max is 720000MB + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + beforeScript = 'module load cluster/wice' + clusterOptions = { + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) + "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" + } + + queue = { + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') + } } + } + + superdome { + params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.' process { - resourceLimits = [ - memory: 5772.GB, - cpus: 14, - time: 168.h - ] - executor = 'slurm' - queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' } clusterOptions = {"--clusters=genius --account=$tier1_project"} - scratch = "$scratch_dir" + beforeScript = 'module load cluster/genius/superdome' + // 6000 - 228 so 228GB for overhead, max is 5910888MB + resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h] + + queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' } } } } + + diff --git a/docs/vsc_kul_uhasselt.md b/docs/vsc_kul_uhasselt.md index a9caec11..2f31304c 100644 --- a/docs/vsc_kul_uhasselt.md +++ b/docs/vsc_kul_uhasselt.md @@ -28,14 +28,14 @@ export NXF_CONDA_CACHEDIR="$VSC_SCRATCH/miniconda3/envs" # Optional tower key # export TOWER_ACCESS_TOKEN="" -# export NXF_VER="" # make sure it's larger then 24.04.0 +# export NXF_VER="" # make sure it's larger then 24.10.1 ``` :::warning -The current config is setup with array jobs. Make sure nextflow version >= 24.04.0, read [array jobs in nextflow](https://www.nextflow.io/docs/latest/process.html#array) you can do this in +The current config is setup with array jobs. Make sure nextflow version >= 24.10.1, read [array jobs in nextflow](https://www.nextflow.io/docs/latest/process.html#array) you can do this in ```bash -export NXF_VER=24.04.0 +export NXF_VER=24.10.1 ``` ::: @@ -64,10 +64,13 @@ nextflow run -profile vsc_kul_uhasselt, **NB:** The vsc_kul_uhasselt profile is based on a selected amount of SLURM partitions. Should you require resources outside of these limits (e.g.gpus) you will need to provide a custom config specifying an appropriate SLURM partition (e.g. 'gpu\*'). +> **NB:** The vsc_kul_uhasselt profile is based on a selected amount of SLURM partitions. The profile will select to its best ability the most appropriate partition for the job. Including modules with a label containing `gpu`will be allocated to a gpu partition when the 'normal' `genius` profile is selected. Select the `genius_gpu` or `wice_gpu` profile to force the job to be allocated to a gpu partition. +> **NB:** If the module does not have `accelerator` set, it will determine the number of GPUs based on the requested resources. Use the `--cluster` option to specify the cluster you intend to use when submitting the job: