From ddf06441e8c98b2a7e8e73d34e61da382f0af923 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Fri, 22 Nov 2024 11:15:58 +0100 Subject: [PATCH 01/10] refactor code & add new gpu_profiles --- conf/vsc_kul_uhasselt.config | 159 ++++++++++++++++++++++++----------- 1 file changed, 109 insertions(+), 50 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 51fffc5c8..f761c8bfb 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -1,9 +1,7 @@ // Default to /tmp directory if $VSC_SCRATCH scratch env is not available, // see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config -def scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" - -// Specify the work directory -workDir = "$scratch_dir/work" +def scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" +def tier1_project = System.getenv("SLURM_ACCOUNT") ?: null // Perform work directory cleanup when the run has succesfully completed // cleanup = true @@ -16,15 +14,11 @@ try { System.err.println("WARNING: Could not run sinfo to determine current cluster, defaulting to genius") } -def tier1_project = System.getenv("SLURM_ACCOUNT") ?: null - if (! tier1_project && (hostname.contains("genius") || hostname.contains("wice"))) { - // Hard-code that Tier 1 cluster dodrio requires a project account System.err.println("Please specify your VSC project account with environment variable SLURM_ACCOUNT.") System.exit(1) } - // Reduce the job submit rate to about 50 per minute, this way the server won't be bombarded with jobs // Limit queueSize to keep job rate under control and avoid timeouts executor { @@ -35,11 +29,13 @@ executor { // Add backoff strategy to catch cluster timeouts and proper symlinks of files in scratch to the work directory process { - stageInMode = "symlink" - stageOutMode = "rsync" + executor = 'slurm' + scratch = "$scratch_dir" + stageInMode = "symlink" + stageOutMode = "rsync" errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } - maxRetries = 5 - // array = 50 + maxRetries = 3 + array = 50 } // Specify that singularity should be used and where the cache dir will be for the images @@ -49,6 +45,11 @@ singularity { cacheDir = "$scratch_dir/.singularity" } +params { + config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' + config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' +} + env { APPTAINER_TMPDIR="$scratch_dir/.apptainer/tmp" APPTAINER_CACHEDIR="$scratch_dir/.apptainer/cache" @@ -56,28 +57,22 @@ env { // AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time) aws { - maxErrorRetry = 3 + maxErrorRetry = 3 } // Define profiles for each cluster profiles { genius { - params { - config_profile_description = 'HPC_GENIUS profile for use on the genius cluster of the VSC HPC.' - config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' - config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' - max_memory = 703.GB // 768 - 65 so 65GB for overhead, max is 720000MB - max_time = 168.h - max_cpus = 36 - } + params.config_profile_description = 'genius profile for use on the genius cluster of the VSC HPC.' process { + clusterOptions = { "--clusters=genius --account=$tier1_project" } resourceLimits = [ - memory: 703.GB, - cpus: 136, + memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB + cpus: 36, time: 168.h ] - executor = 'slurm' + queue = { switch (task.memory) { case { it >= 175.GB }: // max is 180000 @@ -96,29 +91,61 @@ profiles { } } } - clusterOptions = { "--clusters=genius --account=$tier1_project" } - scratch = "$scratch_dir" + } } - wice { + genius_gpu { + + params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.' + + docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + + process { + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + clusterOptions = { "--clusters=genius --account=$tier1_project" } + + resourceLimits = [ + memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB + cpus: 36, + time: 168.h, + ] + + queue = { + switch (task.memory) { + case { it >= 175.GB }: // max is 180000 + switch (task.time) { + case { it >= 72.h }: + return 'gpu_v100_long' + default: + return 'gpu_v100' + } + default: + switch (task.time) { + case { it >= 72.h }: + return 'gpu_p100_long,amd_long' + default: + return 'gpu_p100,gpu_p100_debug,amd' + } + } + } - params { - config_profile_description = 'HPC_WICE profile for use on the Wice cluster of the VSC HPC.' - config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' - config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' - max_memory = 1968.GB // max is 2016000 - max_cpus = 72 - max_time = 168.h } + } + + wice { + params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.' process { + clusterOptions = { "--clusters=wice --account=$tier1_project"} resourceLimits = [ - memory: 1968.GB, + memory: 1968.GB, // max is 2016000 cpus: 72, time: 168.h ] - executor = 'slurm' + queue = { switch (task.memory) { case { it >= 239.GB }: // max is 244800 @@ -137,31 +164,63 @@ profiles { } } } - clusterOptions = { "--clusters=wice --account=$tier1_project"} - scratch = "$scratch_dir" + } } - superdome { - params { - config_profile_description = 'HPC_SUPERDOME profile for use on the genius cluster of the VSC HPC.' - config_profile_contact = 'GitHub: @Joon-Klaps - Email: joon.klaps@kuleuven.be' - config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html' - max_memory = 5772.GB // 6000 - 228 so 228GB for overhead, max is 5910888MB - max_cpus = 14 - max_time = 168.h + wice_gpu { + + params.config_profile_description = 'wice_gpu profile for use on the genius cluster of the VSC HPC.' + + docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + + process { + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + clusterOptions = { "--clusters=genius --account=$tier1_project" } + resourceLimits = [ + memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB + cpus: 60, + time: 168.h + ] + + queue = { + switch (task.memory) { + case { it >= 478.GB }: // max is 489600 + switch (task.time) { + case { it >= 72.h }: + return 'dedicated_big_gpu_h100,dedicated_big_gpu' + default: + return 'gpu,gpu_h100' + } + default: + switch (task.time) { + case { it >= 72.h }: + return 'gpu_a100' + default: + return 'gpu_a100_debug' + } + } + } + } + } + + superdome { + params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.' process { + clusterOptions = {"--clusters=genius --account=$tier1_project"} resourceLimits = [ - memory: 5772.GB, + memory: 5772.GB, // 6000 - 228 so 228GB for overhead, max is 5910888MB cpus: 14, time: 168.h ] - executor = 'slurm' + queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' } - clusterOptions = {"--clusters=genius --account=$tier1_project"} - scratch = "$scratch_dir" } } } + + From a6ba9846ab37b407f2b895e74aecd861495356e3 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Fri, 22 Nov 2024 15:29:09 +0100 Subject: [PATCH 02/10] update logic & readability vsc_kul_uhasselt GPU --- conf/vsc_kul_uhasselt.config | 150 ++++++++++++----------------------- 1 file changed, 52 insertions(+), 98 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index f761c8bfb..fede1181f 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -1,24 +1,11 @@ // Default to /tmp directory if $VSC_SCRATCH scratch env is not available, // see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config -def scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" -def tier1_project = System.getenv("SLURM_ACCOUNT") ?: null +scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" +tier1_project = System.getenv("SLURM_ACCOUNT") ?: null // Perform work directory cleanup when the run has succesfully completed // cleanup = true -// Get the hostname and check some values for tier1 -def hostname = "genius" -try { - hostname = ['/bin/bash', '-c', 'sinfo --clusters=genius,wice -s | head -n 1'].execute().text.replace('CLUSTER: ','') -} catch (java.io.IOException e) { - System.err.println("WARNING: Could not run sinfo to determine current cluster, defaulting to genius") -} - -if (! tier1_project && (hostname.contains("genius") || hostname.contains("wice"))) { - System.err.println("Please specify your VSC project account with environment variable SLURM_ACCOUNT.") - System.exit(1) -} - // Reduce the job submit rate to about 50 per minute, this way the server won't be bombarded with jobs // Limit queueSize to keep job rate under control and avoid timeouts executor { @@ -29,8 +16,6 @@ executor { // Add backoff strategy to catch cluster timeouts and proper symlinks of files in scratch to the work directory process { - executor = 'slurm' - scratch = "$scratch_dir" stageInMode = "symlink" stageOutMode = "rsync" errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } @@ -67,34 +52,33 @@ profiles { process { clusterOptions = { "--clusters=genius --account=$tier1_project" } - resourceLimits = [ - memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB - cpus: 36, - time: 168.h - ] + // 768 - 65 so 65GB for overhead, max is 720000MB + resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] + + withLabel: '.*gpu.*'{ + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + queue = { + switch (task.memory) { + case { it >= 175.GB }: // max is 180000 + return task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100' + default: + return task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd' + } + } + } queue = { switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - switch (task.time) { - case { it >= 72.h }: - return 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' - default: - return 'bigmem' - } - default: - switch (task.time) { - case { it >= 72.h }: - return 'batch_long' + case { it >= 175.GB }: // max is 180000 + return task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem' default: - return 'batch' - } + return task.time >= 72.h ? 'batch_long' : 'batch' } } - } } + genius_gpu { params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.' @@ -107,31 +91,17 @@ profiles { beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' clusterOptions = { "--clusters=genius --account=$tier1_project" } - resourceLimits = [ - memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB - cpus: 36, - time: 168.h, - ] + // 768 - 65 so 65GB for overhead, max is 720000MB + resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] queue = { switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - switch (task.time) { - case { it >= 72.h }: - return 'gpu_v100_long' + case { it >= 175.GB }: // max is 180000 + return task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100' default: - return 'gpu_v100' - } - default: - switch (task.time) { - case { it >= 72.h }: - return 'gpu_p100_long,amd_long' - default: - return 'gpu_p100,gpu_p100_debug,amd' - } + return task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd' } } - } } @@ -140,34 +110,35 @@ profiles { process { clusterOptions = { "--clusters=wice --account=$tier1_project"} - resourceLimits = [ - memory: 1968.GB, // max is 2016000 - cpus: 72, - time: 168.h - ] + // max is 2016000 + resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] queue = { - switch (task.memory) { - case { it >= 239.GB }: // max is 244800 - switch (task.time) { - case { it >= 72.h }: - return 'dedicated_big_bigmem' - default: - return 'bigmem,hugemem' + switch (task.memory) { + case { it >= 239.GB }: // max is 244800 + return task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem' + default: + return task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long': 'batch,batch_sapphirerapids,batch_icelake' } - default: - switch (task.time) { - case { it >= 72.h }: - return 'batch_long,batch_icelake_long,batch_sapphirerapids_long' - default: - return 'batch,batch_sapphirerapids,batch_icelake' + } + + withLabel: '.*gpu.*'{ + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + + queue = { + switch (task.memory) { + case { it >= 239.GB }: // max is 244800 + return task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100' + default: + return task.time >= 72.h ? 'dedicated_big_gpu': 'gpu_a100,gpu' } } } - } } + wice_gpu { params.config_profile_description = 'wice_gpu profile for use on the genius cluster of the VSC HPC.' @@ -179,31 +150,17 @@ profiles { process { beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' clusterOptions = { "--clusters=genius --account=$tier1_project" } - resourceLimits = [ - memory: 703.GB, // 768 - 65 so 65GB for overhead, max is 720000MB - cpus: 60, - time: 168.h - ] + // 768 - 65 so 65GB for overhead, max is 720000MB + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] queue = { switch (task.memory) { - case { it >= 478.GB }: // max is 489600 - switch (task.time) { - case { it >= 72.h }: - return 'dedicated_big_gpu_h100,dedicated_big_gpu' - default: - return 'gpu,gpu_h100' - } - default: - switch (task.time) { - case { it >= 72.h }: - return 'gpu_a100' + case { it >= 478.GB }: // max is 489600 + return task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100' default: - return 'gpu_a100_debug' - } + return task.time >= 72.h ? 'dedicated_big_gpu': 'gpu_a100,gpu' } } - } } @@ -212,11 +169,8 @@ profiles { process { clusterOptions = {"--clusters=genius --account=$tier1_project"} - resourceLimits = [ - memory: 5772.GB, // 6000 - 228 so 228GB for overhead, max is 5910888MB - cpus: 14, - time: 168.h - ] + // 6000 - 228 so 228GB for overhead, max is 5910888MB + resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h] queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' } } From 9292d5a1d70719618640c14cd0a9eddc2dc010ee Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Mon, 25 Nov 2024 13:05:13 +0100 Subject: [PATCH 03/10] Getting somewhere finally --- conf/vsc_kul_uhasselt.config | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index fede1181f..a74b1952a 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -16,6 +16,7 @@ executor { // Add backoff strategy to catch cluster timeouts and proper symlinks of files in scratch to the work directory process { + executor = 'slurm' stageInMode = "symlink" stageOutMode = "rsync" errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } @@ -51,12 +52,14 @@ profiles { params.config_profile_description = 'genius profile for use on the genius cluster of the VSC HPC.' process { - clusterOptions = { "--clusters=genius --account=$tier1_project" } // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] + clusterOptions = { "--clusters=genius --account=$tier1_project" } withLabel: '.*gpu.*'{ beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + clusterOptions = { "--gpus-per-node=${task.cpus < 18 ? 1 : 2} --clusters=genius --account=$tier1_project" } + queue = { switch (task.memory) { case { it >= 175.GB }: // max is 180000 @@ -88,11 +91,10 @@ profiles { singularity.runOptions = '--containall --cleanenv --nv' process { - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' - clusterOptions = { "--clusters=genius --account=$tier1_project" } - // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + clusterOptions = { "--gpus-per-node=${task.cpus < 18 ? 1 : 2} --clusters=genius --account=$tier1_project" } queue = { switch (task.memory) { @@ -125,6 +127,7 @@ profiles { withLabel: '.*gpu.*'{ beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + clusterOptions = { "--gpus-per-node=${task.cpus < 32 ? 1 : 2} --clusters=genius --account=$tier1_project" } queue = { switch (task.memory) { @@ -148,10 +151,10 @@ profiles { singularity.runOptions = '--containall --cleanenv --nv' process { - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' - clusterOptions = { "--clusters=genius --account=$tier1_project" } // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' + clusterOptions = { "--gpus-per-node=${task.cpus < 32 ? 1 : 2} --clusters=genius --account=$tier1_project" } queue = { switch (task.memory) { From d14a293e9f0e542c20668cae2aa1658918055496 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Mon, 25 Nov 2024 16:22:36 +0100 Subject: [PATCH 04/10] Local tests passing! --- conf/vsc_kul_uhasselt.config | 44 +++++++++++++++++------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index a74b1952a..279e93ad3 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -56,9 +56,20 @@ profiles { resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] clusterOptions = { "--clusters=genius --account=$tier1_project" } + queue = { + switch (task.memory) { + case { it >= 175.GB }: // max is 180000 + return task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem' + default: + return task.time >= 72.h ? 'batch_long' : 'batch' + } + } + withLabel: '.*gpu.*'{ - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' - clusterOptions = { "--gpus-per-node=${task.cpus < 18 ? 1 : 2} --clusters=genius --account=$tier1_project" } + resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] + clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/9) as int} --clusters=genius --account=$tier1_project" } + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' queue = { switch (task.memory) { @@ -69,15 +80,6 @@ profiles { } } } - - queue = { - switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - return task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem' - default: - return task.time >= 72.h ? 'batch_long' : 'batch' - } - } } } @@ -85,16 +87,13 @@ profiles { genius_gpu { params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.' - - docker.runOptions = '-u $(id -u):$(id -g) --gpus all' - apptainer.runOptions = '--containall --cleanenv --nv' - singularity.runOptions = '--containall --cleanenv --nv' + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' - clusterOptions = { "--gpus-per-node=${task.cpus < 18 ? 1 : 2} --clusters=genius --account=$tier1_project" } + clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/9) as int} --clusters=genius --account=$tier1_project" } queue = { switch (task.memory) { @@ -111,9 +110,9 @@ profiles { params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.' process { - clusterOptions = { "--clusters=wice --account=$tier1_project"} // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] + clusterOptions = { "--clusters=wice --account=$tier1_project"} queue = { switch (task.memory) { @@ -125,9 +124,8 @@ profiles { } withLabel: '.*gpu.*'{ - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - clusterOptions = { "--gpus-per-node=${task.cpus < 32 ? 1 : 2} --clusters=genius --account=$tier1_project" } + clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } queue = { switch (task.memory) { @@ -144,7 +142,7 @@ profiles { wice_gpu { - params.config_profile_description = 'wice_gpu profile for use on the genius cluster of the VSC HPC.' + params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.' docker.runOptions = '-u $(id -u):$(id -g) --gpus all' apptainer.runOptions = '--containall --cleanenv --nv' @@ -153,8 +151,8 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - beforeScript = 'module load cuDNN/8.4.1.50-CUDA-11.7.0' - clusterOptions = { "--gpus-per-node=${task.cpus < 32 ? 1 : 2} --clusters=genius --account=$tier1_project" } + + clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } queue = { switch (task.memory) { From bf3d46d0c286888c316f196563e13ee21ec2cb69 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Mon, 25 Nov 2024 16:24:45 +0100 Subject: [PATCH 05/10] remove dockerrunOpts on HPC --- conf/vsc_kul_uhasselt.config | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 279e93ad3..34a1414f6 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -85,7 +85,6 @@ profiles { genius_gpu { - params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.' apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' @@ -124,8 +123,10 @@ profiles { } withLabel: '.*gpu.*'{ - resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' queue = { switch (task.memory) { @@ -141,12 +142,9 @@ profiles { wice_gpu { - params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.' - - docker.runOptions = '-u $(id -u):$(id -g) --gpus all' - apptainer.runOptions = '--containall --cleanenv --nv' - singularity.runOptions = '--containall --cleanenv --nv' + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' process { // 768 - 65 so 65GB for overhead, max is 720000MB From e3e948e1b105ce6626556268989d597ff0b3e9ae Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Mon, 25 Nov 2024 16:49:18 +0100 Subject: [PATCH 06/10] remove switch statement, use floor instead of ceil --- conf/vsc_kul_uhasselt.config | 65 +++++++++++++----------------------- 1 file changed, 24 insertions(+), 41 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 34a1414f6..0a9f16d0e 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -57,27 +57,22 @@ profiles { clusterOptions = { "--clusters=genius --account=$tier1_project" } queue = { - switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - return task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem' - default: - return task.time >= 72.h ? 'batch_long' : 'batch' - } + task.memory >= 175.GB ? + (task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem') : + (task.time >= 72.h ? 'batch_long' : 'batch') } withLabel: '.*gpu.*'{ resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] - clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/9) as int} --clusters=genius --account=$tier1_project" } + // suggested to request 9 cpus per gpu + clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/9)) as int} --clusters=genius --account=$tier1_project" } apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' queue = { - switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - return task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100' - default: - return task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd' - } + task.memory >= 175.GB ? + (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') : + (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd') } } } @@ -92,15 +87,12 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] - clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/9) as int} --clusters=genius --account=$tier1_project" } + clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/9)) as int} --clusters=genius --account=$tier1_project" } queue = { - switch (task.memory) { - case { it >= 175.GB }: // max is 180000 - return task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100' - default: - return task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd' - } + task.memory >= 175.GB ? + (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') : + (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd') } } } @@ -114,27 +106,21 @@ profiles { clusterOptions = { "--clusters=wice --account=$tier1_project"} queue = { - switch (task.memory) { - case { it >= 239.GB }: // max is 244800 - return task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem' - default: - return task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long': 'batch,batch_sapphirerapids,batch_icelake' - } + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : + (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } withLabel: '.*gpu.*'{ resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } + clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/16)) as int} --clusters=wice --account=$tier1_project" } apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' queue = { - switch (task.memory) { - case { it >= 239.GB }: // max is 244800 - return task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100' - default: - return task.time >= 72.h ? 'dedicated_big_gpu': 'gpu_a100,gpu' - } + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } } } @@ -149,16 +135,13 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - - clusterOptions = { "--gpus-per-node=${Math.ceil(task.cpus/16) as int} --clusters=wice --account=$tier1_project" } + // suggested to request 16-18 cpus per gpu + clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/16)) as int} --clusters=wice --account=$tier1_project" } queue = { - switch (task.memory) { - case { it >= 478.GB }: // max is 489600 - return task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100' - default: - return task.time >= 72.h ? 'dedicated_big_gpu': 'gpu_a100,gpu' - } + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } } } From 6974904caddb9b64f4f2344dd80b598db23bb87e Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Mon, 25 Nov 2024 17:15:19 +0100 Subject: [PATCH 07/10] refactor cluster options for GPU resource allocation and update documentation for Nextflow version requirements --- conf/vsc_kul_uhasselt.config | 36 ++++++++++++++++++++++++------------ docs/vsc_kul_uhasselt.md | 11 +++++++---- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 0a9f16d0e..0ff37ba4e 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -63,11 +63,14 @@ profiles { } withLabel: '.*gpu.*'{ - resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] - // suggested to request 9 cpus per gpu - clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/9)) as int} --clusters=genius --account=$tier1_project" } - apptainer.runOptions = '--containall --cleanenv --nv' - singularity.runOptions = '--containall --cleanenv --nv' + resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + clusterOptions = { + // suggested to use 9 cpus per gpu + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/9) as int) + "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" + } queue = { task.memory >= 175.GB ? @@ -87,7 +90,10 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] - clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/9)) as int} --clusters=genius --account=$tier1_project" } + clusterOptions = { + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/9) as int) + "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" + } queue = { task.memory >= 175.GB ? @@ -112,10 +118,14 @@ profiles { } withLabel: '.*gpu.*'{ - resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/16)) as int} --clusters=wice --account=$tier1_project" } - apptainer.runOptions = '--containall --cleanenv --nv' - singularity.runOptions = '--containall --cleanenv --nv' + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + apptainer.runOptions = '--containall --cleanenv --nv' + singularity.runOptions = '--containall --cleanenv --nv' + clusterOptions = { + // suggested to use 16 cpus per gpu + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/16) as int) + "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" + } queue = { task.memory >= 239.GB ? @@ -135,8 +145,10 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - // suggested to request 16-18 cpus per gpu - clusterOptions = { "--gpus-per-node=${Math.max(1,Math.floor(task.cpus/16)) as int} --clusters=wice --account=$tier1_project" } + clusterOptions = { + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/16) as int) + "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" + } queue = { task.memory >= 239.GB ? diff --git a/docs/vsc_kul_uhasselt.md b/docs/vsc_kul_uhasselt.md index a9caec11b..2f31304ca 100644 --- a/docs/vsc_kul_uhasselt.md +++ b/docs/vsc_kul_uhasselt.md @@ -28,14 +28,14 @@ export NXF_CONDA_CACHEDIR="$VSC_SCRATCH/miniconda3/envs" # Optional tower key # export TOWER_ACCESS_TOKEN="" -# export NXF_VER="" # make sure it's larger then 24.04.0 +# export NXF_VER="" # make sure it's larger then 24.10.1 ``` :::warning -The current config is setup with array jobs. Make sure nextflow version >= 24.04.0, read [array jobs in nextflow](https://www.nextflow.io/docs/latest/process.html#array) you can do this in +The current config is setup with array jobs. Make sure nextflow version >= 24.10.1, read [array jobs in nextflow](https://www.nextflow.io/docs/latest/process.html#array) you can do this in ```bash -export NXF_VER=24.04.0 +export NXF_VER=24.10.1 ``` ::: @@ -64,10 +64,13 @@ nextflow run -profile vsc_kul_uhasselt, **NB:** The vsc_kul_uhasselt profile is based on a selected amount of SLURM partitions. Should you require resources outside of these limits (e.g.gpus) you will need to provide a custom config specifying an appropriate SLURM partition (e.g. 'gpu\*'). +> **NB:** The vsc_kul_uhasselt profile is based on a selected amount of SLURM partitions. The profile will select to its best ability the most appropriate partition for the job. Including modules with a label containing `gpu`will be allocated to a gpu partition when the 'normal' `genius` profile is selected. Select the `genius_gpu` or `wice_gpu` profile to force the job to be allocated to a gpu partition. +> **NB:** If the module does not have `accelerator` set, it will determine the number of GPUs based on the requested resources. Use the `--cluster` option to specify the cluster you intend to use when submitting the job: From dcd9741dd9fb58d64e8a2e37ec99ca9518964cea Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Tue, 26 Nov 2024 16:54:39 +0100 Subject: [PATCH 08/10] reduce job submission limits & add apptainer timeout --- conf/vsc_kul_uhasselt.config | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 0ff37ba4e..92e9780d3 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -10,7 +10,7 @@ tier1_project = System.getenv("SLURM_ACCOUNT") ?: null // Limit queueSize to keep job rate under control and avoid timeouts executor { submitRateLimit = '50/1min' - queueSize = 30 + queueSize = 50 exitReadTimeout = "10min" } @@ -19,16 +19,17 @@ process { executor = 'slurm' stageInMode = "symlink" stageOutMode = "rsync" - errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } + errorStrategy = { sleep(Math.pow(2, task.attempt ?: 1) * 200 as long); return 'retry' } maxRetries = 3 - array = 50 + array = 30 } // Specify that singularity should be used and where the cache dir will be for the images singularity { - enabled = true - autoMounts = true - cacheDir = "$scratch_dir/.singularity" + enabled = true + autoMounts = true + cacheDir = "$scratch_dir/.singularity" + pullTimeout = "30 min" } params { From 595da58ed72c88f7fda65394ef0cbd4a37b66256 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Wed, 27 Nov 2024 10:13:02 +0100 Subject: [PATCH 09/10] add default task.cpus --- conf/vsc_kul_uhasselt.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 92e9780d3..69e938e83 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -69,7 +69,7 @@ profiles { singularity.runOptions = '--containall --cleanenv --nv' clusterOptions = { // suggested to use 9 cpus per gpu - def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/9) as int) + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" } @@ -92,7 +92,7 @@ profiles { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] clusterOptions = { - def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/9) as int) + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" } @@ -124,7 +124,7 @@ profiles { singularity.runOptions = '--containall --cleanenv --nv' clusterOptions = { // suggested to use 16 cpus per gpu - def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/16) as int) + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" } @@ -147,7 +147,7 @@ profiles { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] clusterOptions = { - def gpus = task.accelerator?.request ?: Math.max(1, Math.floor(task.cpus/16) as int) + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" } From bf238465635f23411fb8fa019038f288c4d9f162 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Tue, 3 Dec 2024 12:47:35 +0100 Subject: [PATCH 10/10] load default module --- conf/vsc_kul_uhasselt.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 69e938e83..4e4634601 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -55,6 +55,7 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] + beforeScript = 'module load cluster/genius' clusterOptions = { "--clusters=genius --account=$tier1_project" } queue = { @@ -91,6 +92,7 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] + beforeScript = 'module load cluster/genius' clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" @@ -111,6 +113,7 @@ profiles { // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] clusterOptions = { "--clusters=wice --account=$tier1_project"} + beforeScript = 'module load cluster/wice' queue = { task.memory >= 239.GB ? @@ -146,6 +149,7 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + beforeScript = 'module load cluster/wice' clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" @@ -164,6 +168,7 @@ profiles { process { clusterOptions = {"--clusters=genius --account=$tier1_project"} + beforeScript = 'module load cluster/genius/superdome' // 6000 - 228 so 228GB for overhead, max is 5910888MB resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h]