nf-core · Joon-Klaps · Nov 22, 2024 · Nov 22, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config
@@ -1,52 +1,40 @@
 // Default to /tmp directory if $VSC_SCRATCH scratch env is not available,
 // see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config
-def scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp"
-
-// Specify the work directory
-workDir = "$scratch_dir/work"
+scratch_dir   = System.getenv("VSC_SCRATCH") ?: "/tmp"
+tier1_project = System.getenv("SLURM_ACCOUNT") ?: null
 
 // Perform work directory cleanup when the run has succesfully completed
 // cleanup = true
 
-// Get the hostname and check some values for tier1
-def hostname = "genius"
-try {
-    hostname = ['/bin/bash', '-c', 'sinfo --clusters=genius,wice -s | head -n 1'].execute().text.replace('CLUSTER: ','')
-} catch (java.io.IOException e) {
-    System.err.println("WARNING: Could not run sinfo to determine current cluster, defaulting to genius")
-}
-
-def tier1_project = System.getenv("SLURM_ACCOUNT") ?: null
-
-if (! tier1_project && (hostname.contains("genius") || hostname.contains("wice"))) {
-    // Hard-code that Tier 1 cluster dodrio requires a project account
-    System.err.println("Please specify your VSC project account with environment variable SLURM_ACCOUNT.")
-    System.exit(1)
-}
-
-
 // Reduce the job submit rate to about 50 per minute, this way the server won't be bombarded with jobs
 // Limit queueSize to keep job rate under control and avoid timeouts
 executor {
     submitRateLimit = '50/1min'
-    queueSize = 30
+    queueSize = 50
     exitReadTimeout = "10min"
 }
 
 // Add backoff strategy to catch cluster timeouts and proper symlinks of files in scratch to the work directory
 process {
-    stageInMode = "symlink"
-    stageOutMode = "rsync"
-    errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' }
-    maxRetries    = 5
-    // array = 50
+    executor      = 'slurm'
+    stageInMode   = "symlink"
+    stageOutMode  = "rsync"
+    errorStrategy = { sleep(Math.pow(2, task.attempt ?: 1) * 200 as long); return 'retry' }
+    maxRetries    = 3
+    array         = 30
 }
 
 // Specify that singularity should be used and where the cache dir will be for the images
 singularity {
-    enabled = true
-    autoMounts = true
-    cacheDir = "$scratch_dir/.singularity"
+    enabled     = true
+    autoMounts  = true
+    cacheDir    = "$scratch_dir/.singularity"
+    pullTimeout = "30 min"
+}
+
+params {
+    config_profile_contact     = 'GitHub: @Joon-Klaps - Email: [email protected]'
+    config_profile_url         = 'https://docs.vscentrum.be/en/latest/index.html'
 }
 
 env {
@@ -56,112 +44,137 @@ env {
 
 // AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time)
 aws {
-        maxErrorRetry = 3
+    maxErrorRetry = 3
 }
 
 // Define profiles for each cluster
 profiles {
     genius {
-        params {
-            config_profile_description = 'HPC_GENIUS profile for use on the genius cluster of the VSC HPC.'
-            config_profile_contact = 'GitHub: @Joon-Klaps - Email: [email protected]'
-            config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html'
-            max_memory = 703.GB  // 768 - 65 so 65GB for overhead, max is 720000MB
-            max_time = 168.h
-            max_cpus = 36
-        }
+        params.config_profile_description = 'genius profile for use on the genius cluster of the VSC HPC.'
 
         process {
-            resourceLimits = [
-                memory: 703.GB,
-                cpus: 136,
-                time: 168.h
-            ]
-            executor = 'slurm'
+            // 768 - 65 so 65GB for overhead, max is 720000MB
+            resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ]
+            beforeScript = 'module load cluster/genius'
+            clusterOptions = { "--clusters=genius --account=$tier1_project" }
+
             queue = {
-                switch (task.memory) {
-                case { it >=  175.GB }: // max is 180000
-                    switch (task.time) {
-                    case { it >= 72.h }:
-                        return 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long'
-                    default:
-                        return 'bigmem'
-                    }
-                default:
-                    switch (task.time) {
-                    case { it >= 72.h }:
-                        return 'batch_long'
-                    default:
-                        return 'batch'
-                    }
+                task.memory >= 175.GB ?
+                    (task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem') :
+                    (task.time >= 72.h ? 'batch_long' : 'batch')
+            }
+
+            withLabel: '.*gpu.*'{
+                resourceLimits         = [ memory: 703.GB, cpus: 36 , time: 168.h ]
+                apptainer.runOptions   = '--containall --cleanenv --nv'
+                singularity.runOptions = '--containall --cleanenv --nv'
+                clusterOptions         = {
+                    // suggested to use 9 cpus per gpu
+                    def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
+                    "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
+                }
+
+                queue = {
+                    task.memory >= 175.GB ?
+                        (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
+                        (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
                 }
             }
-            clusterOptions = { "--clusters=genius --account=$tier1_project" }
-            scratch = "$scratch_dir"
         }
     }
 
-    wice {
 
-        params {
-            config_profile_description = 'HPC_WICE profile for use on the Wice cluster of the VSC HPC.'
-            config_profile_contact = 'GitHub: @Joon-Klaps - Email: [email protected]'
-            config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html'
-            max_memory = 1968.GB // max is 2016000
-            max_cpus = 72
-            max_time = 168.h
+    genius_gpu {
+        params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.'
+        apptainer.runOptions              = '--containall --cleanenv --nv'
+        singularity.runOptions            = '--containall --cleanenv --nv'
+
+        process {
+            // 768 - 65 so 65GB for overhead, max is 720000MB
+            resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h]
+            beforeScript   = 'module load cluster/genius'
+            clusterOptions = {
+                def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
+                "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
+            }
+
+            queue = {
+                    task.memory >= 175.GB ?
+                        (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
+                        (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
+            }
         }
+    }
+
+    wice {
+        params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.'
 
         process {
-            resourceLimits = [
-                memory: 1968.GB,
-                cpus: 72,
-                time: 168.h
-            ]
-            executor = 'slurm'
+            // max is 2016000
+            resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ]
+            clusterOptions = { "--clusters=wice --account=$tier1_project"}
+            beforeScript   = 'module load cluster/wice'
+
             queue = {
-                switch (task.memory) {
-                case { it >=  239.GB }:  // max is 244800
-                    switch (task.time) {
-                    case { it >= 72.h }:
-                        return 'dedicated_big_bigmem'
-                    default:
-                        return 'bigmem,hugemem'
-                    }
-                default:
-                    switch (task.time) {
-                    case { it >= 72.h }:
-                        return 'batch_long,batch_icelake_long,batch_sapphirerapids_long'
-                    default:
-                        return 'batch,batch_sapphirerapids,batch_icelake'
-                    }
+                task.memory >= 239.GB ?
+                    (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') :
+                    (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake')
+            }
+
+            withLabel: '.*gpu.*'{
+                resourceLimits         = [ memory: 703.GB, cpus: 64, time: 168.h ]
+                apptainer.runOptions   = '--containall --cleanenv --nv'
+                singularity.runOptions = '--containall --cleanenv --nv'
+                clusterOptions         = {
+                    // suggested to use 16 cpus per gpu
+                    def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
+                    "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project"
+                }
+
+                queue = {
+                    task.memory >= 239.GB ?
+                        (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
+                        (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
                 }
             }
-            clusterOptions = { "--clusters=wice --account=$tier1_project"}
-            scratch = "$scratch_dir"
         }
     }
 
-    superdome {
-        params {
-            config_profile_description = 'HPC_SUPERDOME profile for use on the genius cluster of the VSC HPC.'
-            config_profile_contact = 'GitHub: @Joon-Klaps - Email: [email protected]'
-            config_profile_url = 'https://docs.vscentrum.be/en/latest/index.html'
-            max_memory = 5772.GB // 6000 - 228 so 228GB for overhead, max is 5910888MB
-            max_cpus = 14
-            max_time = 168.h
+
+    wice_gpu {
+        params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.'
+        apptainer.runOptions              = '--containall --cleanenv --nv'
+        singularity.runOptions            = '--containall --cleanenv --nv'
+
+        process {
+            // 768 - 65 so 65GB for overhead, max is 720000MB
+            resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
+            beforeScript   = 'module load cluster/wice'
+            clusterOptions = {
+                def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
+                "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project"
+            }
+
+            queue = {
+                task.memory >= 239.GB ?
+                    (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
+                    (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
+            }
         }
+    }
+
+    superdome {
+        params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.'
 
         process {
-            resourceLimits = [
-                memory: 5772.GB,
-                cpus: 14,
-                time: 168.h
-            ]
-            executor = 'slurm'
-            queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' }
             clusterOptions = {"--clusters=genius --account=$tier1_project"}
-            scratch = "$scratch_dir"
+            beforeScript   = 'module load cluster/genius/superdome'
+            // 6000 - 228 so 228GB for overhead, max is 5910888MB
+            resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h]
+
+            queue = { task.time <= 72.h ? 'superdome' : 'superdome_long' }
         }
     }
 }
+
+
diff --git a/docs/vsc_kul_uhasselt.md b/docs/vsc_kul_uhasselt.md
@@ -28,14 +28,14 @@ export NXF_CONDA_CACHEDIR="$VSC_SCRATCH/miniconda3/envs"
 
 # Optional tower key
 # export TOWER_ACCESS_TOKEN="<your_tower_access_token>"
-# export NXF_VER="<version>"      # make sure it's larger then 24.04.0
+# export NXF_VER="<version>"      # make sure it's larger then 24.10.1
 ```
 
 :::warning
-The current config is setup with array jobs. Make sure nextflow version >= 24.04.0, read [array jobs in nextflow](https://www.nextflow.io/docs/latest/process.html#array) you can do this in
+The current config is setup with array jobs. Make sure nextflow version >= 24.10.1, read [array jobs in nextflow](https://www.nextflow.io/docs/latest/process.html#array) you can do this in
 
 ```bash
-export NXF_VER=24.04.0
+export NXF_VER=24.10.1
 ```
 
 :::
@@ -64,10 +64,13 @@ nextflow run <pipeline> -profile vsc_kul_uhasselt,<CLUSTER> <Add your other para
 Here the cluster options are:
 
 - genius
+- genius_gpu
 - wice
+- wice_gpu
 - superdome
 
-> **NB:** The vsc_kul_uhasselt profile is based on a selected amount of SLURM partitions. Should you require resources outside of these limits (e.g.gpus) you will need to provide a custom config specifying an appropriate SLURM partition (e.g. 'gpu\*').
+> **NB:** The vsc_kul_uhasselt profile is based on a selected amount of SLURM partitions. The profile will select to its best ability the most appropriate partition for the job. Including modules with a label containing `gpu`will be allocated to a gpu partition when the 'normal' `genius` profile is selected. Select the `genius_gpu` or `wice_gpu` profile to force the job to be allocated to a gpu partition.
+> **NB:** If the module does not have `accelerator` set, it will determine the number of GPUs based on the requested resources.
 
 Use the `--cluster` option to specify the cluster you intend to use when submitting the job: