fmalmeida · fmalmeida · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024
diff --git a/conf/base.config b/conf/base.config
@@ -1,9 +1,9 @@
 process {
 
     // The defaults for all processes (without labels)
-    cpus   = { params.max_cpus   }
-    memory = { params.max_memory }
-    time   = { params.max_time   }
+    cpus   = 2
+    memory = 4.GB
+    time   = { params.max_time }
 
     errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
     maxRetries    = 1
@@ -18,7 +18,7 @@ process {
     withLabel:process_low {
         cpus   = { check_max( 2 * task.attempt, 'cpus'      ) }
         memory = { check_max( 4.GB * task.attempt, 'memory' ) }
-        time   = { check_max( 1.h * task.attempt, 'time'    ) }
+        time   = { check_max( 2.h * task.attempt, 'time'    ) }
 
         errorStrategy = { task.exitStatus in [21,143,137,104,134,139,247] ? 'retry' : 'finish' }
         maxRetries    = 1
@@ -35,9 +35,8 @@ process {
     // Assemblies will first try to adjust themselves to a parallel execution
     // If it is not possible, then it waits to use all the resources allowed
     withLabel:process_assembly {
-      cpus   = {  if (task.attempt == 1) { check_max( 6 * task.attempt, 'cpus'       ) } else { params.max_cpus   } }
-      memory = {  if (task.attempt == 1) { check_max( 20.GB * task.attempt, 'memory' ) } else { params.max_memory } }
-      time   = {  if (task.attempt == 1) { check_max( 24.h * task.attempt, 'time'    ) } else { params.max_time   } }
+      cpus   = {  if (task.attempt == 1) { check_max( params.start_asm_cpus, 'cpus'   ) } else { params.max_cpus   } }
+      memory = {  if (task.attempt == 1) { check_max( params.start_asm_mem , 'memory' ) } else { params.max_memory } }
 
       // retry at least once to try it with full resources
       errorStrategy = { task.exitStatus in [1,21,143,137,104,134,139,247] ? 'retry' : 'finish' }
@@ -46,10 +45,12 @@ process {
     }
 
     // Quast sometimes can take too long
+    def quast_mem  = ((params.start_asm_mem  / 2) > 6.GB) ? (params.start_asm_mem / 2) : 6.GB
+    def quast_cpus = ((params.start_asm_cpus / 2) > 4   ) ? (params.start_asm_mem / 2) : 4
     withName:quast {
-      cpus   = {  if (task.attempt == 1) { check_max( 4 * task.attempt, 'cpus'       ) } else { params.max_cpus   } }
-      memory = {  if (task.attempt == 1) { check_max( 10.GB * task.attempt, 'memory' ) } else { params.max_memory } }
-      time   = {  if (task.attempt == 1) { check_max( 12.h * task.attempt, 'time'    ) } else { params.max_time   } }
+      cpus   = {  if (task.attempt == 1) { check_max( params.quast_cpus, 'cpus'   ) } else { params.max_cpus   } }
+      memory = {  if (task.attempt == 1) { check_max( params.quast_mem , 'memory' ) } else { params.max_memory } }
+      time   = {  if (task.attempt == 1) { check_max( 12.h * task.attempt, 'time' ) } else { params.max_time   } }
 
       // retry at least once to try it with full resources
       errorStrategy = { task.exitStatus in [21,143,137,104,134,139,247] ? 'retry' : 'finish' }

diff --git a/conf/defaults.config b/conf/defaults.config
@@ -134,10 +134,32 @@ params {
   skip_shasta    = false                      // Nanopore longreads only assemblies
   shasta_additional_parameters = null         // Must be given as shown in shasta manual. E.g. " --Reads.minReadLength 5000 "
 
-// Max resource options
-// Defaults only, expecting to be overwritten
-  max_memory                 = '20.GB'
-  max_cpus                   = 6
+
+            /*
+             * Resources controlling parameters
+             * 
+             * Here some parameters that allow the user to better tune the resources used by the pipeline.
+             *
+             * The start_asm_{mem,cpus} parameter tells the pipeline how much memory should the assembly
+             * modules and quast request in the first try. This is essential for bigger genomes in order
+             * to avoid having to fail the first try due lack of memory and then running again (automatically)
+             * using all the max values allowed with the max_{mem,cpus} parameters.
+             *
+             * The max_memory and max_cpus parameters, tell the pipeline how much is the maximum number of
+             * these items that is allowoed per job. The pipeline start by requesting less mem&cpus than 
+             * what is defined by these params, and, in case the first try fails, it then maxes out the job
+             * to use the maximum number you allowed.
+             *
+             * The max_time parameter defines how long a single job is allowed to run.
+             */
+
+  // starting values for the assembly jobs (and quast) to ask for in the very first try
+  start_asm_mem              = 20.GB
+  start_asm_cpus             = 6
+
+  // maximum values to be used on automatic second try in case of lack of memory (all jobs)
+  max_memory                 = 40.GB
+  max_cpus                   = 10
   max_time                   = '40.h'
 
 }
diff --git a/docs/assets/defaults.config b/docs/assets/defaults.config
@@ -56,14 +56,16 @@ params {
 
 // Select the appropriate shasta config to use for assembly
 // Since shasta v0.8 (Oct/2021) this parameter is now mandatory.
+// You can check availability at: https://paoloshasta.github.io/shasta/Configurations.html
   shasta_config = "Nanopore-Oct2021"
 
 // Tells the pipeline to interpret the long reads as "corrected" long reads.
-// This will activate (if available) the options for corrected reads in the
-// assemblers: -corrected (in canu), --pacbio-corr|--nano-corr (in flye), etc.
-// Be cautious when using this parameter. If your reads are not corrected, and
+// This will activate (if available) the options for corrected or even high
+// quality (hq) reads in the assemblers.
+// Be cautious when using this parameter. If your reads are not corrected|hq, and
 // you use this parameter, you will probably do not generate any contig.
-  corrected_long_reads = false
+  corrected_longreads    = false
+  high_quality_longreads = false
 
 // This parameter below (hybrid_strategy) is to select the hybrid strategies adopted by the pipeline.
 // Read the documentation https://mpgap.readthedocs.io/en/latest/manual.html to know more about the hybrid strategies.
@@ -132,10 +134,32 @@ params {
   skip_shasta    = false                      // Nanopore longreads only assemblies
   shasta_additional_parameters = null         // Must be given as shown in shasta manual. E.g. " --Reads.minReadLength 5000 "
 
-// Max resource options
-// Defaults only, expecting to be overwritten
-  max_memory                 = '14.GB'
-  max_cpus                   = 6
+
+            /*
+             * Resources controlling parameters
+             * 
+             * Here some parameters that allow the user to better tune the resources used by the pipeline.
+             *
+             * The start_asm_{mem,cpus} parameter tells the pipeline how much memory should the assembly
+             * modules and quast request in the first try. This is essential for bigger genomes in order
+             * to avoid having to fail the first try due lack of memory and then running again (automatically)
+             * using all the max values allowed with the max_{mem,cpus} parameters.
+             *
+             * The max_memory and max_cpus parameters, tell the pipeline how much is the maximum number of
+             * these items that is allowoed per job. The pipeline start by requesting less mem&cpus than 
+             * what is defined by these params, and, in case the first try fails, it then maxes out the job
+             * to use the maximum number you allowed.
+             *
+             * The max_time parameter defines how long a single job is allowed to run.
+             */
+
+  // starting values for the assembly jobs (and quast) to ask for in the very first try
+  start_asm_mem              = 20.GB
+  start_asm_cpus             = 6
+
+  // maximum values to be used on automatic second try in case of lack of memory (all jobs)
+  max_memory                 = 40.GB
+  max_cpus                   = 10
   max_time                   = '40.h'
 
 }
diff --git a/docs/manual.md b/docs/manual.md
@@ -84,13 +84,15 @@ Please note that, through the command line, the parameters that are boolean (tru
 | `--input`   | :material-check: | NA      | Path to input [samplesheet](samplesheet.md#) in YAML format |
 | `--output`  | :material-check: | NA      | Directory to store output files                             |
 
-## Max job request
+## Start/Max resources on job request
 
 | <div style="width:120px">Parameter</div> | Required | Default | Description |
 | :--------------------------------------- | :------- | :------ | :---------- |
-| `--max_cpus`   | :material-close: | 4    | Max number of threads a job can use across attempts |
-| `--max_memory` | :material-close: | 6.GB | Max amount of memory a job can use across attempts  |
-| `--max_time`   | :material-close: | 40.h | Max amount of time a job can take to run            | 
+| `--start_asm_cpus` | :material-close: | 6     | How many cpus should an assembly job request in the very first attempt?. This is essential for bigger genomes in order to avoid having to fail the first try due lack of memory and then running again (automatically) using all the max values allowed with the max_cpus parameter. |
+| `--start_asm_mem`  | :material-close: | 20.GB |  How much memory should an assembly job request in the very first attempt?. This is essential for bigger genomes in order to avoid having to fail the first try due lack of memory and then running again (automatically) using all the max values allowed with the max_mem parameter.  |
+| `--max_cpus`   | :material-close: | 10    | Max number of threads a job can use across attempts. After one failed attempt this is maxed out. |
+| `--max_memory` | :material-close: | 40.GB | Max amount of memory a job can use across attempts. After one failed attempt this is maxed out.  |
+| `--max_time`   | :material-close: | 40.h  | Max amount of time a job can take to run | 
 
 ## Assemblies configuration
 

diff --git a/markdown/CHANGELOG.md b/markdown/CHANGELOG.md
@@ -10,6 +10,7 @@ The tracking for changes started in v2.
 * Increase default `--max_memory` value to 20.GB.
 * Add a directory called `final_assemblies` in the main output directory holding all the assemblies generated in the pipeline execution.
 * Updated documentation as discussed in [[#58](https://github.com/fmalmeida/MpGAP/issues/58)] and [[#57](https://github.com/fmalmeida/MpGAP/issues/57)].
+* [[#61](https://github.com/fmalmeida/MpGAP/issues/61)] - Add a simple parameter to adjust how many cpus and how much memory should the assembly jobs request in the first attempt to avoid lack of resources errors.
 
 ## v3.1.4 -- [2022-Sep-03]
 

diff --git a/modules/QualityAssessment/multiqc.nf b/modules/QualityAssessment/multiqc.nf
diff --git a/nextflow.config b/nextflow.config
@@ -4,11 +4,10 @@
                     Maintained by Felipe Marques de Almeida
                     Contact: [email protected]
  */
-// Load base.config (contains some label resources configuration)
-includeConfig 'conf/base.config'
-
 // loading required / default pipeline parameters
 includeConfig 'conf/defaults.config'
+// Load base.config (contains some label resources configuration)
+includeConfig 'conf/base.config'
 // fix type of variable expected
 params.hybrid_strategy = params.hybrid_strategy.toString()
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -41,14 +41,24 @@
             "description": "Set the top limit of resources for pipeline",
             "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.",
             "properties": {
-                "max_cpus": {
+                "start_asm_cpus": {
                     "type": "integer",
                     "default": 6,
+                    "description": "Starting (1st try) amount of cpus that assembly jobs should use. Essential for avoiding 1st-try errors due lack of resources for big genomes."
+                },
+                "start_asm_mem": {
+                    "type": "string",
+                    "default": "20.GB",
+                    "description": "Starting (1st try) amount of memory that assembly jobs should use. Essential for avoiding 1st-try errors due lack of resources for big genomes."
+                },
+                "max_cpus": {
+                    "type": "integer",
+                    "default": 10,
                     "description": "Max amount of threads to use"
                 },
                 "max_memory": {
                     "type": "string",
-                    "default": "20.GB",
+                    "default": "40.GB",
                     "description": "Max amount of memory to use"
                 },
                 "max_time": {