Merge branch 'CW-3427' into 'dev'

Bring up to date and tag [CW-3427] Closes CW-3427 See merge request epi2melabs/workflows/wf-artic!157
epi2me-labs · Feb 16, 2024 · d9d84c2 · d9d84c2
2 parents 54451c7 + 4aad9dc
commit d9d84c2
Show file tree

Hide file tree

Showing 14 changed files with 130 additions and 32 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -122,3 +122,22 @@ body:
       render: shell
     validations:
       required: false
+  - type: dropdown
+    id: run-demo
+    attributes:
+      label: Were you able to successfully run the latest version of the workflow with the demo data?
+      description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button?
+      options:
+        - 'yes'
+        - 'no'
+        - other (please describe below)
+    validations:
+      required: true
+  - type: textarea
+    id: demo-other
+    attributes:
+      label: Other demo data information
+      render: shell
+    validations:
+      required: false
+
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -37,5 +37,3 @@ docker-run:
         - if: $MATRIX_NAME == "NEB-VarSkip/v2"
           variables:
               NF_WORKFLOW_OPTS: " --fastq test_data/fastq --samples test_data/sample_sheet.csv --scheme_name SARS-CoV-2 --scheme_version NEB-VarSkip/v2"
-
-
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,12 +3,12 @@ repos:
     hooks:
       - id: docs_readme
         name: docs_readme
-        entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_inputs 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json
+        entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_input_example 06_input_parameters 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json
         language: python
         always_run: true
         pass_filenames: false
         additional_dependencies:
-          - epi2melabs>=0.0.48
+          - epi2melabs>=0.0.52
       - id: build_models
         name: build_models
         entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v1.0.0]
+## Changes
+- Documentation
+- Defined resource requirments
+
 ## [v0.3.33]
 ## Changes
 - Updates for cloud readiness

diff --git a/README.md b/README.md
@@ -27,12 +27,12 @@ parameter.
 Recommended requirements:
 
 + CPUs = 4
-+ memory = 8GB
++ Memory = 8GB
 
-Minimum requirement:
+Minimum requirements:
 
 + CPUs = 2
-+ memory = 4GB
++ Memory = 4GB
 
 Approximate run time: 5 minutes per sample
 
@@ -85,7 +85,30 @@ The Midnight protocol for sample preparation and sequencing can be found in the
 
 
 
-## Inputs
+## Input example
+
+<!---Example of input directory structure, delete and edit as appropriate per workflow.--->
+This workflow accepts FASTQ files as input.
+
+The FASTQ input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`.
+
+```
+(i)                     (ii)                 (iii)    
+input_reads.fastq   ─── input_directory  ─── input_directory
+                        ├── reads0.fastq     ├── barcode01
+                        └── reads1.fastq     │   ├── reads0.fastq
+                                             │   └── reads1.fastq
+                                             ├── barcode02
+                                             │   ├── reads0.fastq
+                                             │   ├── reads1.fastq
+                                             │   └── reads2.fastq
+                                             └── barcode03
+                                              └── reads0.fastq
+```
+
+
+
+## Input parameters
 
 ### Input Options
 
@@ -113,6 +136,13 @@ The Midnight protocol for sample preparation and sequencing can be found in the
 | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. |  |  |
 
 
+### Output Options
+
+| Nextflow parameter name  | Type | Description | Help | Default |
+|--------------------------|------|-------------|------|---------|
+| out_dir | string | Directory for output of all workflow results. |  | output |
+
+
 ### Reporting Options
 
 | Nextflow parameter name  | Type | Description | Help | Default |
@@ -137,6 +167,7 @@ The Midnight protocol for sample preparation and sequencing can be found in the
 | max_softclip_length | integer | Remove reads with alignments showing large soft clipping |  |  |
 | update_data | boolean | Update Pangolin and Nextclade data at runtime. |  | True |
 | pangolin_options | string | Pass options to Pangolin, for example "--analysis-mode fast --min-length 26000". |  |  |
+| nextclade_data_tag | string | The tag of the nextclade data packet |  |  |
 | normalise | integer | Depth ceiling for depth of coverage normalisation |  | 200 |
 | medaka_variant_model | string | The name of a Medaka variant model to use. This name will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecalling model used to a suitable Medaka variant model. You can override this by providing a model with this option instead. |  |
 
@@ -156,7 +187,7 @@ The Midnight protocol for sample preparation and sequencing can be found in the
 
 ## Outputs
 
-Outputs files may be aggregated including information for all             samples or provided per sample. Per sample files             will be prefixed with respective aliases and represented             below as {{ alias }}.
+Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}.
 
 | Title | File path | Description | Per sample or aggregated |
 |-------|-----------|-------------|--------------------------|

diff --git a/bin/workflow_glue/check_sample_sheet.py b/bin/workflow_glue/check_sample_sheet.py
@@ -43,7 +43,7 @@ def main(args):
     ]
 
     if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet):
-        sys.stdout.write(f"Could not open sample sheet '{args.sample_sheet}'.")
+        sys.stdout.write("Could not open sample sheet file.")
         sys.exit()
 
     try:

diff --git a/docs/03_compute_requirements.md b/docs/03_compute_requirements.md
@@ -1,12 +1,12 @@
 Recommended requirements:
 
 + CPUs = 4
-+ memory = 8GB
++ Memory = 8GB
 
-Minimum requirement:
+Minimum requirements:
 
 + CPUs = 2
-+ memory = 4GB
++ Memory = 4GB
 
 Approximate run time: 5 minutes per sample
 

diff --git a/docs/06_input_example.md b/docs/06_input_example.md
@@ -0,0 +1,18 @@
+<!---Example of input directory structure, delete and edit as appropriate per workflow.--->
+This workflow accepts FASTQ files as input.
+
+The FASTQ input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`.
+
+```
+(i)                     (ii)                 (iii)    
+input_reads.fastq   ─── input_directory  ─── input_directory
+                        ├── reads0.fastq     ├── barcode01
+                        └── reads1.fastq     │   ├── reads0.fastq
+                                             │   └── reads1.fastq
+                                             ├── barcode02
+                                             │   ├── reads0.fastq
+                                             │   ├── reads1.fastq
+                                             │   └── reads2.fastq
+                                             └── barcode03
+                                              └── reads0.fastq
+```
diff --git a/docs/06_inputs.md → docs/06_input_parameters.md b/docs/06_inputs.md → docs/06_input_parameters.md
@@ -24,6 +24,13 @@
 | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. |  |  |
 
 
+### Output Options
+
+| Nextflow parameter name  | Type | Description | Help | Default |
+|--------------------------|------|-------------|------|---------|
+| out_dir | string | Directory for output of all workflow results. |  | output |
+
+
 ### Reporting Options
 
 | Nextflow parameter name  | Type | Description | Help | Default |
@@ -48,6 +55,7 @@
 | max_softclip_length | integer | Remove reads with alignments showing large soft clipping |  |  |
 | update_data | boolean | Update Pangolin and Nextclade data at runtime. |  | True |
 | pangolin_options | string | Pass options to Pangolin, for example "--analysis-mode fast --min-length 26000". |  |  |
+| nextclade_data_tag | string | The tag of the nextclade data packet |  |  |
 | normalise | integer | Depth ceiling for depth of coverage normalisation |  | 200 |
 | medaka_variant_model | string | The name of a Medaka variant model to use. This name will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecalling model used to a suitable Medaka variant model. You can override this by providing a model with this option instead. |  |
 

diff --git a/docs/07_outputs.md b/docs/07_outputs.md
@@ -1,4 +1,4 @@
-Outputs files may be aggregated including information for all             samples or provided per sample. Per sample files             will be prefixed with respective aliases and represented             below as {{ alias }}.
+Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}.
 
 | Title | File path | Description | Per sample or aggregated |
 |-------|-----------|-------------|--------------------------|

diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy
@@ -141,7 +141,7 @@ class NfcoreSchema {
         for (specifiedParam in params.keySet()) {
             // nextflow params
             if (nf_params.contains(specifiedParam)) {
-                log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'"
+                log.error "You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'"
                 has_error = true
             }
             // unexpected params
@@ -180,7 +180,7 @@ class NfcoreSchema {
             schema.validate(params_json)
         } catch (ValidationException e) {
             println ''
-            log.error 'ERROR: Validation of pipeline parameters failed!'
+            log.error 'Validation of pipeline parameters failed!'
             JSONObject exceptionJSON = e.toJSON()
             HashSet<String> observed_exceptions = []
             printExceptions(exceptionJSON, params_json, log, enums, raw_schema, observed_exceptions)

diff --git a/lib/ingress.nf b/lib/ingress.nf
@@ -149,28 +149,24 @@ def xam_ingress(Map arguments)
 
     def input = get_valid_inputs(margs, xam_extensions)
 
+    // check BAM headers to see if any samples are uBAM
     ch_result = input.dirs
     | map { meta, path -> [meta, get_target_files_in_dir(path, xam_extensions)] }
     | mix(input.files)
-
-    ch_is_unaligned = ch_result
     | checkBamHeaders
-    | map { meta, is_unaligned_env, mixed_headers_env ->
+    | map { meta, paths, is_unaligned_env, mixed_headers_env ->
         // convert the env. variables from strings ('0' or '1') into bools
         boolean is_unaligned = is_unaligned_env as int as boolean
         boolean mixed_headers = mixed_headers_env as int as boolean
         // throw an error if there was a sample with mixed headers
         if (mixed_headers) {
             error "Found mixed headers in (u)BAM files of sample '${meta.alias}'."
         }
-        [meta, is_unaligned]
+        // add `is_unaligned` to the metamap (note the use of `+` to create a copy of
+        // `meta` to avoid modifying every item in the channel;
+        // https://github.com/nextflow-io/nextflow/issues/2660)
+        [meta + [is_unaligned: is_unaligned], paths]
     }
-
-    ch_result = ch_result | join(ch_is_unaligned)
-    // add `is_unaligned` to the metamap (note the use of `+` to create a copy of `meta`
-    // to avoid modifying every item in the channel;
-    // https://github.com/nextflow-io/nextflow/issues/2660)
-    | map { meta, paths, is_unaligned -> [meta + [is_unaligned: is_unaligned], paths] }
     | branch { meta, paths ->
         // set `paths` to `null` for uBAM samples if unallowed (they will be added to
         // the results channel in shape of `[meta, null]` at the end of the function
@@ -240,11 +236,17 @@ process checkBamHeaders {
     label "ingress"
     label "wf_common"
     cpus 1
+    memory "2 GB"
     input: tuple val(meta), path("input_dir/reads*.bam")
     output:
         // set the two env variables by `eval`-ing the output of the python script
         // checking the XAM headers
-        tuple val(meta), env(IS_UNALIGNED), env(MIXED_HEADERS)
+        tuple(
+            val(meta),
+            path("input_dir/reads*.bam", includeInputs: true),
+            env(IS_UNALIGNED),
+            env(MIXED_HEADERS),
+        )
     script:
     """
     workflow-glue check_bam_headers_in_dir input_dir > env.vars
@@ -257,6 +259,7 @@ process mergeBams {
     label "ingress"
     label "wf_common"
     cpus 3
+    memory "4 GB"
     input: tuple val(meta), path("input_bams/reads*.bam")
     output: tuple val(meta), path("reads.bam")
     shell:
@@ -271,6 +274,7 @@ process catSortBams {
     label "ingress"
     label "wf_common"
     cpus 4
+    memory "4 GB"
     input: tuple val(meta), path("input_bams/reads*.bam")
     output: tuple val(meta), path("reads.bam")
     script:
@@ -285,6 +289,7 @@ process sortBam {
     label "ingress"
     label "wf_common"
     cpus 3
+    memory "4 GB"
     input: tuple val(meta), path("reads.bam")
     output: tuple val(meta), path("reads.sorted.bam")
     script:
@@ -298,17 +303,22 @@ process bamstats {
     label "ingress"
     label "wf_common"
     cpus 3
+    memory "4 GB"
     input:
         tuple val(meta), path("reads.bam")
     output:
-        tuple val(meta), path("reads.bam"), path("bamstats_results")
+        tuple val(meta),
+              path("reads.bam"),
+              path("bamstats_results")
     script:
         def bamstats_threads = Math.max(1, task.cpus - 1)
     """
     mkdir bamstats_results
     bamstats reads.bam -s $meta.alias -u \
         -f bamstats_results/bamstats.flagstat.tsv -t $bamstats_threads \
+        --histograms histograms \
     | bgzip > bamstats_results/bamstats.readstats.tsv.gz
+    mv histograms/* bamstats_results/
 
     # extract the run IDs from the per-read stats
     csvtk cut -tf runid bamstats_results/bamstats.readstats.tsv.gz \
@@ -414,6 +424,7 @@ process move_or_compress_fq_file {
     label "ingress"
     label "wf_common"
     cpus 1
+    memory "2 GB"
     input:
         // don't stage `input` with a literal because we check the file extension
         tuple val(meta), path(input)
@@ -439,11 +450,14 @@ process fastcat {
     label "ingress"
     label "wf_common"
     cpus 3
+    memory "2 GB"
     input:
         tuple val(meta), path("input")
         val extra_args
     output:
-        tuple val(meta), path("seqs.fastq.gz"), path("fastcat_stats")
+        tuple val(meta),
+              path("seqs.fastq.gz"),
+              path("fastcat_stats")
     script:
         String out = "seqs.fastq.gz"
         String fastcat_stats_outdir = "fastcat_stats"
@@ -453,10 +467,12 @@ process fastcat {
             -s ${meta["alias"]} \
             -r >(bgzip -c > $fastcat_stats_outdir/per-read-stats.tsv.gz) \
             -f $fastcat_stats_outdir/per-file-stats.tsv \
+            --histograms histograms \
             $extra_args \
             input \
             | bgzip > $out
 
+        mv histograms/* $fastcat_stats_outdir
         # extract the run IDs from the per-read stats
         csvtk cut -tf runid $fastcat_stats_outdir/per-read-stats.tsv.gz \
         | csvtk del-header | sort | uniq > $fastcat_stats_outdir/run_ids
@@ -737,6 +753,7 @@ process validate_sample_sheet {
     cpus 1
     label "ingress"
     label "wf_common"
+    memory "2 GB"
     input:
         path "sample_sheet.csv"
         val required_sample_types

diff --git a/nextflow.config b/nextflow.config
@@ -58,7 +58,7 @@ params {
             "--scheme_name 'SARS-CoV-2'",
             "--scheme_version 'Midnight-ONT/V3'"
         ]
-        common_sha = 'sha91452ece4f647f62b32dac3a614635a6f0d7f8b5'
+        common_sha = 'sha1c5febff9f75143710826498b093d9769a5edbb9'
         container_sha = 'sha6e8c02f120faf92b4e61e1d0797d71210aaec20b'
         nextclade_sha = 'shae56aff3b5b498b8cb950993692f914033397f8da'
         pangolin_sha = 'shae304dd3bc308a519f26908eb9d5ffa7686131d17'
@@ -72,7 +72,7 @@ manifest {
     description     = 'Run the ARTIC SARS-CoV-2 methodology on multiplexed MinION, GridION, and PromethION data.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=23.04.2'
-    version         = 'v0.3.33'
+    version         = 'v1.0.0'
 }
 
 epi2melabs {
@@ -137,7 +137,7 @@ profiles {
             executor = 'awsbatch'
             queue = "${params.aws_queue}"
             withLabel:wf_common {
-				container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}-root"
+				container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}"
                 memory = '1G'
 			}
             withLabel:artic {