diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bff474c0..e19196a3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,6 +64,7 @@ jobs: test_virus_identification, test_single_end, test_concoct, + test_generatesamplesheet ] steps: - name: Free some space diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bb58625..d6a49234 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#543](https://github.com/nf-core/mag/pull/543) - Automatic samplesheet generation for nf-core/phageannotator (@CarsonJM) + ### `Changed` ### `Fixed` diff --git a/README.md b/README.md index 7e33c988..cec2c6bc 100644 --- a/README.md +++ b/README.md @@ -36,9 +36,10 @@ The pipeline then: - (optionally) performs ancient DNA assembly validation using [PyDamage](https://github.com/maxibor/pydamage) and contig consensus sequence recalling with [Freebayes](https://github.com/freebayes/freebayes) and [BCFtools](http://samtools.github.io/bcftools/bcftools.html) - predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal), and bins with [Prokka](https://github.com/tseemann/prokka) and optionally [MetaEuk](https://www.google.com/search?channel=fs&client=ubuntu-sn&q=MetaEuk) - performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), or [CheckM](https://ecogenomics.github.io/CheckM/), and optionally [GUNC](https://grp-bork.embl-community.io/gunc/). -- Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes) +- performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes) - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool) - assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad), or Eukaryotes with [Tiara](https://github.com/ibe-uw/tiara) +- generates a samplesheet that can be used as input for other nf-core pipelines. Currently, [phageannotator](https://github.com/nf-core/phageannotator) is supported. Furthermore, the pipeline creates various reports in the results directory specified, including a [MultiQC](https://multiqc.info/) report summarizing some of the findings and software versions. diff --git a/conf/test_generatesamplesheet.config b/conf/test_generatesamplesheet.config new file mode 100644 index 00000000..d36c87fe --- /dev/null +++ b/conf/test_generatesamplesheet.config @@ -0,0 +1,41 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv' + generate_downstream_samplesheet = 'phageannotator' + skip_clipping = true + skip_adapter_trimming = true + keep_phix = true + centrifuge_db = null + kraken2_db = null + skip_krona = true + coassemble_group = true + megahit_fix_cpu_1 = true + skip_spadeshybrid = true + skip_spades = true + skip_quast = true + skip_prodigal = true + skip_binning = true + skip_binqc = true + skip_gtdbtk = true + skip_prokka = true +} diff --git a/docs/output.md b/docs/output.md index 88aba227..34c2b1e8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -21,6 +21,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Genome annotation of binned genomes](#genome-annotation-of-binned-genomes) - [Additional summary for binned genomes](#additional-summary-for-binned-genomes) - [Ancient DNA](#ancient-dna) +- [Samplesheet generation](#sampleseet-generation) - [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -706,6 +707,20 @@ Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correc +### Samplesheet generation + +
+Output files + +- `samplesheet/` + - `[generate_downstream_samplesheet].csv`: a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline + +
+ +Currently, samplesheets for the following nf-core pipelines can be automatically generated: + +- [phageannotator](https://github.com/nf-core/phageannotator): a pipeline for identifying, annotation, and quantifying phage sequences in (meta)-genomic sequences. Utilizes quality controlled reads and contigs generate by nf-core/mag + ### MultiQC
diff --git a/modules.json b/modules.json index c62a1a2c..88dd4f09 100644 --- a/modules.json +++ b/modules.json @@ -36,6 +36,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "cat/cat": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "cat/fastq": { "branch": "master", "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml new file mode 100644 index 00000000..17a04ef2 --- /dev/null +++ b/modules/nf-core/cat/cat/environment.yml @@ -0,0 +1,7 @@ +name: cat_cat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf new file mode 100644 index 00000000..4264a92c --- /dev/null +++ b/modules/nf-core/cat/cat/main.nf @@ -0,0 +1,62 @@ +process CAT_CAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(files_in) + + output: + tuple val(meta), path("${prefix}"), emit: file_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def file_list = files_in.collect { it.toString() } + + // | input | output | command1 | command2 | + // |-----------|------------|----------|----------| + // | gzipped | gzipped | cat | | + // | ungzipped | ungzipped | cat | | + // | gzipped | ungzipped | zcat | | + // | ungzipped | gzipped | cat | pigz | + + // Use input file ending as default + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + out_zip = prefix.endsWith('.gz') + in_zip = file_list[0].endsWith('.gz') + command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' + command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + """ + $command1 \\ + $args \\ + ${file_list.join(' ')} \\ + $command2 \\ + > ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + """ + touch $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml new file mode 100644 index 00000000..00a8db0b --- /dev/null +++ b/modules/nf-core/cat/cat/meta.yml @@ -0,0 +1,36 @@ +name: cat_cat +description: A module for concatenation of gzipped or uncompressed files +keywords: + - concatenate + - gzip + - cat +tools: + - cat: + description: Just concatenation + documentation: https://man7.org/linux/man-pages/man1/cat.1.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - files_in: + type: file + description: List of compressed / uncompressed files + pattern: "*" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - file_out: + type: file + description: Concatenated file. Will be gzipped if file_out ends with ".gz" + pattern: "${file_out}" +authors: + - "@erikrikarddaniel" + - "@FriederikeHanssen" +maintainers: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test new file mode 100644 index 00000000..5766daaf --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -0,0 +1,153 @@ +nextflow_process { + + name "Test Process CAT_CAT" + script "../main.nf" + process "CAT_CAT" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/cat" + + test("test_cat_unzipped_unzipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + + test("test_cat_zipped_zipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_zipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_zipped_zipped_size")} + ) + } + } + + test("test_cat_zipped_unzipped") { + config './nextflow_zipped_unzipped.config' + + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_cat_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_unzipped_zipped_size")} + ) + } + } + + test("test_cat_one_file_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")} + ) + } + } +} + diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap new file mode 100644 index 00000000..423571ba --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap @@ -0,0 +1,121 @@ +{ + "test_cat_unzipped_zipped_size": { + "content": [ + 375 + ], + "timestamp": "2023-10-16T14:33:08.049445686" + }, + "test_cat_unzipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:18.500464399" + }, + "test_cat_zipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:49.642741302" + }, + "test_cat_zipped_zipped_lines": { + "content": [ + [ + "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab", + "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1", + "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1", + "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1" + ] + ], + "timestamp": "2023-10-16T14:32:33.629048645" + }, + "test_cat_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:08.038830506" + }, + "test_cat_one_file_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:21.39642399" + }, + "test_cat_zipped_zipped_size": { + "content": [ + 78 + ], + "timestamp": "2023-10-16T14:32:33.641869244" + }, + "test_cat_one_file_unzipped_zipped_size": { + "content": [ + 374 + ], + "timestamp": "2023-10-16T14:33:21.4094373" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config new file mode 100644 index 00000000..ec26b0fd --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config @@ -0,0 +1,6 @@ + +process { + withName: CAT_CAT { + ext.prefix = 'cat.txt.gz' + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config new file mode 100644 index 00000000..fbc79783 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config @@ -0,0 +1,8 @@ + +process { + + withName: CAT_CAT { + ext.prefix = 'cat.txt' + } + +} diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml new file mode 100644 index 00000000..37b578f5 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/tags.yml @@ -0,0 +1,2 @@ +cat/cat: + - modules/nf-core/cat/cat/** diff --git a/nextflow.config b/nextflow.config index 35944b81..da9b83d7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -146,6 +146,10 @@ params { metaeuk_db = null save_mmseqs_db = false + // Generate downstream samplesheet options + generate_downstream_samplesheet = null + samplesheet_combine_assemblers = false + // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' @@ -318,6 +322,7 @@ profiles { test_virus_identification { includeConfig 'conf/test_virus_identification.config' } test_single_end { includeConfig 'conf/test_single_end.config' } test_concoct { includeConfig 'conf/test_concoct.config' } + test_generatesamplesheet { includeConfig 'conf/test_generatesamplesheet.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index 33f16acd..f100eac9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -530,7 +530,7 @@ }, "gtdbtk_min_completeness": { "type": "number", - "default": 50.0, + "default": 50, "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.", "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!", "minimum": 0.01, @@ -538,7 +538,7 @@ }, "gtdbtk_max_contamination": { "type": "number", - "default": 10.0, + "default": 10, "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.", "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!", "minimum": 0, @@ -546,7 +546,7 @@ }, "gtdbtk_min_perc_aa": { "type": "number", - "default": 10.0, + "default": 10, "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.", "minimum": 0, "maximum": 100 @@ -560,7 +560,7 @@ }, "gtdbtk_pplacer_cpus": { "type": "number", - "default": 1.0, + "default": 1, "description": "Number of CPUs used for the by GTDB-Tk run tool pplacer.", "help_text": "A low number of CPUs helps to reduce the memory required/reported by GTDB-Tk. See also the [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes)." }, @@ -884,6 +884,24 @@ "description": "minimum number of bases supporting the alternative allele" } } + }, + "downstream_sample_sheet_generation_options": { + "title": "Downstream sample sheet generation options", + "type": "object", + "description": "Generate sample sheets for downstream nf-core pipelines", + "default": "", + "properties": { + "generate_downstream_samplesheet": { + "type": "string", + "description": "Create a samplesheet for the specified nf-core pipeline", + "help_text": "Automatically generate a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline.", + "enum": ["phageannotator"] + }, + "samplesheet_combine_assemblers": { + "type": "boolean", + "description": "Combine all contigs from all assemblies of a given sample into a single FASTA file" + } + } } }, "allOf": [ @@ -931,6 +949,9 @@ }, { "$ref": "#/definitions/ancient_dna_assembly" + }, + { + "$ref": "#/definitions/downstream_sample_sheet_generation_options" } ] } diff --git a/subworkflows/local/generate_downstream_samplesheet.nf b/subworkflows/local/generate_downstream_samplesheet.nf new file mode 100644 index 00000000..5c21682e --- /dev/null +++ b/subworkflows/local/generate_downstream_samplesheet.nf @@ -0,0 +1,143 @@ +include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' + +workflow GENERATE_DOWNSTREAM_SAMPLESHEET { + take: + downstream_nfcore_pipelines // val: [ nf-core-pipeline, OPTIONAL: other-nf-core-pipelines ] + short_reads // channel: [val(meta), path(fastq_1), path(fastq_2)] + assemblies // channel: [val(meta), path(fasta)] + + main: + + ch_versions = Channel.empty() + + // + // Create a samplesheet for nf-core/phageannotator + // + if ( 'phageannotator' in downstream_nfcore_pipelines ) { + + if ( params.samplesheet_combine_assemblers ) { + // combine assemblies by sample/group if multiple assembly methods were used + ch_assemblies = assemblies + .map { + meta, fasta -> + def meta_new = meta.subMap('id') + [ meta_new, fasta ] + } + .groupTuple() + + // + // MODULE: Combine all assemblies from a sample into one FastA file + // + ch_combined_assemblies = CAT_CAT ( ch_assemblies ).file_out + ch_versions = ch_versions.mix( CAT_CAT.out.versions ) + } else { + ch_combined_assemblies = assemblies + } + + // if no coassembly, join FastQ and FastA by ID + if ( !params.coassemble_group ){ + ch_combined_assemblies_remap = ch_combined_assemblies + .map { + meta, fasta -> + def id = meta.id + + return [ id, fasta ] + } + + short_reads + .map { + meta, fastq -> + def id = meta.id + def group = meta.group + def single_end = meta.single_end + + return [ id, group, single_end, fastq ] + }.join ( ch_combined_assemblies_remap ) + .map { + id, group, single_end, fastq, fasta -> + def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] + def meta = [:] + + meta.id = id + meta.group = group + meta.single_end = single_end + meta.fastq_1 = reads[0] + meta.fastq_2 = !meta.single_end ? reads[1] : '' + meta.fasta = fasta ? fasta : '' + + return meta + } + .set { ch_mag_metadata } + } else { + // if coassembly was used, join FastQ and FastA by group + ch_combined_assemblies_remap = ch_combined_assemblies + .map { + meta, fasta -> + def group = meta.id.split('group-') + + return [ group[1], fasta ] + } + short_reads + .map { + meta, fastq -> + def id = meta.id + def group = meta.group + def single_end = meta.single_end + + return [ group, id, single_end, fastq ] + } + .combine ( ch_combined_assemblies_remap, by:0 ) + .map { + group, id, single_end, fastq, fasta -> + def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] + def meta = [:] + + meta.id = id + meta.group = group + meta.single_end = single_end + meta.fastq_1 = reads[0] + meta.fastq_2 = !meta.single_end ? reads[1] : '' + meta.fasta = fasta ? fasta : '' + + return meta + } + .set { ch_mag_metadata } + } + + // Create samplesheet for each sample using meta information + ch_mag_id_samplesheets = ch_mag_metadata.collectFile() { meta -> + // Save reads and assemblies to outdir so that they are in a stable location + file(meta.fastq_1.toUriString(), checkIfExists: true).copyTo("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_1.name}") + file(meta.fasta, checkIfExists: true).copyTo("${params.outdir}/downstream_samplesheets/fasta/${meta.fasta.name}") + if ( !meta.single_end ){ + file(meta.fastq_2.toUriString(), checkIfExists: true).copyTo("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_2.name}") + [ "${meta.id}_phageannotator_samplesheet.csv", + "sample,group,fastq_1,fastq_2,fasta" + + '\n' + + "${meta.id},${meta.group}," + + file("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_1.name}").toString() + "," + + file("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_2.name}").toString() + "," + + file("${params.outdir}/downstream_samplesheets/fasta/${meta.fasta.name}").toString() + + '\n' + ] + } else { + // Create samplesheet for each sample using meta information + [ "${meta.id}_phageannotator_samplesheet.csv", + "sample,group,fastq_1,fastq_2,fasta" + + '\n' + + "${meta.id},${meta.group}," + + file("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_1.name}").toString() + "," + + "," + + file("${params.outdir}/downstream_samplesheets/fasta/${meta.fasta.name}").toString() + + '\n' + ] + } + } + + // Merge samplesheet across all samples for the pipeline + ch_mag_id_samplesheets.collectFile(name: "phageannotator_samplesheet.csv", keepHeader:true, skip:1, storeDir:"${params.outdir}/downstream_samplesheets/") + } + + emit: + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/workflows/mag.nf b/workflows/mag.nf index 6ec7b132..941899e4 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -86,18 +86,19 @@ include { COMBINE_TSV as COMBINE_SUMMARY_TSV } from '../modules // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' -include { BINNING } from '../subworkflows/local/binning' -include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' -include { BUSCO_QC } from '../subworkflows/local/busco_qc' -include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' -include { CHECKM_QC } from '../subworkflows/local/checkm_qc' -include { GUNC_QC } from '../subworkflows/local/gunc_qc' -include { GTDBTK } from '../subworkflows/local/gtdbtk' -include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' -include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' -include { DEPTHS } from '../subworkflows/local/depths' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' +include { BINNING } from '../subworkflows/local/binning' +include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' +include { BUSCO_QC } from '../subworkflows/local/busco_qc' +include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' +include { CHECKM_QC } from '../subworkflows/local/checkm_qc' +include { GUNC_QC } from '../subworkflows/local/gunc_qc' +include { GTDBTK } from '../subworkflows/local/gtdbtk' +include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' +include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' +include { DEPTHS } from '../subworkflows/local/depths' +include { GENERATE_DOWNSTREAM_SAMPLESHEET } from '../subworkflows/local/generate_downstream_samplesheet' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1041,6 +1042,14 @@ workflow MAG { } } + // + // SUBWORKFLOW: Auto-create samplesheets for downstream nf-core pipelines + // + if ( params.generate_downstream_samplesheet ) { + GENERATE_DOWNSTREAM_SAMPLESHEET ( params.generate_downstream_samplesheet, ch_short_reads, ch_assemblies ) + ch_versions = ch_versions.mix( GENERATE_DOWNSTREAM_SAMPLESHEET.out.versions ) + } + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') )