diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3c06428..e697dec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,4 +52,7 @@ jobs: -params-file \ ./tests/${{ matrix.TEST_PARAMS }}/params.json \ ${{ matrix.OPTION_STUB }} \ - --outdir ./results + --outdir ./results \ + --max_cpus 2 \ + --max_memory '6.GB' \ + --max_time '2.h' diff --git a/CHANGELOG.md b/CHANGELOG.md index e41bf31..119b331 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 20. Added a check for input assemblies. If an assembly is smaller than 1 MB (or 300KB in zipped format), the pipeline errors out before starting the downstream processes [#47](https://github.com/plant-food-research-open/genepal/issues/47) 21. Now `REPEATMASKER` GFF output is saved via `CUSTOM_RMOUTTOGFF3` [#54](https://github.com/plant-food-research-open/genepal/issues/54) 22. Added `benchmark` column to the input sheet and used `GFFCOMPARE` to perform benchmarking [#63](https://github.com/plant-food-research-open/genepal/issues/63) -23. Updated modules and sub-workflows +23. Added `SEQKIT_RMDUP` to detect duplicate sequence and wrap the fasta to 80 characters +24. Updated modules and sub-workflows ### `Fixed` diff --git a/conf/modules.config b/conf/modules.config index 4de605a..83688e5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -11,6 +11,12 @@ */ process { // SUBWORKFLOW: PREPARE_ASSEMBLY + + withName: '.*:PREPARE_ASSEMBLY:SEQKIT_RMDUP' { + ext.args = '--by-seq --ignore-case -w 80' + ext.prefix = { "${meta.id}.seqkit.rmdup" } + } + withName: '.*:PREPARE_ASSEMBLY:FASTA_EDTA_LAI:EDTA_EDTA' { ext.args = [ params.edta_is_sensitive ? "--sensitive 1" : "--sensitive 0", diff --git a/modules.json b/modules.json index 691c939..682ed80 100644 --- a/modules.json +++ b/modules.json @@ -185,6 +185,11 @@ "git_sha": "b13f07be4c508d6ff6312d354d09f2493243e208", "installed_by": ["modules"] }, + "seqkit/rmdup": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "sortmerna": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/nf-core/seqkit/rmdup/environment.yml b/modules/nf-core/seqkit/rmdup/environment.yml new file mode 100644 index 0000000..4f8058a --- /dev/null +++ b/modules/nf-core/seqkit/rmdup/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::seqkit=2.8.1" diff --git a/modules/nf-core/seqkit/rmdup/main.nf b/modules/nf-core/seqkit/rmdup/main.nf new file mode 100644 index 0000000..410bb83 --- /dev/null +++ b/modules/nf-core/seqkit/rmdup/main.nf @@ -0,0 +1,66 @@ +process SEQKIT_RMDUP { + tag "$meta.id" + label 'process_low' + // File IO can be a bottleneck. See: https://bioinf.shenwei.me/seqkit/usage/#parallelization-of-cpu-intensive-jobs + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqkit:2.8.1--h9ee0642_0': + 'biocontainers/seqkit:2.8.1--h9ee0642_0' }" + + input: + tuple val(meta), path(fastx) + + output: + tuple val(meta), path("${prefix}.${extension}") , emit: fastx + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + extension = "fastq" + if ("$fastx" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz|.+\.fsa|.+\.fsa.gz/ ) { + extension = "fasta" + } + extension = fastx.toString().endsWith('.gz') ? "${extension}.gz" : extension + // SeqKit/rmdup takes care of compressing the output: https://bioinf.shenwei.me/seqkit/usage/#rmdup + if("${prefix}.${extension}" == "$fastx") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + seqkit \\ + rmdup \\ + --threads $task.cpus \\ + $args \\ + $fastx \\ + -o ${prefix}.${extension} \\ + 2> >(tee ${prefix}.log >&2) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(seqkit version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + extension = "fastq" + if ("$fastx" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz|.+\.fsa|.+\.fsa.gz/ ) { + extension = "fasta" + } + extension = fastx.toString().endsWith('.gz') ? "${extension}.gz" : extension + if("${prefix}.${extension}" == "$fastx") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.${extension} + echo \\ + '[INFO] 0 duplicated records removed' \\ + > ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(seqkit version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqkit/rmdup/meta.yml b/modules/nf-core/seqkit/rmdup/meta.yml new file mode 100644 index 0000000..22e29c1 --- /dev/null +++ b/modules/nf-core/seqkit/rmdup/meta.yml @@ -0,0 +1,59 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "seqkit_rmdup" +description: Transforms sequences (extract ID, filter by length, remove gaps, reverse + complement...) +keywords: + - genomics + - fasta + - fastq + - remove + - duplicates +tools: + - "seqkit": + description: "A cross-platform and ultrafast toolkit for FASTA/Q file manipulation" + homepage: "https://bioinf.shenwei.me/seqkit/" + documentation: "https://bioinf.shenwei.me/seqkit/usage/" + tool_dev_url: "https://github.com/shenwei356/seqkit" + doi: "10.1371/journal.pone.0163962" + licence: ["MIT"] + identifier: biotools:seqkit +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - fastx: + type: file + description: Input fasta/fastq file + pattern: "*.{fsa,fas,fa,fasta,fastq,fq,fsa.gz,fas.gz,fa.gz,fasta.gz,fastq.gz,fq.gz}" +output: + - fastx: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - ${prefix}.${extension}: + type: file + description: Output fasta/fastq file + pattern: "*.{fasta,fasta.gz,fastq,fastq.gz}" + - log: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.log": + type: file + description: Log containing information regarding removed duplicates + pattern: "*.log" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/nf-core/seqkit/rmdup/tests/main.nf.test b/modules/nf-core/seqkit/rmdup/tests/main.nf.test new file mode 100644 index 0000000..e990443 --- /dev/null +++ b/modules/nf-core/seqkit/rmdup/tests/main.nf.test @@ -0,0 +1,173 @@ +nextflow_process { + + name "Test Process SEQKIT_RMDUP" + script "../main.nf" + process "SEQKIT_RMDUP" + + tag "modules" + tag "modules_nfcore" + tag "seqkit" + tag "seqkit/rmdup" + + test("sarscov2-genome_fasta") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.log[0][1]).text.contains('0 duplicated records removed') } + ) + } + + } + + test("repeated-fasta") { + when { + process { + """ + def repeated_fasta = file('repeated.fasta') + repeated_fasta.text = '>A\\nAGCTAGCTAGCT\\n>B\\nAGCTAGCTAGCT\\n>A\\nAGCTAGCTAGCT' + + input[0] = [ + [ id:'test' ], // meta map + repeated_fasta + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.log[0][1]).text.contains('1 duplicated records removed') } + ) + } + + } + + test("sarscov2-genome_fasta_gz") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.log[0][1]).text.contains('0 duplicated records removed') } + ) + } + + } + + test("sarscov2-test_1_fastq_gz") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.log[0][1]).text.contains('0 duplicated records removed') } + ) + } + + } + + test("file_name_conflict-fail_with_error") { + when { + process { + """ + input[0] = [ + [ id:'test_1' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert !process.success }, + { assert process.stdout.toString().contains("Input and output names are the same") } + ) + } + + } + + test("sarscov2-genome_fasta-stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.log[0][1]).text.contains('0 duplicated records removed') } + ) + } + + } + + test("file_name_conflict-fail_with_error-stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert !process.success }, + { assert process.stdout.toString().contains("Input and output names are the same") } + ) + } + + } + +} diff --git a/modules/nf-core/seqkit/rmdup/tests/main.nf.test.snap b/modules/nf-core/seqkit/rmdup/tests/main.nf.test.snap new file mode 100644 index 0000000..68c415c --- /dev/null +++ b/modules/nf-core/seqkit/rmdup/tests/main.nf.test.snap @@ -0,0 +1,247 @@ +{ + "sarscov2-genome_fasta-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.log:md5,cf833211befdf890bb6b2a3cd0b91853" + ] + ], + "2": [ + "versions.yml:md5,d2b8da3c114c2bd1c6606030df55b6aa" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,cf833211befdf890bb6b2a3cd0b91853" + ] + ], + "versions": [ + "versions.yml:md5,d2b8da3c114c2bd1c6606030df55b6aa" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.1" + }, + "timestamp": "2024-05-27T19:40:01.6034" + }, + "sarscov2-test_1_fastq_gz": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "2": [ + "versions.yml:md5,d2b8da3c114c2bd1c6606030df55b6aa" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "versions": [ + "versions.yml:md5,d2b8da3c114c2bd1c6606030df55b6aa" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.1" + }, + "timestamp": "2024-05-27T19:37:48.551195" + }, + "sarscov2-genome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "2": [ + "versions.yml:md5,d2b8da3c114c2bd1c6606030df55b6aa" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "versions": [ + "versions.yml:md5,d2b8da3c114c2bd1c6606030df55b6aa" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.1" + }, + "timestamp": "2024-05-27T19:37:38.821528" + }, + "sarscov2-genome_fasta_gz": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "2": [ + "versions.yml:md5,d2b8da3c114c2bd1c6606030df55b6aa" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "versions": [ + "versions.yml:md5,d2b8da3c114c2bd1c6606030df55b6aa" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.1" + }, + "timestamp": "2024-05-27T19:37:43.723054" + }, + "repeated-fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,7510a742291241e7d7556bf720caf65c" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.log:md5,314c0aaef0f832a217a3f6ce3f8bc117" + ] + ], + "2": [ + "versions.yml:md5,d2b8da3c114c2bd1c6606030df55b6aa" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta:md5,7510a742291241e7d7556bf720caf65c" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,314c0aaef0f832a217a3f6ce3f8bc117" + ] + ], + "versions": [ + "versions.yml:md5,d2b8da3c114c2bd1c6606030df55b6aa" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.1" + }, + "timestamp": "2024-05-27T19:52:34.545807" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqkit/rmdup/tests/tags.yml b/modules/nf-core/seqkit/rmdup/tests/tags.yml new file mode 100644 index 0000000..e732db3 --- /dev/null +++ b/modules/nf-core/seqkit/rmdup/tests/tags.yml @@ -0,0 +1,2 @@ +seqkit/rmdup: + - "modules/nf-core/seqkit/rmdup/**" diff --git a/subworkflows/local/prepare_assembly.nf b/subworkflows/local/prepare_assembly.nf index 2469e6e..cb92113 100644 --- a/subworkflows/local/prepare_assembly.nf +++ b/subworkflows/local/prepare_assembly.nf @@ -1,5 +1,6 @@ include { GUNZIP as GUNZIP_TARGET_ASSEMBLY } from '../../modules/nf-core/gunzip' include { GUNZIP as GUNZIP_TE_LIBRARY } from '../../modules/nf-core/gunzip' +include { SEQKIT_RMDUP } from '../../modules/nf-core/seqkit/rmdup/main.nf' include { FASTAVALIDATOR } from '../../modules/nf-core/fastavalidator' include { REPEATMODELER_BUILDDATABASE } from '../../modules/nf-core/repeatmodeler/builddatabase' include { REPEATMODELER_REPEATMODELER } from '../../modules/nf-core/repeatmodeler/repeatmodeler' @@ -36,18 +37,35 @@ workflow PREPARE_ASSEMBLY { ) ch_versions = ch_versions.mix(GUNZIP_TARGET_ASSEMBLY.out.versions.first()) + // MODULE: SEQKIT_RMDUP + SEQKIT_RMDUP ( ch_gunzip_assembly ) + + ch_nondup_fw_assembly = SEQKIT_RMDUP.out.log + | join(SEQKIT_RMDUP.out.fastx) + | map { meta, error_log, fasta -> + if ( error_log.text.contains('0 duplicated records removed') ) { + return [ meta, fasta ] + } + + log.warn "FASTA validation failed for ${meta.id} due to presence of duplicate sequences.\n" + + "${meta.id} is excluded from further analysis." + + return null + } // Fixed width assembly fasta without duplicates + + ch_versions = ch_versions.mix(SEQKIT_RMDUP.out.versions.first()) // MODULE: FASTAVALIDATOR - FASTAVALIDATOR ( ch_gunzip_assembly ) + FASTAVALIDATOR ( ch_nondup_fw_assembly ) - ch_validated_assembly = ch_gunzip_assembly + ch_validated_assembly = ch_nondup_fw_assembly | join(FASTAVALIDATOR.out.success_log) | map { meta, fasta, log -> [ meta, fasta ] } ch_versions = ch_versions.mix(FASTAVALIDATOR.out.versions.first()) FASTAVALIDATOR.out.error_log | map { meta, log -> - System.err.println("WARNING: FASTAVALIDATOR failed for ${meta.id} with error: ${log}. ${meta.id} is excluded from further analysis.") + log.warn "FASTAVALIDATOR failed for ${meta.id} with error: ${log}. ${meta.id} is excluded from further analysis." } // MODULE: GUNZIP_TE_LIBRARY diff --git a/subworkflows/local/preprocess_rnaseq.nf b/subworkflows/local/preprocess_rnaseq.nf index 72fa176..b13a3c9 100644 --- a/subworkflows/local/preprocess_rnaseq.nf +++ b/subworkflows/local/preprocess_rnaseq.nf @@ -77,7 +77,7 @@ workflow PREPROCESS_RNASEQ { | join(ch_trim_reads, remainder:true) | map { meta, reads, trimmed -> if (!trimmed) { - System.err.println("WARNING: Dropping ${reads.collect { it.getName() }} as read count after trimming is less than $min_trimmed_reads") + log.warn "Dropping ${reads.collect { it.getName() }} as read count after trimming is less than $min_trimmed_reads" } }