diff --git a/CHANGELOG.md b/CHANGELOG.md index e4d0ca6..33813b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 1. Added cDNA and CDS outputs to /annotations/ directory [#118](https://github.com/Plant-Food-Research-Open/genepal/issues/118) 2. Added parameter `add_attrs_to_proteins_cds_fastas` +3. Added parameter `filter_genes_by_aa_length` with default set to `24` which allows removal of genes with ORFs shorter than 24 [#125](https://github.com/Plant-Food-Research-Open/genepal/issues/125) ### `Fixed` diff --git a/README.md b/README.md index 51f3a3e..177e8f5 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ - Optionally, allow or remove iso-forms - Remove BRAKER models from Liftoff loci - Merge Liftoff and BRAKER models + - Optionally, remove models with ORFs shorter than `N` amino acids - Optionally, remove models without any EggNOG-mapper hits - [EggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper): Add functional annotation to gff - [GenomeTools](https://github.com/genometools/genometools): GFF format validation diff --git a/conf/modules.config b/conf/modules.config index 44e6123..fbb5f52 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -240,6 +240,10 @@ process { // SUBWORKFLOW: GFF_MERGE_CLEANUP ext.prefix = { "${meta.id}.liftoff.braker" } } + withName: '.*:GFF_MERGE_CLEANUP:AGAT_SPFILTERBYORFSIZE' { + ext.args = params.filter_genes_by_aa_length ? "-s ${params.filter_genes_by_aa_length}" : '' + } + withName: '.*:GFF_MERGE_CLEANUP:GT_GFF3' { ext.args = '-tidy -retainids -sort' } diff --git a/docs/output.md b/docs/output.md index f4793b5..40b546c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -169,8 +169,8 @@ If more than one genome is included in the pipeline, [ORTHOFINDER](https://githu - `Y/` - `Y.gt.gff3`: Final annotation file for genome `Y` which contains gene models and their functional annotations - `Y.pep.fasta`: Protein sequences for the gene models - - 'Y.cdna.fasta': cDNA sequences for the gene models - - 'Y.cds.fasta': Coding sequences for the gene models + - `Y.cdna.fasta`: cDNA sequences for the gene models + - `Y.cds.fasta`: Coding sequences for the gene models diff --git a/docs/parameters.md b/docs/parameters.md index 9297c4a..0c2bb09 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -59,12 +59,13 @@ A Nextflow pipeline for consensus, phased and pan-genome annotation. ## Post-annotation filtering options -| Parameter | Description | Type | Default | Required | Hidden | -| ----------------------------- | ----------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | | -| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | | -| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | | -| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | | +| Parameter | Description | Type | Default | Required | Hidden | +| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | +| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | | +| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | | +| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | | +| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | | +| `filter_genes_by_aa_length` | Filter genes with open reading frames shorter than the specified number of amino acids. If set to `null`, this filter step is skipped. | `integer` | 24 | | | ## Annotation output options diff --git a/modules.json b/modules.json index da05f16..6b9d74a 100644 --- a/modules.json +++ b/modules.json @@ -15,6 +15,11 @@ "git_sha": "a8939d36280e7d9037c7cf164eeede19e46546a4", "installed_by": ["gxf_fasta_agat_spaddintrons_spextractsequences"] }, + "agat/spfilterbyorfsize": { + "branch": "main", + "git_sha": "a0054cdffbd84f002fb6582b28575b699e01098e", + "installed_by": ["modules"] + }, "agat/spflagshortintrons": { "branch": "main", "git_sha": "d8f08700c82a3bd14811a3dfe7e7d63838130693", diff --git a/modules/gallvp/agat/spfilterbyorfsize/environment.yml b/modules/gallvp/agat/spfilterbyorfsize/environment.yml new file mode 100644 index 0000000..2c3daab --- /dev/null +++ b/modules/gallvp/agat/spfilterbyorfsize/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::agat=1.4.2" diff --git a/modules/gallvp/agat/spfilterbyorfsize/main.nf b/modules/gallvp/agat/spfilterbyorfsize/main.nf new file mode 100644 index 0000000..502a9cd --- /dev/null +++ b/modules/gallvp/agat/spfilterbyorfsize/main.nf @@ -0,0 +1,60 @@ +process AGAT_SPFILTERBYORFSIZE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/agat:1.4.2--pl5321hdfd78af_0': + 'biocontainers/agat:1.4.2--pl5321hdfd78af_0' }" + + input: + tuple val(meta), path(gxf) + path config + + output: + tuple val(meta), path("*.passed.gff") , emit: passed_gff + tuple val(meta), path("*.failed.gff") , emit: failed_gff + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def config_arg = config ? "-c $config" : '' + if( "$gxf" in [ "${prefix}.passed.gff", "${prefix}.failed.gff" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + agat_sp_filter_by_ORF_size.pl \\ + -g $gxf \\ + $args \\ + $config_arg \\ + -o $prefix + + mv \\ + ${prefix}_NOT* \\ + "${prefix}.failed.gff" + + mv \\ + ${prefix}_* \\ + "${prefix}.passed.gff" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_sp_flag_short_introns.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + if( "$gxf" in [ "${prefix}.passed.gff", "${prefix}.failed.gff" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.passed.gff + touch ${prefix}.failed.gff + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_sp_flag_short_introns.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ +} diff --git a/modules/gallvp/agat/spfilterbyorfsize/meta.yml b/modules/gallvp/agat/spfilterbyorfsize/meta.yml new file mode 100644 index 0000000..cf399da --- /dev/null +++ b/modules/gallvp/agat/spfilterbyorfsize/meta.yml @@ -0,0 +1,67 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "agat_spfilterbyorfsize" +description: The script reads a gff annotation file, and create two output files, + one contains the gene models with ORF passing the test, the other contains the rest. + By default the test is "> 100" that means all gene models that have ORF longer than + 100 Amino acids, will pass the test. +keywords: + - genomics + - GFF/GTF + - filter + - annotation +tools: + - "agat": + description: "Another Gff Analysis Toolkit (AGAT). Suite of tools to handle gene + annotations in any GTF/GFF format." + homepage: "https://agat.readthedocs.io/en/latest/" + documentation: "https://agat.readthedocs.io/en/latest/" + tool_dev_url: "https://github.com/NBISweden/AGAT" + doi: "10.5281/zenodo.3552717" + licence: ["GPL v3"] + identifier: biotools:AGAT + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - gxf: + type: file + description: Input GFF3/GTF file + pattern: "*.{gff,gff3,gtf}" + - - config: + type: file + description: | + Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". + The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + pattern: "*.yaml" +output: + - passed_gff: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ] + - "*.passed.gff": + type: file + description: GFF file with gene models which pass the filter test + - failed_gff: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ] + - "*.failed.gff": + type: file + description: GFF file with remaining gene models + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test b/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test new file mode 100644 index 0000000..4a6e1fc --- /dev/null +++ b/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test @@ -0,0 +1,62 @@ +nextflow_process { + + name "Test Process AGAT_SPFILTERBYORFSIZE" + script "../main.nf" + process "AGAT_SPFILTERBYORFSIZE" + + tag "modules" + tag "modules_gallvp" + tag "agat" + tag "agat/spfilterbyorfsize" + + test("actinidia_chinensis - genome - gtf") { + + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/actinidia_chinensis/genome/chr1/genome.gtf.gz', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("homo_sapiens - genome - gtf - stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr1/genome.gtf', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + +} diff --git a/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap b/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap new file mode 100644 index 0000000..22b26fe --- /dev/null +++ b/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap @@ -0,0 +1,100 @@ +{ + "homo_sapiens - genome - gtf - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.passed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.failed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" + ], + "failed_gff": [ + [ + { + "id": "test" + }, + "test.failed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "passed_gff": [ + [ + { + "id": "test" + }, + "test.passed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.04.4" + }, + "timestamp": "2024-12-10T17:07:11.619928" + }, + "actinidia_chinensis - genome - gtf": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.passed.gff:md5,e2558c89e50df32d654f19f9a69e46a3" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.failed.gff:md5,d7eb6ae1c3dc30675138029b513073eb" + ] + ], + "2": [ + "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" + ], + "failed_gff": [ + [ + { + "id": "test" + }, + "test.failed.gff:md5,d7eb6ae1c3dc30675138029b513073eb" + ] + ], + "passed_gff": [ + [ + { + "id": "test" + }, + "test.passed.gff:md5,e2558c89e50df32d654f19f9a69e46a3" + ] + ], + "versions": [ + "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.04.4" + }, + "timestamp": "2024-12-10T17:07:06.829402" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 363f0c5..c3ce861 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,6 +54,7 @@ params { enforce_full_intron_support = true filter_liftoff_by_hints = true eggnogmapper_purge_nohits = false + filter_genes_by_aa_length = 24 // Annotation output options braker_save_outputs = false diff --git a/nextflow_schema.json b/nextflow_schema.json index b7b5cc4..abe26a9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -272,6 +272,13 @@ "type": "boolean", "description": "Purge transcripts which do not have a hit against eggnog", "fa_icon": "fas fa-question-circle" + }, + "filter_genes_by_aa_length": { + "type": "integer", + "default": 24, + "fa_icon": "fas fa-hashtag", + "description": "Filter genes with open reading frames shorter than the specified number of amino acids. If set to `null`, this filter step is skipped.", + "minimum": 3 } } }, diff --git a/subworkflows/local/gff_merge_cleanup.nf b/subworkflows/local/gff_merge_cleanup.nf index fc6c75e..fbdea37 100644 --- a/subworkflows/local/gff_merge_cleanup.nf +++ b/subworkflows/local/gff_merge_cleanup.nf @@ -1,18 +1,20 @@ include { AGAT_SPMERGEANNOTATIONS } from '../../modules/nf-core/agat/spmergeannotations/main' include { GT_GFF3 } from '../../modules/nf-core/gt/gff3/main' +include { AGAT_SPFILTERBYORFSIZE } from '../../modules/gallvp/agat/spfilterbyorfsize/main' include { AGAT_CONVERTSPGXF2GXF } from '../../modules/nf-core/agat/convertspgxf2gxf/main' workflow GFF_MERGE_CLEANUP { take: ch_braker_gff // Channel: [ meta, gff ] ch_liftoff_gff // Channel: [ meta, gff ] + val_filter_by_aa_length // val(null|Integer) main: ch_versions = Channel.empty() ch_gff_branch = ch_braker_gff | join(ch_liftoff_gff, remainder:true) - | branch { meta, braker_gff, liftoff_gff -> + | branch { _meta, braker_gff, liftoff_gff -> both : ( braker_gff && liftoff_gff ) braker_only : ( braker_gff && ( ! liftoff_gff ) ) liftoff_only: ( ( ! braker_gff ) && liftoff_gff ) @@ -25,12 +27,25 @@ workflow GFF_MERGE_CLEANUP { ) ch_merged_gff = AGAT_SPMERGEANNOTATIONS.out.gff - | mix ( ch_gff_branch.liftoff_only.map { meta, braker_gff, liftoff_gff -> [ meta, liftoff_gff ] } ) - | mix ( ch_gff_branch.braker_only.map { meta, braker_gff, liftoff_gff -> [ meta, braker_gff ] } ) + | mix ( ch_gff_branch.liftoff_only.map { meta, _braker_gff, liftoff_gff -> [ meta, liftoff_gff ] } ) + | mix ( ch_gff_branch.braker_only.map { meta, braker_gff, _liftoff_gff -> [ meta, braker_gff ] } ) ch_versions = ch_versions.mix(AGAT_SPMERGEANNOTATIONS.out.versions.first()) + // MODULE: AGAT_SPFILTERBYORFSIZE + ch_filter_input = ch_merged_gff + | branch { + filter: val_filter_by_aa_length != null + pass: val_filter_by_aa_length == null + } + + AGAT_SPFILTERBYORFSIZE ( ch_filter_input.filter, [] ) + + ch_filtered_gff = AGAT_SPFILTERBYORFSIZE.out.passed_gff + | mix ( ch_filter_input.pass ) + ch_versions = ch_versions.mix(AGAT_SPFILTERBYORFSIZE.out.versions.first()) + // MODULE: GT_GFF3 - GT_GFF3 ( ch_merged_gff ) + GT_GFF3 ( ch_filtered_gff ) ch_gt_gff = GT_GFF3.out.gt_gff3 ch_versions = ch_versions.mix(GT_GFF3.out.versions.first()) diff --git a/workflows/genepal.nf b/workflows/genepal.nf index 538fcfe..6ee525b 100644 --- a/workflows/genepal.nf +++ b/workflows/genepal.nf @@ -178,7 +178,8 @@ workflow GENEPAL { // SUBWORKFLOW: GFF_MERGE_CLEANUP GFF_MERGE_CLEANUP( ch_braker_purged_gff, - ch_liftoff_gff3 + ch_liftoff_gff3, + params.filter_genes_by_aa_length ) ch_merged_gff = GFF_MERGE_CLEANUP.out.gff