Plant-Food-Research-Open · GallVp · Dec 15, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 11, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,12 +3,13 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v0.6.0 - [10-Dec-2024]
+## v0.6.0 - [16-Dec-2024]
 
 ### 'Added'
 
 1. Added cDNA and CDS outputs to <OUTPUT_DIR>/annotations/<SAMPLE> directory [#118](https://github.com/Plant-Food-Research-Open/genepal/issues/118)
 2. Added parameter `add_attrs_to_proteins_cds_fastas`
+3. Added parameter `filter_genes_by_aa_length` with default set to `24` which allows removal of genes with ORFs shorter than 24 [#125](https://github.com/Plant-Food-Research-Open/genepal/issues/125)
 
 ### `Fixed`
 

diff --git a/README.md b/README.md
@@ -42,7 +42,9 @@
   - Optionally, remove models without any EggNOG-mapper hits
 - [EggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper): Add functional annotation to gff
 - [GenomeTools](https://github.com/genometools/genometools): GFF format validation
-- [GffRead](https://github.com/gpertea/gffread): Extraction of protein sequences
+- [GffRead](https://github.com/gpertea/gffread)
+  - Extraction of protein sequences
+  - Optionally, remove models with ORFs shorter than `N` amino acids
 - [OrthoFinder](https://github.com/davidemms/OrthoFinder): Perform phylogenetic orthology inference across genomes
 - [GffCompare](https://github.com/gpertea/gffcompare): Compare and benchmark against an existing annotation
 - [BUSCO](https://gitlab.com/ezlab/busco): Completeness statistics for genome and annotation through proteins

diff --git a/conf/modules.config b/conf/modules.config
@@ -240,6 +240,10 @@ process { // SUBWORKFLOW: GFF_MERGE_CLEANUP
         ext.prefix = { "${meta.id}.liftoff.braker" }
     }
 
+    withName: '.*:GFF_MERGE_CLEANUP:FILTER_BY_ORF_SIZE' {
+        ext.args = params.filter_genes_by_aa_length ? "--no-pseudo --keep-genes -C -l ${ ( params.filter_genes_by_aa_length + 1 ) * 3 }" : ''
+    }
+
     withName: '.*:GFF_MERGE_CLEANUP:GT_GFF3' {
         ext.args = '-tidy -retainids -sort'
     }

diff --git a/docs/output.md b/docs/output.md
@@ -169,8 +169,8 @@ If more than one genome is included in the pipeline, [ORTHOFINDER](https://githu
   - `Y/`
     - `Y.gt.gff3`: Final annotation file for genome `Y` which contains gene models and their functional annotations
     - `Y.pep.fasta`: Protein sequences for the gene models
-    - 'Y.cdna.fasta': cDNA sequences for the gene models
-    - 'Y.cds.fasta': Coding sequences for the gene models
+    - `Y.cdna.fasta`: cDNA sequences for the gene models
+    - `Y.cds.fasta`: Coding sequences for the gene models
 
 </details>
 

diff --git a/docs/parameters.md b/docs/parameters.md
@@ -59,12 +59,13 @@ A Nextflow pipeline for consensus, phased and pan-genome annotation.
 
 ## Post-annotation filtering options
 
-| Parameter                     | Description                                                       | Type      | Default | Required | Hidden |
-| ----------------------------- | ----------------------------------------------------------------- | --------- | ------- | -------- | ------ |
-| `allow_isoforms`              | Allow multiple isoforms for gene models                           | `boolean` | True    |          |        |
-| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True    |          |        |
-| `filter_liftoff_by_hints`     | Use BRAKER hints to filter Liftoff models                         | `boolean` | True    |          |        |
-| `eggnogmapper_purge_nohits`   | Purge transcripts which do not have a hit against eggnog          | `boolean` |         |          |        |
+| Parameter                     | Description                                                                                                                                                     | Type      | Default | Required | Hidden |
+| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ |
+| `allow_isoforms`              | Allow multiple isoforms for gene models                                                                                                                         | `boolean` | True    |          |        |
+| `enforce_full_intron_support` | Require every model to have external evidence for all its introns                                                                                               | `boolean` | True    |          |        |
+| `filter_liftoff_by_hints`     | Use BRAKER hints to filter Liftoff models                                                                                                                       | `boolean` | True    |          |        |
+| `eggnogmapper_purge_nohits`   | Purge transcripts which do not have a hit against eggnog                                                                                                        | `boolean` |         |          |        |
+| `filter_genes_by_aa_length`   | Filter genes with open reading frames shorter than the specified number of amino acids excluding the stop codon. If set to `null`, this filter step is skipped. | `integer` | 24      |          |        |
 
 ## Annotation output options
 

diff --git a/modules/local/tests/gffread/main.nf.test b/modules/local/tests/gffread/main.nf.test
@@ -0,0 +1,38 @@
+nextflow_process {
+
+    name "Test Process GFFREAD"
+    script "../../../nf-core/gffread/main.nf"
+    config "./nextflow.config"
+    process "GFFREAD"
+
+    tag "gffread"
+    tag "modules_nfcore"
+    tag "modules"
+
+    test("filter by length") {
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [id: 'test'],
+                    file("$baseDir" + '/modules/local/tests/gffread/testdata/t.gff', checkIfExists: true)
+                ]
+                input[1] = []
+                """
+            }
+        }
+
+        then {
+            assertAll (
+                { assert process.success },
+                { assert snapshot(process.out).match() },
+                { assert file(process.out.gffread_gff[0][1]).text.contains('gene19851') },
+                { assert file(process.out.gffread_gff[0][1]).text.contains('gene19851.t1') },
+                { assert ! file(process.out.gffread_gff[0][1]).text.contains('gene19851.t2') } // This is the only transcript which is being knocked out
+            )
+        }
+
+    }
+
+}
diff --git a/modules/local/tests/gffread/main.nf.test.snap b/modules/local/tests/gffread/main.nf.test.snap
@@ -0,0 +1,47 @@
+{
+    "filter by length": {
+        "content": [
+            {
+                "0": [
+
+                ],
+                "1": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "test.gff3:md5,59a7d6ff7123589ef2b90b20043a347c"
+                    ]
+                ],
+                "2": [
+
+                ],
+                "3": [
+                    "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd"
+                ],
+                "gffread_fasta": [
+
+                ],
+                "gffread_gff": [
+                    [
+                        {
+                            "id": "test"
+                        },
+                        "test.gff3:md5,59a7d6ff7123589ef2b90b20043a347c"
+                    ]
+                ],
+                "gtf": [
+
+                ],
+                "versions": [
+                    "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.2",
+            "nextflow": "24.04.4"
+        },
+        "timestamp": "2024-12-11T21:11:59.953464"
+    }
+}
diff --git a/modules/local/tests/gffread/nextflow.config b/modules/local/tests/gffread/nextflow.config
@@ -0,0 +1,5 @@
+process {
+    withName: GFFREAD {
+        ext.args = '--no-pseudo --keep-genes -C -l 72'
+    }
+}
diff --git a/modules/local/tests/gffread/testdata/t.gff b/modules/local/tests/gffread/testdata/t.gff
@@ -0,0 +1,47 @@
+##gff-version 3
+###
+chr23	AUGUSTUS	gene	16515075	16516672	.	-	.	ID=gene19849;description=Protein%20of%20unknown%20function%20%28DUF1635%29
+chr23	AUGUSTUS	mRNA	16515075	16516597	1	-	.	ID=gene19849.t1;Parent=gene19849;description=Protein%20of%20unknown%20function%20%28DUF1635%29
+chr23	AUGUSTUS	exon	16515075	16515794	.	-	.	ID=gene19849.t1.exon1;Parent=gene19849.t1
+chr23	AUGUSTUS	CDS	16515075	16515794	1	-	0	ID=gene19849.t1.cds1;Parent=gene19849.t1
+chr23	AUGUSTUS	exon	16516562	16516597	.	-	.	ID=gene19849.t1.exon2;Parent=gene19849.t1
+chr23	AUGUSTUS	CDS	16516562	16516597	1	-	0	ID=gene19849.t1.cds2;Parent=gene19849.t1
+chr23	gmst	mRNA	16515075	16516672	.	-	.	ID=gene19849.t2;Parent=gene19849;description=Protein%20of%20unknown%20function%20%28DUF1635%29
+chr23	gmst	exon	16515075	16515794	50.2	-	0	ID=gene19849.t2.exon1;Parent=gene19849.t2
+chr23	gmst	CDS	16515075	16515794	50.2	-	0	ID=gene19849.t2.cds1;Parent=gene19849.t2
+chr23	gmst	exon	16516562	16516672	50.2	-	0	ID=gene19849.t2.exon2;Parent=gene19849.t2
+chr23	gmst	CDS	16516562	16516672	50.2	-	0	ID=gene19849.t2.cds2;Parent=gene19849.t2
+###
+chr23	gmst	gene	16530414	16531453	.	-	.	ID=gene19850;description=Myb-like%20DNA-binding%20domain
+chr23	gmst	mRNA	16530414	16531453	.	-	.	ID=gene19850.t1;Parent=gene19850;description=Myb-like%20DNA-binding%20domain
+chr23	gmst	exon	16530414	16531041	42.7	-	1	ID=gene19850.t1.exon1;Parent=gene19850.t1
+chr23	gmst	CDS	16530414	16531041	42.7	-	1	ID=gene19850.t1.cds1;Parent=gene19850.t1
+chr23	gmst	exon	16531197	16531453	42.7	-	0	ID=gene19850.t1.exon2;Parent=gene19850.t1
+chr23	gmst	CDS	16531197	16531453	42.7	-	0	ID=gene19850.t1.cds2;Parent=gene19850.t1
+###
+chr23	AUGUSTUS	gene	16530414	16531542	.	-	.	ID=gene19851;description=Differing%20isoform%20descriptions
+chr23	AUGUSTUS	mRNA	16530414	16531542	1	-	.	ID=gene19851.t1;Parent=gene19851;description=Myb-like%20DNA-binding%20domain
+chr23	AUGUSTUS	exon	16530414	16530721	.	-	.	ID=gene19851.t1.exon1;Parent=gene19851.t1
+chr23	AUGUSTUS	CDS	16530414	16530721	1	-	2	ID=gene19851.t1.cds1;Parent=gene19851.t1
+chr23	AUGUSTUS	exon	16530824	16531041	.	-	.	ID=gene19851.t1.exon2;Parent=gene19851.t1
+chr23	AUGUSTUS	CDS	16530824	16531041	1	-	1	ID=gene19851.t1.cds2;Parent=gene19851.t1
+chr23	AUGUSTUS	exon	16531197	16531326	.	-	.	ID=gene19851.t1.exon3;Parent=gene19851.t1
+chr23	AUGUSTUS	CDS	16531197	16531326	1	-	2	ID=gene19851.t1.cds3;Parent=gene19851.t1
+chr23	AUGUSTUS	exon	16531428	16531542	.	-	.	ID=gene19851.t1.exon4;Parent=gene19851.t1
+chr23	AUGUSTUS	CDS	16531428	16531542	1	-	0	ID=gene19851.t1.cds4;Parent=gene19851.t1
+chr23	GeneMark.hmm3	mRNA	16531514	16531542	.	-	.	ID=gene19851.t2;Parent=gene19851;description=Hypothetical%20protein%20%7C%20no%20eggnog%20hit
+chr23	GeneMark.hmm3	exon	16531514	16531542	.	-	0	ID=gene19851.t2.exon1;Parent=gene19851.t2
+chr23	GeneMark.hmm3	CDS	16531514	16531542	.	-	0	ID=gene19851.t2.cds1;Parent=gene19851.t2
+###
+chr23	AUGUSTUS	gene	16539401	16545431	.	+	.	ID=gene19852;description=nuclease%20HARBI1
+chr23	AUGUSTUS	mRNA	16539401	16545431	1	+	.	ID=gene19852.t1;Parent=gene19852;description=nuclease%20HARBI1
+chr23	AUGUSTUS	exon	16539401	16539509	.	+	.	ID=gene19852.t1.exon1;Parent=gene19852.t1
+chr23	AUGUSTUS	CDS	16539401	16539509	1	+	0	ID=gene19852.t1.cds1;Parent=gene19852.t1
+chr23	AUGUSTUS	exon	16544386	16545431	.	+	.	ID=gene19852.t1.exon2;Parent=gene19852.t1
+chr23	AUGUSTUS	CDS	16544386	16545431	1	+	2	ID=gene19852.t1.cds2;Parent=gene19852.t1
+###
+chr23	AUGUSTUS	gene	16556338	16556796	.	+	.	ID=gene19853;description=Zinc%20finger%20protein
+chr23	AUGUSTUS	mRNA	16556338	16556796	1	+	.	ID=gene19853.t1;Parent=gene19853;description=Zinc%20finger%20protein
+chr23	AUGUSTUS	exon	16556338	16556796	.	+	.	ID=gene19853.t1.exon1;Parent=gene19853.t1
+chr23	AUGUSTUS	CDS	16556338	16556796	1	+	0	ID=gene19853.t1.cds1;Parent=gene19853.t1
+###
diff --git a/nextflow.config b/nextflow.config
@@ -54,6 +54,7 @@ params {
     enforce_full_intron_support         = true
     filter_liftoff_by_hints             = true
     eggnogmapper_purge_nohits           = false
+    filter_genes_by_aa_length           = 24
 
     // Annotation output options
     braker_save_outputs                 = false

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -272,6 +272,13 @@
                     "type": "boolean",
                     "description": "Purge transcripts which do not have a hit against eggnog",
                     "fa_icon": "fas fa-question-circle"
+                },
+                "filter_genes_by_aa_length": {
+                    "type": "integer",
+                    "default": 24,
+                    "fa_icon": "fas fa-hashtag",
+                    "description": "Filter genes with open reading frames shorter than the specified number of amino acids excluding the stop codon. If set to `null`, this filter step is skipped.",
+                    "minimum": 3
                 }
             }
         },

diff --git a/pfr/params.json b/pfr/params.json
@@ -32,8 +32,9 @@
     "enforce_full_intron_support": true,
     "filter_liftoff_by_hints": true,
     "eggnogmapper_purge_nohits": false,
+    "filter_genes_by_aa_length": 24,
     "braker_save_outputs": false,
-    "add_attrs_to_proteins_fasta": false,
+    "add_attrs_to_proteins_cds_fastas": false,
     "busco_skip": false,
     "busco_lineage_datasets": "embryophyta_odb10"
 }
diff --git a/subworkflows/local/gff_eggnogmapper.nf b/subworkflows/local/gff_eggnogmapper.nf
@@ -16,8 +16,8 @@ workflow GFF_EGGNOGMAPPER {
                                 | join(ch_fasta)
 
     GFF2FASTA_FOR_EGGNOGMAPPER(
-        ch_gffread_inputs.map { meta, gff, fasta -> [ meta, gff ] },
-        ch_gffread_inputs.map { meta, gff, fasta -> fasta }
+        ch_gffread_inputs.map { meta, gff, _fasta -> [ meta, gff ] },
+        ch_gffread_inputs.map { _meta, _gff, fasta -> fasta }
     )
 
     ch_gffread_fasta            = GFF2FASTA_FOR_EGGNOGMAPPER.out.gffread_fasta
@@ -30,9 +30,9 @@ workflow GFF_EGGNOGMAPPER {
                                 | combine(Channel.fromPath(db_folder))
 
     EGGNOGMAPPER(
-        ch_eggnogmapper_inputs.map { meta, fasta, db -> [ meta, fasta ] },
+        ch_eggnogmapper_inputs.map { meta, fasta, _db -> [ meta, fasta ] },
         [],
-        ch_eggnogmapper_inputs.map { meta, fasta, db -> db },
+        ch_eggnogmapper_inputs.map { _meta, _fasta, db -> db },
         [ [], [] ]
     )
 

diff --git a/subworkflows/local/gff_merge_cleanup.nf b/subworkflows/local/gff_merge_cleanup.nf
@@ -1,18 +1,20 @@
 include { AGAT_SPMERGEANNOTATIONS               } from '../../modules/nf-core/agat/spmergeannotations/main'
 include { GT_GFF3                               } from '../../modules/nf-core/gt/gff3/main'
+include { GFFREAD as FILTER_BY_ORF_SIZE         } from '../../modules/nf-core/gffread/main'
 include { AGAT_CONVERTSPGXF2GXF                 } from '../../modules/nf-core/agat/convertspgxf2gxf/main'
 
 workflow GFF_MERGE_CLEANUP {
     take:
     ch_braker_gff               // Channel: [ meta, gff ]
     ch_liftoff_gff              // Channel: [ meta, gff ]
+    val_filter_by_aa_length     // val(null|Integer)
 
     main:
     ch_versions                 = Channel.empty()
 
     ch_gff_branch               = ch_braker_gff
                                 | join(ch_liftoff_gff, remainder:true)
-                                | branch { meta, braker_gff, liftoff_gff ->
+                                | branch { _meta, braker_gff, liftoff_gff ->
                                     both        : (     braker_gff      &&      liftoff_gff )
                                     braker_only : (     braker_gff      && ( !  liftoff_gff ) )
                                     liftoff_only: ( ( ! braker_gff )    &&      liftoff_gff )
@@ -25,12 +27,25 @@ workflow GFF_MERGE_CLEANUP {
     )
 
     ch_merged_gff               = AGAT_SPMERGEANNOTATIONS.out.gff
-                                | mix ( ch_gff_branch.liftoff_only.map { meta, braker_gff, liftoff_gff -> [ meta, liftoff_gff ] } )
-                                | mix ( ch_gff_branch.braker_only.map { meta, braker_gff, liftoff_gff -> [ meta, braker_gff ] } )
+                                | mix ( ch_gff_branch.liftoff_only.map { meta, _braker_gff, liftoff_gff -> [ meta, liftoff_gff ] } )
+                                | mix ( ch_gff_branch.braker_only.map { meta, braker_gff, _liftoff_gff -> [ meta, braker_gff ] } )
     ch_versions                 = ch_versions.mix(AGAT_SPMERGEANNOTATIONS.out.versions.first())
 
+    // MODULE: GFFREAD as FILTER_BY_ORF_SIZE
+    ch_filter_input             = ch_merged_gff
+                                | branch {
+                                    filter: val_filter_by_aa_length != null
+                                    pass: val_filter_by_aa_length == null
+                                }
+
+    FILTER_BY_ORF_SIZE ( ch_filter_input.filter, [] )
+
+    ch_filtered_gff             = FILTER_BY_ORF_SIZE.out.gffread_gff
+                                | mix ( ch_filter_input.pass )
+    ch_versions                 = ch_versions.mix(FILTER_BY_ORF_SIZE.out.versions.first())
+
     // MODULE: GT_GFF3
-    GT_GFF3 ( ch_merged_gff )
+    GT_GFF3 ( ch_filtered_gff )
 
     ch_gt_gff                   = GT_GFF3.out.gt_gff3
     ch_versions                 = ch_versions.mix(GT_GFF3.out.versions.first())

diff --git a/tests/minimal/main.nf.test b/tests/minimal/main.nf.test
@@ -38,6 +38,8 @@ nextflow_pipeline {
                 ['**']
             )
 
+            def summary_stats = (Map) new groovy.json.JsonSlurper().parseText(file("$outputDir/genepal_data/summary_stats.json").text)
+
             assertAll(
                 { assert workflow.success},
                 { assert snapshot(
@@ -46,6 +48,7 @@ nextflow_pipeline {
                         'versions': removeNextflowVersion("$outputDir/pipeline_info/genepal_software_mqc_versions.yml"),
                         'stable paths': stable_path,
                         'stable names': getRelativePath(stable_name, outputDir),
+                        'summary_stats': summary_stats
                     ]
                 ).match() }
             )

diff --git a/tests/minimal/main.nf.test.snap b/tests/minimal/main.nf.test.snap
@@ -2,7 +2,7 @@
     "profile - test": {
         "content": [
             {
-                "successful tasks": 20,
+                "successful tasks": 21,
                 "versions": {
                     "AGAT_CONVERTSPGFF2GTF": {
                         "agat": "v1.4.0"
@@ -37,6 +37,9 @@
                     "FASTAVALIDATOR": {
                         "py_fasta_validator": 0.6
                     },
+                    "FILTER_BY_ORF_SIZE": {
+                        "gffread": "0.12.7"
+                    },
                     "FINAL_GFF_CHECK": {
                         "genometools": "1.6.5"
                     },
@@ -67,9 +70,9 @@
                 "stable paths": [
                     "a_thaliana.cdna.fasta:md5,12b9bef973e488640aec8c04ba3882fe",
                     "a_thaliana.cds.fasta:md5,b81060419355a590560f92aec8536281",
-                    "a_thaliana.gt.gff3:md5,8ab16549095f605ff8715ac4a3de58ed",
+                    "a_thaliana.gt.gff3:md5,528459cf9596523bf66de99d24c37e20",
                     "a_thaliana.pep.fasta:md5,4994c0393ca0245a1c57966d846d101e",
-                    "a_thaliana.gff3:md5,d23d16cd86499d48a30ffb981ed27891",
+                    "a_thaliana.gff3:md5,30adac1b21d7aaed6ca7fb71ab33f32d",
                     "summary_stats.json:md5,007ba5cf2b7a2fd395a27d9458ca2d2e"
                 ],
                 "stable names": [
@@ -87,13 +90,26 @@
                     "genepal_report.html",
                     "multiqc_report.html",
                     "pipeline_info"
-                ]
+                ],
+                "summary_stats": {
+                    "stats": [
+                        {
+                            "ID": "a_thaliana",
+                            "Genes": 252,
+                            "mRNA": 265,
+                            "CDS": 1340,
+                            "Exons": 1340,
+                            "Intron": 1075,
+                            "Non canon splice sites": 18
+                        }
+                    ]
+                }
             }
         ],
         "meta": {
             "nf-test": "0.9.2",
-            "nextflow": "24.04.2"
+            "nextflow": "24.04.4"
         },
-        "timestamp": "2024-12-05T07:51:43.818374"
+        "timestamp": "2024-12-12T09:36:52.952048"
     }
-}
+}