Skip to content

Commit

Permalink
Added parameter filter_genes_by_aa_length
Browse files Browse the repository at this point in the history
  • Loading branch information
GallVp committed Dec 10, 2024
1 parent 496ff62 commit 3114adb
Show file tree
Hide file tree
Showing 15 changed files with 345 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

1. Added cDNA and CDS outputs to <OUTPUT_DIR>/annotations/<SAMPLE> directory [#118](https://github.com/Plant-Food-Research-Open/genepal/issues/118)
2. Added parameter `add_attrs_to_proteins_cds_fastas`
3. Added parameter `filter_genes_by_aa_length` with default set to `24` which allows removal of genes with ORFs shorter than 24 [#125](https://github.com/Plant-Food-Research-Open/genepal/issues/125)

### `Fixed`

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
- Optionally, allow or remove iso-forms
- Remove BRAKER models from Liftoff loci
- Merge Liftoff and BRAKER models
- Optionally, remove models with ORFs shorter than `N` amino acids
- Optionally, remove models without any EggNOG-mapper hits
- [EggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper): Add functional annotation to gff
- [GenomeTools](https://github.com/genometools/genometools): GFF format validation
Expand Down
4 changes: 4 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,10 @@ process { // SUBWORKFLOW: GFF_MERGE_CLEANUP
ext.prefix = { "${meta.id}.liftoff.braker" }
}

withName: '.*:GFF_MERGE_CLEANUP:AGAT_SPFILTERBYORFSIZE' {
ext.args = params.filter_genes_by_aa_length ? "-s ${params.filter_genes_by_aa_length}" : ''
}

withName: '.*:GFF_MERGE_CLEANUP:GT_GFF3' {
ext.args = '-tidy -retainids -sort'
}
Expand Down
4 changes: 2 additions & 2 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ If more than one genome is included in the pipeline, [ORTHOFINDER](https://githu
- `Y/`
- `Y.gt.gff3`: Final annotation file for genome `Y` which contains gene models and their functional annotations
- `Y.pep.fasta`: Protein sequences for the gene models
- 'Y.cdna.fasta': cDNA sequences for the gene models
- 'Y.cds.fasta': Coding sequences for the gene models
- `Y.cdna.fasta`: cDNA sequences for the gene models
- `Y.cds.fasta`: Coding sequences for the gene models

</details>

Expand Down
13 changes: 7 additions & 6 deletions docs/parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,13 @@ A Nextflow pipeline for consensus, phased and pan-genome annotation.

## Post-annotation filtering options

| Parameter | Description | Type | Default | Required | Hidden |
| ----------------------------- | ----------------------------------------------------------------- | --------- | ------- | -------- | ------ |
| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | |
| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | |
| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | |
| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | |
| Parameter | Description | Type | Default | Required | Hidden |
| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ |
| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | |
| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | |
| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | |
| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | |
| `filter_genes_by_aa_length` | Filter genes with open reading frames shorter than the specified number of amino acids. If set to `null`, this filter step is skipped. | `integer` | 24 | | |

## Annotation output options

Expand Down
5 changes: 5 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
"git_sha": "a8939d36280e7d9037c7cf164eeede19e46546a4",
"installed_by": ["gxf_fasta_agat_spaddintrons_spextractsequences"]
},
"agat/spfilterbyorfsize": {
"branch": "main",
"git_sha": "a0054cdffbd84f002fb6582b28575b699e01098e",
"installed_by": ["modules"]
},
"agat/spflagshortintrons": {
"branch": "main",
"git_sha": "d8f08700c82a3bd14811a3dfe7e7d63838130693",
Expand Down
7 changes: 7 additions & 0 deletions modules/gallvp/agat/spfilterbyorfsize/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- "bioconda::agat=1.4.2"
60 changes: 60 additions & 0 deletions modules/gallvp/agat/spfilterbyorfsize/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
process AGAT_SPFILTERBYORFSIZE {
tag "$meta.id"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/agat:1.4.2--pl5321hdfd78af_0':
'biocontainers/agat:1.4.2--pl5321hdfd78af_0' }"

input:
tuple val(meta), path(gxf)
path config

output:
tuple val(meta), path("*.passed.gff") , emit: passed_gff
tuple val(meta), path("*.failed.gff") , emit: failed_gff
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def config_arg = config ? "-c $config" : ''
if( "$gxf" in [ "${prefix}.passed.gff", "${prefix}.failed.gff" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
"""
agat_sp_filter_by_ORF_size.pl \\
-g $gxf \\
$args \\
$config_arg \\
-o $prefix
mv \\
${prefix}_NOT* \\
"${prefix}.failed.gff"
mv \\
${prefix}_* \\
"${prefix}.passed.gff"
cat <<-END_VERSIONS > versions.yml
"${task.process}":
agat: \$(agat_sp_flag_short_introns.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p')
END_VERSIONS
"""

stub:
def prefix = task.ext.prefix ?: "${meta.id}"
if( "$gxf" in [ "${prefix}.passed.gff", "${prefix}.failed.gff" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
"""
touch ${prefix}.passed.gff
touch ${prefix}.failed.gff
cat <<-END_VERSIONS > versions.yml
"${task.process}":
agat: \$(agat_sp_flag_short_introns.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p')
END_VERSIONS
"""
}
67 changes: 67 additions & 0 deletions modules/gallvp/agat/spfilterbyorfsize/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "agat_spfilterbyorfsize"
description: The script reads a gff annotation file, and create two output files,
one contains the gene models with ORF passing the test, the other contains the rest.
By default the test is "> 100" that means all gene models that have ORF longer than
100 Amino acids, will pass the test.
keywords:
- genomics
- GFF/GTF
- filter
- annotation
tools:
- "agat":
description: "Another Gff Analysis Toolkit (AGAT). Suite of tools to handle gene
annotations in any GTF/GFF format."
homepage: "https://agat.readthedocs.io/en/latest/"
documentation: "https://agat.readthedocs.io/en/latest/"
tool_dev_url: "https://github.com/NBISweden/AGAT"
doi: "10.5281/zenodo.3552717"
licence: ["GPL v3"]
identifier: biotools:AGAT

input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- gxf:
type: file
description: Input GFF3/GTF file
pattern: "*.{gff,gff3,gtf}"
- - config:
type: file
description: |
Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any,
otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose".
The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently).
pattern: "*.yaml"
output:
- passed_gff:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]
- "*.passed.gff":
type: file
description: GFF file with gene models which pass the filter test
- failed_gff:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]
- "*.failed.gff":
type: file
description: GFF file with remaining gene models
- versions:
- versions.yml:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@GallVp"
maintainers:
- "@GallVp"
62 changes: 62 additions & 0 deletions modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
nextflow_process {

name "Test Process AGAT_SPFILTERBYORFSIZE"
script "../main.nf"
process "AGAT_SPFILTERBYORFSIZE"

tag "modules"
tag "modules_gallvp"
tag "agat"
tag "agat/spfilterbyorfsize"

test("actinidia_chinensis - genome - gtf") {


when {
process {
"""
input[0] = [
[ id:'test' ], // meta map
file(params.modules_testdata_base_path + 'genomics/eukaryotes/actinidia_chinensis/genome/chr1/genome.gtf.gz', checkIfExists: true)
]
input[1] = []
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}

}

test("homo_sapiens - genome - gtf - stub") {

options '-stub'

when {
process {
"""
input[0] = [
[ id:'test' ], // meta map
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr1/genome.gtf', checkIfExists: true)
]
input[1] = []
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}

}


}
100 changes: 100 additions & 0 deletions modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{
"homo_sapiens - genome - gtf - stub": {
"content": [
{
"0": [
[
{
"id": "test"
},
"test.passed.gff:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"1": [
[
{
"id": "test"
},
"test.failed.gff:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"2": [
"versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29"
],
"failed_gff": [
[
{
"id": "test"
},
"test.failed.gff:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"passed_gff": [
[
{
"id": "test"
},
"test.passed.gff:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"versions": [
"versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29"
]
}
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.04.4"
},
"timestamp": "2024-12-10T17:07:11.619928"
},
"actinidia_chinensis - genome - gtf": {
"content": [
{
"0": [
[
{
"id": "test"
},
"test.passed.gff:md5,e2558c89e50df32d654f19f9a69e46a3"
]
],
"1": [
[
{
"id": "test"
},
"test.failed.gff:md5,d7eb6ae1c3dc30675138029b513073eb"
]
],
"2": [
"versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29"
],
"failed_gff": [
[
{
"id": "test"
},
"test.failed.gff:md5,d7eb6ae1c3dc30675138029b513073eb"
]
],
"passed_gff": [
[
{
"id": "test"
},
"test.passed.gff:md5,e2558c89e50df32d654f19f9a69e46a3"
]
],
"versions": [
"versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29"
]
}
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.04.4"
},
"timestamp": "2024-12-10T17:07:06.829402"
}
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ params {
enforce_full_intron_support = true
filter_liftoff_by_hints = true
eggnogmapper_purge_nohits = false
filter_genes_by_aa_length = 24

// Annotation output options
braker_save_outputs = false
Expand Down
7 changes: 7 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,13 @@
"type": "boolean",
"description": "Purge transcripts which do not have a hit against eggnog",
"fa_icon": "fas fa-question-circle"
},
"filter_genes_by_aa_length": {
"type": "integer",
"default": 24,
"fa_icon": "fas fa-hashtag",
"description": "Filter genes with open reading frames shorter than the specified number of amino acids. If set to `null`, this filter step is skipped.",
"minimum": 3
}
}
},
Expand Down
Loading

0 comments on commit 3114adb

Please sign in to comment.