diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 04497a0..66e6122 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,11 +1,11 @@ # Adopted from https://github.com/nf-core/modules/blob/master/.github/workflows/test.yml -name: Lint and -stub on Linux/Docker +name: CI tests on: push: - branches: [main] + branches: + - dev pull_request: - branches: [main] # Cancel if a newer run is started concurrency: @@ -30,7 +30,7 @@ jobs: - name: Run pre-commit run: pre-commit run --all-files - stub-test: + test: runs-on: ubuntu-latest name: Run stub test with docker env: @@ -44,17 +44,20 @@ jobs: with: version: "23.04.4" + - name: Disk space cleanup + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + - name: Run stub-test run: | nextflow run \ main.nf \ - -profile local,docker \ + -profile docker \ -stub \ -params-file tests/stub/params.json confirm-pass: runs-on: ubuntu-latest - needs: [pre-commit, stub-test] + needs: [pre-commit, test] if: always() steps: - name: All tests ok diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c94585..13c2478 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## 0.4.0 - [07-Aug-2024] +## 0.4.0+dev - [19-Aug-2024] ### `Added` @@ -24,6 +24,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 15. Reduced `BRAKER3` threads to 8 [#55](https://github.com/PlantandFoodResearch/pangene/issues/55) 16. Now the final annotations are stored in the `annotations` folder [#53](https://github.com/PlantandFoodResearch/pangene/issues/53) 17. Added `-gff` flag to `REPEATMASKER` to save the gff file [#54](https://github.com/PlantandFoodResearch/pangene/issues/54) +18. Now a single `fasta` file can be directly specified for `protein_evidence` +19. `eggnogmapper_db_dir` is not a required parameter anymore +20. `eggnogmapper_tax_scope` is now set to 1 (root div) by default +21. Added a `test` profile based on public data ### `Fixed` @@ -46,7 +50,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 6. Removed dependency on for `BRAKER3` and `REPEATMASKER` modules which are now installed from 7. Removed dependency on 8. Now the final annotations are not stored in the `final` folder -9. Now BRAKER3 outputs are not saved by default [#53](https://github.com/PlantandFoodResearch/pangene/issues/53) +9. Now BRAKER3 outputs are not saved by default [#53](https://github.com/PlantandFoodResearch/pangene/issues/53) and saved under `etc` folder when enabled +10. Removed `local` profile. Local executor is the default when no executor is specified. Therefore, the `local` profile was not needed. ## 0.3.3 - [18-Jun-2024] diff --git a/conf/base.config b/conf/base.config index c5fe54c..bfc2d88 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,34 +1,3 @@ -profiles { - pfr { - process { - executor = 'slurm' - } - - apptainer { - envWhitelist = 'APPTAINER_BINDPATH,APPTAINER_BIND' - cacheDir = "/workspace/pangene/singularity" - } - } - - local { - process { - executor = 'local' - } - } - - apptainer { - apptainer.enabled = true - apptainer.autoMounts= true - apptainer.registry = 'quay.io' - } - - docker { - docker.enabled = true - docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' - docker.registry = 'quay.io' - } -} - process { cpus = { check_max( 1 * task.attempt, 'cpus' ) } diff --git a/conf/modules.config b/conf/modules.config index a44a293..1f9f08b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -146,7 +146,7 @@ process { // SUBWORKFLOW: FASTA_BRAKER3 ].flatten().unique(false).join(' ').trim() ext.prefix = { "${meta.id}" } publishDir = [ - path: { "${params.outdir}/braker/" }, + path: { "${params.outdir}/etc/braker/" }, mode: "copy", saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: params.braker_save_outputs @@ -335,7 +335,7 @@ process { // Universal withName: SAVE_MARKED_GFF3 { publishDir = [ - path: { "${params.outdir}/splicing_marked" }, + path: { "${params.outdir}/etc/splicing_marked" }, mode: "copy", saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] diff --git a/conf/test.config b/conf/test.config new file mode 100644 index 0000000..98293d9 --- /dev/null +++ b/conf/test.config @@ -0,0 +1,7 @@ +params { + input = "${projectDir}/tests/minimal/assemblysheet.csv" + protein_evidence = 'https://raw.githubusercontent.com/Gaius-Augustus/BRAKER/f58479fe5bb13a9e51c3ca09cb9e137cab3b8471/example/proteins.fa' + + braker_extra_args = '--gm_max_intergenic 10000 --skipOptimize' // Added for faster test execution! Do not use with actual data! + busco_lineage_datasets = 'eudicots_odb10' +} diff --git a/docs/parameters.md b/docs/parameters.md index b53c98a..ee94077 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -7,9 +7,9 @@ A NextFlow pipeline for pan-genome annotation | Parameter | Description | Type | Default | Required | Hidden | | ------------------------- | -------------------------------------------------------------------------------------------------------- | --------- | --------- | -------- | ------ | | `input` | Target assemblies listed in a CSV sheet | `string` | | True | | -| `protein_evidence` | Protein evidence provided as fasta files listed in a text sheet | `string` | | True | | -| `eggnogmapper_db_dir` | Eggnogmapper database directory | `string` | | True | | -| `eggnogmapper_tax_scope` | Eggnogmapper taxonomy scopre. Eukaryota: 2759, Viridiplantae: 33090, Archaea: 2157, Bacteria: 2, root: 1 | `integer` | | True | | +| `protein_evidence` | Protein evidence provided as a fasta file or multiple fasta files listed in a plain txt file | `string` | | True | | +| `eggnogmapper_db_dir` | Eggnogmapper database directory | `string` | | | | +| `eggnogmapper_tax_scope` | Eggnogmapper taxonomy scopre. Eukaryota: 2759, Viridiplantae: 33090, Archaea: 2157, Bacteria: 2, root: 1 | `integer` | 1 | | | | `rna_evidence` | FASTQ/BAM samples listed in a CSV sheet | `string` | | | | | `liftoff_annotations` | Reference annotations listed in a CSV sheet | `string` | | | | | `orthofinder_annotations` | Additional annotations for orthology listed in a CSV sheet | `string` | | | | diff --git a/local_pangene b/local_pangene index 8a1aa8a..6f287b1 100755 --- a/local_pangene +++ b/local_pangene @@ -14,8 +14,10 @@ F_BOLD="\033[1m" nextflow run \ main.nf \ - -profile local,docker \ + -profile docker,test \ -resume \ $stub \ - -params-file pangene-test/params.json \ + --max_cpus 8 \ + --max_memory '32.GB' \ + --eggnogmapper_tax_scope 33090 \ --eggnogmapper_db_dir ../dbs/emapperdb/5.0.2 diff --git a/modules/local/utils.nf b/modules/local/utils.nf index 2878afc..f6fc82e 100644 --- a/modules/local/utils.nf +++ b/modules/local/utils.nf @@ -4,7 +4,7 @@ def idFromFileName(fileName) { ).replaceFirst( /\.f(ast)?q$/, '' ).replaceFirst( - /\.f(asta|sa|a|as|aa)?$/, '' + /\.f(asta|sa|a|as|aa|na)?$/, '' ).replaceFirst( /\.gff(3)?$/, '' ).replaceFirst( diff --git a/nextflow.config b/nextflow.config index 84459c7..df016c3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,11 +1,9 @@ -includeConfig './conf/base.config' - params { // Input/output options input = null protein_evidence = null eggnogmapper_db_dir = null - eggnogmapper_tax_scope = null + eggnogmapper_tax_scope = 1 rna_evidence = null liftoff_annotations = null orthofinder_annotations = null @@ -21,7 +19,7 @@ params { skip_fastqc = false skip_fastp = false min_trimmed_reads = 10000 - extra_fastp_args = "" + extra_fastp_args = null save_trimmed = false remove_ribo_rna = false save_non_ribo_reads = false @@ -29,12 +27,12 @@ params { // RNAseq alignment options star_max_intron_length = 16000 - star_align_extra_args = "" + star_align_extra_args = null star_save_outputs = false save_cat_bam = false // Annotation options - braker_extra_args = "" + braker_extra_args = null braker_save_outputs = false liftoff_coverage = 0.9 liftoff_identity = 0.9 @@ -59,15 +57,26 @@ params { validationS3PathCheck = true } -manifest { - name = 'pangene' - author = """Usman Rashid, Jason Shiller""" - homePage = 'https://github.com/PlantandFoodResearch/pangene' - description = """A NextFlow pipeline for pan-genome annotation""" - mainScript = 'main.nf' - nextflowVersion = '!>=23.04.4' - version = '0.4.0' - doi = '' +includeConfig './conf/base.config' + +profiles { + apptainer { + apptainer.enabled = true + apptainer.autoMounts = true + apptainer.registry = 'quay.io' + } + + docker { + docker.enabled = true + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + docker.registry = 'quay.io' + } + + test { includeConfig 'conf/test.config' } +} + +plugins { + id 'nf-validation@1.1.3' } def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') @@ -84,8 +93,15 @@ trace { file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" } -plugins { - id 'nf-validation@1.1.3' +manifest { + name = 'pangene' + author = """Usman Rashid, Jason Shiller""" + homePage = 'https://github.com/PlantandFoodResearch/pangene' + description = """A NextFlow pipeline for pan-genome annotation""" + mainScript = 'main.nf' + nextflowVersion = '!>=23.04.4' + version = '0.4.0+dev' + doi = '' } includeConfig './conf/modules.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index 17f9c04..584ff6e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "", - "required": ["input", "protein_evidence", "eggnogmapper_db_dir", "eggnogmapper_tax_scope", "outdir"], + "required": ["input", "protein_evidence", "outdir"], "properties": { "input": { "type": "string", @@ -23,9 +23,9 @@ }, "protein_evidence": { "type": "string", - "description": "Protein evidence provided as fasta files listed in a text sheet", + "description": "Protein evidence provided as a fasta file or multiple fasta files listed in a plain txt file", "format": "file-path", - "mimetype": "text/txt", + "pattern": "^\\S+\\.(txt|fa|faa|fna|fsa|fas|fasta)(\\.gz)?$", "fa_icon": "far fa-file-alt" }, "eggnogmapper_db_dir": { @@ -36,7 +36,8 @@ "eggnogmapper_tax_scope": { "type": "integer", "description": "Eggnogmapper taxonomy scopre. Eukaryota: 2759, Viridiplantae: 33090, Archaea: 2157, Bacteria: 2, root: 1", - "minimum": 0 + "minimum": 1, + "default": 1 }, "rna_evidence": { "type": "string", diff --git a/subworkflows/local/gff_eggnogmapper.nf b/subworkflows/local/gff_eggnogmapper.nf index 7ea0d19..841a243 100644 --- a/subworkflows/local/gff_eggnogmapper.nf +++ b/subworkflows/local/gff_eggnogmapper.nf @@ -24,7 +24,9 @@ workflow GFF_EGGNOGMAPPER { ch_versions = ch_versions.mix(GFF2FASTA_FOR_EGGNOGMAPPER.out.versions.first()) - ch_eggnogmapper_inputs = ch_gffread_fasta + ch_eggnogmapper_inputs = ! db_folder + ? Channel.empty() + : ch_gffread_fasta | combine(Channel.fromPath(db_folder)) EGGNOGMAPPER( diff --git a/subworkflows/local/gff_store.nf b/subworkflows/local/gff_store.nf index 217cb6d..00ca00b 100644 --- a/subworkflows/local/gff_store.nf +++ b/subworkflows/local/gff_store.nf @@ -8,12 +8,15 @@ workflow GFF_STORE { ch_target_gff // [ meta, gff ] ch_eggnogmapper_annotations // [ meta, annotations ] ch_fasta // [ meta, fasta ] + val_describe_gff // val(true|false) main: ch_versions = Channel.empty() // COLLECTFILE: Add eggnogmapper hits to gff - ch_described_gff = ch_target_gff + ch_described_gff = ! val_describe_gff + ? Channel.empty() + : ch_target_gff | join(ch_eggnogmapper_annotations) | map { meta, gff, annotations -> def tx_annotations = annotations.readLines() @@ -109,7 +112,11 @@ workflow GFF_STORE { } // MODULE: GT_GFF3 as FINAL_GFF_CHECK - FINAL_GFF_CHECK ( ch_described_gff ) + ch_final_check_input = val_describe_gff + ? ch_described_gff + : ch_target_gff + + FINAL_GFF_CHECK ( ch_final_check_input ) ch_final_gff = FINAL_GFF_CHECK.out.gt_gff3 ch_versions = ch_versions.mix(FINAL_GFF_CHECK.out.versions.first()) diff --git a/subworkflows/local/purge_nohit_models.nf b/subworkflows/local/purge_nohit_models.nf index e9f8fbb..55b970e 100644 --- a/subworkflows/local/purge_nohit_models.nf +++ b/subworkflows/local/purge_nohit_models.nf @@ -60,6 +60,11 @@ workflow PURGE_NOHIT_MODELS { ch_versions = ch_versions.mix(AGAT_SPFILTERFEATUREFROMKILLLIST.out.versions.first()) emit: - purged_gff = ch_target_purged_gff.mix(val_purge_nohits ? Channel.empty() : ch_target_gff) + purged_gff = ch_target_purged_gff + | mix( + val_purge_nohits + ? Channel.empty() + : ch_target_gff + ) versions = ch_versions // [ versions.yml ] } diff --git a/tests/minimal/assemblysheet.csv b/tests/minimal/assemblysheet.csv new file mode 100644 index 0000000..a77d36a --- /dev/null +++ b/tests/minimal/assemblysheet.csv @@ -0,0 +1,2 @@ +tag,fasta,is_masked +a_thaliana,https://raw.githubusercontent.com/Gaius-Augustus/BRAKER/f58479fe5bb13a9e51c3ca09cb9e137cab3b8471/example/genome.fa,yes diff --git a/tests/minimal/params.json b/tests/minimal/params.json new file mode 100644 index 0000000..c3e9566 --- /dev/null +++ b/tests/minimal/params.json @@ -0,0 +1,6 @@ +{ + "input": "tests/minimal/assemblysheet.csv", + "protein_evidence": "https://raw.githubusercontent.com/Gaius-Augustus/BRAKER/f58479fe5bb13a9e51c3ca09cb9e137cab3b8471/example/proteins.fa", + "braker_extra_args": "--gm_max_intergenic 10000 --skipOptimize", + "busco_lineage_datasets": "eudicots_odb10" +} diff --git a/workflows/pangene.nf b/workflows/pangene.nf index 01e4d6a..010bc14 100644 --- a/workflows/pangene.nf +++ b/workflows/pangene.nf @@ -103,7 +103,9 @@ workflow PANGENE { bam: files.first().extension == 'bam' } - ch_rna_fq = ch_rna_branch.fq + ch_rna_fq = ! params.rna_evidence + ? Channel.empty() + : ch_rna_branch.fq | map { meta, files -> [ meta.id, meta, files ] } | groupTuple | combine(ch_tar_assm_str) @@ -111,7 +113,9 @@ workflow PANGENE { validateFastqMetadata(metas, files, tar_assm_str) } - ch_rna_bam = ch_rna_branch.bam + ch_rna_bam = ! params.rna_evidence + ? Channel.empty() + : ch_rna_branch.bam | map { meta, files -> [ meta.id, meta, files ] } | groupTuple | combine(ch_tar_assm_str) @@ -149,15 +153,21 @@ workflow PANGENE { | collect : Channel.empty() - ch_ext_prot_fastas = ! params.protein_evidence - ? Channel.empty() - : Channel.fromPath(params.protein_evidence) - | splitText + ch_ext_prot_fastas = ( params.protein_evidence.endsWith('txt') + ? Channel.fromPath(params.protein_evidence) + | splitText + : Channel.fromPath(params.protein_evidence) + ) | map { file_path -> - def file_handle = file(file_path.strip(), checkIfExists: true) + + def file_handle = ( file_path instanceof String ) + ? file(file_path.strip(), checkIfExists: true) + : file_path + [ [ id: idFromFileName( file_handle.baseName ) ], file_handle ] } + ch_liftoff_mm = ! params.liftoff_annotations ? Channel.empty() : Channel.fromSamplesheet('liftoff_annotations') @@ -319,7 +329,7 @@ workflow PANGENE { PURGE_NOHIT_MODELS( ch_merged_gff, ch_eggnogmapper_hits, - params.eggnogmapper_purge_nohits + params.eggnogmapper_purge_nohits && params.eggnogmapper_db_dir ) ch_purged_gff = PURGE_NOHIT_MODELS.out.purged_gff @@ -329,7 +339,8 @@ workflow PANGENE { GFF_STORE( ch_purged_gff, ch_eggnogmapper_annotations, - ch_valid_target_assembly + ch_valid_target_assembly, + params.eggnogmapper_db_dir ) ch_final_gff = GFF_STORE.out.final_gff