From ee9d0b9c56883697807ff1b4e8fb847ae0bab419 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 7 Nov 2023 12:51:46 +1300 Subject: [PATCH 01/59] Added updated info about nf-core modules --- modules/nf-core/CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/nf-core/CHANGELOG.md b/modules/nf-core/CHANGELOG.md index 1b886a1..2721570 100644 --- a/modules/nf-core/CHANGELOG.md +++ b/modules/nf-core/CHANGELOG.md @@ -23,4 +23,6 @@ ### sortmerna 1. Added stub -2. Added author in meta.yml \ No newline at end of file +2. Added author in meta.yml + +- Repo: https://github.com/nf-core/modules/tree/4e2cbac1db88f544711e488e552175368ca14588 \ No newline at end of file From 7f03697dfe00ae5dd7d18ead8c293117e638b6c8 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 7 Nov 2023 13:00:55 +1300 Subject: [PATCH 02/59] Turned off SortMeRNA by default --- TODO.md | 3 ++- nextflow.config | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/TODO.md b/TODO.md index a9be74b..d243565 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,5 @@ - [ ] Rename perform_edta_annotation to FASTA_PERFORM_EDTA - [ ] Extract subworkflows - [ ] STAR ignores softmasking and, thus, should be fed the unmasked genome so that masking and mapping can run in parallel. -- [ ] Add --eval=reference.gtf \ No newline at end of file +- [ ] Add --eval=reference.gtf +- [ ] Replace quay containers with galaxyproject cache containers. \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index a624175..3a60fdb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -48,7 +48,7 @@ params { // toggling this parameter results in rerun of FASTP and FASTQC_TRIM save_trimmed = false - remove_ribo_rna = true + remove_ribo_rna = false save_non_ribo_reads = false ribo_database_manifest = "${projectDir}/assets/rrna-db-defaults.txt" } From 232493a7fc52aab9862b1825a0447e561d921f03 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 8 Nov 2023 13:55:31 +1300 Subject: [PATCH 03/59] Decouple target assemblies and read qc/align --- .gitignore | 15 -- README.md | 5 +- conf/modules.config | 18 +- modules/nf-core/fastp/main.nf | 8 +- nextflow.config | 20 ++- subworkflows/local/extract_samples.nf | 23 ++- workflows/pan_gene.nf | 233 ++++++++++---------------- 7 files changed, 128 insertions(+), 194 deletions(-) diff --git a/.gitignore b/.gitignore index cc3b658..6e9d9d4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,24 +1,9 @@ .DS_Store - *.pyc __pycahce__ - -nextflow .nextflow* work/ -*.dot - -Results/ results/ -report/ -Report/ - -*.log -.nfs* - -*.sif - -pan_gene_slurm.sh *.stdout *.stderr diff --git a/README.md b/README.md index 2eaf965..7dbf45e 100644 --- a/README.md +++ b/README.md @@ -112,5 +112,6 @@ Some software components of this pipeline have been adopted from following third > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). -2. rewarewaannotation [MIT](https://github.com/kherronism/rewarewaannotation/blob/master/LICENSE): https://github.com/kherronism/rewarewaannotation -3. assembly_qc [GPL-3.0](https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE): https://github.com/Plant-Food-Research-Open/assembly_qc \ No newline at end of file +2. nf-core/rnaseq [MIT](https://github.com/nf-core/rnaseq/blob/master/LICENSE): https://github.com/nf-core/rnaseq +3. rewarewaannotation [MIT](https://github.com/kherronism/rewarewaannotation/blob/master/LICENSE): https://github.com/kherronism/rewarewaannotation +4. assembly_qc [GPL-3.0](https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE): https://github.com/Plant-Food-Research-Open/assembly_qc \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index 66205cb..0cd7bb5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -31,8 +31,6 @@ process { } } -// https://github.com/nf-core/rnaseq -// MIT: https://github.com/nf-core/rnaseq/blob/master/LICENSE if(!params.sample_prep.skip_fastqc) { process { withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTQC_RAW' { @@ -50,17 +48,20 @@ if(!params.sample_prep.skip_fastqc) { } } -// https://github.com/nf-core/rnaseq -// MIT: https://github.com/nf-core/rnaseq/blob/master/LICENSE if(!params.sample_prep.skip_fastp) { process { withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTP' { ext.args = params.sample_prep.extra_fastp_args ?: '' publishDir = [ [ - path: { "${params.outdir}/fastp" }, + path: { "${params.outdir}/fastp/html" }, mode: "copy", - pattern: "*.{json,html}" + pattern: "*.{html}" + ], + [ + path: { "${params.outdir}/fastp/json" }, + mode: "copy", + pattern: "*.{json}" ], [ path: { "${params.outdir}/fastp/log" }, @@ -99,12 +100,7 @@ if (params.sample_prep.remove_ribo_rna) { } } -// https://github.com/kherronism/rewarewaannotation -// MIT: https://github.com/nf-core/rnaseq/blob/master/LICENSE process { - - // Changes: - // Introduced additional defaults withName: STAR_ALIGN { ext.args = [ "--outSAMstrandField intronMotif", diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf index ee38e1d..9c747d3 100644 --- a/modules/nf-core/fastp/main.nf +++ b/modules/nf-core/fastp/main.nf @@ -101,12 +101,16 @@ process FASTP { } stub: - def prefix = task.ext.prefix ?: "${meta.id}" + def prefix = task.ext.prefix ?: "${meta.id}" + def isSingleOutput = task.ext.args?.contains('--interleaved_in') || meta.single_end + def outputFiles = isSingleOutput ? "${prefix}.fastp.fastq.gz" : "${prefix}_1.fastp.fastq.gz ${prefix}_2.fastp.fastq.gz" + def mergedFileCommand = (!isSingleOutput && save_merged) ? "touch ${prefix}.merged.fastq.gz" : "" """ - touch "${prefix}.fastp.fastq.gz" + touch $outputFiles touch "${prefix}.json" touch "${prefix}.html" touch "${prefix}.log" + $mergedFileCommand cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 3a60fdb..ec56979 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,6 +1,10 @@ includeConfig './conf/base.config' params { + target_assemblies = [ + ["red5_v2p1", "/workspace/hrauxr/pan-gene/.test/red5_v2p1_chr1.fasta"], + ["donghong", "/workspace/hrauxr/pan-gene/.test/donghong.chr1.fsa.gz"] + ] // FASTA files (fasta, fasta.gz) for the assemblies to annotate // // Pattern: [["tag", "file path"]] @@ -14,10 +18,10 @@ params { // ["tag2", "./a/relative/path/to/the/fasta/file2.fasta"], // ["tag3", "https://ftp.ncbi.nlm.nih.gov/genomes/test_genome.fna"], ...] // target_assemblies = [["tair10", "/an/absolute/path/to/the/fasta/file.fasta"]] - target_assemblies = [ - ["red5_v2p1", "/workspace/hrauxr/pan-gene/.test/red5_v2p1_chr1.fasta"] + + te_libraries = [ + ["donghong", "/workspace/hrauxr/pan-gene/.test/donghong.TElib.fa.gz"] ] - // TE libs (fasta, fasta.gz) for target_assemblies // // Optional Set to [] if libraries are not available, te_libraries = [] @@ -26,8 +30,6 @@ params { // Not all target_assemblies need to have an associated (by tag) TE library. // When the TE lib is not available for a traget assembly, EDTA is used to create one. - te_libraries = [] - edta { is_sensitive = false save_outputs = true @@ -36,8 +38,8 @@ params { save_outputs = true } - // Optional: Set to null if not available samplesheet = "./.test/samplesheet.csv" + // Optional: Set to null if not available sample_prep { skip_fastqc = false @@ -45,8 +47,8 @@ params { min_trimmed_reads = 10000 extra_fastp_args = "" - // toggling this parameter results in rerun of FASTP and FASTQC_TRIM save_trimmed = false + // toggling this parameter results in rerun of FASTP and FASTQC_TRIM remove_ribo_rna = false save_non_ribo_reads = false @@ -59,17 +61,17 @@ params { save_outputs = false } - // Optional: Set to null if not available external_protein_seqs = [ "/workspace/ComparativeDataSources/OrthoDB11/Viridiplantae.fa.gz", "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/RU01.20221115150135.pep.fasta" ] + // Optional: Set to null if not available braker { extra_braker_args = "" } - outdir = "./results" + outdir = "./results" max_cpus = 12 max_memory = 200.GB diff --git a/subworkflows/local/extract_samples.nf b/subworkflows/local/extract_samples.nf index f3c1a15..d05de4c 100644 --- a/subworkflows/local/extract_samples.nf +++ b/subworkflows/local/extract_samples.nf @@ -7,6 +7,8 @@ nextflow.enable.dsl=2 // Added channel permissible_target_assemblies // Changed file name from input_check.nf to extract_samples.nf // Removed strandedness +// Nowing emitting an extra channel 'assemblies' which indicates the +// assemblies targeted by each read // // Check input samplesheet and get read channels // @@ -20,14 +22,23 @@ workflow EXTRACT_SAMPLES { main: SAMPLESHEET_CHECK ( samplesheet, permissible_target_assemblies ) - .csv - .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } + .csv + | splitCsv ( header:true, sep:',' ) + | map { create_fastq_channel(it) } + | set { ch_reads } + reads = ch_reads.map { meta, fastq -> [[id:meta.id, single_end:meta.single_end], fastq]} + + ch_reads + | flatMap { meta, fastq -> + meta.target_assemblies.collect { assembly -> [[id:meta.id, single_end:meta.single_end], assembly] } + } + | set { assemblies } + emit: - reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + reads // channel: [ val(meta), [ reads ] ] + assemblies // channel: [ val(meta), val(assembly) ] + versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] diff --git a/workflows/pan_gene.nf b/workflows/pan_gene.nf index fccbe2e..46dd112 100644 --- a/workflows/pan_gene.nf +++ b/workflows/pan_gene.nf @@ -31,10 +31,9 @@ if (params.sample_prep.remove_ribo_rna) { workflow PAN_GENE { // Versions - Channel.empty() - | set { ch_versions } + ch_versions = Channel.empty() - // GUNZIP: target_assemblies + // MODULE: GUNZIP_TARGET_ASSEMBLY Channel.fromList(params.target_assemblies) | map { tag, filePath -> [[id:tag], file(filePath, checkIfExists: true)] @@ -54,20 +53,16 @@ workflow PAN_GENE { ) | set { ch_gunzip_target_assemblies } - ch_versions - | mix(GUNZIP_TARGET_ASSEMBLY.out.versions.first()) - | set { ch_versions } + ch_versions = ch_versions.mix(GUNZIP_TARGET_ASSEMBLY.out.versions.first()) - // FASTA_VALIDATE + // MODULE: FASTA_VALIDATE FASTA_VALIDATE(ch_gunzip_target_assemblies) .valid_fasta | set { ch_validated_target_assemblies } - ch_versions - | mix(FASTA_VALIDATE.out.versions.first()) - | set { ch_versions } + ch_versions = ch_versions.mix(FASTA_VALIDATE.out.versions.first()) - // GUNZIP: te_libraries + // MODULE: GUNZIP_TE_LIBRARY Channel.fromList(params.te_libraries) | map { tag, filePath -> [[id:tag], file(filePath, checkIfExists: true)] @@ -87,11 +82,9 @@ workflow PAN_GENE { ) | set { ch_gunzip_te_libraries } - ch_versions - | mix(GUNZIP_TE_LIBRARY.out.versions.first()) - | set { ch_versions } + ch_versions = ch_versions.mix(GUNZIP_TE_LIBRARY.out.versions.first()) - // PERFORM_EDTA_ANNOTATION + // SUBWORKFLOW: PERFORM_EDTA_ANNOTATION ch_validated_target_assemblies | join( ch_gunzip_te_libraries, remainder: true @@ -102,11 +95,9 @@ workflow PAN_GENE { | map {meta, assembly, teLib -> [meta, assembly]} | PERFORM_EDTA_ANNOTATION - ch_versions - | mix(PERFORM_EDTA_ANNOTATION.out.versions) - | set { ch_versions } + ch_versions = ch_versions.mix(PERFORM_EDTA_ANNOTATION.out.versions) - // REPEATMASKER + // MODULE: REPEATMASKER ch_validated_target_assemblies | join( PERFORM_EDTA_ANNOTATION.out.te_lib_fasta.mix(ch_gunzip_te_libraries) @@ -118,11 +109,9 @@ workflow PAN_GENE { ch_assemblies_n_te_libs.map {meta, assembly, teLib -> teLib}, ) - ch_versions - | mix(REPEATMASKER.out.versions.first()) - | set { ch_versions } + ch_versions = ch_versions.mix(REPEATMASKER.out.versions.first()) - // STAR_GENOMEGENERATE + // MODULE: STAR_GENOMEGENERATE def star_ignore_sjdbgtf = true STAR_GENOMEGENERATE( REPEATMASKER.out.fasta_masked, @@ -132,15 +121,9 @@ workflow PAN_GENE { .index | set { ch_assembly_index } - ch_versions - | mix(STAR_GENOMEGENERATE.out.versions.first()) - | set { ch_versions } + ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions.first()) - // EXTRACT_SAMPLES - // https://github.com/nf-core/rnaseq - // MIT: https://github.com/nf-core/rnaseq/blob/master/LICENSE - // Changes - // Use meta.id as key for groupTuple as groupTuple does not work when there is a sublist in the key list + // SUBWORKFLOW: EXTRACT_SAMPLES ch_samplesheet_path = Channel.empty() if(params.samplesheet != null) { ch_samplesheet_path = Channel.fromPath(params.samplesheet) @@ -152,25 +135,22 @@ workflow PAN_GENE { ) .reads | map { meta, fastq -> - new_id = meta.id - ~/_T\d+/ - [ new_id, meta + [id: new_id], fastq ] + groupID = meta.id - ~/_T\d+/ + [ meta + [id: groupID], fastq ] } | groupTuple() - | branch { meta_id, meta, fastq -> + | branch { meta, fastq -> single : fastq.size() == 1 - return [ meta.first(), fastq.flatten() ] + return [ meta, fastq.flatten() ] multiple: fastq.size() > 1 - return [ meta.first(), fastq.flatten() ] + return [ meta, fastq.flatten() ] } | set { ch_fastq } - ch_versions - | mix(EXTRACT_SAMPLES.out.versions) - | set { ch_versions } + ch_read_target_assemblies = EXTRACT_SAMPLES.out.assemblies + ch_versions = ch_versions.mix(EXTRACT_SAMPLES.out.versions) - // CAT_FASTQ - // https://github.com/nf-core/rnaseq - // MIT: https://github.com/nf-core/rnaseq/blob/master/LICENSE + // MODULES: CAT_FASTQ CAT_FASTQ ( ch_fastq.multiple ) @@ -178,13 +158,9 @@ workflow PAN_GENE { | mix(ch_fastq.single) | set { ch_cat_fastq } - ch_versions - | mix(CAT_FASTQ.out.versions.first()) - | set { ch_versions } + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first()) - // FASTQ_FASTQC_UMITOOLS_FASTP - // https://github.com/nf-core/rnaseq - // MIT: https://github.com/nf-core/rnaseq/blob/master/LICENSE + // SUBWORKFLOW: FASTQ_FASTQC_UMITOOLS_FASTP def with_umi = false def skip_umi_extract = true def umi_discard_read = false @@ -203,7 +179,9 @@ workflow PAN_GENE { .reads | set { ch_trim_reads } - // SORTMERNA + ch_versions = ch_versions.mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.versions) + + // MODULE: SORTMERNA if (params.sample_prep.remove_ribo_rna) { Channel.from(ch_ribo_db.readLines()) | map { row -> file(row, checkIfExists: true) } @@ -217,110 +195,75 @@ workflow PAN_GENE { .reads | set { ch_trim_reads } - ch_versions - | mix(SORTMERNA.out.versions.first()) - | set { ch_versions } + ch_versions = ch_versions.mix(SORTMERNA.out.versions.first()) } - ch_trim_reads - | flatMap { meta, reads -> - def targetAssemblies = meta["target_assemblies"] - - readsByAssembly = [] - - for(assembly in targetAssemblies) { - readsByAssembly += [[[id: "${meta.id}.on.${assembly}", single_end: meta.single_end, target_assembly: assembly], reads]] - } - - return readsByAssembly + // MODULE: STAR_ALIGN + ch_read_target_assemblies + | map { meta, assembly -> + groupID = meta.id - ~/_T\d+/ + [ meta + [id: groupID], assembly ] } - | set { ch_trim_reads_by_assembly } - - ch_versions - | mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.versions) - | set { ch_versions } - - // STAR_ALIGN - ch_assembly_index - | map { meta, index -> - [meta.id, index] + | unique + | combine(ch_trim_reads, by:0) + | map { meta, assembly, fastq -> + [assembly, [id:"${meta.id}.on.${assembly}", single_end:meta.single_end, target_assembly:assembly], fastq] } - | cross( - ch_trim_reads_by_assembly.map{meta, reads -> [meta.target_assembly, meta, reads]} + | combine( + ch_assembly_index.map { meta, index -> [meta.id, index] }, + by:0 ) - | map { indexWithExt, readsWithExt -> - def index = indexWithExt[1] - - def readsMeta = readsWithExt[1] - def reads = readsWithExt[2] - - [ - readsMeta, - reads, - index - ] - } - | set { ch_trim_reads_by_assembly_with_index } + | map { assembly, meta, fastq, index -> [meta, fastq, index] } + | set { ch_star_inputs } def seq_platform = false def seq_center = false STAR_ALIGN( - ch_trim_reads_by_assembly_with_index.map{meta, reads, index -> [meta, reads]}, - ch_trim_reads_by_assembly_with_index.map{meta, reads, index -> [[id: meta.target_assembly], index]}, - ch_trim_reads_by_assembly_with_index.map{meta, reads, index -> [[id: meta.target_assembly], []]}, + ch_star_inputs.map{meta, fastq, index -> [meta, fastq]}, + ch_star_inputs.map{meta, fastq, index -> [[id: meta.target_assembly], index]}, + ch_star_inputs.map{meta, fastq, index -> [[id: meta.target_assembly], []]}, star_ignore_sjdbgtf, seq_platform, seq_center ) .bam_sorted - .tap { ch_mapped_reads } - .map { meta, bam -> + | set { ch_star_bam } + + ch_versions = ch_versions.mix(STAR_ALIGN.out.versions.first()) + + // MODULE: SAMTOOLS_CAT + ch_star_bam + | map { meta, bam -> [ [id: meta.target_assembly], bam instanceof List ? bam.find {it =~ /Aligned/} : bam ] } | groupTuple - | set { ch_mapped_reads_by_assembly } - - ch_versions - | mix(STAR_ALIGN.out.versions.first()) - | set { ch_versions } - - // SAMTOOLS_CAT - ch_mapped_reads_by_assembly | branch { meta, bamList -> bams: bamList.size() > 1 bam: bamList.size() <= 1 } - | set { ch_samtools_cat_inputs_branches } + | set { ch_star_bam_branch } SAMTOOLS_CAT( - ch_samtools_cat_inputs_branches.bams + ch_star_bam_branch.bams ) - .bam - | map { meta, bam -> - [ - meta, - [bam] - ] - } + .bam.map { meta, bam -> [meta, [bam]] } | mix( - ch_samtools_cat_inputs_branches.bam + ch_star_bam_branch.bam ) - | set { ch_cat_bam_by_assembly } + | set { ch_samtools_bam } - ch_versions - | mix(SAMTOOLS_CAT.out.versions.first()) - | set { ch_versions } + ch_versions = ch_versions.mix(SAMTOOLS_CAT.out.versions.first()) - // GUNZIP: external_protein_seqs - ch_external_protein_seqs = Channel.empty() + // MODULE: GUNZIP_EXTERNAL_PROTEIN_SEQ + ch_ext_prot_seqs = Channel.empty() if(params.external_protein_seqs != null) { - ch_external_protein_seqs = Channel.fromList(params.external_protein_seqs) + ch_ext_prot_seqs = Channel.fromList(params.external_protein_seqs) } - ch_external_protein_seqs + ch_ext_prot_seqs | map { filePath -> def fileHandle = file(filePath, checkIfExists: true) [[id:fileHandle.getSimpleName()], fileHandle] @@ -329,38 +272,32 @@ workflow PAN_GENE { gz: "$file".endsWith(".gz") rest: !"$file".endsWith(".gz") } - | set { ch_external_protein_seqs_branch } + | set { ch_ext_prot_seqs_branch } GUNZIP_EXTERNAL_PROTEIN_SEQ( - ch_external_protein_seqs_branch.gz + ch_ext_prot_seqs_branch.gz ) .gunzip | mix( - ch_external_protein_seqs_branch.rest + ch_ext_prot_seqs_branch.rest ) - | set { ch_gunzip_external_protein_seqs } + | set { ch_ext_prot_seqs } - ch_versions - | mix(GUNZIP_EXTERNAL_PROTEIN_SEQ.out.versions.first()) - | set { ch_versions } + ch_versions = ch_versions.mix(GUNZIP_EXTERNAL_PROTEIN_SEQ.out.versions.first()) - // CAT_PROTEIN_SEQS - ch_gunzip_external_protein_seqs - | map{meta, filePath -> filePath} + // MODULE: CAT_PROTEIN_SEQS + ch_ext_prot_seqs + | map{ meta, filePath -> filePath } | collect - | map{fileList -> [[id:"protein_seqs"], fileList]} + | map{ fileList -> [[id:"protein_seqs"], fileList] } | CAT_PROTEIN_SEQS - CAT_PROTEIN_SEQS.out.file_out - | set { ch_protein_seq } - - ch_versions - | mix(CAT_PROTEIN_SEQS.out.versions) - | set { ch_versions } + ch_ext_prot_seqs = CAT_PROTEIN_SEQS.out.file_out + ch_versions = ch_versions.mix(CAT_PROTEIN_SEQS.out.versions) - // BRAKER3 + // MODULE: BRAKER3 REPEATMASKER.out.fasta_masked - | mix(ch_cat_bam_by_assembly) + | mix(ch_samtools_bam) | groupTuple(size: 2, remainder: true) | map { meta, groupedItems -> def maskedFasta = groupedItems[0] @@ -376,20 +313,20 @@ workflow PAN_GENE { if(params.external_protein_seqs) { ch_braker_inputs - | combine(ch_protein_seq.map{meta, filePath -> filePath}) + | combine(ch_ext_prot_seqs.map{meta, filePath -> filePath}) | set { ch_braker_inputs } } else { ch_braker_inputs - | map{meta, assembly, bams -> [meta, assembly, bams, []]} + | map{meta, assembly, bam -> [meta, assembly, bam, []]} | set { ch_braker_inputs } } - ch_fasta = ch_braker_inputs.map{meta, assembly, bams, proteinSeq -> [meta, assembly]} - ch_bam = ch_braker_inputs.map{meta, assembly, bams, proteinSeq -> bams} - ch_rnaseq_sets_dirs = ch_braker_inputs.map{meta, assembly, bams, proteinSeq -> []} - ch_rnaseq_sets_ids = ch_braker_inputs.map{meta, assembly, bams, proteinSeq -> []} - ch_proteins = ch_braker_inputs.map{meta, assembly, bams, proteinSeq -> proteinSeq} - ch_hintsfile = ch_braker_inputs.map{meta, assembly, bams, proteinSeq -> []} + ch_fasta = ch_braker_inputs.map{ meta, assembly, bam, proteinSeq -> [meta, assembly] } + ch_bam = ch_braker_inputs.map{ meta, assembly, bam, proteinSeq -> bam } + ch_proteins = ch_braker_inputs.map{ meta, assembly, bam, proteinSeq -> proteinSeq } + ch_rnaseq_sets_dirs = [] + ch_rnaseq_sets_ids = [] + ch_hintsfile = [] BRAKER3( ch_fasta, @@ -400,7 +337,5 @@ workflow PAN_GENE { ch_hintsfile ) - ch_versions - | mix(BRAKER3.out.versions.first()) - | set { ch_versions } + ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) } \ No newline at end of file From 4f2ed8e7d6b996c2ea5e78c40638d6d7545a0f44 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 9 Nov 2023 15:21:59 +1300 Subject: [PATCH 04/59] A bit of reformatiing --- nextflow.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index ec56979..be33522 100644 --- a/nextflow.config +++ b/nextflow.config @@ -73,9 +73,9 @@ params { outdir = "./results" - max_cpus = 12 - max_memory = 200.GB - max_time = 1.days + max_cpus = 12 + max_memory = 200.GB + max_time = 1.days } includeConfig './conf/modules.config' From 8b83c61aaf748efa5b93a8d82a68f7143abdb7c5 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 9 Nov 2023 15:58:46 +1300 Subject: [PATCH 05/59] Started implementing liftoff --- conf/modules.config | 13 ++++++++++ modules/local/liftoff/main.nf | 48 +++++++++++++++++++++++++++++++++++ modules/nf-core/CHANGELOG.md | 2 +- nextflow.config | 14 ++++++++++ 4 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 modules/local/liftoff/main.nf diff --git a/conf/modules.config b/conf/modules.config index 0cd7bb5..f5dc1c6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -130,4 +130,17 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] } +} + +if(params.liftoff.xref_annotations) { + process { + withName: LIFTOFF { + ext.args = '-exclude_partial', + publishDir = [ + path: { "${params.outdir}/liftoff/${meta.id}" }, + mode: "copy", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + } } \ No newline at end of file diff --git a/modules/local/liftoff/main.nf b/modules/local/liftoff/main.nf new file mode 100644 index 0000000..cec7bd1 --- /dev/null +++ b/modules/local/liftoff/main.nf @@ -0,0 +1,48 @@ +process LIFTOFF { + tag "$meta.id" + label "process_high" + + container "https://depot.galaxyproject.org/singularity/liftoff:1.6.3--pyhdfd78af_0" + + input: + tuple val(meta), path(target_fa) + path ref_fa + path ref_gff + + output: + tuple val(meta), path("*.liftoff.gff3") , emit: gff3 + tuple val(meta), path("unmapped_features.txt") , emit: unmapped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + liftoff \\ + -g $ref_gff \\ + -p $task.cpus \\ + $args \\ + $target_fa \\ + $ref_fa \\ + > "${prefix}.liftoff.gff3" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + liftoff: \$(liftoff --version) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch "${prefix}.liftoff.gff3" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + liftoff: \$(liftoff --version) + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/nf-core/CHANGELOG.md b/modules/nf-core/CHANGELOG.md index 2721570..f7e0034 100644 --- a/modules/nf-core/CHANGELOG.md +++ b/modules/nf-core/CHANGELOG.md @@ -25,4 +25,4 @@ 1. Added stub 2. Added author in meta.yml -- Repo: https://github.com/nf-core/modules/tree/4e2cbac1db88f544711e488e552175368ca14588 \ No newline at end of file +- Repo: https://github.com/nf-core/modules/tree/18cd2206622dc606bbceea533c7823feb2a251db \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index be33522..6980e1c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -71,6 +71,20 @@ params { extra_braker_args = "" } + liftoff { + xref_annotations = [ + [ + "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/Russell_V2a.chromosomes.and.unassiged.and.haplotigs.fsa", + "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/RU01.20221115150135.gff3" + ], + [ + "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_chr_all.fas", + "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_GFF3_genes_transposons.fixed.gff3" + ] + ] + // Optional: Set to null if not available + } + outdir = "./results" max_cpus = 12 From 37ce74edb234d9792d2b59368855c9210fd7e865 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Fri, 10 Nov 2023 13:10:22 +1300 Subject: [PATCH 06/59] Checkpoint before major reshuffle --- conf/modules.config | 2 +- nextflow.config | 6 +-- workflows/pan_gene.nf | 89 ++++++++++++++++++++++++++++++++++++------- 3 files changed, 80 insertions(+), 17 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f5dc1c6..132ffe9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -135,7 +135,7 @@ process { if(params.liftoff.xref_annotations) { process { withName: LIFTOFF { - ext.args = '-exclude_partial', + ext.args = '-exclude_partial -copies' publishDir = [ path: { "${params.outdir}/liftoff/${meta.id}" }, mode: "copy", diff --git a/nextflow.config b/nextflow.config index 6980e1c..daf1eef 100644 --- a/nextflow.config +++ b/nextflow.config @@ -50,7 +50,7 @@ params { save_trimmed = false // toggling this parameter results in rerun of FASTP and FASTQC_TRIM - remove_ribo_rna = false + remove_ribo_rna = true save_non_ribo_reads = false ribo_database_manifest = "${projectDir}/assets/rrna-db-defaults.txt" } @@ -87,8 +87,8 @@ params { outdir = "./results" - max_cpus = 12 - max_memory = 200.GB + max_cpus = 1 + max_memory = 4.GB max_time = 1.days } diff --git a/workflows/pan_gene.nf b/workflows/pan_gene.nf index 46dd112..2198765 100644 --- a/workflows/pan_gene.nf +++ b/workflows/pan_gene.nf @@ -12,6 +12,8 @@ include { STAR_ALIGN } from '../modules/nf-core/star/ include { SAMTOOLS_CAT } from '../modules/nf-core/samtools/cat' include { CAT_CAT as CAT_PROTEIN_SEQS } from '../modules/nf-core/cat/cat' include { BRAKER3 } from '../modules/kherronism/braker3' +include { GUNZIP as GUNZIP_XREF_FASTA } from '../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_XREF_GFF } from '../modules/nf-core/gunzip' include { PERFORM_EDTA_ANNOTATION } from '../subworkflows/local/perform_edta_annotation' include { EXTRACT_SAMPLES } from '../subworkflows/local/extract_samples' @@ -105,8 +107,8 @@ workflow PAN_GENE { | set { ch_assemblies_n_te_libs } REPEATMASKER( - ch_assemblies_n_te_libs.map {meta, assembly, teLib -> [meta, assembly]}, - ch_assemblies_n_te_libs.map {meta, assembly, teLib -> teLib}, + ch_assemblies_n_te_libs.map { meta, assembly, teLib -> [meta, assembly] }, + ch_assemblies_n_te_libs.map { meta, assembly, teLib -> teLib }, ) ch_versions = ch_versions.mix(REPEATMASKER.out.versions.first()) @@ -115,7 +117,7 @@ workflow PAN_GENE { def star_ignore_sjdbgtf = true STAR_GENOMEGENERATE( REPEATMASKER.out.fasta_masked, - REPEATMASKER.out.fasta_masked.map{meta, maskedFasta -> [meta, []]}, + REPEATMASKER.out.fasta_masked.map { meta, maskedFasta -> [meta, []] }, star_ignore_sjdbgtf ) .index @@ -131,7 +133,7 @@ workflow PAN_GENE { EXTRACT_SAMPLES( ch_samplesheet_path, - Channel.of(params.target_assemblies.collect{tag, fastaPath -> tag.strip()}.join(",")) + Channel.of(params.target_assemblies.collect { tag, fastaPath -> tag.strip() }.join(",")) ) .reads | map { meta, fastq -> @@ -219,9 +221,9 @@ workflow PAN_GENE { def seq_platform = false def seq_center = false STAR_ALIGN( - ch_star_inputs.map{meta, fastq, index -> [meta, fastq]}, - ch_star_inputs.map{meta, fastq, index -> [[id: meta.target_assembly], index]}, - ch_star_inputs.map{meta, fastq, index -> [[id: meta.target_assembly], []]}, + ch_star_inputs.map { meta, fastq, index -> [meta, fastq] }, + ch_star_inputs.map { meta, fastq, index -> [[id: meta.target_assembly], index] }, + ch_star_inputs.map { meta, fastq, index -> [[id: meta.target_assembly], []] }, star_ignore_sjdbgtf, seq_platform, seq_center @@ -259,7 +261,7 @@ workflow PAN_GENE { // MODULE: GUNZIP_EXTERNAL_PROTEIN_SEQ ch_ext_prot_seqs = Channel.empty() - if(params.external_protein_seqs != null) { + if(params.external_protein_seqs) { ch_ext_prot_seqs = Channel.fromList(params.external_protein_seqs) } @@ -287,9 +289,9 @@ workflow PAN_GENE { // MODULE: CAT_PROTEIN_SEQS ch_ext_prot_seqs - | map{ meta, filePath -> filePath } + | map { meta, filePath -> filePath } | collect - | map{ fileList -> [[id:"protein_seqs"], fileList] } + | map { fileList -> [[id:"protein_seqs"], fileList] } | CAT_PROTEIN_SEQS ch_ext_prot_seqs = CAT_PROTEIN_SEQS.out.file_out @@ -321,9 +323,9 @@ workflow PAN_GENE { | set { ch_braker_inputs } } - ch_fasta = ch_braker_inputs.map{ meta, assembly, bam, proteinSeq -> [meta, assembly] } - ch_bam = ch_braker_inputs.map{ meta, assembly, bam, proteinSeq -> bam } - ch_proteins = ch_braker_inputs.map{ meta, assembly, bam, proteinSeq -> proteinSeq } + ch_fasta = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> [meta, assembly] } + ch_bam = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> bam } + ch_proteins = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> proteinSeq } ch_rnaseq_sets_dirs = [] ch_rnaseq_sets_ids = [] ch_hintsfile = [] @@ -338,4 +340,65 @@ workflow PAN_GENE { ) ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) + + // MODULE: GUNZIP_XREF_FASTA + ch_xref_annotations = Channel.empty() + if(params.liftoff.xref_annotations) { + Channel.fromList(params.liftoff.xref_annotations) + | multiMap { fasta, gff -> + def fastaFile = file(fasta, checkIfExists:true) + def meta = [id:fastaFile.getSimpleName()] + + fasta: [meta, fastaFile] + gff: [meta, file(gff, checkIfExists:true)] + } + | set { ch_xref_annotations } + } + + ch_xref_annotations.fasta + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } + | set { ch_xref_annotations_branch } + + GUNZIP_XREF_FASTA( + ch_xref_annotations_branch.gz + ) + .gunzip + | mix( + ch_xref_annotations_branch.rest + ) + | set { ch_xref_annotations_fasta } + + // MODULE: GUNZIP_XREF_GFF + ch_xref_annotations.gff + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } + | set { ch_xref_annotations_gff_branch } + + GUNZIP_XREF_GFF( + ch_xref_annotations_gff_branch.gff.map { meta, fasta, gff -> [meta, gff] } + ) + .gunzip + | mix( + ch_xref_annotations_gff_branch.rest.map { meta, fasta, gff -> [meta, gff] } + ) + | set { ch_xref_annotations_gff } + + ch_xref_annotations_fasta + | join( + ch_xref_annotations_gff + ) + | set { ch_xref_annotations } + + // // MODULE: LIFTOFF + // ch_xref_annotations + // | combine( + // ch_validated_target_assemblies + // ) + // | map { meta, ref_fasta, refGFF, targetMeta, targetFasta -> [[id:"${targetMeta.id}.from.${meta.id}"], ref_fasta, refGFF, targetFasta] } + // | set { ch_liftoff_inputs } } \ No newline at end of file From d148f1824ee07d653d05239c76496c62a94c16e5 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Fri, 10 Nov 2023 13:59:21 +1300 Subject: [PATCH 07/59] Reformatted local modules --- TODO.md | 1 - modules/local/edta/edta/main.nf | 107 +++++++++--------- modules/local/edta/restore_edta_ids/main.nf | 97 ++++++++-------- ...ming_f1b7bce.py => reverse_edta_naming.py} | 2 +- modules/local/edta/shorten_edta_ids/main.nf | 55 +++++---- ...ta_ids_c97537f.py => shorten_fasta_ids.py} | 9 +- modules/local/fasta_validate/main.nf | 55 +++++---- modules/local/samplesheet_check/main.nf | 3 +- modules/local/validate_params/main.nf | 4 - subworkflows/local/extract_samples.nf | 8 +- subworkflows/local/fasta_edta.nf | 43 +++++++ subworkflows/local/perform_edta_annotation.nf | 48 -------- 12 files changed, 203 insertions(+), 229 deletions(-) rename modules/local/edta/restore_edta_ids/resources/usr/bin/{reverse_edta_naming_f1b7bce.py => reverse_edta_naming.py} (98%) rename modules/local/edta/shorten_edta_ids/resources/usr/bin/{shorten_fasta_ids_c97537f.py => shorten_fasta_ids.py} (96%) create mode 100644 subworkflows/local/fasta_edta.nf delete mode 100644 subworkflows/local/perform_edta_annotation.nf diff --git a/TODO.md b/TODO.md index d243565..6f0a836 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,3 @@ -- [ ] Rename perform_edta_annotation to FASTA_PERFORM_EDTA - [ ] Extract subworkflows - [ ] STAR ignores softmasking and, thus, should be fed the unmasked genome so that masking and mapping can run in parallel. - [ ] Add --eval=reference.gtf diff --git a/modules/local/edta/edta/main.nf b/modules/local/edta/edta/main.nf index 99b6811..56fd196 100644 --- a/modules/local/edta/edta/main.nf +++ b/modules/local/edta/edta/main.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -// https://github.com/Plant-Food-Research-Open/assembly_qc -// GPL-3.0: https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE process EDTA { tag "$meta.id" label "process_high" @@ -11,64 +7,67 @@ process EDTA { containerOptions "-B $TMPDIR:$TMPDIR" input: - tuple val(meta), path(fasta_file) + tuple val(meta), path(fasta_file) output: - tuple val(meta), path('*.EDTA.TElib.fa'), emit: te_lib_fasta - tuple val(meta), path('*.EDTA.intact.gff3'), emit: intact_gff3 - tuple val(meta), path('*.EDTA.pass.list'), emit: pass_list - tuple val(meta), path('*.EDTA.out'), emit: out_file - tuple val(meta), path('*.EDTA.TEanno.gff3'), emit: te_anno_gff3 - path "versions.yml", emit: versions + tuple val(meta), path('*.EDTA.TElib.fa') , emit: te_lib_fasta + tuple val(meta), path('*.EDTA.intact.gff3') , emit: intact_gff3 + tuple val(meta), path('*.EDTA.pass.list') , emit: pass_list + tuple val(meta), path('*.EDTA.out') , emit: out_file + tuple val(meta), path('*.EDTA.TEanno.gff3') , emit: te_anno_gff3 + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def modFileName = "${fasta_file}.mod" - """ - EDTA.pl \\ - --genome $fasta_file \\ - --threads $task.cpus \\ - $args - - if [ -f "${modFileName}.EDTA.raw/LTR/${modFileName}.pass.list" ]; then - cat "${modFileName}.EDTA.raw/LTR/${modFileName}.pass.list" \\ - > "${modFileName}.EDTA.pass.list" - else - echo "EDTA PASS LIST IS EMPTY" \\ - > "${modFileName}.EDTA.pass.list" - fi + def args = task.ext.args ?: '' + def modFileName = "${fasta_file}.mod" + """ + EDTA.pl \\ + --genome $fasta_file \\ + --threads $task.cpus \\ + $args + + if [ -f "${modFileName}.EDTA.raw/LTR/${modFileName}.pass.list" ]; then + cat "${modFileName}.EDTA.raw/LTR/${modFileName}.pass.list" \\ + > "${modFileName}.EDTA.pass.list" + else + echo "EDTA PASS LIST IS EMPTY" \\ + > "${modFileName}.EDTA.pass.list" + fi - if [ -f "${modFileName}.EDTA.anno/${modFileName}.out" ]; then - cat "${modFileName}.EDTA.anno/${modFileName}.out" \\ - > "${modFileName}.EDTA.out" - else - echo "EDTA DID NOT PRODUCE AN OUT FILE" \\ - > "${modFileName}.EDTA.out" - fi + if [ -f "${modFileName}.EDTA.anno/${modFileName}.out" ]; then + cat "${modFileName}.EDTA.anno/${modFileName}.out" \\ + > "${modFileName}.EDTA.out" + else + echo "EDTA DID NOT PRODUCE AN OUT FILE" \\ + > "${modFileName}.EDTA.out" + fi - if [ ! -f "${modFileName}.EDTA.TEanno.gff3" ]; then - echo "##EDTA DID NOT PRODUCE A TEANNO GFF3" \\ - > "${modFileName}.EDTA.TEanno.gff3" - fi + if [ ! -f "${modFileName}.EDTA.TEanno.gff3" ]; then + echo "##EDTA DID NOT PRODUCE A TEANNO GFF3" \\ + > "${modFileName}.EDTA.TEanno.gff3" + fi - cat <<-END_VERSIONS > versions.yml - "${task.process}": - EDTA: \$(EDTA.pl -h | awk ' /##### Extensive/ {print \$7}') - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + EDTA: \$(EDTA.pl -h | awk ' /##### Extensive/ {print \$7}') + END_VERSIONS + """ stub: - def modFileName = "${fasta_file}.mod" - """ - touch "${modFileName}.EDTA.TElib.fa" - touch "${modFileName}.EDTA.intact.gff3" - touch "${modFileName}.EDTA.pass.list" - touch "${modFileName}.EDTA.out" - touch "${modFileName}.EDTA.TEanno.gff3" + def modFileName = "${fasta_file}.mod" + """ + touch "${modFileName}.EDTA.TElib.fa" + touch "${modFileName}.EDTA.intact.gff3" + touch "${modFileName}.EDTA.pass.list" + touch "${modFileName}.EDTA.out" + touch "${modFileName}.EDTA.TEanno.gff3" - cat <<-END_VERSIONS > versions.yml - "${task.process}": - EDTA: \$(EDTA.pl -h | awk ' /##### Extensive/ {print \$7}') - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + EDTA: \$(EDTA.pl -h | awk ' /##### Extensive/ {print \$7}') + END_VERSIONS + """ } \ No newline at end of file diff --git a/modules/local/edta/restore_edta_ids/main.nf b/modules/local/edta/restore_edta_ids/main.nf index 606848c..4da8a34 100644 --- a/modules/local/edta/restore_edta_ids/main.nf +++ b/modules/local/edta/restore_edta_ids/main.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -// https://github.com/Plant-Food-Research-Open/assembly_qc -// GPL-3.0: https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE process RESTORE_EDTA_IDS { tag "$meta.id" label "process_single" @@ -9,58 +5,59 @@ process RESTORE_EDTA_IDS { container "docker://gallvp/python3npkgs:v0.4" input: - tuple val(meta), path(te_lib_fa) - path(intact_gff3) - path(pass_list) - path(out_file) - path(te_anno_gff3) - path(renamed_ids_tsv) + tuple val(meta), path(te_lib_fa) + path(intact_gff3) + path(pass_list) + path(out_file) + path(te_anno_gff3) + path(renamed_ids_tsv) output: - tuple val(meta), path("${meta.id}.EDTA.TElib.fa"), emit: te_lib_fasta - tuple val(meta), path("${meta.id}.EDTA.intact.gff3"), emit: intact_gff3 - tuple val(meta), path("${meta.id}.renamed.ids.EDTA.pass.list"), emit: pass_list - tuple val(meta), path("${meta.id}.renamed.ids.EDTA.out"), emit: out_file - tuple val(meta), path("${meta.id}.EDTA.TEanno.gff3"), emit: te_anno_gff3 - tuple val(meta), path("${meta.id}.renamed.ids.tsv"), emit: renamed_ids_tsv - path "versions.yml", emit: versions + tuple val(meta), path("${meta.id}.EDTA.TElib.fa") , emit: te_lib_fasta + tuple val(meta), path("${meta.id}.EDTA.intact.gff3") , emit: intact_gff3 + tuple val(meta), path("${meta.id}.renamed.ids.EDTA.pass.list") , emit: pass_list + tuple val(meta), path("${meta.id}.renamed.ids.EDTA.out") , emit: out_file + tuple val(meta), path("${meta.id}.EDTA.TEanno.gff3") , emit: te_anno_gff3 + tuple val(meta), path("${meta.id}.renamed.ids.tsv") , emit: renamed_ids_tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def VERSION = "f1b7bce" // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - """ - cat $pass_list > "${meta.id}.renamed.ids.EDTA.pass.list" - cat $out_file > "${meta.id}.renamed.ids.EDTA.out" - cat $te_lib_fa > "${meta.id}.EDTA.TElib.fa" - cat $renamed_ids_tsv > "${meta.id}.renamed.ids.tsv" - - renamed_ids_head=\$(head -n 1 "$renamed_ids_tsv") - - if [[ \$renamed_ids_head == "IDs have acceptable length and character. No change required." ]]; then - cat $te_anno_gff3 > "${meta.id}.EDTA.TEanno.gff3" - cat $intact_gff3 > "${meta.id}.EDTA.intact.gff3" - else - reverse_edta_naming_f1b7bce.py "$renamed_ids_tsv" "$te_anno_gff3" "$intact_gff3" "$meta" - fi + """ + cat $pass_list > "${meta.id}.renamed.ids.EDTA.pass.list" + cat $out_file > "${meta.id}.renamed.ids.EDTA.out" + cat $te_lib_fa > "${meta.id}.EDTA.TElib.fa" + cat $renamed_ids_tsv > "${meta.id}.renamed.ids.tsv" + + renamed_ids_head=\$(head -n 1 "$renamed_ids_tsv") + + if [[ \$renamed_ids_head == "IDs have acceptable length and character. No change required." ]]; then + cat $te_anno_gff3 > "${meta.id}.EDTA.TEanno.gff3" + cat $intact_gff3 > "${meta.id}.EDTA.intact.gff3" + else + reverse_edta_naming.py "$renamed_ids_tsv" "$te_anno_gff3" "$intact_gff3" "$meta" + fi - cat <<-END_VERSIONS > versions.yml - "${task.process}": - reverse_edta_naming: $VERSION - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + reverse_edta_naming: \$(md5sum \$(which reverse_edta_naming.py) | cut -d' ' -f1) + END_VERSIONS + """ stub: - def VERSION = "f1b7bce" // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - """ - touch "${meta.id}.EDTA.TElib.fa" - touch "${meta.id}.EDTA.intact.gff3" - touch "${meta.id}.renamed.ids.EDTA.pass.list" - touch "${meta.id}.renamed.ids.EDTA.out" - touch "${meta.id}.EDTA.TEanno.gff3" - touch "${meta.id}.renamed.ids.tsv" + """ + touch "${meta.id}.EDTA.TElib.fa" + touch "${meta.id}.EDTA.intact.gff3" + touch "${meta.id}.renamed.ids.EDTA.pass.list" + touch "${meta.id}.renamed.ids.EDTA.out" + touch "${meta.id}.EDTA.TEanno.gff3" + touch "${meta.id}.renamed.ids.tsv" - cat <<-END_VERSIONS > versions.yml - "${task.process}": - reverse_edta_naming: $VERSION - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + reverse_edta_naming: \$(md5sum \$(which reverse_edta_naming.py) | cut -d' ' -f1) + END_VERSIONS + """ } \ No newline at end of file diff --git a/modules/local/edta/restore_edta_ids/resources/usr/bin/reverse_edta_naming_f1b7bce.py b/modules/local/edta/restore_edta_ids/resources/usr/bin/reverse_edta_naming.py similarity index 98% rename from modules/local/edta/restore_edta_ids/resources/usr/bin/reverse_edta_naming_f1b7bce.py rename to modules/local/edta/restore_edta_ids/resources/usr/bin/reverse_edta_naming.py index c047100..7e8522c 100755 --- a/modules/local/edta/restore_edta_ids/resources/usr/bin/reverse_edta_naming_f1b7bce.py +++ b/modules/local/edta/restore_edta_ids/resources/usr/bin/reverse_edta_naming.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys diff --git a/modules/local/edta/shorten_edta_ids/main.nf b/modules/local/edta/shorten_edta_ids/main.nf index 829667b..e216ce4 100644 --- a/modules/local/edta/shorten_edta_ids/main.nf +++ b/modules/local/edta/shorten_edta_ids/main.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -// https://github.com/Plant-Food-Research-Open/assembly_qc -// GPL-3.0: https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE process SHORTEN_EDTA_IDS { tag "$meta.id" label "process_single" @@ -9,39 +5,40 @@ process SHORTEN_EDTA_IDS { container "docker://gallvp/python3npkgs:v0.4" input: - tuple val(meta), path(fasta_file) + tuple val(meta), path(fasta_file) output: - tuple val(meta), path("*.renamed.ids.fa"), emit: renamed_ids_fasta - tuple val(meta), path("*.renamed.ids.tsv"), emit: renamed_ids_tsv - path "versions.yml", emit: versions + tuple val(meta), path("*.renamed.ids.fa") , emit: renamed_ids_fasta + tuple val(meta), path("*.renamed.ids.tsv") , emit: renamed_ids_tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def VERSION = "c97537f" // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - """ - FILE="$fasta_file" - output_prefix="\${FILE%.*}" + """ + FILE="$fasta_file" + output_prefix="\${FILE%.*}" - shorten_fasta_ids_c97537f.py "$fasta_file" "\$output_prefix" + shorten_fasta_ids.py "$fasta_file" "\$output_prefix" - cat <<-END_VERSIONS > versions.yml - "${task.process}": - shorten_fasta_ids: $VERSION - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + shorten_fasta_ids: \$(md5sum \$(which shorten_fasta_ids.py) | cut -d' ' -f1) + END_VERSIONS + """ stub: - def VERSION = "c97537f" // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - """ - FILE="$fasta_file" - output_prefix="\${FILE%.*}" + """ + FILE="$fasta_file" + output_prefix="\${FILE%.*}" - touch "\${output_prefix}.renamed.ids.fa" - touch "\${output_prefix}.renamed.ids.tsv" + touch "\${output_prefix}.renamed.ids.fa" + touch "\${output_prefix}.renamed.ids.tsv" - cat <<-END_VERSIONS > versions.yml - "${task.process}": - shorten_fasta_ids: $VERSION - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + shorten_fasta_ids: \$(md5sum \$(which shorten_fasta_ids.py) | cut -d' ' -f1) + END_VERSIONS + """ } \ No newline at end of file diff --git a/modules/local/edta/shorten_edta_ids/resources/usr/bin/shorten_fasta_ids_c97537f.py b/modules/local/edta/shorten_edta_ids/resources/usr/bin/shorten_fasta_ids.py similarity index 96% rename from modules/local/edta/shorten_edta_ids/resources/usr/bin/shorten_fasta_ids_c97537f.py rename to modules/local/edta/shorten_edta_ids/resources/usr/bin/shorten_fasta_ids.py index e5b62b3..0b6e6d2 100755 --- a/modules/local/edta/shorten_edta_ids/resources/usr/bin/shorten_fasta_ids_c97537f.py +++ b/modules/local/edta/shorten_edta_ids/resources/usr/bin/shorten_fasta_ids.py @@ -1,13 +1,10 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import re import sys from Bio import SeqIO -# https://github.com/Plant-Food-Research-Open/assembly_qc -# GPL-3.0: https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE - # The input fasta file path fasta_file_path = sys.argv[1] @@ -153,10 +150,10 @@ def fail_if_new_ids_not_valid(ids): if not do_ids_need_to_change(input_ids): print("IDs have acceptable length and character. No change required.") - + with open(f"{output_files_prefix}.renamed.ids.tsv", "w") as f: f.write("IDs have acceptable length and character. No change required.") - + write_fasta_without_comments(fasta_file_path, output_files_prefix) exit(0) diff --git a/modules/local/fasta_validate/main.nf b/modules/local/fasta_validate/main.nf index 7f8370c..7c37c39 100644 --- a/modules/local/fasta_validate/main.nf +++ b/modules/local/fasta_validate/main.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -// https://github.com/Plant-Food-Research-Open/assembly_qc -// GPL-3.0: https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE process FASTA_VALIDATE { tag "$meta.id" label "process_single" @@ -9,38 +5,39 @@ process FASTA_VALIDATE { container "docker://gallvp/fasta_validator:a6a2ec1_ps" input: - tuple val(meta), path(fasta_file) + tuple val(meta), path(fasta_file) output: - tuple val(meta), path("$validFasta"), emit: valid_fasta - path "versions.yml", emit: versions + tuple val(meta), path("$validFasta") , emit: valid_fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: - validFasta = (fasta_file.toString() - ~/\.\w+$/) + ".validated.fasta" - def VERSION = "a6a2ec1_ps" // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - """ - fasta_validate -v $fasta_file >/dev/null + validFasta = (fasta_file.toString() - ~/\.\w+$/) + ".validated.fasta" + """ + fasta_validate -v $fasta_file >/dev/null - # If invalid, the above command will fail and - # the NXF error startegy will kick in. - - cat $fasta_file > $validFasta + # If invalid, the above command will fail and + # the NXF error startegy will kick in. + + cat $fasta_file > $validFasta - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fasta_validate: $VERSION - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fasta_validate: \$(md5sum \$(which fasta_validate) | cut -d' ' -f1) + END_VERSIONS + """ stub: - validFasta = (fasta_file.toString() - ~/\.\w+$/) + ".validated.fasta" - def VERSION = "a6a2ec1_ps" // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - """ - touch $validFasta + validFasta = (fasta_file.toString() - ~/\.\w+$/) + ".validated.fasta" + """ + touch $validFasta - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fasta_validate: $VERSION - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fasta_validate: \$(md5sum \$(which fasta_validate) | cut -d' ' -f1) + END_VERSIONS + """ } \ No newline at end of file diff --git a/modules/local/samplesheet_check/main.nf b/modules/local/samplesheet_check/main.nf index adb8a92..f0437a6 100644 --- a/modules/local/samplesheet_check/main.nf +++ b/modules/local/samplesheet_check/main.nf @@ -1,5 +1,4 @@ -nextflow.enable.dsl=2 - +// Source: // https://github.com/nf-core/rnaseq // MIT: https://github.com/nf-core/rnaseq/blob/master/LICENSE // diff --git a/modules/local/validate_params/main.nf b/modules/local/validate_params/main.nf index f40d2ac..7b5697f 100644 --- a/modules/local/validate_params/main.nf +++ b/modules/local/validate_params/main.nf @@ -1,7 +1,3 @@ -nextflow.enable.dsl=2 - -// https://github.com/Plant-Food-Research-Open/assembly_qc -// GPL-3.0: https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE def validateParams(params) { validateFastaTags(params) validateTETags(params) diff --git a/subworkflows/local/extract_samples.nf b/subworkflows/local/extract_samples.nf index d05de4c..e63bbe0 100644 --- a/subworkflows/local/extract_samples.nf +++ b/subworkflows/local/extract_samples.nf @@ -1,17 +1,15 @@ -nextflow.enable.dsl=2 - +// Source: // https://github.com/nf-core/rnaseq // MIT: https://github.com/nf-core/rnaseq/blob/master/LICENSE // +// Check input samplesheet and get read channels +// // Changes: // Added channel permissible_target_assemblies // Changed file name from input_check.nf to extract_samples.nf // Removed strandedness // Nowing emitting an extra channel 'assemblies' which indicates the // assemblies targeted by each read -// -// Check input samplesheet and get read channels -// include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' diff --git a/subworkflows/local/fasta_edta.nf b/subworkflows/local/fasta_edta.nf new file mode 100644 index 0000000..f55a958 --- /dev/null +++ b/subworkflows/local/fasta_edta.nf @@ -0,0 +1,43 @@ +include { SHORTEN_EDTA_IDS } from '../../modules/local/edta/shorten_edta_ids' +include { EDTA } from '../../modules/local/edta/edta' +include { RESTORE_EDTA_IDS } from '../../modules/local/edta/restore_edta_ids' + +workflow FASTA_EDTA { + take: + genome_fasta // channel: [ meta, /path/fasta ] + + main: + SHORTEN_EDTA_IDS(genome_fasta) + .renamed_ids_fasta + | EDTA + + RESTORE_EDTA_IDS( + EDTA.out.te_lib_fasta, + EDTA.out.intact_gff3.map { it[1] }, + EDTA.out.pass_list.map { it[1] }, + EDTA.out.out_file.map { it[1] }, + EDTA.out.te_anno_gff3.map { it[1] }, + SHORTEN_EDTA_IDS.out.renamed_ids_tsv.map { it[1] } + ) + + Channel.empty() + | mix( + SHORTEN_EDTA_IDS.out.versions.first() + ) + | mix( + EDTA.out.versions.first() + ) + | mix( + RESTORE_EDTA_IDS.out.versions.first() + ) + | set { ch_versions } + + emit: + te_lib_fasta = RESTORE_EDTA_IDS.out.te_lib_fasta // channel: [ meta, /path/fasta ] + intact_gff3 = RESTORE_EDTA_IDS.out.intact_gff3 // channel: [ meta, /path/gff3 ] + pass_list = RESTORE_EDTA_IDS.out.pass_list // channel: [ meta, /path/pass.list ] + out_file = RESTORE_EDTA_IDS.out.out_file // channel: [ meta, /path/out.file ] + te_anno_gff3 = RESTORE_EDTA_IDS.out.te_anno_gff3 // channel: [ meta, /path/gff3 ] + renamed_ids_tsv = RESTORE_EDTA_IDS.out.renamed_ids_tsv // channel: [ meta, /path/tsv ] + versions = ch_versions // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/subworkflows/local/perform_edta_annotation.nf b/subworkflows/local/perform_edta_annotation.nf deleted file mode 100644 index d362934..0000000 --- a/subworkflows/local/perform_edta_annotation.nf +++ /dev/null @@ -1,48 +0,0 @@ -nextflow.enable.dsl=2 - -include { SHORTEN_EDTA_IDS } from '../../modules/local/edta/shorten_edta_ids' -include { EDTA } from '../../modules/local/edta/edta' -include { RESTORE_EDTA_IDS } from '../../modules/local/edta/restore_edta_ids' - -// https://github.com/Plant-Food-Research-Open/assembly_qc -// GPL-3.0: https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE -workflow PERFORM_EDTA_ANNOTATION { - take: - genome_fasta // [meta, /path/to/genome/fasta] - - main: - - SHORTEN_EDTA_IDS(genome_fasta) - .renamed_ids_fasta - | EDTA - - RESTORE_EDTA_IDS( - EDTA.out.te_lib_fasta, - EDTA.out.intact_gff3.map { it[1] }, - EDTA.out.pass_list.map { it[1] }, - EDTA.out.out_file.map { it[1] }, - EDTA.out.te_anno_gff3.map { it[1] }, - SHORTEN_EDTA_IDS.out.renamed_ids_tsv.map { it[1] } - ) - - Channel.empty() - | mix( - SHORTEN_EDTA_IDS.out.versions.first() - ) - | mix( - EDTA.out.versions.first() - ) - | mix( - RESTORE_EDTA_IDS.out.versions.first() - ) - | set { ch_versions } - - emit: - te_lib_fasta = RESTORE_EDTA_IDS.out.te_lib_fasta - intact_gff3 = RESTORE_EDTA_IDS.out.intact_gff3 - pass_list = RESTORE_EDTA_IDS.out.pass_list - out_file = RESTORE_EDTA_IDS.out.out_file - te_anno_gff3 = RESTORE_EDTA_IDS.out.te_anno_gff3 - renamed_ids_tsv = RESTORE_EDTA_IDS.out.renamed_ids_tsv - versions = ch_versions -} \ No newline at end of file From e63e22fea1ec2c240c5299869ed30b058a781827 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Fri, 10 Nov 2023 14:01:48 +1300 Subject: [PATCH 08/59] Now using galaxy containers --- TODO.md | 3 +-- modules/local/edta/edta/main.nf | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/TODO.md b/TODO.md index 6f0a836..41d8b8c 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,3 @@ - [ ] Extract subworkflows - [ ] STAR ignores softmasking and, thus, should be fed the unmasked genome so that masking and mapping can run in parallel. -- [ ] Add --eval=reference.gtf -- [ ] Replace quay containers with galaxyproject cache containers. \ No newline at end of file +- [ ] Add --eval=reference.gtf \ No newline at end of file diff --git a/modules/local/edta/edta/main.nf b/modules/local/edta/edta/main.nf index 56fd196..2e6d759 100644 --- a/modules/local/edta/edta/main.nf +++ b/modules/local/edta/edta/main.nf @@ -3,7 +3,7 @@ process EDTA { label "process_high" label "process_week_long" - container 'quay.io/biocontainers/edta:2.1.0--hdfd78af_1' + container 'https://depot.galaxyproject.org/singularity/edta:2.1.0--hdfd78af_1' containerOptions "-B $TMPDIR:$TMPDIR" input: From 19783086736688fe3349c264ac1b83f8f9e67e71 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Fri, 10 Nov 2023 15:44:53 +1300 Subject: [PATCH 09/59] Extracted some subworkflows --- subworkflows/local/align_rnaseq.nf | 70 +++++++ subworkflows/local/fasta_edta.nf | 14 +- subworkflows/local/prepare_assembly.nf | 101 +++++++++ subworkflows/local/preprocess_rnaseq.nf | 95 +++++++++ workflows/pan_gene.nf | 261 ++++-------------------- 5 files changed, 317 insertions(+), 224 deletions(-) create mode 100644 subworkflows/local/align_rnaseq.nf create mode 100644 subworkflows/local/prepare_assembly.nf create mode 100644 subworkflows/local/preprocess_rnaseq.nf diff --git a/subworkflows/local/align_rnaseq.nf b/subworkflows/local/align_rnaseq.nf new file mode 100644 index 0000000..be7f026 --- /dev/null +++ b/subworkflows/local/align_rnaseq.nf @@ -0,0 +1,70 @@ +include { STAR_ALIGN } from '../../modules/nf-core/star/align' +include { SAMTOOLS_CAT } from '../../modules/nf-core/samtools/cat' + +workflow ALIGN_RNASEQ { + take: + reads_target // channel: [ meta, assembly_id ] + trim_reads // channel: [ meta, [ fq ] ] + assembly_index // channel: [ meta2, star_index ] + + main: + // MODULE: STAR_ALIGN + reads_target + | combine(trim_reads, by:0) + | map { meta, assembly, fastq -> + [assembly, [id:"${meta.id}.on.${assembly}", single_end:meta.single_end, target_assembly:assembly], fastq] + } + | combine( + assembly_index.map { meta, index -> [meta.id, index] }, + by:0 + ) + | map { assembly, meta, fastq, index -> [meta, fastq, index] } + | set { ch_star_inputs } + + def seq_platform = false + def seq_center = false + STAR_ALIGN( + ch_star_inputs.map { meta, fastq, index -> [meta, fastq] }, + ch_star_inputs.map { meta, fastq, index -> [[id: meta.target_assembly], index] }, + ch_star_inputs.map { meta, fastq, index -> [[id: meta.target_assembly], []] }, + star_ignore_sjdbgtf, + seq_platform, + seq_center + ) + .bam_sorted + | set { ch_star_bam } + + // MODULE: SAMTOOLS_CAT + ch_star_bam + | map { meta, bam -> + [ + [id: meta.target_assembly], + bam instanceof List ? bam.find {it =~ /Aligned/} : bam + ] + } + | groupTuple + | branch { meta, bamList -> + bams: bamList.size() > 1 + bam: bamList.size() <= 1 + } + | set { ch_star_bam_branch } + + SAMTOOLS_CAT( + ch_star_bam_branch.bams + ) + .bam + | map { meta, bam -> [meta, [bam]] } + | mix( + ch_star_bam_branch.bam + ) + | set { ch_samtools_bam } + + Channel.empty() + | mix(STAR_ALIGN.out.versions.first()) + | mix(SAMTOOLS_CAT.out.versions.first()) + | set { ch_versions } + + emit: + bam = ch_samtools_bam // channel: [ [ id, single_end, target_assembly ], [ bam ] ] + versions = ch_versions // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/subworkflows/local/fasta_edta.nf b/subworkflows/local/fasta_edta.nf index f55a958..c47e557 100644 --- a/subworkflows/local/fasta_edta.nf +++ b/subworkflows/local/fasta_edta.nf @@ -4,7 +4,7 @@ include { RESTORE_EDTA_IDS } from '../../modules/local/edta/restore_edta_ids' workflow FASTA_EDTA { take: - genome_fasta // channel: [ meta, /path/fasta ] + genome_fasta // channel: [ meta, fasta ] main: SHORTEN_EDTA_IDS(genome_fasta) @@ -33,11 +33,11 @@ workflow FASTA_EDTA { | set { ch_versions } emit: - te_lib_fasta = RESTORE_EDTA_IDS.out.te_lib_fasta // channel: [ meta, /path/fasta ] - intact_gff3 = RESTORE_EDTA_IDS.out.intact_gff3 // channel: [ meta, /path/gff3 ] - pass_list = RESTORE_EDTA_IDS.out.pass_list // channel: [ meta, /path/pass.list ] - out_file = RESTORE_EDTA_IDS.out.out_file // channel: [ meta, /path/out.file ] - te_anno_gff3 = RESTORE_EDTA_IDS.out.te_anno_gff3 // channel: [ meta, /path/gff3 ] - renamed_ids_tsv = RESTORE_EDTA_IDS.out.renamed_ids_tsv // channel: [ meta, /path/tsv ] + te_lib_fasta = RESTORE_EDTA_IDS.out.te_lib_fasta // channel: [ meta, fasta ] + intact_gff3 = RESTORE_EDTA_IDS.out.intact_gff3 // channel: [ meta, gff3 ] + pass_list = RESTORE_EDTA_IDS.out.pass_list // channel: [ meta, pass.list ] + out_file = RESTORE_EDTA_IDS.out.out_file // channel: [ meta, out.file ] + te_anno_gff3 = RESTORE_EDTA_IDS.out.te_anno_gff3 // channel: [ meta, gff3 ] + renamed_ids_tsv = RESTORE_EDTA_IDS.out.renamed_ids_tsv // channel: [ meta, tsv ] versions = ch_versions // channel: [ versions.yml ] } \ No newline at end of file diff --git a/subworkflows/local/prepare_assembly.nf b/subworkflows/local/prepare_assembly.nf new file mode 100644 index 0000000..1cecb72 --- /dev/null +++ b/subworkflows/local/prepare_assembly.nf @@ -0,0 +1,101 @@ +include { GUNZIP as GUNZIP_TARGET_ASSEMBLY } from '../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_TE_LIBRARY } from '../../modules/nf-core/gunzip' +include { FASTA_VALIDATE } from '../../modules/local/fasta_validate' +include { REPEATMASKER } from '../../modules/kherronism/repeatmasker' +include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate' + +include { FASTA_EDTA } from '../subworkflows/local/fasta_edta' + +workflow PREPARE_ASSEMBLY { + take: + target_assembly // channel: [ meta, fasta ] + te_library // channel: [ meta, fasta ] + + main: + // MODULE: GUNZIP_TARGET_ASSEMBLY + target_assembly + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } + | set { tech_target_assembly_branch } + + GUNZIP_TARGET_ASSEMBLY( + tech_target_assembly_branch.gz + ) + .gunzip + | mix( + tech_target_assembly_branch.rest + ) + | set { ch_gunzip_target_assembly } + + // MODULE: FASTA_VALIDATE + FASTA_VALIDATE(ch_gunzip_target_assembly) + .valid_fasta + | set { ch_validated_target_assembly } + + // MODULE: GUNZIP_TE_LIBRARY + te_library + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } + | set { ch_te_library_branch } + + GUNZIP_TE_LIBRARY( + ch_te_library_branch.gz + ) + .gunzip + | mix( + ch_te_library_branch.rest + ) + | set { ch_gunzip_te_library } + + // SUBWORKFLOW: FASTA_EDTA + ch_validated_target_assembly + | join( + ch_gunzip_te_library, remainder: true + ) + | filter { meta, assembly, teLib -> + teLib == null + } + | map { meta, assembly, teLib -> [meta, assembly] } + | FASTA_EDTA + + // MODULE: REPEATMASKER + ch_validated_target_assembly + | join( + FASTA_EDTA.out.te_lib_fasta.mix(ch_gunzip_te_library) + ) + | set { ch_assembly_n_te_lib } + + REPEATMASKER( + ch_assembly_n_te_lib.map { meta, assembly, teLib -> [meta, assembly] }, + ch_assembly_n_te_lib.map { meta, assembly, teLib -> teLib }, + ) + + // MODULE: STAR_GENOMEGENERATE + def star_ignore_sjdbgtf = true + STAR_GENOMEGENERATE( + REPEATMASKER.out.fasta_masked, + REPEATMASKER.out.fasta_masked.map { meta, maskedFasta -> [meta, []] }, + star_ignore_sjdbgtf + ) + .index + | set { ch_assembly_index } + + Channel.empty() + | mix(FASTA_VALIDATE.out.versions.first()) + | mix(GUNZIP_TE_LIBRARY.out.versions.first()) + | mix(FASTA_EDTA.out.versions) + | mix(REPEATMASKER.out.versions.first()) + | mix(STAR_GENOMEGENERATE.out.versions.first()) + | mix(GUNZIP_TARGET_ASSEMBLY.out.versions.first()) + | set { ch_versions } + + emit: + target_assemby // channel: [ meta, fasta ] + masked_target_assembly = REPEATMASKER.out.fasta_masked // channel: [ meta, fasta ] + target_assemby_index = ch_assembly_index // channel: [ meta, star_index ] + versions = ch_versions // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/subworkflows/local/preprocess_rnaseq.nf b/subworkflows/local/preprocess_rnaseq.nf new file mode 100644 index 0000000..8223910 --- /dev/null +++ b/subworkflows/local/preprocess_rnaseq.nf @@ -0,0 +1,95 @@ +include { CAT_FASTQ } from '../../modules/nf-core/cat/fastq' +include { SORTMERNA } from '../../modules/nf-core/sortmerna' +include { EXTRACT_SAMPLES } from '../../subworkflows/local/extract_samples' +include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../../subworkflows/nf-core/fastq_fastqc_umitools_fastp' + +workflow PREPROCESS_RNASEQ { + take: + samplesheet // path: csv + permissible_target_assemblies // val: assembly_a,assembly_b + skip_fastqc // val: true|false + skip_fastp // val: true|false + save_trimmed // val: true|false + min_trimmed_reads // val: Integer + remove_ribo_rna // val: true|false + sortmerna_fastas // channel: [ [ fasta ] ] + + main: + ch_versions = Channel.empty() + // SUBWORKFLOW: EXTRACT_SAMPLES + EXTRACT_SAMPLES( + samplesheet, + ch_permissible_target_assemblies + ) + .reads + | map { meta, fastq -> + groupID = meta.id - ~/_T\d+/ + [ meta + [id: groupID], fastq ] + } + | groupTuple() + | branch { meta, fastq -> + single : fastq.size() == 1 + return [ meta, fastq.flatten() ] + multiple: fastq.size() > 1 + return [ meta, fastq.flatten() ] + } + | set { ch_fastq } + + EXTRACT_SAMPLES.out.assemblies + | map { meta, assembly -> + groupID = meta.id - ~/_T\d+/ + [ meta + [id: groupID], assembly ] + } + | unique + | set { ch_reads_target } + + // MODULES: CAT_FASTQ + CAT_FASTQ ( + ch_fastq.multiple + ) + .reads + | mix(ch_fastq.single) + | set { ch_cat_fastq } + + // SUBWORKFLOW: FASTQ_FASTQC_UMITOOLS_FASTP + def with_umi = false + def skip_umi_extract = true + def umi_discard_read = false + FASTQ_FASTQC_UMITOOLS_FASTP ( + ch_cat_fastq, + skip_fastqc, + with_umi, + skip_umi_extract, + umi_discard_read, + skip_fastp, + [], + save_trimmed, + save_trimmed, + min_trimmed_reads + ) + .reads + | set { ch_trim_reads } + + // MODULE: SORTMERNA + if (remove_ribo_rna) { + SORTMERNA ( + ch_trim_reads, + sortmerna_fastas + ) + .reads + | set { ch_sortmerna_reads } + + ch_versions = ch_versions.mix(SORTMERNA.out.versions.first()) + } + + ch_versions + | mix(EXTRACT_SAMPLES.out.versions) + | mix(CAT_FASTQ.out.versions.first()) + | mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.versions) + | set { ch_versions } + + emit: + trim_reads = remove_ribo_rna ? ch_sortmerna_reads : ch_trim_reads // channel: [ meta, [ fq ] ] + reads_target = ch_reads_target // channel: [ meta, assembly_id ] + versions = ch_versions // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/workflows/pan_gene.nf b/workflows/pan_gene.nf index 2198765..a0dc6e8 100644 --- a/workflows/pan_gene.nf +++ b/workflows/pan_gene.nf @@ -1,26 +1,17 @@ nextflow.enable.dsl=2 -include { GUNZIP as GUNZIP_TARGET_ASSEMBLY } from '../modules/nf-core/gunzip' -include { GUNZIP as GUNZIP_TE_LIBRARY } from '../modules/nf-core/gunzip' + include { GUNZIP as GUNZIP_EXTERNAL_PROTEIN_SEQ } from '../modules/nf-core/gunzip' -include { FASTA_VALIDATE } from '../modules/local/fasta_validate' -include { REPEATMASKER } from '../modules/kherronism/repeatmasker' -include { STAR_GENOMEGENERATE } from '../modules/nf-core/star/genomegenerate' -include { CAT_FASTQ } from '../modules/nf-core/cat/fastq' -include { SORTMERNA } from '../modules/nf-core/sortmerna' -include { STAR_ALIGN } from '../modules/nf-core/star/align' -include { SAMTOOLS_CAT } from '../modules/nf-core/samtools/cat' include { CAT_CAT as CAT_PROTEIN_SEQS } from '../modules/nf-core/cat/cat' include { BRAKER3 } from '../modules/kherronism/braker3' include { GUNZIP as GUNZIP_XREF_FASTA } from '../modules/nf-core/gunzip' include { GUNZIP as GUNZIP_XREF_GFF } from '../modules/nf-core/gunzip' - -include { PERFORM_EDTA_ANNOTATION } from '../subworkflows/local/perform_edta_annotation' -include { EXTRACT_SAMPLES } from '../subworkflows/local/extract_samples' -include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../subworkflows/nf-core/fastq_fastqc_umitools_fastp' - include { validateParams } from '../modules/local/validate_params' +include { PREPARE_ASSEMBLY } from '../subworkflows/local/prepare_assembly' +include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' +include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' + validateParams(params) // Additional validation @@ -34,230 +25,66 @@ workflow PAN_GENE { // Versions ch_versions = Channel.empty() - - // MODULE: GUNZIP_TARGET_ASSEMBLY + + // Input channels Channel.fromList(params.target_assemblies) | map { tag, filePath -> [[id:tag], file(filePath, checkIfExists: true)] } - | branch { meta, file -> - gz: "$file".endsWith(".gz") - rest: !"$file".endsWith(".gz") - } - | set { ch_target_assemblies } - - GUNZIP_TARGET_ASSEMBLY( - ch_target_assemblies.gz - ) - .gunzip - | mix( - ch_target_assemblies.rest - ) - | set { ch_gunzip_target_assemblies } - - ch_versions = ch_versions.mix(GUNZIP_TARGET_ASSEMBLY.out.versions.first()) - - // MODULE: FASTA_VALIDATE - FASTA_VALIDATE(ch_gunzip_target_assemblies) - .valid_fasta - | set { ch_validated_target_assemblies } - - ch_versions = ch_versions.mix(FASTA_VALIDATE.out.versions.first()) + | set { ch_target_assembly } - // MODULE: GUNZIP_TE_LIBRARY Channel.fromList(params.te_libraries) | map { tag, filePath -> [[id:tag], file(filePath, checkIfExists: true)] } - | branch { meta, file -> - gz: "$file".endsWith(".gz") - rest: !"$file".endsWith(".gz") - } - | set { ch_te_libraries } - - GUNZIP_TE_LIBRARY( - ch_te_libraries.gz - ) - .gunzip - | mix( - ch_te_libraries.rest - ) - | set { ch_gunzip_te_libraries } - - ch_versions = ch_versions.mix(GUNZIP_TE_LIBRARY.out.versions.first()) + | set { ch_te_library } - // SUBWORKFLOW: PERFORM_EDTA_ANNOTATION - ch_validated_target_assemblies - | join( - ch_gunzip_te_libraries, remainder: true - ) - | filter { meta, assembly, teLib -> - teLib == null + ch_samplesheet = Channel.empty() + if(params.samplesheet) { + ch_samplesheet = Channel.fromPath(params.samplesheet) } - | map {meta, assembly, teLib -> [meta, assembly]} - | PERFORM_EDTA_ANNOTATION - - ch_versions = ch_versions.mix(PERFORM_EDTA_ANNOTATION.out.versions) - // MODULE: REPEATMASKER - ch_validated_target_assemblies - | join( - PERFORM_EDTA_ANNOTATION.out.te_lib_fasta.mix(ch_gunzip_te_libraries) - ) - | set { ch_assemblies_n_te_libs } - - REPEATMASKER( - ch_assemblies_n_te_libs.map { meta, assembly, teLib -> [meta, assembly] }, - ch_assemblies_n_te_libs.map { meta, assembly, teLib -> teLib }, - ) + Channel.of(params.target_assemblies.collect { tag, fastaPath -> tag.strip() }.join(",")) + | set { ch_permissible_target_assemblies } - ch_versions = ch_versions.mix(REPEATMASKER.out.versions.first()) + Channel.from(ch_ribo_db.readLines()) + | map { row -> file(row, checkIfExists: true) } + | collect + | set { ch_sortmerna_fastas } - // MODULE: STAR_GENOMEGENERATE - def star_ignore_sjdbgtf = true - STAR_GENOMEGENERATE( - REPEATMASKER.out.fasta_masked, - REPEATMASKER.out.fasta_masked.map { meta, maskedFasta -> [meta, []] }, - star_ignore_sjdbgtf + // SUBWORKFLOW: PREPARE_ASSEMBLY + PREPARE_ASSEMBLY( + ch_target_assembly, + ch_te_library ) - .index - | set { ch_assembly_index } - ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions.first()) + ch_valid_target_assembly = PREPARE_ASSEMBLY.out.target_assemby + ch_masked_target_assembly = PREPARE_ASSEMBLY.out.masked_target_assembly + ch_target_assemby_index = PREPARE_ASSEMBLY.out.target_assemby_index + ch_versions = ch_versions.mix(PREPARE_ASSEMBLY.out.versions) - // SUBWORKFLOW: EXTRACT_SAMPLES - ch_samplesheet_path = Channel.empty() - if(params.samplesheet != null) { - ch_samplesheet_path = Channel.fromPath(params.samplesheet) - } - - EXTRACT_SAMPLES( - ch_samplesheet_path, - Channel.of(params.target_assemblies.collect { tag, fastaPath -> tag.strip() }.join(",")) + // SUBWORKFLOW: PREPROCESS_RNASEQ + PREPROCESS_RNASEQ( + ch_samplesheet, + ch_permissible_target_assemblies, + params.skip_fastqc, + params.skip_fastp, + params.save_trimmed, + params.min_trimmed_reads, + params.remove_ribo_rna, + ch_sortmerna_fastas ) - .reads - | map { meta, fastq -> - groupID = meta.id - ~/_T\d+/ - [ meta + [id: groupID], fastq ] - } - | groupTuple() - | branch { meta, fastq -> - single : fastq.size() == 1 - return [ meta, fastq.flatten() ] - multiple: fastq.size() > 1 - return [ meta, fastq.flatten() ] - } - | set { ch_fastq } - ch_read_target_assemblies = EXTRACT_SAMPLES.out.assemblies - ch_versions = ch_versions.mix(EXTRACT_SAMPLES.out.versions) + ch_trim_reads = PREPROCESS_RNASEQ.out.trim_reads + ch_reads_target = PREPROCESS_RNASEQ.out.reads_target + ch_versions = ch_versions.mix(PREPROCESS_RNASEQ.out.versions) - // MODULES: CAT_FASTQ - CAT_FASTQ ( - ch_fastq.multiple - ) - .reads - | mix(ch_fastq.single) - | set { ch_cat_fastq } - - ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first()) - - // SUBWORKFLOW: FASTQ_FASTQC_UMITOOLS_FASTP - def with_umi = false - def skip_umi_extract = true - def umi_discard_read = false - FASTQ_FASTQC_UMITOOLS_FASTP ( - ch_cat_fastq, - params.sample_prep.skip_fastqc, - with_umi, - skip_umi_extract, - umi_discard_read, - params.sample_prep.skip_fastp, - [], - params.sample_prep.save_trimmed, - params.sample_prep.save_trimmed, - params.sample_prep.min_trimmed_reads + // SUBWORKFLOW: STAR_ALIGN + ALIGN_RNASEQ( + ch_reads_target, + ch_trim_reads, + ch_target_assemby_index ) - .reads - | set { ch_trim_reads } - - ch_versions = ch_versions.mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.versions) - - // MODULE: SORTMERNA - if (params.sample_prep.remove_ribo_rna) { - Channel.from(ch_ribo_db.readLines()) - | map { row -> file(row, checkIfExists: true) } - | collect - | set { ch_sortmerna_fastas } - - SORTMERNA ( - ch_trim_reads, - ch_sortmerna_fastas - ) - .reads - | set { ch_trim_reads } - - ch_versions = ch_versions.mix(SORTMERNA.out.versions.first()) - } - - // MODULE: STAR_ALIGN - ch_read_target_assemblies - | map { meta, assembly -> - groupID = meta.id - ~/_T\d+/ - [ meta + [id: groupID], assembly ] - } - | unique - | combine(ch_trim_reads, by:0) - | map { meta, assembly, fastq -> - [assembly, [id:"${meta.id}.on.${assembly}", single_end:meta.single_end, target_assembly:assembly], fastq] - } - | combine( - ch_assembly_index.map { meta, index -> [meta.id, index] }, - by:0 - ) - | map { assembly, meta, fastq, index -> [meta, fastq, index] } - | set { ch_star_inputs } - - def seq_platform = false - def seq_center = false - STAR_ALIGN( - ch_star_inputs.map { meta, fastq, index -> [meta, fastq] }, - ch_star_inputs.map { meta, fastq, index -> [[id: meta.target_assembly], index] }, - ch_star_inputs.map { meta, fastq, index -> [[id: meta.target_assembly], []] }, - star_ignore_sjdbgtf, - seq_platform, - seq_center - ) - .bam_sorted - | set { ch_star_bam } - - ch_versions = ch_versions.mix(STAR_ALIGN.out.versions.first()) - - // MODULE: SAMTOOLS_CAT - ch_star_bam - | map { meta, bam -> - [ - [id: meta.target_assembly], - bam instanceof List ? bam.find {it =~ /Aligned/} : bam - ] - } - | groupTuple - | branch { meta, bamList -> - bams: bamList.size() > 1 - bam: bamList.size() <= 1 - } - | set { ch_star_bam_branch } - - SAMTOOLS_CAT( - ch_star_bam_branch.bams - ) - .bam.map { meta, bam -> [meta, [bam]] } - | mix( - ch_star_bam_branch.bam - ) - | set { ch_samtools_bam } - - ch_versions = ch_versions.mix(SAMTOOLS_CAT.out.versions.first()) // MODULE: GUNZIP_EXTERNAL_PROTEIN_SEQ ch_ext_prot_seqs = Channel.empty() From 10a015815b4839823af3571a66bfb5e9aa95b631 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Mon, 13 Nov 2023 12:20:37 +1300 Subject: [PATCH 10/59] Extracted a few subworkflows --- TODO.md | 2 - conf/modules.config | 28 +- modules/local/edta/shorten_edta_ids/main.nf | 14 - modules/local/validate_params/main.nf | 6 + nextflow.config | 90 +++--- subworkflows/local/align_rnaseq.nf | 5 +- subworkflows/local/prepare_assembly.nf | 8 +- subworkflows/local/prepare_ext_prots.nf | 40 +++ subworkflows/local/preprocess_rnaseq.nf | 2 +- workflows/pan_gene.nf | 297 +++++++++----------- 10 files changed, 235 insertions(+), 257 deletions(-) create mode 100644 subworkflows/local/prepare_ext_prots.nf diff --git a/TODO.md b/TODO.md index 41d8b8c..6e1e66c 100644 --- a/TODO.md +++ b/TODO.md @@ -1,3 +1 @@ -- [ ] Extract subworkflows -- [ ] STAR ignores softmasking and, thus, should be fed the unmasked genome so that masking and mapping can run in parallel. - [ ] Add --eval=reference.gtf \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index 132ffe9..6683fcc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1,7 +1,7 @@ process { withName: 'EDTA' { ext.args = [ - params.edta.is_sensitive ? "--sensitive 1" : "--sensitive 0", + params.edta_is_sensitive ? "--sensitive 1" : "--sensitive 0", "--anno 0", "--force 1" ].join(' ').trim() @@ -12,7 +12,7 @@ process { path: { "${params.outdir}/edta/${meta.id}" }, mode: "copy", saveAs: { filename -> filename.equals("versions.yml") ? null : filename }, - enabled: params.edta.save_outputs + enabled: params.edta_save_outputs ] } @@ -26,12 +26,12 @@ process { path: { "${params.outdir}/repeatmasker" }, mode: "copy", saveAs: { filename -> filename.equals("versions.yml") ? null : filename }, - enabled: params.repeatmasker.save_outputs + enabled: params.repeatmasker_save_outputs ] } } -if(!params.sample_prep.skip_fastqc) { +if(!params.skip_fastqc) { process { withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTQC_RAW' { ext.args = '--quiet' @@ -48,10 +48,10 @@ if(!params.sample_prep.skip_fastqc) { } } -if(!params.sample_prep.skip_fastp) { +if(!params.skip_fastp) { process { withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTP' { - ext.args = params.sample_prep.extra_fastp_args ?: '' + ext.args = params.extra_fastp_args ?: '' publishDir = [ [ path: { "${params.outdir}/fastp/html" }, @@ -72,14 +72,14 @@ if(!params.sample_prep.skip_fastp) { path: { "${params.outdir}/fastp" }, mode: "copy", pattern: "*.fastq.gz", - enabled: params.sample_prep.save_trimmed + enabled: params.save_trimmed ] ] } } } -if (params.sample_prep.remove_ribo_rna) { +if (params.remove_ribo_rna) { process { withName: 'SORTMERNA' { ext.args = '--num_alignments 1 -v' @@ -93,7 +93,7 @@ if (params.sample_prep.remove_ribo_rna) { path: { "${params.outdir}/sortmerna" }, mode: "copy", pattern: "*.fastq.gz", - enabled: params.sample_prep.save_non_ribo_reads + enabled: params.save_non_ribo_reads ] ] } @@ -106,22 +106,22 @@ process { "--outSAMstrandField intronMotif", "--outSAMtype BAM SortedByCoordinate", "--readFilesCommand gunzip -c", - "--alignIntronMax ${params.star_align.max_intron_length}", - params.star_align.extra_star_align_args ? params.star_align.extra_star_align_args.split("\\s(?=--)") : '' + "--alignIntronMax ${params.star_max_intron_length}", + params.star_align_extra_args ? params.star_align_extra_args.split("\\s(?=--)") : '' ].flatten().unique(false).join(' ').trim() ext.prefix = { "${meta.id}" } publishDir = [ path: { "${params.outdir}/star/alignment" }, mode: "copy", saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.star_align.save_outputs + enabled: params.star_save_outputs ] } withName: BRAKER3 { ext.args = [ "--gff3", - params.braker.extra_braker_args ? params.braker.extra_braker_args.split("\\s(?=--)") : '' + params.braker_extra_args ? params.braker_extra_args.split("\\s(?=--)") : '' ].flatten().unique(false).join(' ').trim() ext.prefix = { "${meta.id}" } publishDir = [ @@ -132,7 +132,7 @@ process { } } -if(params.liftoff.xref_annotations) { +if(params.liftoff_xref_annotations) { process { withName: LIFTOFF { ext.args = '-exclude_partial -copies' diff --git a/modules/local/edta/shorten_edta_ids/main.nf b/modules/local/edta/shorten_edta_ids/main.nf index e216ce4..43b94f0 100644 --- a/modules/local/edta/shorten_edta_ids/main.nf +++ b/modules/local/edta/shorten_edta_ids/main.nf @@ -27,18 +27,4 @@ process SHORTEN_EDTA_IDS { shorten_fasta_ids: \$(md5sum \$(which shorten_fasta_ids.py) | cut -d' ' -f1) END_VERSIONS """ - - stub: - """ - FILE="$fasta_file" - output_prefix="\${FILE%.*}" - - touch "\${output_prefix}.renamed.ids.fa" - touch "\${output_prefix}.renamed.ids.tsv" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - shorten_fasta_ids: \$(md5sum \$(which shorten_fasta_ids.py) | cut -d' ' -f1) - END_VERSIONS - """ } \ No newline at end of file diff --git a/modules/local/validate_params/main.nf b/modules/local/validate_params/main.nf index 7b5697f..e86302e 100644 --- a/modules/local/validate_params/main.nf +++ b/modules/local/validate_params/main.nf @@ -3,6 +3,12 @@ def validateParams(params) { validateTETags(params) validateTEFastaCorrespondence(params) + + if (params.remove_ribo_rna) { + ch_ribo_db = file(params.ribo_database_manifest, checkIfExists: true) + + if (ch_ribo_db.isEmpty()) {exit 1, "File provided with --ribo_database_manifest is empty: ${ch_ribo_db.getName()}!"} + } } def validateFastaTags(params) { diff --git a/nextflow.config b/nextflow.config index daf1eef..3c630e9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -5,85 +5,65 @@ params { ["red5_v2p1", "/workspace/hrauxr/pan-gene/.test/red5_v2p1_chr1.fasta"], ["donghong", "/workspace/hrauxr/pan-gene/.test/donghong.chr1.fsa.gz"] ] - // FASTA files (fasta, fasta.gz) for the assemblies to annotate - // - // Pattern: [["tag", "file path"]] - // Permissible tags: tag, tag_1, tag_tag2_3, tag_tag2_tag3; - // Any name with alphanumeric characters including "_". - // "." is not allowed in the tag name - // Unique, short tags are recommended. - // Otherwise, some of the plots in the report may not display correctly. - // Examples: - // target_assemblies = [["tag1", "./a/relative/path/to/the/fasta/file.fasta"], - // ["tag2", "./a/relative/path/to/the/fasta/file2.fasta"], - // ["tag3", "https://ftp.ncbi.nlm.nih.gov/genomes/test_genome.fna"], ...] - // target_assemblies = [["tair10", "/an/absolute/path/to/the/fasta/file.fasta"]] + // Pattern: [ [tag, fasta(.gz) ] ] + // Permissible tags: tag, tag_1, tag_tag2_3, tag_tag2_tag3; + // Any name with alphanumeric characters including "_". + // "." is not allowed in the tag name te_libraries = [ ["donghong", "/workspace/hrauxr/pan-gene/.test/donghong.TElib.fa.gz"] ] - // TE libs (fasta, fasta.gz) for target_assemblies - // - // Optional Set to [] if libraries are not available, te_libraries = [] + // Pattern: [ [tag, fasta(.gz) ] ] + // Optional Set to null if libraries are not available. // // Each TE library should have an associated (by tag) assembly in target_assemblies. // Not all target_assemblies need to have an associated (by tag) TE library. // When the TE lib is not available for a traget assembly, EDTA is used to create one. - edta { - is_sensitive = false - save_outputs = true - } - repeatmasker { - save_outputs = true - } + edta_is_sensitive = false + edta_save_outputs = false + + repeatmasker_save_outputs = true samplesheet = "./.test/samplesheet.csv" // Optional: Set to null if not available - sample_prep { - skip_fastqc = false - skip_fastp = false - min_trimmed_reads = 10000 - extra_fastp_args = "" + skip_fastqc = false + skip_fastp = false + min_trimmed_reads = 10000 + extra_fastp_args = "" - save_trimmed = false - // toggling this parameter results in rerun of FASTP and FASTQC_TRIM + save_trimmed = false + // toggling this parameter results in rerun of FASTP and FASTQC_TRIM - remove_ribo_rna = true - save_non_ribo_reads = false - ribo_database_manifest = "${projectDir}/assets/rrna-db-defaults.txt" - } + remove_ribo_rna = true + save_non_ribo_reads = false + ribo_database_manifest = "${projectDir}/assets/rrna-db-defaults.txt" - star_align { - max_intron_length = 16000 - extra_star_align_args = "" - save_outputs = false - } + star_max_intron_length = 16000 + star_align_extra_args = "" + star_save_outputs = false - external_protein_seqs = [ + external_protein_fastas = [ "/workspace/ComparativeDataSources/OrthoDB11/Viridiplantae.fa.gz", "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/RU01.20221115150135.pep.fasta" ] // Optional: Set to null if not available - braker { - extra_braker_args = "" - } + braker_extra_args = "" - liftoff { - xref_annotations = [ - [ - "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/Russell_V2a.chromosomes.and.unassiged.and.haplotigs.fsa", - "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/RU01.20221115150135.gff3" - ], - [ - "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_chr_all.fas", - "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_GFF3_genes_transposons.fixed.gff3" - ] + liftoff_xref_annotations = [ + [ + "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/Russell_V2a.chromosomes.and.unassiged.and.haplotigs.fsa", + "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/RU01.20221115150135.gff3" + ], + [ + "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_chr_all.fas", + "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_GFF3_genes_transposons.fixed.gff3" ] - // Optional: Set to null if not available - } + ] + // Format: [ [ fasta(.gz), gff3(.gz) ] ] + // Optional: Set to null if not available outdir = "./results" diff --git a/subworkflows/local/align_rnaseq.nf b/subworkflows/local/align_rnaseq.nf index be7f026..c0a9039 100644 --- a/subworkflows/local/align_rnaseq.nf +++ b/subworkflows/local/align_rnaseq.nf @@ -21,8 +21,9 @@ workflow ALIGN_RNASEQ { | map { assembly, meta, fastq, index -> [meta, fastq, index] } | set { ch_star_inputs } - def seq_platform = false - def seq_center = false + def star_ignore_sjdbgtf = true + def seq_platform = false + def seq_center = false STAR_ALIGN( ch_star_inputs.map { meta, fastq, index -> [meta, fastq] }, ch_star_inputs.map { meta, fastq, index -> [[id: meta.target_assembly], index] }, diff --git a/subworkflows/local/prepare_assembly.nf b/subworkflows/local/prepare_assembly.nf index 1cecb72..7469afc 100644 --- a/subworkflows/local/prepare_assembly.nf +++ b/subworkflows/local/prepare_assembly.nf @@ -4,7 +4,7 @@ include { FASTA_VALIDATE } from '../../modules/local/fast include { REPEATMASKER } from '../../modules/kherronism/repeatmasker' include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate' -include { FASTA_EDTA } from '../subworkflows/local/fasta_edta' +include { FASTA_EDTA } from '../../subworkflows/local/fasta_edta' workflow PREPARE_ASSEMBLY { take: @@ -77,8 +77,8 @@ workflow PREPARE_ASSEMBLY { // MODULE: STAR_GENOMEGENERATE def star_ignore_sjdbgtf = true STAR_GENOMEGENERATE( - REPEATMASKER.out.fasta_masked, - REPEATMASKER.out.fasta_masked.map { meta, maskedFasta -> [meta, []] }, + ch_validated_target_assembly, + ch_validated_target_assembly.map { meta, maskedFasta -> [meta, []] }, star_ignore_sjdbgtf ) .index @@ -94,7 +94,7 @@ workflow PREPARE_ASSEMBLY { | set { ch_versions } emit: - target_assemby // channel: [ meta, fasta ] + target_assemby = ch_validated_target_assembly // channel: [ meta, fasta ] masked_target_assembly = REPEATMASKER.out.fasta_masked // channel: [ meta, fasta ] target_assemby_index = ch_assembly_index // channel: [ meta, star_index ] versions = ch_versions // channel: [ versions.yml ] diff --git a/subworkflows/local/prepare_ext_prots.nf b/subworkflows/local/prepare_ext_prots.nf new file mode 100644 index 0000000..5109064 --- /dev/null +++ b/subworkflows/local/prepare_ext_prots.nf @@ -0,0 +1,40 @@ +include { GUNZIP } from '../../modules/nf-core/gunzip' +include { CAT_CAT as CAT_PROTEIN_FASTAS } from '../../modules/nf-core/cat/cat' + +workflow PREPARE_EXT_PROTS { + take: + ch_ext_prot_fastas // Channel: [ meta, fasta ] + + main: + ch_ext_prot_fastas + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } + | set { ch_ext_prot_seqs_branch } + + // MODULE: GUNZIP + GUNZIP( + ch_ext_prot_seqs_branch.gz + ) + .gunzip + | mix( + ch_ext_prot_seqs_branch.rest + ) + | set { ch_ext_prot_gunzip_fastas } + + // MODULE: CAT_PROTEIN_FASTAS + ch_ext_prot_gunzip_fastas + | map { meta, filePath -> filePath } + | collect + | map { fileList -> [[id:"ext_protein_seqs"], fileList] } + | CAT_PROTEIN_FASTAS + + GUNZIP.out.versions.first() + | mix(CAT_PROTEIN_FASTAS.out.versions) + | set { ch_versions } + + emit: + ext_prots_fasta = CAT_PROTEIN_FASTAS.out.file_out // Channel: [ meta, fasta ] + versions = ch_versions // Channel: [ versions.yml ] +} \ No newline at end of file diff --git a/subworkflows/local/preprocess_rnaseq.nf b/subworkflows/local/preprocess_rnaseq.nf index 8223910..7a82786 100644 --- a/subworkflows/local/preprocess_rnaseq.nf +++ b/subworkflows/local/preprocess_rnaseq.nf @@ -19,7 +19,7 @@ workflow PREPROCESS_RNASEQ { // SUBWORKFLOW: EXTRACT_SAMPLES EXTRACT_SAMPLES( samplesheet, - ch_permissible_target_assemblies + permissible_target_assemblies ) .reads | map { meta, fastq -> diff --git a/workflows/pan_gene.nf b/workflows/pan_gene.nf index a0dc6e8..37e5920 100644 --- a/workflows/pan_gene.nf +++ b/workflows/pan_gene.nf @@ -1,8 +1,5 @@ nextflow.enable.dsl=2 - -include { GUNZIP as GUNZIP_EXTERNAL_PROTEIN_SEQ } from '../modules/nf-core/gunzip' -include { CAT_CAT as CAT_PROTEIN_SEQS } from '../modules/nf-core/cat/cat' include { BRAKER3 } from '../modules/kherronism/braker3' include { GUNZIP as GUNZIP_XREF_FASTA } from '../modules/nf-core/gunzip' include { GUNZIP as GUNZIP_XREF_GFF } from '../modules/nf-core/gunzip' @@ -11,46 +8,48 @@ include { validateParams } from '../modules/local/validat include { PREPARE_ASSEMBLY } from '../subworkflows/local/prepare_assembly' include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' +include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' validateParams(params) -// Additional validation -// Check rRNA databases for sortmerna -if (params.sample_prep.remove_ribo_rna) { - ch_ribo_db = file(params.sample_prep.ribo_database_manifest, checkIfExists: true) - if (ch_ribo_db.isEmpty()) {exit 1, "File provided with --ribo_database_manifest is empty: ${ch_ribo_db.getName()}!"} -} - workflow PAN_GENE { - // Versions - ch_versions = Channel.empty() - - // Input channels - Channel.fromList(params.target_assemblies) - | map { tag, filePath -> - [[id:tag], file(filePath, checkIfExists: true)] - } - | set { ch_target_assembly } - - Channel.fromList(params.te_libraries) - | map { tag, filePath -> - [[id:tag], file(filePath, checkIfExists: true)] - } - | set { ch_te_library } - - ch_samplesheet = Channel.empty() - if(params.samplesheet) { - ch_samplesheet = Channel.fromPath(params.samplesheet) - } - - Channel.of(params.target_assemblies.collect { tag, fastaPath -> tag.strip() }.join(",")) - | set { ch_permissible_target_assemblies } + ch_versions = Channel.empty() - Channel.from(ch_ribo_db.readLines()) - | map { row -> file(row, checkIfExists: true) } - | collect - | set { ch_sortmerna_fastas } + ch_target_assembly = Channel.fromList(params.target_assemblies) + | map { tag, filePath -> + [[id:tag], file(filePath, checkIfExists: true)] + } + + ch_te_library = Channel.fromList(params.te_libraries) + | map { tag, filePath -> + [[id:tag], file(filePath, checkIfExists: true)] + } + + ch_samplesheet = params.samplesheet + ? Channel.fromPath(params.samplesheet, checkIfExists: true) + : Channel.empty() + + ch_tar_assm_str = Channel.of( + params.target_assemblies + .collect { tag, fastaPath -> tag.strip() }.join(",") + ) + + ch_ribo_db = params.remove_ribo_rna + ? file(params.ribo_database_manifest, checkIfExists: true) + : Channel.empty() + + ch_sortmerna_fastas = Channel.from(ch_ribo_db.readLines()) + | map { row -> file(row, checkIfExists: true) } + | collect + + ch_ext_prot_fastas = (params.external_protein_fastas + ? Channel.fromList(params.external_protein_fastas) + : Channel.empty()) + | map { filePath -> + def fileHandle = file(filePath, checkIfExists: true) + [[id:fileHandle.getSimpleName()], fileHandle] + } // SUBWORKFLOW: PREPARE_ASSEMBLY PREPARE_ASSEMBLY( @@ -66,7 +65,7 @@ workflow PAN_GENE { // SUBWORKFLOW: PREPROCESS_RNASEQ PREPROCESS_RNASEQ( ch_samplesheet, - ch_permissible_target_assemblies, + ch_tar_assm_str, params.skip_fastqc, params.skip_fastp, params.save_trimmed, @@ -79,147 +78,115 @@ workflow PAN_GENE { ch_reads_target = PREPROCESS_RNASEQ.out.reads_target ch_versions = ch_versions.mix(PREPROCESS_RNASEQ.out.versions) - // SUBWORKFLOW: STAR_ALIGN + // SUBWORKFLOW: ALIGN_RNASEQ ALIGN_RNASEQ( ch_reads_target, ch_trim_reads, ch_target_assemby_index ) - // MODULE: GUNZIP_EXTERNAL_PROTEIN_SEQ - ch_ext_prot_seqs = Channel.empty() - if(params.external_protein_seqs) { - ch_ext_prot_seqs = Channel.fromList(params.external_protein_seqs) - } - - ch_ext_prot_seqs - | map { filePath -> - def fileHandle = file(filePath, checkIfExists: true) - [[id:fileHandle.getSimpleName()], fileHandle] - } - | branch { meta, file -> - gz: "$file".endsWith(".gz") - rest: !"$file".endsWith(".gz") - } - | set { ch_ext_prot_seqs_branch } - - GUNZIP_EXTERNAL_PROTEIN_SEQ( - ch_ext_prot_seqs_branch.gz - ) - .gunzip - | mix( - ch_ext_prot_seqs_branch.rest - ) - | set { ch_ext_prot_seqs } + ch_rnaseq_bam = ALIGN_RNASEQ.out.bam + ch_versions = ch_versions.mix(ALIGN_RNASEQ.out.versions) - ch_versions = ch_versions.mix(GUNZIP_EXTERNAL_PROTEIN_SEQ.out.versions.first()) + // MODULE: PREPARE_EXT_PROTS + PREPARE_EXT_PROTS( + ch_ext_prot_fastas + ) - // MODULE: CAT_PROTEIN_SEQS - ch_ext_prot_seqs - | map { meta, filePath -> filePath } - | collect - | map { fileList -> [[id:"protein_seqs"], fileList] } - | CAT_PROTEIN_SEQS - - ch_ext_prot_seqs = CAT_PROTEIN_SEQS.out.file_out - ch_versions = ch_versions.mix(CAT_PROTEIN_SEQS.out.versions) + ch_ext_prots_fasta = PREPARE_EXT_PROTS.out.ext_prots_fasta + ch_versions = ch_versions.mix(PREPARE_EXT_PROTS.out.versions) // MODULE: BRAKER3 - REPEATMASKER.out.fasta_masked - | mix(ch_samtools_bam) - | groupTuple(size: 2, remainder: true) - | map { meta, groupedItems -> - def maskedFasta = groupedItems[0] - - if(groupedItems.size() == 2) { - def bam = groupedItems[1] - return [meta, maskedFasta, bam] - } else { - return [meta, maskedFasta, []] - } - } - | set { ch_braker_inputs } + // ch_braker_inputs = REPEATMASKER.out.fasta_masked + // | mix(ch_rnaseq_bam) + // | groupTuple(size: 2, remainder: true) + // | map { meta, groupedItems -> + // def maskedFasta = groupedItems[0] + // def bam = (groupedItems.size() == 2) ? groupedItems[1] : [] + + // [meta, maskedFasta, bam] + // } - if(params.external_protein_seqs) { - ch_braker_inputs - | combine(ch_ext_prot_seqs.map{meta, filePath -> filePath}) - | set { ch_braker_inputs } - } else { - ch_braker_inputs - | map{meta, assembly, bam -> [meta, assembly, bam, []]} - | set { ch_braker_inputs } - } + // if(params.external_protein_fastas) { + // ch_braker_inputs + // | combine(ch_ext_prot_seqs.map{meta, filePath -> filePath}) + // | set { ch_braker_inputs } + // } else { + // ch_braker_inputs + // | map{meta, assembly, bam -> [meta, assembly, bam, []]} + // | set { ch_braker_inputs } + // } - ch_fasta = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> [meta, assembly] } - ch_bam = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> bam } - ch_proteins = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> proteinSeq } - ch_rnaseq_sets_dirs = [] - ch_rnaseq_sets_ids = [] - ch_hintsfile = [] - - BRAKER3( - ch_fasta, - ch_bam, - ch_rnaseq_sets_dirs, - ch_rnaseq_sets_ids, - ch_proteins, - ch_hintsfile - ) + // ch_fasta = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> [meta, assembly] } + // ch_bam = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> bam } + // ch_proteins = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> proteinSeq } + // ch_rnaseq_sets_dirs = [] + // ch_rnaseq_sets_ids = [] + // ch_hintsfile = [] + + // BRAKER3( + // ch_fasta, + // ch_bam, + // ch_rnaseq_sets_dirs, + // ch_rnaseq_sets_ids, + // ch_proteins, + // ch_hintsfile + // ) - ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) - - // MODULE: GUNZIP_XREF_FASTA - ch_xref_annotations = Channel.empty() - if(params.liftoff.xref_annotations) { - Channel.fromList(params.liftoff.xref_annotations) - | multiMap { fasta, gff -> - def fastaFile = file(fasta, checkIfExists:true) - def meta = [id:fastaFile.getSimpleName()] - - fasta: [meta, fastaFile] - gff: [meta, file(gff, checkIfExists:true)] - } - | set { ch_xref_annotations } - } - - ch_xref_annotations.fasta - | branch { meta, file -> - gz: "$file".endsWith(".gz") - rest: !"$file".endsWith(".gz") - } - | set { ch_xref_annotations_branch } - - GUNZIP_XREF_FASTA( - ch_xref_annotations_branch.gz - ) - .gunzip - | mix( - ch_xref_annotations_branch.rest - ) - | set { ch_xref_annotations_fasta } - - // MODULE: GUNZIP_XREF_GFF - ch_xref_annotations.gff - | branch { meta, file -> - gz: "$file".endsWith(".gz") - rest: !"$file".endsWith(".gz") - } - | set { ch_xref_annotations_gff_branch } - - GUNZIP_XREF_GFF( - ch_xref_annotations_gff_branch.gff.map { meta, fasta, gff -> [meta, gff] } - ) - .gunzip - | mix( - ch_xref_annotations_gff_branch.rest.map { meta, fasta, gff -> [meta, gff] } - ) - | set { ch_xref_annotations_gff } + // ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) + + // // MODULE: GUNZIP_XREF_FASTA + // ch_xref_annotations = Channel.empty() + // if(params.liftoff_xref_annotations) { + // Channel.fromList(params.liftoff_xref_annotations) + // | multiMap { fasta, gff -> + // def fastaFile = file(fasta, checkIfExists:true) + // def meta = [id:fastaFile.getSimpleName()] + + // fasta: [meta, fastaFile] + // gff: [meta, file(gff, checkIfExists:true)] + // } + // | set { ch_xref_annotations } + // } + + // ch_xref_annotations.fasta + // | branch { meta, file -> + // gz: "$file".endsWith(".gz") + // rest: !"$file".endsWith(".gz") + // } + // | set { ch_xref_annotations_branch } + + // GUNZIP_XREF_FASTA( + // ch_xref_annotations_branch.gz + // ) + // .gunzip + // | mix( + // ch_xref_annotations_branch.rest + // ) + // | set { ch_xref_annotations_fasta } + + // // MODULE: GUNZIP_XREF_GFF + // ch_xref_annotations.gff + // | branch { meta, file -> + // gz: "$file".endsWith(".gz") + // rest: !"$file".endsWith(".gz") + // } + // | set { ch_xref_annotations_gff_branch } + + // GUNZIP_XREF_GFF( + // ch_xref_annotations_gff_branch.gff.map { meta, fasta, gff -> [meta, gff] } + // ) + // .gunzip + // | mix( + // ch_xref_annotations_gff_branch.rest.map { meta, fasta, gff -> [meta, gff] } + // ) + // | set { ch_xref_annotations_gff } - ch_xref_annotations_fasta - | join( - ch_xref_annotations_gff - ) - | set { ch_xref_annotations } + // ch_xref_annotations_fasta + // | join( + // ch_xref_annotations_gff + // ) + // | set { ch_xref_annotations } // // MODULE: LIFTOFF // ch_xref_annotations From 20317902969f86650d491fad1d3b916cb7677e2c Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Mon, 13 Nov 2023 13:08:23 +1300 Subject: [PATCH 11/59] Extracted subworkflows uptill BRAKER3 --- modules/local/validate_params/main.nf | 13 +++++- nextflow.config | 2 +- subworkflows/local/prepare_ext_prots.nf | 3 +- workflows/pan_gene.nf | 59 +++++++++---------------- 4 files changed, 36 insertions(+), 41 deletions(-) diff --git a/modules/local/validate_params/main.nf b/modules/local/validate_params/main.nf index e86302e..5933dfe 100644 --- a/modules/local/validate_params/main.nf +++ b/modules/local/validate_params/main.nf @@ -5,9 +5,9 @@ def validateParams(params) { validateTEFastaCorrespondence(params) if (params.remove_ribo_rna) { - ch_ribo_db = file(params.ribo_database_manifest, checkIfExists: true) + file_ribo_db = file(params.ribo_database_manifest, checkIfExists: true) - if (ch_ribo_db.isEmpty()) {exit 1, "File provided with --ribo_database_manifest is empty: ${ch_ribo_db.getName()}!"} + if (file_ribo_db.isEmpty()) {exit 1, "File provided with --ribo_database_manifest is empty: ${file_ribo_db.getName()}!"} } } @@ -32,6 +32,11 @@ def validateFastaTags(params) { } def validateTETags(params) { + + if(!params["te_libraries"]) { + return + } + def listOfTETuples = params["te_libraries"] if (listOfTETuples.isEmpty()) { @@ -52,6 +57,10 @@ def validateTETags(params) { } def validateTEFastaCorrespondence(params) { + + if(!params["te_libraries"]) { + return + } def listOfTETuples = params["te_libraries"] def listOfFastaTuples = params["target_assemblies"] diff --git a/nextflow.config b/nextflow.config index 3c630e9..002f73a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -36,7 +36,7 @@ params { save_trimmed = false // toggling this parameter results in rerun of FASTP and FASTQC_TRIM - remove_ribo_rna = true + remove_ribo_rna = false save_non_ribo_reads = false ribo_database_manifest = "${projectDir}/assets/rrna-db-defaults.txt" diff --git a/subworkflows/local/prepare_ext_prots.nf b/subworkflows/local/prepare_ext_prots.nf index 5109064..d14c60b 100644 --- a/subworkflows/local/prepare_ext_prots.nf +++ b/subworkflows/local/prepare_ext_prots.nf @@ -30,7 +30,8 @@ workflow PREPARE_EXT_PROTS { | map { fileList -> [[id:"ext_protein_seqs"], fileList] } | CAT_PROTEIN_FASTAS - GUNZIP.out.versions.first() + Channel.empty() + | mix(GUNZIP.out.versions.first()) | mix(CAT_PROTEIN_FASTAS.out.versions) | set { ch_versions } diff --git a/workflows/pan_gene.nf b/workflows/pan_gene.nf index 37e5920..acfc77a 100644 --- a/workflows/pan_gene.nf +++ b/workflows/pan_gene.nf @@ -37,9 +37,9 @@ workflow PAN_GENE { ch_ribo_db = params.remove_ribo_rna ? file(params.ribo_database_manifest, checkIfExists: true) - : Channel.empty() + : null - ch_sortmerna_fastas = Channel.from(ch_ribo_db.readLines()) + ch_sortmerna_fastas = Channel.from(ch_ribo_db ? ch_ribo_db.readLines() : null) | map { row -> file(row, checkIfExists: true) } | collect @@ -97,43 +97,28 @@ workflow PAN_GENE { ch_versions = ch_versions.mix(PREPARE_EXT_PROTS.out.versions) // MODULE: BRAKER3 - // ch_braker_inputs = REPEATMASKER.out.fasta_masked - // | mix(ch_rnaseq_bam) - // | groupTuple(size: 2, remainder: true) - // | map { meta, groupedItems -> - // def maskedFasta = groupedItems[0] - // def bam = (groupedItems.size() == 2) ? groupedItems[1] : [] - - // [meta, maskedFasta, bam] - // } - - // if(params.external_protein_fastas) { - // ch_braker_inputs - // | combine(ch_ext_prot_seqs.map{meta, filePath -> filePath}) - // | set { ch_braker_inputs } - // } else { - // ch_braker_inputs - // | map{meta, assembly, bam -> [meta, assembly, bam, []]} - // | set { ch_braker_inputs } - // } + ch_braker_inputs = ch_masked_target_assembly + | join(ch_rnaseq_bam, remainder: true) + | combine( + ch_ext_prots_fasta.map { meta, filePath -> filePath }.ifEmpty(null) + ) + | map { meta, fasta, bam, prots -> [meta, fasta, bam ?: [], prots ?: []] } - // ch_fasta = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> [meta, assembly] } - // ch_bam = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> bam } - // ch_proteins = ch_braker_inputs.map { meta, assembly, bam, proteinSeq -> proteinSeq } - // ch_rnaseq_sets_dirs = [] - // ch_rnaseq_sets_ids = [] - // ch_hintsfile = [] - - // BRAKER3( - // ch_fasta, - // ch_bam, - // ch_rnaseq_sets_dirs, - // ch_rnaseq_sets_ids, - // ch_proteins, - // ch_hintsfile - // ) + def rnaseq_sets_dirs = [] + def rnaseq_sets_ids = [] + def hintsfile = [] + + BRAKER3( + ch_braker_inputs.map { meta, fasta, bam, prots -> [meta, fasta] }, + ch_braker_inputs.map { meta, fasta, bam, prots -> bam }, + rnaseq_sets_dirs, + rnaseq_sets_ids, + ch_braker_inputs.map { meta, fasta, bam, prots -> prots }, + hintsfile + ) - // ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) + ch_braker_gff3 = BRAKER3.out.gff3 + ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) // // MODULE: GUNZIP_XREF_FASTA // ch_xref_annotations = Channel.empty() From f3154677339b8bff09646220772dda71a39587fd Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 14 Nov 2023 09:30:31 +1300 Subject: [PATCH 12/59] Inc liftoff --- conf/manifest.config | 10 ++ conf/modules.config | 13 +- modules/local/liftoff/main.nf | 17 +-- modules/local/validate_params/main.nf | 28 ++++- modules/nf-core/CHANGELOG.md | 2 +- .../dumpsoftwareversions/environment.yml | 6 + .../custom/dumpsoftwareversions/main.nf | 24 ++++ .../custom/dumpsoftwareversions/meta.yml | 37 ++++++ .../templates/dumpsoftwareversions.py | 101 ++++++++++++++++ .../dumpsoftwareversions/tests/main.nf.test | 38 ++++++ .../tests/main.nf.test.snap | 27 +++++ .../dumpsoftwareversions/tests/tags.yml | 2 + nextflow.config | 5 +- pan_gene_pfr.sh | 2 +- subworkflows/local/fasta_liftoff.nf | 79 ++++++++++++ workflows/pan_gene.nf | 114 +++++++----------- 16 files changed, 415 insertions(+), 90 deletions(-) create mode 100644 conf/manifest.config create mode 100644 modules/nf-core/custom/dumpsoftwareversions/environment.yml create mode 100644 modules/nf-core/custom/dumpsoftwareversions/main.nf create mode 100644 modules/nf-core/custom/dumpsoftwareversions/meta.yml create mode 100755 modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml create mode 100644 subworkflows/local/fasta_liftoff.nf diff --git a/conf/manifest.config b/conf/manifest.config new file mode 100644 index 0000000..7bf1f6b --- /dev/null +++ b/conf/manifest.config @@ -0,0 +1,10 @@ +manifest { + name = 'pan-gene' + author = """Usman Rashid""" + homePage = 'https://github.com/PlantandFoodResearch/pan-gene' + description = """A NextFlow pipeline for pan-genome annotation""" + mainScript = 'main.nf' + nextflowVersion = '!>=22.10.4' + version = '0.1' + doi = '' +} \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index 6683fcc..8c99be5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -135,7 +135,7 @@ process { if(params.liftoff_xref_annotations) { process { withName: LIFTOFF { - ext.args = '-exclude_partial -copies' + ext.args = '-exclude_partial -copies -polish' publishDir = [ path: { "${params.outdir}/liftoff/${meta.id}" }, mode: "copy", @@ -143,4 +143,15 @@ if(params.liftoff_xref_annotations) { ] } } +} + +process { + withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { + publishDir = [ + path: params.outdir, + pattern: "software_versions.yml", + mode: "copy", + enabled: true + ] + } } \ No newline at end of file diff --git a/modules/local/liftoff/main.nf b/modules/local/liftoff/main.nf index cec7bd1..5bfb6f2 100644 --- a/modules/local/liftoff/main.nf +++ b/modules/local/liftoff/main.nf @@ -2,7 +2,7 @@ process LIFTOFF { tag "$meta.id" label "process_high" - container "https://depot.galaxyproject.org/singularity/liftoff:1.6.3--pyhdfd78af_0" + container 'https://depot.galaxyproject.org/singularity/liftoff:1.6.3--pyhdfd78af_0' input: tuple val(meta), path(target_fa) @@ -10,8 +10,8 @@ process LIFTOFF { path ref_gff output: - tuple val(meta), path("*.liftoff.gff3") , emit: gff3 - tuple val(meta), path("unmapped_features.txt") , emit: unmapped + tuple val(meta), path("*.gff3") , emit: gff3 + tuple val(meta), path("*.unmapped.txt") , emit: unmapped path "versions.yml" , emit: versions when: @@ -24,25 +24,28 @@ process LIFTOFF { liftoff \\ -g $ref_gff \\ -p $task.cpus \\ + -o "${prefix}.gff3" \\ + -u "${prefix}.unmapped.txt" \\ $args \\ $target_fa \\ $ref_fa \\ - > "${prefix}.liftoff.gff3" + 2> liftoff.stderr cat <<-END_VERSIONS > versions.yml "${task.process}": - liftoff: \$(liftoff --version) + liftoff: \$(liftoff --version 2> /dev/null) END_VERSIONS """ stub: def prefix = task.ext.prefix ?: "${meta.id}" """ - touch "${prefix}.liftoff.gff3" + touch "${prefix}.gff3" + touch "${prefix}.unmapped.txt" cat <<-END_VERSIONS > versions.yml "${task.process}": - liftoff: \$(liftoff --version) + liftoff: \$(liftoff --version 2> /dev/null) END_VERSIONS """ } \ No newline at end of file diff --git a/modules/local/validate_params/main.nf b/modules/local/validate_params/main.nf index 5933dfe..5eb6207 100644 --- a/modules/local/validate_params/main.nf +++ b/modules/local/validate_params/main.nf @@ -1,14 +1,12 @@ def validateParams(params) { validateFastaTags(params) + validateTETags(params) - validateTEFastaCorrespondence(params) - if (params.remove_ribo_rna) { - file_ribo_db = file(params.ribo_database_manifest, checkIfExists: true) - - if (file_ribo_db.isEmpty()) {exit 1, "File provided with --ribo_database_manifest is empty: ${file_ribo_db.getName()}!"} - } + validateRiboDBManifest(params) + + validateLiftoffXrefs(params) } def validateFastaTags(params) { @@ -75,6 +73,24 @@ def validateTEFastaCorrespondence(params) { } } +def validateRiboDBManifest(params) { + if (params.remove_ribo_rna) { + file_ribo_db = file(params.ribo_database_manifest, checkIfExists: true) + + if (file_ribo_db.isEmpty()) {exit 1, "File provided with --ribo_database_manifest is empty: ${file_ribo_db.getName()}!"} + } +} + +def validateLiftoffXrefs(params) { + if(!params["liftoff_xref_annotations"]) { + return + } + + if(isNotListOfLists(params["liftoff_xref_annotations"]), 2) { + error "Error: liftoff_xref_annotations must be a list of sublists, with each sublist containing 2 elements" + } +} + def isNotListOfLists(thisOne, subListSize) { return (!(thisOne instanceof List) || thisOne.isEmpty() || thisOne.any { !(it instanceof List) || it.size() != subListSize }) } \ No newline at end of file diff --git a/modules/nf-core/CHANGELOG.md b/modules/nf-core/CHANGELOG.md index f7e0034..280bc90 100644 --- a/modules/nf-core/CHANGELOG.md +++ b/modules/nf-core/CHANGELOG.md @@ -25,4 +25,4 @@ 1. Added stub 2. Added author in meta.yml -- Repo: https://github.com/nf-core/modules/tree/18cd2206622dc606bbceea533c7823feb2a251db \ No newline at end of file +- Repo: https://github.com/nf-core/modules/tree/71dbe24bee9ad6c013d4dd400d92612f6bf01ab8 \ No newline at end of file diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml new file mode 100644 index 0000000..9d0e6b2 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.17 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf new file mode 100644 index 0000000..7685b33 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -0,0 +1,24 @@ +process CUSTOM_DUMPSOFTWAREVERSIONS { + label 'process_single' + + // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" + + input: + path versions + + output: + path "software_versions.yml" , emit: yml + path "software_versions_mqc.yml", emit: mqc_yml + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + template 'dumpsoftwareversions.py' +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml new file mode 100644 index 0000000..9414c32 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -0,0 +1,37 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: custom_dumpsoftwareversions +description: Custom module used to dump software versions within the nf-core pipeline template +keywords: + - custom + - dump + - version +tools: + - custom: + description: Custom module used to dump software versions within the nf-core pipeline template + homepage: https://github.com/nf-core/tools + documentation: https://github.com/nf-core/tools + licence: ["MIT"] +input: + - versions: + type: file + description: YML file containing software versions + pattern: "*.yml" +output: + - yml: + type: file + description: Standard YML file containing software versions + pattern: "software_versions.yml" + - mqc_yml: + type: file + description: MultiQC custom content YML file containing software versions + pattern: "software_versions_mqc.yml" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py new file mode 100755 index 0000000..da03340 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + + +"""Provide functions to merge multiple versions.yml files.""" + + +import yaml +import platform +from textwrap import dedent + + +def _make_versions_html(versions): + """Generate a tabular HTML output of all versions for MultiQC.""" + html = [ + dedent( + """\\ + + + + + + + + + + """ + ) + ] + for process, tmp_versions in sorted(versions.items()): + html.append("") + for i, (tool, version) in enumerate(sorted(tmp_versions.items())): + html.append( + dedent( + f"""\\ + + + + + + """ + ) + ) + html.append("") + html.append("
Process Name Software Version
{process if (i == 0) else ''}{tool}{version}
") + return "\\n".join(html) + + +def main(): + """Load all version files and generate merged output.""" + versions_this_module = {} + versions_this_module["${task.process}"] = { + "python": platform.python_version(), + "yaml": yaml.__version__, + } + + with open("$versions") as f: + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + + # aggregate versions by the module name (derived from fully-qualified process name) + versions_by_module = {} + for process, process_versions in versions_by_process.items(): + module = process.split(":")[-1] + try: + if versions_by_module[module] != process_versions: + raise AssertionError( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) + except KeyError: + versions_by_module[module] = process_versions + + versions_by_module["Workflow"] = { + "Nextflow": "$workflow.nextflow.version", + "$workflow.manifest.name": "$workflow.manifest.version", + } + + versions_mqc = { + "id": "software_versions", + "section_name": "${workflow.manifest.name} Software Versions", + "section_href": "https://github.com/${workflow.manifest.name}", + "plot_type": "html", + "description": "are collected at run time from the software output.", + "data": _make_versions_html(versions_by_module), + } + + with open("software_versions.yml", "w") as f: + yaml.dump(versions_by_module, f, default_flow_style=False) + with open("software_versions_mqc.yml", "w") as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + with open("versions.yml", "w") as f: + yaml.dump(versions_this_module, f, default_flow_style=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test new file mode 100644 index 0000000..eec1db1 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -0,0 +1,38 @@ +nextflow_process { + + name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS" + script "../main.nf" + process "CUSTOM_DUMPSOFTWAREVERSIONS" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "dumpsoftwareversions" + tag "custom/dumpsoftwareversions" + + test("Should run without failures") { + when { + process { + """ + def tool1_version = ''' + TOOL1: + tool1: 0.11.9 + '''.stripIndent() + + def tool2_version = ''' + TOOL2: + tool2: 1.9 + '''.stripIndent() + + input[0] = Channel.of(tool1_version, tool2_version).collectFile() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap new file mode 100644 index 0000000..4274ed5 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -0,0 +1,27 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ], + "1": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "2": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "mqc_yml": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "versions": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "yml": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ] + } + ], + "timestamp": "2023-11-03T14:43:22.157011" + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml new file mode 100644 index 0000000..405aa24 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml @@ -0,0 +1,2 @@ +custom/dumpsoftwareversions: + - modules/nf-core/custom/dumpsoftwareversions/** diff --git a/nextflow.config b/nextflow.config index 002f73a..574bf39 100644 --- a/nextflow.config +++ b/nextflow.config @@ -67,10 +67,11 @@ params { outdir = "./results" - max_cpus = 1 - max_memory = 4.GB + max_cpus = 12 + max_memory = 200.GB max_time = 1.days } +includeConfig './conf/manifest.config' includeConfig './conf/modules.config' includeConfig './conf/reporting_defaults.config' \ No newline at end of file diff --git a/pan_gene_pfr.sh b/pan_gene_pfr.sh index a1b1ced..c45623e 100644 --- a/pan_gene_pfr.sh +++ b/pan_gene_pfr.sh @@ -5,7 +5,7 @@ #SBATCH --time=1-00:00:00 #SBATCH --nodes=1 #SBATCH --ntasks=1 -#SBATCH --cpus-per-task=2 +#SBATCH --cpus-per-task=1 #SBATCH --output pan_gene_pfr.stdout #SBATCH --error pan_gene_pfr.stderr #SBATCH --mem=4G diff --git a/subworkflows/local/fasta_liftoff.nf b/subworkflows/local/fasta_liftoff.nf new file mode 100644 index 0000000..9a3cba8 --- /dev/null +++ b/subworkflows/local/fasta_liftoff.nf @@ -0,0 +1,79 @@ +include { GUNZIP as GUNZIP_FASTA } from '../../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_GFF } from '../../modules/nf-core/gunzip' +include { LIFTOFF } from '../../modules/local/liftoff' + + +workflow FASTA_LIFTOFF { + take: + target_assemby // Channel: [ meta, fasta ] + xref_annotations_fasta // Channel: [ meta2, fasta ] + xref_annotations_gff // Channel: [ meta2, gff3 ] + + main: + // MODULE: GUNZIP_FASTA + xref_annotations_fasta + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } + | set { xref_annotations_fasta_branch } + + GUNZIP_FASTA( + xref_annotations_fasta_branch.gz + ) + .gunzip + | mix( + xref_annotations_fasta_branch.rest + ) + | set { ch_xref_annotations_gunzip_fasta } + + // MODULE: GUNZIP_GFF + xref_annotations_gff + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } + | set { xref_annotations_gff_branch } + + GUNZIP_GFF( + xref_annotations_gff_branch.gz + ) + .gunzip + | mix( + xref_annotations_gff_branch.rest + ) + | set { ch_xref_annotations_gunzip_gff } + + // MODULE: LIFTOFF + target_assemby + | combine( + ch_xref_annotations_gunzip_fasta + | join( + ch_xref_annotations_gunzip_gff + ) + ) + | map { meta, targetFasta, refMeta, refFasta, refGFF -> + [[id:"${meta.id}.from.${refMeta.id}", target_assemby: meta.id], targetFasta, refFasta, refGFF] + } + | set { ch_liftoff_inputs } + + LIFTOFF( + ch_liftoff_inputs.map { meta, targetFasta, refFasta, refGFF -> [meta, targetFasta] }, + ch_liftoff_inputs.map { meta, targetFasta, refFasta, refGFF -> refFasta }, + ch_liftoff_inputs.map { meta, targetFasta, refFasta, refGFF -> refGFF } + ) + .gff3 + | map { meta, gff -> [[id: meta.target_assemby], gff] } + | groupTuple + | set { ch_liftoff_gff3 } + + Channel.empty() + | mix(GUNZIP_FASTA.out.versions.first()) + | mix(GUNZIP_GFF.out.versions.first()) + | mix(LIFTOFF.out.versions.first()) + | set { ch_versions } + + emit: + gff3 = ch_liftoff_gff3 // [ meta, [ gff3 ] ] + versions = ch_versions // [ versions.yml ] +} \ No newline at end of file diff --git a/workflows/pan_gene.nf b/workflows/pan_gene.nf index acfc77a..11699f5 100644 --- a/workflows/pan_gene.nf +++ b/workflows/pan_gene.nf @@ -1,14 +1,15 @@ -nextflow.enable.dsl=2 +include { validateParams } from '../modules/local/validate_params' -include { BRAKER3 } from '../modules/kherronism/braker3' -include { GUNZIP as GUNZIP_XREF_FASTA } from '../modules/nf-core/gunzip' -include { GUNZIP as GUNZIP_XREF_GFF } from '../modules/nf-core/gunzip' -include { validateParams } from '../modules/local/validate_params' +include { PREPARE_ASSEMBLY } from '../subworkflows/local/prepare_assembly' +include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' +include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' +include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' -include { PREPARE_ASSEMBLY } from '../subworkflows/local/prepare_assembly' -include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' -include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' -include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' +include { BRAKER3 } from '../modules/kherronism/braker3' + +include { FASTA_LIFTOFF } from '../subworkflows/local/fasta_liftoff' + +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions' validateParams(params) @@ -39,17 +40,32 @@ workflow PAN_GENE { ? file(params.ribo_database_manifest, checkIfExists: true) : null - ch_sortmerna_fastas = Channel.from(ch_ribo_db ? ch_ribo_db.readLines() : null) + ch_sortmerna_fastas = ch_ribo_db + ? Channel.from(ch_ribo_db ? ch_ribo_db.readLines() : null) | map { row -> file(row, checkIfExists: true) } | collect + : Channel.empty() - ch_ext_prot_fastas = (params.external_protein_fastas + ch_ext_prot_fastas = params.external_protein_fastas ? Channel.fromList(params.external_protein_fastas) - : Channel.empty()) | map { filePath -> def fileHandle = file(filePath, checkIfExists: true) [[id:fileHandle.getSimpleName()], fileHandle] } + : Channel.empty() + + ch_xref_annotations_mm = params.liftoff_xref_annotations + ? Channel.fromList(params.liftoff_xref_annotations) + | multiMap { fasta, gff -> + def fastaFile = file(fasta, checkIfExists:true) + + fasta: [[id:fastaFile.getSimpleName()], fastaFile] + gff: [[id:fastaFile.getSimpleName()], file(gff, checkIfExists:true)] + } + : Channel.empty() + + ch_xref_annotations_fasta = ch_xref_annotations_mm.fasta + ch_xref_annotations_gff = ch_xref_annotations_mm.gff // SUBWORKFLOW: PREPARE_ASSEMBLY PREPARE_ASSEMBLY( @@ -120,64 +136,18 @@ workflow PAN_GENE { ch_braker_gff3 = BRAKER3.out.gff3 ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) - // // MODULE: GUNZIP_XREF_FASTA - // ch_xref_annotations = Channel.empty() - // if(params.liftoff_xref_annotations) { - // Channel.fromList(params.liftoff_xref_annotations) - // | multiMap { fasta, gff -> - // def fastaFile = file(fasta, checkIfExists:true) - // def meta = [id:fastaFile.getSimpleName()] - - // fasta: [meta, fastaFile] - // gff: [meta, file(gff, checkIfExists:true)] - // } - // | set { ch_xref_annotations } - // } - - // ch_xref_annotations.fasta - // | branch { meta, file -> - // gz: "$file".endsWith(".gz") - // rest: !"$file".endsWith(".gz") - // } - // | set { ch_xref_annotations_branch } - - // GUNZIP_XREF_FASTA( - // ch_xref_annotations_branch.gz - // ) - // .gunzip - // | mix( - // ch_xref_annotations_branch.rest - // ) - // | set { ch_xref_annotations_fasta } - - // // MODULE: GUNZIP_XREF_GFF - // ch_xref_annotations.gff - // | branch { meta, file -> - // gz: "$file".endsWith(".gz") - // rest: !"$file".endsWith(".gz") - // } - // | set { ch_xref_annotations_gff_branch } - - // GUNZIP_XREF_GFF( - // ch_xref_annotations_gff_branch.gff.map { meta, fasta, gff -> [meta, gff] } - // ) - // .gunzip - // | mix( - // ch_xref_annotations_gff_branch.rest.map { meta, fasta, gff -> [meta, gff] } - // ) - // | set { ch_xref_annotations_gff } - - // ch_xref_annotations_fasta - // | join( - // ch_xref_annotations_gff - // ) - // | set { ch_xref_annotations } - - // // MODULE: LIFTOFF - // ch_xref_annotations - // | combine( - // ch_validated_target_assemblies - // ) - // | map { meta, ref_fasta, refGFF, targetMeta, targetFasta -> [[id:"${targetMeta.id}.from.${meta.id}"], ref_fasta, refGFF, targetFasta] } - // | set { ch_liftoff_inputs } + // SUBWORKFLOW: FASTA_LIFTOFF + FASTA_LIFTOFF( + ch_valid_target_assembly, + ch_xref_annotations_fasta, + ch_xref_annotations_gff + ) + + ch_liftoff_gff3 = FASTA_LIFTOFF.out.gff3 + ch_versions = ch_versions.mix(FASTA_LIFTOFF.out.versions) + + // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) } \ No newline at end of file From f10ae9425689924f1133e5ed957858351a2a9fe4 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 14 Nov 2023 10:24:42 +1300 Subject: [PATCH 13/59] Added polished out channel to liftoff --- modules/local/liftoff/main.nf | 11 ++++++++--- workflows/pan_gene.nf | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/modules/local/liftoff/main.nf b/modules/local/liftoff/main.nf index 5bfb6f2..e10374d 100644 --- a/modules/local/liftoff/main.nf +++ b/modules/local/liftoff/main.nf @@ -10,9 +10,10 @@ process LIFTOFF { path ref_gff output: - tuple val(meta), path("*.gff3") , emit: gff3 - tuple val(meta), path("*.unmapped.txt") , emit: unmapped - path "versions.yml" , emit: versions + tuple val(meta), path("*.gff3") , emit: gff3 + tuple val(meta), path("*.polished.gff3") , emit: polished_gff3, optional: true + tuple val(meta), path("*.unmapped.txt") , emit: unmapped_txt + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -31,6 +32,10 @@ process LIFTOFF { $ref_fa \\ 2> liftoff.stderr + [ -f "${prefix}.gff3_polished" ] \\ + && mv "${prefix}.gff3_polished" "${prefix}.polished.gff3" \\ + || echo "-polish is absent" + cat <<-END_VERSIONS > versions.yml "${task.process}": liftoff: \$(liftoff --version 2> /dev/null) diff --git a/workflows/pan_gene.nf b/workflows/pan_gene.nf index 11699f5..c442354 100644 --- a/workflows/pan_gene.nf +++ b/workflows/pan_gene.nf @@ -143,7 +143,7 @@ workflow PAN_GENE { ch_xref_annotations_gff ) - ch_liftoff_gff3 = FASTA_LIFTOFF.out.gff3 + ch_liftoff_gff3 = FASTA_LIFTOFF.out.polished_gff3 ch_versions = ch_versions.mix(FASTA_LIFTOFF.out.versions) // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS From 80db270eae1073dabac28a34ffbb78bd2e5965ac Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 14 Nov 2023 10:38:56 +1300 Subject: [PATCH 14/59] Added gffread before liftoff --- conf/modules.config | 4 +++ modules/nf-core/CHANGELOG.md | 8 ++++- modules/nf-core/gffread/environment.yml | 6 ++++ modules/nf-core/gffread/main.nf | 35 +++++++++++++++++++ modules/nf-core/gffread/meta.yml | 33 +++++++++++++++++ modules/nf-core/gffread/tests/main.nf.test | 32 +++++++++++++++++ .../nf-core/gffread/tests/main.nf.test.snap | 21 +++++++++++ modules/nf-core/gffread/tests/tags.yml | 2 ++ subworkflows/local/fasta_liftoff.nf | 12 +++++-- 9 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 modules/nf-core/gffread/environment.yml create mode 100644 modules/nf-core/gffread/main.nf create mode 100644 modules/nf-core/gffread/meta.yml create mode 100644 modules/nf-core/gffread/tests/main.nf.test create mode 100644 modules/nf-core/gffread/tests/main.nf.test.snap create mode 100644 modules/nf-core/gffread/tests/tags.yml diff --git a/conf/modules.config b/conf/modules.config index 8c99be5..5448813 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -142,6 +142,10 @@ if(params.liftoff_xref_annotations) { saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] } + + withName: GFFREAD { + ext.args = '--no-pseudo --keep-genes' + } } } diff --git a/modules/nf-core/CHANGELOG.md b/modules/nf-core/CHANGELOG.md index 280bc90..b2b47c7 100644 --- a/modules/nf-core/CHANGELOG.md +++ b/modules/nf-core/CHANGELOG.md @@ -25,4 +25,10 @@ 1. Added stub 2. Added author in meta.yml -- Repo: https://github.com/nf-core/modules/tree/71dbe24bee9ad6c013d4dd400d92612f6bf01ab8 \ No newline at end of file +- Repo: https://github.com/nf-core/modules/tree/71dbe24bee9ad6c013d4dd400d92612f6bf01ab8 + +### gffread + +1. Added gff3 channel +2. Made output channels optional +3. Added author in meta.yml \ No newline at end of file diff --git a/modules/nf-core/gffread/environment.yml b/modules/nf-core/gffread/environment.yml new file mode 100644 index 0000000..d127cae --- /dev/null +++ b/modules/nf-core/gffread/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gffread=0.12.1 diff --git a/modules/nf-core/gffread/main.nf b/modules/nf-core/gffread/main.nf new file mode 100644 index 0000000..d1477ab --- /dev/null +++ b/modules/nf-core/gffread/main.nf @@ -0,0 +1,35 @@ +process GFFREAD { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gffread:0.12.1--h8b12597_0' : + 'biocontainers/gffread:0.12.1--h8b12597_0' }" + + input: + tuple val(meta), path(gff) + + output: + tuple val(meta), path("*.gtf") , emit: gtf, optional: true + tuple val(meta), path("*.gff3") , emit: gff, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${gff.baseName}" + def extension = args.contains("-T") ? '.gtf' : '.gff3' + """ + gffread \\ + $gff \\ + $args \\ + -o ${prefix}.${extension} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gffread/meta.yml b/modules/nf-core/gffread/meta.yml new file mode 100644 index 0000000..8a09a20 --- /dev/null +++ b/modules/nf-core/gffread/meta.yml @@ -0,0 +1,33 @@ +name: gffread +description: Validate, filter, convert and perform various other operations on GFF files +keywords: + - gff + - conversion + - validation +tools: + - gffread: + description: GFF/GTF utility providing format conversions, region filtering, FASTA sequence extraction and more. + homepage: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + documentation: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + tool_dev_url: https://github.com/gpertea/gffread + doi: 10.12688/f1000research.23297.1 + licence: ["MIT"] +input: + - gff: + type: file + description: A reference file in either the GFF3, GFF2 or GTF format. + pattern: "*.{gff, gtf}" +output: + - gtf: + type: file + description: GTF file resulting from the conversion of the GFF input file + pattern: "*.{gtf}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@emiller88" + - "@gallvp" +maintainers: + - "@emiller88" diff --git a/modules/nf-core/gffread/tests/main.nf.test b/modules/nf-core/gffread/tests/main.nf.test new file mode 100644 index 0000000..67d47ec --- /dev/null +++ b/modules/nf-core/gffread/tests/main.nf.test @@ -0,0 +1,32 @@ +nextflow_process { + + name "Test Process GFFREAD" + script "../main.nf" + process "GFFREAD" + tag "gffread" + tag "modules_nfcore" + tag "modules" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gffread/tests/main.nf.test.snap b/modules/nf-core/gffread/tests/main.nf.test.snap new file mode 100644 index 0000000..fb5460c --- /dev/null +++ b/modules/nf-core/gffread/tests/main.nf.test.snap @@ -0,0 +1,21 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "genome.gtf:md5,f184f856b7fe3e159d21b052b5dd3954" + ], + "1": [ + "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + ], + "gtf": [ + "genome.gtf:md5,f184f856b7fe3e159d21b052b5dd3954" + ], + "versions": [ + "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + ] + } + ], + "timestamp": "2023-10-17T10:00:08.542490523" + } +} \ No newline at end of file diff --git a/modules/nf-core/gffread/tests/tags.yml b/modules/nf-core/gffread/tests/tags.yml new file mode 100644 index 0000000..0557606 --- /dev/null +++ b/modules/nf-core/gffread/tests/tags.yml @@ -0,0 +1,2 @@ +gffread: + - modules/nf-core/gffread/** diff --git a/subworkflows/local/fasta_liftoff.nf b/subworkflows/local/fasta_liftoff.nf index 9a3cba8..8ac820e 100644 --- a/subworkflows/local/fasta_liftoff.nf +++ b/subworkflows/local/fasta_liftoff.nf @@ -1,8 +1,8 @@ include { GUNZIP as GUNZIP_FASTA } from '../../modules/nf-core/gunzip' include { GUNZIP as GUNZIP_GFF } from '../../modules/nf-core/gunzip' +include { GFFREAD } from '../../modules/nf-core/gffread' include { LIFTOFF } from '../../modules/local/liftoff' - workflow FASTA_LIFTOFF { take: target_assemby // Channel: [ meta, fasta ] @@ -44,12 +44,19 @@ workflow FASTA_LIFTOFF { ) | set { ch_xref_annotations_gunzip_gff } + // MODULE: GFFREAD + GFFREAD( + ch_xref_annotations_gunzip_gff + ) + .gff + | set { ch_gffread_gff } + // MODULE: LIFTOFF target_assemby | combine( ch_xref_annotations_gunzip_fasta | join( - ch_xref_annotations_gunzip_gff + ch_gffread_gff ) ) | map { meta, targetFasta, refMeta, refFasta, refGFF -> @@ -70,6 +77,7 @@ workflow FASTA_LIFTOFF { Channel.empty() | mix(GUNZIP_FASTA.out.versions.first()) | mix(GUNZIP_GFF.out.versions.first()) + | mix(GFFREAD.out.versions.first()) | mix(LIFTOFF.out.versions.first()) | set { ch_versions } From bb9b8b06973c19a34c98f7ce59deaffa942f6d1d Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 14 Nov 2023 12:00:47 +1300 Subject: [PATCH 15/59] Updated flowchart --- README.md | 124 +++++++++++++++++++++++------------------------------- 1 file changed, 52 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index 7dbf45e..237657f 100644 --- a/README.md +++ b/README.md @@ -5,90 +5,70 @@ A NextFlow pipeline for pan-genome annotation. ```mermaid flowchart TD - ribo_db((ribo_db)) - SAMPLESHEET((samples)) - TE_LIBRARIES(("[te_libs]")) - TARGET_ASSEMBLIES(("[assemblies]")) - EXTERNAL_PROTEIN_SEQS(("[ext_prots]")) - - GUNZIP_PROT[GUNZIP] - GUNZIP_TE[GUNZIP] - SKIP_EDTA{Skip EDTA} - pend((dev)) - - TE_LIBRARIES --> GUNZIP_TE - GUNZIP_TE --> SKIP_EDTA - - TARGET_ASSEMBLIES --> GUNZIP - GUNZIP --> FASTA_VALIDATE - FASTA_VALIDATE --> FASTA_PERFORM_EDTA - FASTA_VALIDATE --> SKIP_EDTA - - SKIP_EDTA --> REPEATMASKER - FASTA_PERFORM_EDTA --> REPEATMASKER - REPEATMASKER --> STAR_GENOMEGENERATE - - SAMPLESHEET --> SAMPLESHEET_CHECK - SAMPLESHEET_CHECK --> |Technical replicates|CAT_FASTQ - CAT_FASTQ --> FASTQC - SAMPLESHEET_CHECK --> FASTQC - FASTQC --> FASTP - - ribo_db --> SORTMERNA - FASTP --> SORTMERNA - SORTMERNA --> STAR_ALIGN - STAR_GENOMEGENERATE --> STAR_ALIGN - STAR_ALIGN --> GROUP_BY_ASSEMBLY([Group by assembly]) - GROUP_BY_ASSEMBLY --> SAMTOOLS_CAT - SAMTOOLS_CAT --> |RNASeq bam|BRAKER3 - - REPEATMASKER --> BRAKER3 - - EXTERNAL_PROTEIN_SEQS --> GUNZIP_PROT - GUNZIP_PROT --> CAT - CAT --> BRAKER3 - - BRAKER3 --> pend - - subgraph Params + subgraph PrepareAssembly [ ] TARGET_ASSEMBLIES TE_LIBRARIES - SAMPLESHEET - ribo_db - EXTERNAL_PROTEIN_SEQS - end - - subgraph GenomePrep - GUNZIP FASTA_VALIDATE - GUNZIP_TE - FASTA_PERFORM_EDTA - SKIP_EDTA + EDTA REPEATMASKER - STAR_GENOMEGENERATE end - - subgraph Braker - CAT - GUNZIP_PROT - BRAKER3 - end - - subgraph SamplePrep - SAMPLESHEET_CHECK + + TARGET_ASSEMBLIES(["[target_assemblies]"]) + TE_LIBRARIES(["[te_libs]"]) + TARGET_ASSEMBLIES --> FASTA_VALIDATE + FASTA_VALIDATE --> EDTA + TE_LIBRARIES --> REPEATMASKER + EDTA --> |te_lib absent|REPEATMASKER + + subgraph Samplesheet [ ] + SAMPLESHEET CAT_FASTQ FASTQC FASTP + FASTP_FASTQC SORTMERNA - STAR_ALIGN - GROUP_BY_ASSEMBLY + STAR SAMTOOLS_CAT end + + SAMPLESHEET([samplesheet]) + SAMPLESHEET --> |Tech. reps|CAT_FASTQ + CAT_FASTQ --> FASTQC + SAMPLESHEET --> FASTQC + FASTQC --> FASTP + FASTP --> FASTP_FASTQC[FASTQC] + FASTP_FASTQC --> SORTMERNA + SORTMERNA --> STAR + STAR --> SAMTOOLS_CAT + + subgraph Annotation [ ] + anno_fasta(( )) + anno_masked_fasta(( )) + anno_bam(( )) + EXTERNAL_PROTEIN_SEQS(["[ext_prots]"]) + XREF_ANNOTATIONS(["[xref_annotations]"]) + CAT + BRAKER3 + GFFREAD + LIFTOFF + end + + PrepareAssembly --> |Fasta, Masked fasta|Annotation + Samplesheet --> |RNASeq bam|Annotation + + XREF_ANNOTATIONS --> |xref_gff|GFFREAD + XREF_ANNOTATIONS --> |xref_fasta|LIFTOFF + GFFREAD --> LIFTOFF + anno_fasta --> |Fasta|LIFTOFF + + EXTERNAL_PROTEIN_SEQS --> CAT + anno_masked_fasta --> |Masked fasta|BRAKER3 + anno_bam --> |RNASeq bam|BRAKER3 + CAT --> BRAKER3 - style Params fill:#00FFFF21,stroke:#00FFFF21 - style GenomePrep fill:#00FFFF21,stroke:#00FFFF21 - style SamplePrep fill:#00FFFF21,stroke:#00FFFF21 - style Braker fill:#00FFFF21,stroke:#00FFFF21 + style Samplesheet fill:#00FFFF21,stroke:#00FFFF21 + style PrepareAssembly fill:#00FFFF21,stroke:#00FFFF21 + style Annotation fill:#00FFFF21,stroke:#00FFFF21 ``` ## Plant&Food Users From c982946527e4776eb60504e8a126ce5605fa5f26 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 16 Nov 2023 08:13:09 +1300 Subject: [PATCH 16/59] Added liftoff options --- conf/modules.config | 9 ++++++++- nextflow.config | 3 +++ subworkflows/local/fasta_liftoff.nf | 2 +- workflows/pan_gene.nf | 2 +- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5448813..58830e9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -135,7 +135,14 @@ process { if(params.liftoff_xref_annotations) { process { withName: LIFTOFF { - ext.args = '-exclude_partial -copies -polish' + ext.args = ' ' + ext.args = [ + '-exclude_partial', + '-copies', + '-polish', + "-a $params.liftoff_coverage", + "-s $params.liftoff_identity" + ].join(' ').trim() publishDir = [ path: { "${params.outdir}/liftoff/${meta.id}" }, mode: "copy", diff --git a/nextflow.config b/nextflow.config index 574bf39..6180e35 100644 --- a/nextflow.config +++ b/nextflow.config @@ -65,6 +65,9 @@ params { // Format: [ [ fasta(.gz), gff3(.gz) ] ] // Optional: Set to null if not available + liftoff_coverage = 0.9 + liftoff_identity = 0.9 + outdir = "./results" max_cpus = 12 diff --git a/subworkflows/local/fasta_liftoff.nf b/subworkflows/local/fasta_liftoff.nf index 8ac820e..5e6fd22 100644 --- a/subworkflows/local/fasta_liftoff.nf +++ b/subworkflows/local/fasta_liftoff.nf @@ -69,7 +69,7 @@ workflow FASTA_LIFTOFF { ch_liftoff_inputs.map { meta, targetFasta, refFasta, refGFF -> refFasta }, ch_liftoff_inputs.map { meta, targetFasta, refFasta, refGFF -> refGFF } ) - .gff3 + .polished_gff3 | map { meta, gff -> [[id: meta.target_assemby], gff] } | groupTuple | set { ch_liftoff_gff3 } diff --git a/workflows/pan_gene.nf b/workflows/pan_gene.nf index c442354..11699f5 100644 --- a/workflows/pan_gene.nf +++ b/workflows/pan_gene.nf @@ -143,7 +143,7 @@ workflow PAN_GENE { ch_xref_annotations_gff ) - ch_liftoff_gff3 = FASTA_LIFTOFF.out.polished_gff3 + ch_liftoff_gff3 = FASTA_LIFTOFF.out.gff3 ch_versions = ch_versions.mix(FASTA_LIFTOFF.out.versions) // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS From c8ff8dac9f38702a41782f5893eb0466d70ee245 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 21 Nov 2023 15:14:49 +1300 Subject: [PATCH 17/59] Started moving to nf-core/tools --- .gitignore | 9 ++++++--- README.md | 2 +- TODO.md | 6 +++++- conf/base.config | 4 +++- main.nf | 8 ++++++-- modules/kherronism/braker3/main.nf | 2 -- modules/local/edta/edta/main.nf | 1 - nextflow.config | 6 +++--- pan_gene_pfr.sh | 18 ------------------ pangene_pfr.sh | 18 ++++++++++++++++++ workflows/{pan_gene.nf => pangene.nf} | 2 +- 11 files changed, 43 insertions(+), 33 deletions(-) delete mode 100644 pan_gene_pfr.sh create mode 100644 pangene_pfr.sh rename workflows/{pan_gene.nf => pangene.nf} (99%) diff --git a/.gitignore b/.gitignore index 6e9d9d4..8f984b0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,12 @@ -.DS_Store -*.pyc -__pycahce__ .nextflow* work/ +data/ results/ +.DS_Store +testing/ +testing* +*.pyc + *.stdout *.stderr diff --git a/README.md b/README.md index 237657f..ea8b609 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# PAN-GENE +# PANGENE A NextFlow pipeline for pan-genome annotation. ## Pipeline Flowchart diff --git a/TODO.md b/TODO.md index 6e1e66c..8c90b99 100644 --- a/TODO.md +++ b/TODO.md @@ -1 +1,5 @@ -- [ ] Add --eval=reference.gtf \ No newline at end of file +- [ ] Add --eval=reference.gtf +- [ ] From Ross regarding post-processing: + +> [9:49 am] Ross Crowhurst +Here is an easy one: BLATSp vs swissprot & Arabidpsis and check query is with set thresholds of reference - if so accept; If not move to BLASTp vs Uniref90 or Refeq (or some other predetermined model species) - same deal accept if within threshold limits. Else BLASTn of cds vs NCBI nt (really scrapping the bottom of the barrel here). If not a hit to anything then chances are its garbage and should be removed. Some ppl might try to claim its a unique protein to the genotype but in 20 years I have never seen one of those be supported - mostly this category is garbage. The screen agains NCBI nt also assists to classify "bits" as well retroposonss etc. Idea being you want to remove garbage predictions - as this does take time you can see why some papers just filter out by size. \ No newline at end of file diff --git a/conf/base.config b/conf/base.config index 4cdec8d..54db554 100644 --- a/conf/base.config +++ b/conf/base.config @@ -61,9 +61,11 @@ process { } } -singularity { +apptainer { enabled = true autoMounts = true + envWhitelist = "APPTAINER_BINDPATH,APPTAINER_BIND" + registry = 'quay.io' } nextflow { diff --git a/main.nf b/main.nf index c8a54e2..7fe5247 100755 --- a/main.nf +++ b/main.nf @@ -2,8 +2,12 @@ nextflow.enable.dsl=2 -include { PAN_GENE } from './workflows/pan_gene.nf' +include { PANGENE } from './workflows/pangene.nf' workflow { - PAN_GENE() + PFR_PANGENE() +} + +workflow PFR_PANGENE { + PANGENE() } \ No newline at end of file diff --git a/modules/kherronism/braker3/main.nf b/modules/kherronism/braker3/main.nf index 82fa096..c9d915a 100644 --- a/modules/kherronism/braker3/main.nf +++ b/modules/kherronism/braker3/main.nf @@ -6,8 +6,6 @@ process BRAKER3 { container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'registry.hub.docker.com/teambraker/braker3:v.1.0.3': 'registry.hub.docker.com/teambraker/braker3:v.1.0.3' }" - - containerOptions "-B $TMPDIR:$TMPDIR" input: tuple val(meta), path(fasta) diff --git a/modules/local/edta/edta/main.nf b/modules/local/edta/edta/main.nf index 2e6d759..9c9b180 100644 --- a/modules/local/edta/edta/main.nf +++ b/modules/local/edta/edta/main.nf @@ -4,7 +4,6 @@ process EDTA { label "process_week_long" container 'https://depot.galaxyproject.org/singularity/edta:2.1.0--hdfd78af_1' - containerOptions "-B $TMPDIR:$TMPDIR" input: tuple val(meta), path(fasta_file) diff --git a/nextflow.config b/nextflow.config index 6180e35..135bf29 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,8 +2,8 @@ includeConfig './conf/base.config' params { target_assemblies = [ - ["red5_v2p1", "/workspace/hrauxr/pan-gene/.test/red5_v2p1_chr1.fasta"], - ["donghong", "/workspace/hrauxr/pan-gene/.test/donghong.chr1.fsa.gz"] + ["red5_v2p1", "/workspace/hrauxr/pangene/.test/red5_v2p1_chr1.fasta"], + ["donghong", "/workspace/hrauxr/pangene/.test/donghong.chr1.fsa.gz"] ] // Pattern: [ [tag, fasta(.gz) ] ] // Permissible tags: tag, tag_1, tag_tag2_3, tag_tag2_tag3; @@ -11,7 +11,7 @@ params { // "." is not allowed in the tag name te_libraries = [ - ["donghong", "/workspace/hrauxr/pan-gene/.test/donghong.TElib.fa.gz"] + ["donghong", "/workspace/hrauxr/pangene/.test/donghong.TElib.fa.gz"] ] // Pattern: [ [tag, fasta(.gz) ] ] // Optional Set to null if libraries are not available. diff --git a/pan_gene_pfr.sh b/pan_gene_pfr.sh deleted file mode 100644 index c45623e..0000000 --- a/pan_gene_pfr.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -e - - -#SBATCH --job-name PAN_GENE -#SBATCH --time=1-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=1 -#SBATCH --output pan_gene_pfr.stdout -#SBATCH --error pan_gene_pfr.stderr -#SBATCH --mem=4G - -ml apptainer/1.1 -ml nextflow/22.10.4 - -export TMPDIR="/workspace/$USER/tmp" - -nextflow main.nf -profile slurm -resume \ No newline at end of file diff --git a/pangene_pfr.sh b/pangene_pfr.sh new file mode 100644 index 0000000..785199e --- /dev/null +++ b/pangene_pfr.sh @@ -0,0 +1,18 @@ +#!/bin/bash -e + + +#SBATCH --job-name PANGENE +#SBATCH --time=1-00:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 +#SBATCH --output pangene_pfr.stdout +#SBATCH --error pangene_pfr.stderr +#SBATCH --mem=4G + +ml apptainer/1.1 +ml nextflow/23.04.4 + +export APPTAINER_BINDPATH="$APPTAINER_BINDPATH,/workspace/$USER/tmp:/tmp" + +nextflow main.nf -profile slurm -resume \ No newline at end of file diff --git a/workflows/pan_gene.nf b/workflows/pangene.nf similarity index 99% rename from workflows/pan_gene.nf rename to workflows/pangene.nf index 11699f5..9110688 100644 --- a/workflows/pan_gene.nf +++ b/workflows/pangene.nf @@ -13,7 +13,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpso validateParams(params) -workflow PAN_GENE { +workflow PANGENE { ch_versions = Channel.empty() From f28546183bb88402cf5c62f00e2f549f16490b6c Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 22 Nov 2023 12:51:01 +1300 Subject: [PATCH 18/59] Reimported modules using nf-core/tools --- .nf-core.yml | 1 + modules.json | 108 ++++ modules/nf-core/CHANGELOG.md | 34 -- modules/nf-core/LICENSE | 21 - modules/nf-core/cat/cat/environment.yml | 7 + modules/nf-core/cat/cat/main.nf | 2 +- modules/nf-core/cat/cat/meta.yml | 7 +- modules/nf-core/cat/cat/tests/main.nf.test | 153 ++++++ .../nf-core/cat/cat/tests/main.nf.test.snap | 121 +++++ .../cat/tests/nextflow_unzipped_zipped.config | 6 + .../cat/tests/nextflow_zipped_unzipped.config | 8 + modules/nf-core/cat/cat/tests/tags.yml | 2 + modules/nf-core/cat/fastq/environment.yml | 7 + modules/nf-core/cat/fastq/main.nf | 2 +- modules/nf-core/cat/fastq/meta.yml | 4 +- modules/nf-core/cat/fastq/tests/main.nf.test | 143 ++++++ .../nf-core/cat/fastq/tests/main.nf.test.snap | 78 +++ modules/nf-core/cat/fastq/tests/tags.yml | 2 + .../dumpsoftwareversions/environment.yml | 1 + .../custom/dumpsoftwareversions/meta.yml | 2 +- modules/nf-core/fastp/environment.yml | 7 + modules/nf-core/fastp/main.nf | 20 +- modules/nf-core/fastp/meta.yml | 5 +- modules/nf-core/fastp/tests/main.nf.test | 485 ++++++++++++++++++ modules/nf-core/fastp/tests/main.nf.test.snap | 52 ++ modules/nf-core/fastp/tests/nextflow.config | 6 + modules/nf-core/fastp/tests/tags.yml | 2 + modules/nf-core/fastqc/environment.yml | 7 + modules/nf-core/fastqc/main.nf | 6 +- modules/nf-core/fastqc/meta.yml | 5 + modules/nf-core/fastqc/tests/main.nf.test | 23 +- .../nf-core/fastqc/tests/main.nf.test.snap | 10 + modules/nf-core/fastqc/tests/tags.yml | 2 + modules/nf-core/gffread/environment.yml | 1 + modules/nf-core/gffread/main.nf | 16 +- modules/nf-core/gffread/meta.yml | 1 - modules/nf-core/gunzip/environment.yml | 7 + modules/nf-core/gunzip/main.nf | 2 +- modules/nf-core/gunzip/meta.yml | 4 + modules/nf-core/gunzip/tests/main.nf.test | 35 ++ .../nf-core/gunzip/tests/main.nf.test.snap | 31 ++ modules/nf-core/gunzip/tests/tags.yml | 2 + modules/nf-core/samtools/cat/environment.yml | 7 + modules/nf-core/samtools/cat/main.nf | 2 +- modules/nf-core/samtools/cat/meta.yml | 2 + modules/nf-core/sortmerna/environment.yml | 7 + modules/nf-core/sortmerna/main.nf | 28 +- modules/nf-core/sortmerna/meta.yml | 4 +- modules/nf-core/star/align/environment.yml | 9 + modules/nf-core/star/align/main.nf | 2 +- modules/nf-core/star/align/meta.yml | 6 +- .../star/genomegenerate/environment.yml | 9 + modules/nf-core/star/genomegenerate/main.nf | 8 +- modules/nf-core/star/genomegenerate/meta.yml | 6 +- modules/nf-core/star/starsolo/main.nf | 94 ---- modules/nf-core/star/starsolo/meta.yml | 79 --- modules/nf-core/trinity/main.nf | 74 --- modules/nf-core/trinity/meta.yml | 45 -- modules/nf-core/umitools/dedup/main.nf | 62 --- modules/nf-core/umitools/dedup/meta.yml | 68 --- .../nf-core/umitools/extract/environment.yml | 7 + modules/nf-core/umitools/extract/main.nf | 2 +- modules/nf-core/umitools/extract/meta.yml | 17 +- .../umitools/extract/tests/main.nf.test | 35 ++ .../umitools/extract/tests/main.nf.test.snap | 10 + .../umitools/extract/tests/nextflow.config | 9 + .../nf-core/umitools/extract/tests/tags.yml | 2 + modules/nf-core/umitools/group/main.nf | 62 --- modules/nf-core/umitools/group/meta.yml | 62 --- .../fastq_fastqc_umitools_fastp/main.nf | 3 +- .../fastq_fastqc_umitools_fastp/meta.yml | 9 +- 71 files changed, 1463 insertions(+), 705 deletions(-) create mode 100644 .nf-core.yml create mode 100644 modules.json delete mode 100644 modules/nf-core/CHANGELOG.md delete mode 100644 modules/nf-core/LICENSE create mode 100644 modules/nf-core/cat/cat/environment.yml create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test.snap create mode 100644 modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config create mode 100644 modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config create mode 100644 modules/nf-core/cat/cat/tests/tags.yml create mode 100644 modules/nf-core/cat/fastq/environment.yml create mode 100644 modules/nf-core/cat/fastq/tests/main.nf.test create mode 100644 modules/nf-core/cat/fastq/tests/main.nf.test.snap create mode 100644 modules/nf-core/cat/fastq/tests/tags.yml create mode 100644 modules/nf-core/fastp/environment.yml create mode 100644 modules/nf-core/fastp/tests/main.nf.test create mode 100644 modules/nf-core/fastp/tests/main.nf.test.snap create mode 100644 modules/nf-core/fastp/tests/nextflow.config create mode 100644 modules/nf-core/fastp/tests/tags.yml create mode 100644 modules/nf-core/fastqc/environment.yml create mode 100644 modules/nf-core/fastqc/tests/main.nf.test.snap create mode 100644 modules/nf-core/fastqc/tests/tags.yml create mode 100644 modules/nf-core/gunzip/environment.yml create mode 100644 modules/nf-core/gunzip/tests/main.nf.test create mode 100644 modules/nf-core/gunzip/tests/main.nf.test.snap create mode 100644 modules/nf-core/gunzip/tests/tags.yml create mode 100644 modules/nf-core/samtools/cat/environment.yml create mode 100644 modules/nf-core/sortmerna/environment.yml create mode 100644 modules/nf-core/star/align/environment.yml create mode 100644 modules/nf-core/star/genomegenerate/environment.yml delete mode 100644 modules/nf-core/star/starsolo/main.nf delete mode 100644 modules/nf-core/star/starsolo/meta.yml delete mode 100644 modules/nf-core/trinity/main.nf delete mode 100644 modules/nf-core/trinity/meta.yml delete mode 100644 modules/nf-core/umitools/dedup/main.nf delete mode 100644 modules/nf-core/umitools/dedup/meta.yml create mode 100644 modules/nf-core/umitools/extract/environment.yml create mode 100644 modules/nf-core/umitools/extract/tests/main.nf.test create mode 100644 modules/nf-core/umitools/extract/tests/main.nf.test.snap create mode 100644 modules/nf-core/umitools/extract/tests/nextflow.config create mode 100644 modules/nf-core/umitools/extract/tests/tags.yml delete mode 100644 modules/nf-core/umitools/group/main.nf delete mode 100644 modules/nf-core/umitools/group/meta.yml diff --git a/.nf-core.yml b/.nf-core.yml new file mode 100644 index 0000000..b1a7f0e --- /dev/null +++ b/.nf-core.yml @@ -0,0 +1 @@ +repository_type: pipeline \ No newline at end of file diff --git a/modules.json b/modules.json new file mode 100644 index 0000000..14b25cd --- /dev/null +++ b/modules.json @@ -0,0 +1,108 @@ +{ + "name": "PlantandFoodResearch/pangene", + "homePage": "https://github.com/PlantandFoodResearch/pangene", + "repos": { + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "cat/cat": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", + "installed_by": [ + "modules" + ] + }, + "fastp": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "fastq_fastqc_umitools_fastp" + ] + }, + "fastqc": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules", + "fastq_fastqc_umitools_fastp" + ] + }, + "gffread": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "gunzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "samtools/cat": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "sortmerna": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "star/align": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "star/genomegenerate": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "umitools/extract": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "fastq_fastqc_umitools_fastp" + ] + } + } + }, + "subworkflows": { + "nf-core": { + "fastq_fastqc_umitools_fastp": { + "branch": "master", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "installed_by": [ + "subworkflows" + ] + } + } + } + } + } +} \ No newline at end of file diff --git a/modules/nf-core/CHANGELOG.md b/modules/nf-core/CHANGELOG.md deleted file mode 100644 index b2b47c7..0000000 --- a/modules/nf-core/CHANGELOG.md +++ /dev/null @@ -1,34 +0,0 @@ -## Source - -- Repo: https://github.com/nf-core/modules/tree/3fbcafe2543dabcc7b2be0f3b24507002b3e4b0d -- License: See LICENSE file - -## Changes - -### trinity - -1. Added stub -2. Added author in meta.yml - -### fastp - -1. Added stub -2. Added author in meta.yml - -### star/genomegenerate - -1. Added star_ignore_sjdbgtf -2. Added author in meta.yml - -### sortmerna - -1. Added stub -2. Added author in meta.yml - -- Repo: https://github.com/nf-core/modules/tree/71dbe24bee9ad6c013d4dd400d92612f6bf01ab8 - -### gffread - -1. Added gff3 channel -2. Made output channels optional -3. Added author in meta.yml \ No newline at end of file diff --git a/modules/nf-core/LICENSE b/modules/nf-core/LICENSE deleted file mode 100644 index d2e2384..0000000 --- a/modules/nf-core/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) Philip Ewels - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml new file mode 100644 index 0000000..17a04ef --- /dev/null +++ b/modules/nf-core/cat/cat/environment.yml @@ -0,0 +1,7 @@ +name: cat_cat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf index 9f06221..4264a92 100644 --- a/modules/nf-core/cat/cat/main.nf +++ b/modules/nf-core/cat/cat/main.nf @@ -2,7 +2,7 @@ process CAT_CAT { tag "$meta.id" label 'process_low' - conda "conda-forge::pigz=2.3.4" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : 'biocontainers/pigz:2.3.4' }" diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml index 8acc0bf..00a8db0 100644 --- a/modules/nf-core/cat/cat/meta.yml +++ b/modules/nf-core/cat/cat/meta.yml @@ -7,9 +7,7 @@ keywords: tools: - cat: description: Just concatenation - documentation: https://man7.org/linux/man-pages/man1/cat.1.html - licence: ["GPL-3.0-or-later"] input: - meta: @@ -21,7 +19,6 @@ input: type: file description: List of compressed / uncompressed files pattern: "*" - output: - versions: type: file @@ -31,7 +28,9 @@ output: type: file description: Concatenated file. Will be gzipped if file_out ends with ".gz" pattern: "${file_out}" - authors: - "@erikrikarddaniel" - "@FriederikeHanssen" +maintainers: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test new file mode 100644 index 0000000..5766daa --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -0,0 +1,153 @@ +nextflow_process { + + name "Test Process CAT_CAT" + script "../main.nf" + process "CAT_CAT" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/cat" + + test("test_cat_unzipped_unzipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + + test("test_cat_zipped_zipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_zipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_zipped_zipped_size")} + ) + } + } + + test("test_cat_zipped_unzipped") { + config './nextflow_zipped_unzipped.config' + + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_cat_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_unzipped_zipped_size")} + ) + } + } + + test("test_cat_one_file_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")} + ) + } + } +} + diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap new file mode 100644 index 0000000..423571b --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap @@ -0,0 +1,121 @@ +{ + "test_cat_unzipped_zipped_size": { + "content": [ + 375 + ], + "timestamp": "2023-10-16T14:33:08.049445686" + }, + "test_cat_unzipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:18.500464399" + }, + "test_cat_zipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:49.642741302" + }, + "test_cat_zipped_zipped_lines": { + "content": [ + [ + "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab", + "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1", + "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1", + "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1" + ] + ], + "timestamp": "2023-10-16T14:32:33.629048645" + }, + "test_cat_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:08.038830506" + }, + "test_cat_one_file_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:21.39642399" + }, + "test_cat_zipped_zipped_size": { + "content": [ + 78 + ], + "timestamp": "2023-10-16T14:32:33.641869244" + }, + "test_cat_one_file_unzipped_zipped_size": { + "content": [ + 374 + ], + "timestamp": "2023-10-16T14:33:21.4094373" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config new file mode 100644 index 0000000..ec26b0f --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config @@ -0,0 +1,6 @@ + +process { + withName: CAT_CAT { + ext.prefix = 'cat.txt.gz' + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config new file mode 100644 index 0000000..fbc7978 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config @@ -0,0 +1,8 @@ + +process { + + withName: CAT_CAT { + ext.prefix = 'cat.txt' + } + +} diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml new file mode 100644 index 0000000..37b578f --- /dev/null +++ b/modules/nf-core/cat/cat/tests/tags.yml @@ -0,0 +1,2 @@ +cat/cat: + - modules/nf-core/cat/cat/** diff --git a/modules/nf-core/cat/fastq/environment.yml b/modules/nf-core/cat/fastq/environment.yml new file mode 100644 index 0000000..bff93ad --- /dev/null +++ b/modules/nf-core/cat/fastq/environment.yml @@ -0,0 +1,7 @@ +name: cat_fastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf index 5021e6f..3d96378 100644 --- a/modules/nf-core/cat/fastq/main.nf +++ b/modules/nf-core/cat/fastq/main.nf @@ -2,7 +2,7 @@ process CAT_FASTQ { tag "$meta.id" label 'process_single' - conda "conda-forge::sed=4.7" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'nf-core/ubuntu:20.04' }" diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml index 8a39e30..db4ac3c 100644 --- a/modules/nf-core/cat/fastq/meta.yml +++ b/modules/nf-core/cat/fastq/meta.yml @@ -34,7 +34,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@joseespinosa" - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test b/modules/nf-core/cat/fastq/tests/main.nf.test new file mode 100644 index 0000000..f5f9418 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test @@ -0,0 +1,143 @@ +nextflow_process { + + name "Test Process CAT_FASTQ" + script "../main.nf" + process "CAT_FASTQ" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/fastq" + + test("test_cat_fastq_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_single_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_paired_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_single_end_single_file") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true)] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } +} diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test.snap b/modules/nf-core/cat/fastq/tests/main.nf.test.snap new file mode 100644 index 0000000..ec2342e --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test.snap @@ -0,0 +1,78 @@ +{ + "test_cat_fastq_single_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d" + ] + ] + ], + "timestamp": "2023-10-17T23:19:12.990284837" + }, + "test_cat_fastq_single_end_same_name": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66" + ] + ] + ], + "timestamp": "2023-10-17T23:19:31.554568147" + }, + "test_cat_fastq_single_end_single_file": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,e325ef7deb4023447a1f074e285761af" + ] + ] + ], + "timestamp": "2023-10-17T23:19:49.629360033" + }, + "test_cat_fastq_paired_end_same_name": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66", + "test_2.merged.fastq.gz:md5,fe9f266f43a6fc3dcab690a18419a56e" + ] + ] + ] + ], + "timestamp": "2023-10-17T23:19:40.711617539" + }, + "test_cat_fastq_paired_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d", + "test_2.merged.fastq.gz:md5,77c8e966e130d8c6b6ec9be52fcb2bda" + ] + ] + ] + ], + "timestamp": "2023-10-18T07:53:20.923560211" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/fastq/tests/tags.yml b/modules/nf-core/cat/fastq/tests/tags.yml new file mode 100644 index 0000000..6ac4361 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/tags.yml @@ -0,0 +1,2 @@ +cat/fastq: + - modules/nf-core/cat/fastq/** diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml index 9d0e6b2..f0c63f6 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/environment.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -1,3 +1,4 @@ +name: custom_dumpsoftwareversions channels: - conda-forge - bioconda diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index 9414c32..5f15a5f 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,4 +1,4 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: diff --git a/modules/nf-core/fastp/environment.yml b/modules/nf-core/fastp/environment.yml new file mode 100644 index 0000000..70389e6 --- /dev/null +++ b/modules/nf-core/fastp/environment.yml @@ -0,0 +1,7 @@ +name: fastp +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastp=0.23.4 diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf index 9c747d3..c8e815a 100644 --- a/modules/nf-core/fastp/main.nf +++ b/modules/nf-core/fastp/main.nf @@ -2,7 +2,7 @@ process FASTP { tag "$meta.id" label 'process_medium' - conda "bioconda::fastp=0.23.4" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : 'biocontainers/fastp:0.23.4--h5f740d0_0' }" @@ -99,22 +99,4 @@ process FASTP { END_VERSIONS """ } - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def isSingleOutput = task.ext.args?.contains('--interleaved_in') || meta.single_end - def outputFiles = isSingleOutput ? "${prefix}.fastp.fastq.gz" : "${prefix}_1.fastp.fastq.gz ${prefix}_2.fastp.fastq.gz" - def mergedFileCommand = (!isSingleOutput && save_merged) ? "touch ${prefix}.merged.fastq.gz" : "" - """ - touch $outputFiles - touch "${prefix}.json" - touch "${prefix}.html" - touch "${prefix}.log" - $mergedFileCommand - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") - END_VERSIONS - """ } diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml index 1c34ff9..c22a16a 100644 --- a/modules/nf-core/fastp/meta.yml +++ b/modules/nf-core/fastp/meta.yml @@ -33,7 +33,6 @@ input: - save_merged: type: boolean description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` - output: - meta: type: map @@ -71,4 +70,6 @@ output: authors: - "@drpatelh" - "@kevinmenden" - - "@gallvp" +maintainers: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/fastp/tests/main.nf.test b/modules/nf-core/fastp/tests/main.nf.test new file mode 100644 index 0000000..f610b73 --- /dev/null +++ b/modules/nf-core/fastp/tests/main.nf.test @@ -0,0 +1,485 @@ +nextflow_process { + + name "Test Process FASTP" + script "../main.nf" + process "FASTP" + tag "modules" + tag "modules_nfcore" + tag "fastp" + + test("test_fastp_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ + [ id:'test', single_end:true ], + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:12.922000 K (92.984097%)", + "single end (151 cycles)" ] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 99" ] + def read_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("test_fastp_single_end_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "The input has little adapter percentage (~0.000000%), probably it's trimmed before."] + def log_text = [ "No adapter detected for read1", + "Q30 bases: 12281(88.3716%)"] + def json_text = ['"passed_filter_reads": 198'] + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("fastp test_fastp_interleaved") { + config './nextflow.config' + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "paired end (151 cycles + 151 cycles)"] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 198"] + def read_lines = [ "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("fastp test_fastp_interleaved_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_single_end_trim_fail") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = true + save_merged = false + + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:12.922000 K (92.984097%)", + "single end (151 cycles)"] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 99" ] + def read_lines = [ "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { failed_read_lines.each { failed_read_line -> + { assert path(process.out.reads_fail.get(0).get(1)).linesGzip.contains(failed_read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("test_fastp_single_end_trim_fail_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_trim_fail") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = true + save_merged = false + + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "The input has little adapter percentage (~0.000000%), probably it's trimmed before."] + def log_text = [ "No adapter detected for read1", + "Q30 bases: 12281(88.3716%)"] + def json_text = ['"passed_filter_reads": 198'] + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { failed_read2_lines.each { failed_read2_line -> + { assert path(process.out.reads_fail.get(0).get(1).get(1)).linesGzip.contains(failed_read2_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_merged") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = true + + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "
"] + def log_text = [ "Merged and filtered:", + "total reads: 75", + "total bases: 13683"] + def json_text = ['"merged_and_filtered": {', '"total_reads": 75', '"total_bases": 13683'] + def read1_lines = [ "@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/1", + "CCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTC", + "AAAAAEAEEAEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEE/EAEEEEEE/6EEEEEEEEEEAEEAEEE/EE/AEEAEEEEEAEEEA/EEAAEAE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { read_merged_lines.each { read_merged_line -> + { assert path(process.out.reads_merged.get(0).get(1)).linesGzip.contains(read_merged_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_merged_adapterlist") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/fastp/adapters.fasta", checkIfExists: true) + save_trimmed_fail = false + save_merged = true + + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "
"] + def log_text = [ "Merged and filtered:", + "total reads: 75", + "total bases: 13683"] + def json_text = ['"merged_and_filtered": {', '"total_reads": 75', '"total_bases": 13683',"--adapter_fasta"] + def read1_lines = ["@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/1", + "CCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTC", + "AAAAAEAEEAEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEE/EAEEEEEE/6EEEEEEEEEEAEEAEEE/EE/AEEAEEEEEAEEEA/EEAAEAE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { read_merged_lines.each { read_merged_line -> + { assert path(process.out.reads_merged.get(0).get(1)).linesGzip.contains(read_merged_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } +} diff --git a/modules/nf-core/fastp/tests/main.nf.test.snap b/modules/nf-core/fastp/tests/main.nf.test.snap new file mode 100644 index 0000000..0fa68c7 --- /dev/null +++ b/modules/nf-core/fastp/tests/main.nf.test.snap @@ -0,0 +1,52 @@ +{ + "fastp test_fastp_interleaved_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,168f516f7bd4b7b6c32da7cba87299a4" + ] + ] + ], + "timestamp": "2023-10-17T11:04:45.794175881" + }, + "test_fastp_single_end_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,c852d7a6dba5819e4ac8d9673bedcacc" + ] + ] + ], + "timestamp": "2023-10-17T11:04:10.566343705" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "timestamp": "2023-10-17T11:04:10.582076024" + }, + "test_fastp_single_end_trim_fail_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,9a7ee180f000e8d00c7fb67f06293eb5" + ] + ] + ], + "timestamp": "2023-10-17T11:05:00.379878948" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastp/tests/nextflow.config b/modules/nf-core/fastp/tests/nextflow.config new file mode 100644 index 0000000..0f7849a --- /dev/null +++ b/modules/nf-core/fastp/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + + withName: FASTP { + ext.args = "--interleaved_in" + } +} diff --git a/modules/nf-core/fastp/tests/tags.yml b/modules/nf-core/fastp/tests/tags.yml new file mode 100644 index 0000000..c1afcce --- /dev/null +++ b/modules/nf-core/fastp/tests/tags.yml @@ -0,0 +1,2 @@ +fastp: + - modules/nf-core/fastp/** diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml new file mode 100644 index 0000000..1787b38 --- /dev/null +++ b/modules/nf-core/fastqc/environment.yml @@ -0,0 +1,7 @@ +name: fastqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastqc=0.12.1 diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 249f906..50e59f2 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -2,10 +2,10 @@ process FASTQC { tag "$meta.id" label 'process_medium' - conda "bioconda::fastqc=0.11.9" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : - 'biocontainers/fastqc:0.11.9--0' }" + 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : + 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" input: tuple val(meta), path(reads) diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml index 4da5bb5..ee5507e 100644 --- a/modules/nf-core/fastqc/meta.yml +++ b/modules/nf-core/fastqc/meta.yml @@ -50,3 +50,8 @@ authors: - "@grst" - "@ewels" - "@FelixKrueger" +maintainers: + - "@drpatelh" + - "@grst" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test index 3961de6..6437a14 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -1,13 +1,18 @@ nextflow_process { name "Test Process FASTQC" - script "modules/nf-core/fastqc/main.nf" + script "../main.nf" process "FASTQC" + tag "modules" + tag "modules_nfcore" tag "fastqc" test("Single-Read") { when { + params { + outdir = "$outputDir" + } process { """ input[0] = [ @@ -21,12 +26,16 @@ nextflow_process { } then { - assert process.success - assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" - assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") - assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" + assertAll ( + { assert process.success }, + // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. + // looks like this:
Mon 2 Oct 2023
test.gz
+ // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 + { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" }, + { assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") }, + { assert snapshot(process.out.versions).match("versions") }, + { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } + ) } - } - } diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap new file mode 100644 index 0000000..636a32c --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "timestamp": "2023-10-09T23:40:54+0000" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml new file mode 100644 index 0000000..7834294 --- /dev/null +++ b/modules/nf-core/fastqc/tests/tags.yml @@ -0,0 +1,2 @@ +fastqc: + - modules/nf-core/fastqc/** diff --git a/modules/nf-core/gffread/environment.yml b/modules/nf-core/gffread/environment.yml index d127cae..5398f71 100644 --- a/modules/nf-core/gffread/environment.yml +++ b/modules/nf-core/gffread/environment.yml @@ -1,3 +1,4 @@ +name: gffread channels: - conda-forge - bioconda diff --git a/modules/nf-core/gffread/main.nf b/modules/nf-core/gffread/main.nf index d1477ab..68f8045 100644 --- a/modules/nf-core/gffread/main.nf +++ b/modules/nf-core/gffread/main.nf @@ -1,5 +1,5 @@ process GFFREAD { - tag "$meta.id" + tag "$gff" label 'process_low' conda "${moduleDir}/environment.yml" @@ -8,25 +8,23 @@ process GFFREAD { 'biocontainers/gffread:0.12.1--h8b12597_0' }" input: - tuple val(meta), path(gff) + path gff output: - tuple val(meta), path("*.gtf") , emit: gtf, optional: true - tuple val(meta), path("*.gff3") , emit: gff, optional: true - path "versions.yml" , emit: versions + path "*.gtf" , emit: gtf + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${gff.baseName}" - def extension = args.contains("-T") ? '.gtf' : '.gff3' + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${gff.baseName}" """ gffread \\ $gff \\ $args \\ - -o ${prefix}.${extension} + -o ${prefix}.gtf cat <<-END_VERSIONS > versions.yml "${task.process}": gffread: \$(gffread --version 2>&1) diff --git a/modules/nf-core/gffread/meta.yml b/modules/nf-core/gffread/meta.yml index 8a09a20..f486f8b 100644 --- a/modules/nf-core/gffread/meta.yml +++ b/modules/nf-core/gffread/meta.yml @@ -28,6 +28,5 @@ output: pattern: "versions.yml" authors: - "@emiller88" - - "@gallvp" maintainers: - "@emiller88" diff --git a/modules/nf-core/gunzip/environment.yml b/modules/nf-core/gunzip/environment.yml new file mode 100644 index 0000000..25910b3 --- /dev/null +++ b/modules/nf-core/gunzip/environment.yml @@ -0,0 +1,7 @@ +name: gunzip +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf index 73bf08c..468a6f2 100644 --- a/modules/nf-core/gunzip/main.nf +++ b/modules/nf-core/gunzip/main.nf @@ -2,7 +2,7 @@ process GUNZIP { tag "$archive" label 'process_single' - conda "conda-forge::sed=4.7" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'nf-core/ubuntu:20.04' }" diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml index 4cdcdf4..231034f 100644 --- a/modules/nf-core/gunzip/meta.yml +++ b/modules/nf-core/gunzip/meta.yml @@ -33,3 +33,7 @@ authors: - "@joseespinosa" - "@drpatelh" - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/gunzip/tests/main.nf.test b/modules/nf-core/gunzip/tests/main.nf.test new file mode 100644 index 0000000..d031792 --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process GUNZIP" + script "../main.nf" + process "GUNZIP" + tag "gunzip" + tag "modules_nfcore" + tag "modules" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [], + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gunzip/tests/main.nf.test.snap b/modules/nf-core/gunzip/tests/main.nf.test.snap new file mode 100644 index 0000000..720fd9f --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ], + "gunzip": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ] + } + ], + "timestamp": "2023-10-17T15:35:37.690477896" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunzip/tests/tags.yml b/modules/nf-core/gunzip/tests/tags.yml new file mode 100644 index 0000000..fd3f691 --- /dev/null +++ b/modules/nf-core/gunzip/tests/tags.yml @@ -0,0 +1,2 @@ +gunzip: + - modules/nf-core/gunzip/** diff --git a/modules/nf-core/samtools/cat/environment.yml b/modules/nf-core/samtools/cat/environment.yml new file mode 100644 index 0000000..80da1cf --- /dev/null +++ b/modules/nf-core/samtools/cat/environment.yml @@ -0,0 +1,7 @@ +name: samtools_cat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/cat/main.nf b/modules/nf-core/samtools/cat/main.nf index 22a63e2..5d939aa 100644 --- a/modules/nf-core/samtools/cat/main.nf +++ b/modules/nf-core/samtools/cat/main.nf @@ -2,7 +2,7 @@ process SAMTOOLS_CAT { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : 'biocontainers/samtools:1.17--h00cdaf9_0' }" diff --git a/modules/nf-core/samtools/cat/meta.yml b/modules/nf-core/samtools/cat/meta.yml index 42632e7..3541e0c 100644 --- a/modules/nf-core/samtools/cat/meta.yml +++ b/modules/nf-core/samtools/cat/meta.yml @@ -47,3 +47,5 @@ output: pattern: "versions.yml" authors: - "@matthdsm" +maintainers: + - "@matthdsm" diff --git a/modules/nf-core/sortmerna/environment.yml b/modules/nf-core/sortmerna/environment.yml new file mode 100644 index 0000000..3dae00a --- /dev/null +++ b/modules/nf-core/sortmerna/environment.yml @@ -0,0 +1,7 @@ +name: sortmerna +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sortmerna=4.3.4 diff --git a/modules/nf-core/sortmerna/main.nf b/modules/nf-core/sortmerna/main.nf index 5b4fbca..53ccb97 100644 --- a/modules/nf-core/sortmerna/main.nf +++ b/modules/nf-core/sortmerna/main.nf @@ -2,7 +2,7 @@ process SORTMERNA { tag "$meta.id" label "process_high" - conda "bioconda::sortmerna=4.3.4" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/sortmerna:4.3.4--h9ee0642_0' : 'biocontainers/sortmerna:4.3.4--h9ee0642_0' }" @@ -67,30 +67,4 @@ process SORTMERNA { END_VERSIONS """ } - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - if (meta.single_end) { - """ - touch ${prefix}.non_rRNA.fastq.gz - touch ${prefix}.sortmerna.log - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') - END_VERSIONS - """ - } else { - """ - touch ${prefix}_1.non_rRNA.fastq.gz - touch ${prefix}_2.non_rRNA.fastq.gz - touch ${prefix}.sortmerna.log - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') - END_VERSIONS - """ - } } diff --git a/modules/nf-core/sortmerna/meta.yml b/modules/nf-core/sortmerna/meta.yml index 66f00de..de0b18e 100644 --- a/modules/nf-core/sortmerna/meta.yml +++ b/modules/nf-core/sortmerna/meta.yml @@ -48,4 +48,6 @@ output: authors: - "@drpatelh" - "@mashehu" - - "@gallvp" +maintainers: + - "@drpatelh" + - "@mashehu" diff --git a/modules/nf-core/star/align/environment.yml b/modules/nf-core/star/align/environment.yml new file mode 100644 index 0000000..6db2098 --- /dev/null +++ b/modules/nf-core/star/align/environment.yml @@ -0,0 +1,9 @@ +name: star_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::star=2.7.10a + - bioconda::samtools=1.16.1 + - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/align/main.nf b/modules/nf-core/star/align/main.nf index d0e2038..fa645a6 100644 --- a/modules/nf-core/star/align/main.nf +++ b/modules/nf-core/star/align/main.nf @@ -2,7 +2,7 @@ process STAR_ALIGN { tag "$meta.id" label 'process_high' - conda "bioconda::star=2.7.10a bioconda::samtools=1.16.1 conda-forge::gawk=5.1.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' : 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }" diff --git a/modules/nf-core/star/align/meta.yml b/modules/nf-core/star/align/meta.yml index 3d8fed0..e80dbb7 100644 --- a/modules/nf-core/star/align/meta.yml +++ b/modules/nf-core/star/align/meta.yml @@ -52,7 +52,6 @@ input: - seq_center: type: string description: Sequencing center - output: - bam: type: file @@ -106,8 +105,11 @@ output: type: file description: STAR output bedGraph format file(s) (optional) pattern: "*.bg" - authors: - "@kevinmenden" - "@drpatelh" - "@praveenraj2018" +maintainers: + - "@kevinmenden" + - "@drpatelh" + - "@praveenraj2018" diff --git a/modules/nf-core/star/genomegenerate/environment.yml b/modules/nf-core/star/genomegenerate/environment.yml new file mode 100644 index 0000000..0b35ff5 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/environment.yml @@ -0,0 +1,9 @@ +name: star_genomegenerate +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::star=2.7.10a + - bioconda::samtools=1.16.1 + - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf index ed32d7c..473e62a 100644 --- a/modules/nf-core/star/genomegenerate/main.nf +++ b/modules/nf-core/star/genomegenerate/main.nf @@ -2,7 +2,7 @@ process STAR_GENOMEGENERATE { tag "$fasta" label 'process_high' - conda "bioconda::star=2.7.10a bioconda::samtools=1.16.1 conda-forge::gawk=5.1.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' : 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }" @@ -10,7 +10,6 @@ process STAR_GENOMEGENERATE { input: tuple val(meta), path(fasta) tuple val(meta2), path(gtf) - val star_ignore_sjdbgtf output: tuple val(meta), path("star") , emit: index @@ -23,7 +22,6 @@ process STAR_GENOMEGENERATE { def args = task.ext.args ?: '' def args_list = args.tokenize() def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' - def ignore_gtf = star_ignore_sjdbgtf ? '' : "--sjdbGTFfile $gtf" if (args_list.contains('--genomeSAindexNbases')) { """ mkdir star @@ -31,7 +29,7 @@ process STAR_GENOMEGENERATE { --runMode genomeGenerate \\ --genomeDir star/ \\ --genomeFastaFiles $fasta \\ - $ignore_gtf \\ + --sjdbGTFfile $gtf \\ --runThreadN $task.cpus \\ $memory \\ $args @@ -53,7 +51,7 @@ process STAR_GENOMEGENERATE { --runMode genomeGenerate \\ --genomeDir star/ \\ --genomeFastaFiles $fasta \\ - $ignore_gtf \\ + --sjdbGTFfile $gtf \\ --runThreadN $task.cpus \\ --genomeSAindexNbases \$NUM_BASES \\ $memory \\ diff --git a/modules/nf-core/star/genomegenerate/meta.yml b/modules/nf-core/star/genomegenerate/meta.yml index e824dbf..1061e1b 100644 --- a/modules/nf-core/star/genomegenerate/meta.yml +++ b/modules/nf-core/star/genomegenerate/meta.yml @@ -31,7 +31,6 @@ input: - gtf: type: file description: GTF file of the reference genome - output: - meta: type: map @@ -46,8 +45,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@kevinmenden" - "@drpatelh" - - "@gallvp" +maintainers: + - "@kevinmenden" + - "@drpatelh" diff --git a/modules/nf-core/star/starsolo/main.nf b/modules/nf-core/star/starsolo/main.nf deleted file mode 100644 index 07499b6..0000000 --- a/modules/nf-core/star/starsolo/main.nf +++ /dev/null @@ -1,94 +0,0 @@ -process STARSOLO { - tag "$meta.id" - label 'process_high' - - conda "bioconda::star=2.7.10b" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/star:2.7.10b--h9ee0642_0': - 'biocontainers/star:2.7.10b--h9ee0642_0' }" - - input: - tuple val(meta), val(solotype), path(reads) - tuple val(meta2), path(index) - - output: - tuple val(meta), path('*.Solo.out') , emit: counts - tuple val(meta), path('*Log.final.out') , emit: log_final - tuple val(meta), path('*Log.out') , emit: log_out - tuple val(meta), path('*Log.progress.out') , emit: log_progress - tuple val(meta), path('*/Gene/Summary.csv') , emit: summary - path "versions.yml" , emit: versions - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def (forward, reverse) = reads.collate(2).transpose() - def zcat = reads[0].getExtension() == "gz" ? "--readFilesCommand zcat": "" - - // Handle solotype argument logic - switch(solotype) { - case "CB_UMI_Simple": - solotype_args = meta.umi_len ? "--soloUMIlen ${meta.umi_len} " : ""; - solotype_args = solotype_args + (meta.whitelist ? "--soloCBwhitelist ${meta.whitelist} " : "--soloCBwhitelist None "); - solotype_args = solotype_args + (meta.umi_start ? "--soloUMIstart ${meta.umi_start} " : ""); - solotype_args = solotype_args + (meta.cb_len ? "--soloCBlen ${meta.cb_len} " : ""); - solotype_args = solotype_args + (meta.cb_start ? "--soloCBstart ${meta.cb_start} " : ""); - solotype_args = solotype_args + (meta.barcode_len ? "--soloBarcodeReadLength ${meta.barcode_len} " : ""); - solotype_args = solotype_args + (meta.barcode_mate ? "--soloBarcodeMate ${meta.barcode_mate} " : ""); - break - case "CB_UMI_Complex": - solotype_args = meta.cb_position ? "--soloCBposition ${meta.cb_position}" : ""; - solotype_args = solotype_args + (meta.whitelist ? "--soloCBwhitelist ${meta.whitelist} " : "--soloCBwhitelist None "); - solotype_args = solotype_args + (meta.umi_position ? "--soloUMIposition ${meta.umi_position} " : ""); - solotype_args = solotype_args + (meta.adapter_seq ? "--soloAdapterSequence ${meta.adapter_seq} " : ""); - solotype_args = solotype_args + (meta.max_mismatch_adapter ? "--soloAdapterMismatchesNmax ${meta.max_mismatch_adapter} " : ""); - break - case "SmartSeq": - solotype_args = "--soloUMIdedup Exact "; - solotype_args = solotype_args + (meta.strandedness ? "--soloStrand ${meta.strandedness} " : ""); - solotype_args = solotype_args + "--outSAMattrRGline ID:${prefix} "; - break - default: - log.warn("Unknown output solotype (${solotype})"); - break - } - - """ - STAR \\ - --genomeDir $index \\ - --readFilesIn ${reverse.join( "," )} ${forward.join( "," )} \\ - --runThreadN $task.cpus \\ - --outFileNamePrefix $prefix. \\ - --soloType $solotype \\ - $zcat \\ - $solotype_args \\ - $args - - if [ -d ${prefix}.Solo.out ]; then - find ${prefix}.Solo.out \\( -name "*.tsv" -o -name "*.mtx" \\) -exec gzip {} \\; - fi - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - star: \$(STAR --version | sed -e "s/STAR_//g") - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - mkdir ${prefix}.Solo.out/ - touch ${prefix}.Solo.out/Log.final.out - touch ${prefix}.Solo.out/Log.out - touch ${prefix}.Solo.out/Log.progress.out - touch ${prefix}.Solo.out/Summary.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - star: \$(STAR --version | sed -e "s/STAR_//g") - END_VERSIONS - """ -} diff --git a/modules/nf-core/star/starsolo/meta.yml b/modules/nf-core/star/starsolo/meta.yml deleted file mode 100644 index 4fce56c..0000000 --- a/modules/nf-core/star/starsolo/meta.yml +++ /dev/null @@ -1,79 +0,0 @@ -name: "starsolo" -description: Create a counts matrix for single-cell data using STARSolo, handling cell barcodes and UMI information. -keywords: - - align - - count - - genome - - reference -tools: - - "starsolo": - description: "Mapping, demultiplexing and quantification for single cell RNA-seq." - homepage: "https://github.com/alexdobin/STAR/" - documentation: "https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md" - doi: "10.1101/2021.05.05.442755" - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information. - Here, you should add all the specific barcode/umi - information for each sample. - e.g. `[ id:'test_starsolo', umi_len:'12', cb_start:1 ]` - - solotype: - type: string - description: | - Type of single-cell library. - It can be CB_UMI_Simple for most common ones such as 10xv2 and 10xv3, - CB_UMI_Complex for method such as inDrop and SmartSeq for SMART-Seq. - - meta2: - type: map - description: Groovy Map containing the STAR index information. - - index: - type: directory - description: STAR genome index - pattern: "star" - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. -output: - - meta: - type: map - description: | - Groovy Map containing sample information. - Here, you should add all the specific barcode/umi - information for each sample. - e.g. `[ id:'test_starsolo', umi_len:'12', cb_start:1 ]` - - log_final: - type: file - description: STAR final log file - pattern: "*Log.final.out" - - log_out: - type: file - description: STAR lot out file - pattern: "*Log.out" - - log_progress: - type: file - description: STAR log progress file - pattern: "*Log.progress.out" - - summary: - type: file - description: STARSolo metrics summary CSV file. - pattern: "*/Gene/Summary.csv" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - -authors: - - "@kevinmenden" - - "@ggabernet" - - "@grst" - - "@fmalmeida" - - "@rhreynolds" - - "@apeltzer" - - "@vivian-chen16" - - "@maxulysse" - - "@joaodemeirelles" diff --git a/modules/nf-core/trinity/main.nf b/modules/nf-core/trinity/main.nf deleted file mode 100644 index 3960a35..0000000 --- a/modules/nf-core/trinity/main.nf +++ /dev/null @@ -1,74 +0,0 @@ -process TRINITY { - tag "$meta.id" - label 'process_high_memory' - - conda "bioconda::trinity=2.13.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/trinity:2.13.2--h00214ad_1': - 'biocontainers/trinity:2.13.2--h00214ad_1' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.fa.gz") , emit: transcript_fasta - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - if (meta.single_end) { - reads_args = "--single ${reads}" - } else { - reads_args = "--left ${reads[0]} --right ${reads[1]}" - } - - // --seqType argument, fasta or fastq. Exact pattern match .fasta or .fa suffix with optional .gz (gzip) suffix - seqType_args = reads[0] ==~ /(.*fasta(.gz)?$)|(.*fa(.gz)?$)/ ? "fa" : "fq" - - // Define the memory requirements. Trinity needs this as an option. - def avail_mem = 7 - if (!task.memory) { - log.info '[Trinity] Available memory not known - defaulting to 7GB. Specify process memory requirements to change this.' - } else { - avail_mem = (task.memory.giga*0.8).intValue() - } - - """ - # Note that Trinity needs the word 'trinity' in the outdir - - Trinity \\ - --seqType ${seqType_args} \\ - --max_memory ${avail_mem}G \\ - ${reads_args} \\ - --output ${prefix}_trinity \\ - --CPU $task.cpus \\ - $args - - gzip -cf ${prefix}_trinity.Trinity.fasta > ${prefix}.fa.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - trinity: \$(echo \$(Trinity --version | head -n 1 2>&1) | sed 's/^Trinity version: Trinity-v//' )) - END_VERSIONS - - # Need to only take the first line of --version since it will warn about not being up-to-date and this messes up the version.yaml. - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.fa.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - trinity: \$(echo \$(Trinity --version | head -n 1 2>&1) | sed 's/^Trinity version: Trinity-v//' )) - END_VERSIONS - - # Need to only take the first line of --version since it will warn about not being up-to-date and this messes up the version.yaml. - """ -} diff --git a/modules/nf-core/trinity/meta.yml b/modules/nf-core/trinity/meta.yml deleted file mode 100644 index 26e8c10..0000000 --- a/modules/nf-core/trinity/meta.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: "trinity" -description: Assembles a de novo transcriptome from RNAseq reads -keywords: - - assembly - - de novo assembler - - fasta - - fastq -tools: - - "trinity": - description: "Trinity assembles transcript sequences from Illumina RNA-Seq data." - homepage: "https://github.com/trinityrnaseq/trinityrnaseq/wiki" - documentation: "https://github.com/trinityrnaseq/trinityrnaseq/wiki" - tool_dev_url: "https://github.com/trinityrnaseq/trinityrnaseq/" - doi: "10.1038/nbt.1883" - licence: "['BSD-3-clause']" - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: fasta/fastq file of reads to be assembled into a transcriptome - pattern: "*.{fa|fasta|fq|fastq}" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - transcript_fasta: - type: file - description: de novo assembled transcripts fasta file compressed - pattern: "*.fa.gz" - -authors: - - "@timslittle" - - "@gallvp" diff --git a/modules/nf-core/umitools/dedup/main.nf b/modules/nf-core/umitools/dedup/main.nf deleted file mode 100644 index 56ea046..0000000 --- a/modules/nf-core/umitools/dedup/main.nf +++ /dev/null @@ -1,62 +0,0 @@ -process UMITOOLS_DEDUP { - tag "$meta.id" - label "process_medium" - - conda "bioconda::umi_tools=1.1.4" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.4--py38hbff2b2d_1' : - 'biocontainers/umi_tools:1.1.4--py38hbff2b2d_1' }" - - input: - tuple val(meta), path(bam), path(bai) - val get_output_stats - - output: - tuple val(meta), path("${prefix}.bam") , emit: bam - tuple val(meta), path("*.log") , emit: log - tuple val(meta), path("*edit_distance.tsv"), optional:true, emit: tsv_edit_distance - tuple val(meta), path("*per_umi.tsv") , optional:true, emit: tsv_per_umi - tuple val(meta), path("*per_position.tsv") , optional:true, emit: tsv_umi_per_position - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" - def paired = meta.single_end ? "" : "--paired" - stats = get_output_stats ? "--output-stats ${prefix}" : "" - if ("$bam" == "${prefix}.bam") error "Input and output names are the same, set prefix in module configuration to disambiguate!" - - if (!(args ==~ /.*--random-seed.*/)) {args += " --random-seed=100"} - """ - PYTHONHASHSEED=0 umi_tools \\ - dedup \\ - -I $bam \\ - -S ${prefix}.bam \\ - -L ${prefix}.log \\ - $stats \\ - $paired \\ - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') - END_VERSIONS - """ - - stub: - """ - touch ${prefix}.bam - touch ${prefix}.log - touch ${prefix}_edit_distance.tsv - touch ${prefix}_per_umi.tsv - touch ${prefix}_per_position.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/umitools/dedup/meta.yml b/modules/nf-core/umitools/dedup/meta.yml deleted file mode 100644 index 534d4c6..0000000 --- a/modules/nf-core/umitools/dedup/meta.yml +++ /dev/null @@ -1,68 +0,0 @@ -name: umitools_dedup -description: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. -keywords: - - umitools - - deduplication - - dedup -tools: - - umi_tools: - description: > - UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) - and single cell RNA-Seq cell barcodes - documentation: https://umi-tools.readthedocs.io/en/latest/ - license: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: | - BAM file containing reads to be deduplicated via UMIs. - pattern: "*.{bam}" - - bai: - type: file - description: | - BAM index files corresponding to the input BAM file. - pattern: "*.{bai}" - - get_output_stats: - type: boolean - description: | - Whether or not to generate output stats. -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM file with deduplicated UMIs. - pattern: "*.{bam}" - - log: - type: file - description: File with logging information - pattern: "*.{log}" - - tsv_edit_distance: - type: file - description: Reports the (binned) average edit distance between the UMIs at each position. - pattern: "*edit_distance.tsv" - - tsv_per_umi: - type: file - description: UMI-level summary statistics. - pattern: "*per_umi.tsv" - - tsv_umi_per_position: - type: file - description: Tabulates the counts for unique combinations of UMI and position. - pattern: "*per_position.tsv" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - -authors: - - "@drpatelh" - - "@grst" - - "@klkeys" diff --git a/modules/nf-core/umitools/extract/environment.yml b/modules/nf-core/umitools/extract/environment.yml new file mode 100644 index 0000000..7d08ac0 --- /dev/null +++ b/modules/nf-core/umitools/extract/environment.yml @@ -0,0 +1,7 @@ +name: umitools_extract +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::umi_tools=1.1.4 diff --git a/modules/nf-core/umitools/extract/main.nf b/modules/nf-core/umitools/extract/main.nf index 2f94fa9..a01ef73 100644 --- a/modules/nf-core/umitools/extract/main.nf +++ b/modules/nf-core/umitools/extract/main.nf @@ -3,7 +3,7 @@ process UMITOOLS_EXTRACT { label "process_single" label "process_long" - conda "bioconda::umi_tools=1.1.4" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.4--py38hbff2b2d_1' : 'biocontainers/umi_tools:1.1.4--py38hbff2b2d_1' }" diff --git a/modules/nf-core/umitools/extract/meta.yml b/modules/nf-core/umitools/extract/meta.yml index db64a0f..7695b27 100644 --- a/modules/nf-core/umitools/extract/meta.yml +++ b/modules/nf-core/umitools/extract/meta.yml @@ -1,15 +1,16 @@ name: umitools_extract description: Extracts UMI barcode from a read and add it to the read name, leaving any sample barcode in place keywords: - - umitools + - UMI + - barcode - extract + - umitools tools: - umi_tools: description: > - UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) - and single cell RNA-Seq cell barcodes + UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) and single cell RNA-Seq cell barcodes documentation: https://umi-tools.readthedocs.io/en/latest/ - license: ["MIT"] + license: "MIT" input: - meta: type: map @@ -29,9 +30,7 @@ output: - reads: type: file description: > - Extracted FASTQ files. | - For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | - For paired-end reads, pattern is \${prefix}.umi_extract_{1,2}.fastq.gz. + Extracted FASTQ files. | For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | For paired-end reads, pattern is \${prefix}.umi_extract_{1,2}.fastq.gz. pattern: "*.{fastq.gz}" - log: type: file @@ -41,7 +40,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@drpatelh" - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/umitools/extract/tests/main.nf.test b/modules/nf-core/umitools/extract/tests/main.nf.test new file mode 100644 index 0000000..22242d1 --- /dev/null +++ b/modules/nf-core/umitools/extract/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process UMITOOLS_EXTRACT" + script "../main.nf" + process "UMITOOLS_EXTRACT" + config "./nextflow.config" + tag "modules_nfcore" + tag "modules" + tag "umitools" + tag "umitools/extract" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } +} \ No newline at end of file diff --git a/modules/nf-core/umitools/extract/tests/main.nf.test.snap b/modules/nf-core/umitools/extract/tests/main.nf.test.snap new file mode 100644 index 0000000..54e77fb --- /dev/null +++ b/modules/nf-core/umitools/extract/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,866a2da05ce1af35cc07261ffe6bc31a" + ] + ], + "timestamp": "2023-10-17T08:25:55.427194" + } +} \ No newline at end of file diff --git a/modules/nf-core/umitools/extract/tests/nextflow.config b/modules/nf-core/umitools/extract/tests/nextflow.config new file mode 100644 index 0000000..c866f5a --- /dev/null +++ b/modules/nf-core/umitools/extract/tests/nextflow.config @@ -0,0 +1,9 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: UMITOOLS_EXTRACT { + ext.args = '--bc-pattern="NNNN"' + } + +} diff --git a/modules/nf-core/umitools/extract/tests/tags.yml b/modules/nf-core/umitools/extract/tests/tags.yml new file mode 100644 index 0000000..c3fb23d --- /dev/null +++ b/modules/nf-core/umitools/extract/tests/tags.yml @@ -0,0 +1,2 @@ +umitools/extract: + - modules/nf-core/umitools/extract/** diff --git a/modules/nf-core/umitools/group/main.nf b/modules/nf-core/umitools/group/main.nf deleted file mode 100644 index 9a6370b..0000000 --- a/modules/nf-core/umitools/group/main.nf +++ /dev/null @@ -1,62 +0,0 @@ -process UMITOOLS_GROUP { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::umi_tools=1.1.4" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.4--py38hbff2b2d_1' : - 'biocontainers/umi_tools:1.1.4--py38hbff2b2d_1' }" - - input: - tuple val(meta), path(bam), path(bai) - val create_bam - val get_group_info - - output: - tuple val(meta), path("*.log") , emit: log - tuple val(meta), path("${prefix}.bam"), optional: true, emit: bam - tuple val(meta), path("*.tsv") , optional: true, emit: tsv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" - def paired = meta.single_end ? "" : "--paired" - output_bam = create_bam ? "--output-bam -S ${prefix}.bam" : "" - group_info = get_group_info ? "--group-out ${prefix}.tsv" : "" - - if (create_bam && "$bam" == "${prefix}.bam") { error "Input and output names are the same, set prefix in module configuration to disambiguate!" } - - if (!(args ==~ /.*--random-seed.*/)) {args += " --random-seed=100"} - """ - PYTHONHASHSEED=0 umi_tools \\ - group \\ - -I $bam \\ - $output_bam \\ - -L ${prefix}.log \\ - $group_info \\ - $paired \\ - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') - END_VERSIONS - """ - - stub: - prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.bam - touch ${prefix}.log - touch ${prefix}.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/umitools/group/meta.yml b/modules/nf-core/umitools/group/meta.yml deleted file mode 100644 index 1fa826d..0000000 --- a/modules/nf-core/umitools/group/meta.yml +++ /dev/null @@ -1,62 +0,0 @@ -name: umitools_group -description: Group reads based on their UMI and mapping coordinates -keywords: - - umitools - - umi - - deduplication - - dedup - - clustering -tools: - - umi_tools: - description: > - UMI-tools contains tools for dealing with Unique Molecular Identifiers (UMIs)/Random Molecular Tags (RMTs) - and single cell RNA-Seq cell barcodes - documentation: https://umi-tools.readthedocs.io/en/latest/ - license: ["MIT"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: | - BAM file containing reads to be deduplicated via UMIs. - pattern: "*.{bam}" - - bai: - type: file - description: | - BAM index files corresponding to the input BAM file. - pattern: "*.{bai}" - - create_bam: - type: boolean - description: | - Whether or not to create a read group tagged BAM file. - - get_group_info: - type: boolean - description: | - Whether or not to generate the flatfile describing the read groups, see docs for complete info of all columns - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: a read group tagged BAM file. - pattern: "${prefix}.{bam}" - - log: - type: file - description: File with logging information - pattern: "*.{log}" - - tsv: - type: file - description: Flatfile describing the read groups, see docs for complete info of all columns - pattern: "*.{tsv}" - -authors: - - "@Joon-Klaps" diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf index 63a6592..3dbb27e 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf @@ -96,8 +96,7 @@ workflow FASTQ_FASTQC_UMITOOLS_FASTP { .out .reads .join(trim_json) - // Change: Bypassing getFastpReadsAfterFiltering when FASTP stub returns empty json - .map { meta, reads, json -> [ meta, reads, json.text ? getFastpReadsAfterFiltering(json) : min_trimmed_reads.toLong()] } + .map { meta, reads, json -> [ meta, reads, getFastpReadsAfterFiltering(json) ] } .set { ch_num_trimmed_reads } ch_num_trimmed_reads diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml index eafb0dc..220e8db 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml @@ -69,8 +69,10 @@ output: - reads: type: file description: > - Extracted FASTQ files. | - For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | + Extracted FASTQ files. | For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | + + + For paired-end reads, pattern is \${prefix}.umi_extract_{1,2}.fastq.gz. pattern: "*.{fastq.gz}" - fastqc_html: @@ -122,4 +124,5 @@ output: pattern: "versions.yml" authors: - "@robsyme" - - "@gallvp" +maintainers: + - "@robsyme" From bea59ef3a81622b1b6f3d764650f0143c81eaa20 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 22 Nov 2023 13:41:41 +1300 Subject: [PATCH 19/59] Reimported kherronism modules with nf-core/tools --- modules.json | 20 ++++++++++ modules/kherronism/CHANGELOG.md | 21 ---------- modules/kherronism/LICENSE | 21 ---------- modules/kherronism/braker3/main.nf | 50 ++++++------------------ modules/kherronism/braker3/meta.yml | 24 ++++++++++-- modules/kherronism/repeatmasker/main.nf | 19 +-------- modules/kherronism/repeatmasker/meta.yml | 1 - 7 files changed, 53 insertions(+), 103 deletions(-) delete mode 100644 modules/kherronism/CHANGELOG.md delete mode 100644 modules/kherronism/LICENSE diff --git a/modules.json b/modules.json index 14b25cd..6c14afd 100644 --- a/modules.json +++ b/modules.json @@ -2,6 +2,26 @@ "name": "PlantandFoodResearch/pangene", "homePage": "https://github.com/PlantandFoodResearch/pangene", "repos": { + "git@github.com:kherronism/nf-modules.git": { + "modules": { + "kherronism": { + "braker3": { + "branch": "dev", + "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", + "installed_by": [ + "modules" + ] + }, + "repeatmasker": { + "branch": "dev", + "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", + "installed_by": [ + "modules" + ] + } + } + } + }, "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { diff --git a/modules/kherronism/CHANGELOG.md b/modules/kherronism/CHANGELOG.md deleted file mode 100644 index 5665af2..0000000 --- a/modules/kherronism/CHANGELOG.md +++ /dev/null @@ -1,21 +0,0 @@ -## Source - -- Repo: https://github.com/kherronism/rewarewaannotation/tree/1a39a83e22fe2d8665a8c6dc49772cce6579983f -- License: See LICENSE file - -## Changes - -### repeatmasker - -1. Added stub -2. Added author in meta.yml -3. Changed input "tuple val(meta), path(lib)" to "path(lib)" - -### braker3 - -1. Added stub -2. Added author in meta.yml -3. Made output hintsfile optional as it is not produced for ab-initio annotation. -4. Directed `--AUGUSTUS_CONFIG_PATH` to work folder. This avoids "species already exists" error on subsequent runs with same species. -5. Updated version extractor. -6. Added `containerOptions "-B $TMPDIR:$TMPDIR"` \ No newline at end of file diff --git a/modules/kherronism/LICENSE b/modules/kherronism/LICENSE deleted file mode 100644 index 4b42925..0000000 --- a/modules/kherronism/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) Katie Herron - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file diff --git a/modules/kherronism/braker3/main.nf b/modules/kherronism/braker3/main.nf index c9d915a..e5cc77c 100644 --- a/modules/kherronism/braker3/main.nf +++ b/modules/kherronism/braker3/main.nf @@ -4,22 +4,17 @@ process BRAKER3 { conda "bioconda::braker3=3.0.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'registry.hub.docker.com/teambraker/braker3:v.1.0.3': - 'registry.hub.docker.com/teambraker/braker3:v.1.0.3' }" + 'registry.hub.docker.com/teambraker/braker3:v.1.0.4': + 'registry.hub.docker.com/teambraker/braker3:v.1.0.4' }" input: - tuple val(meta), path(fasta) - path bam - path rnaseq_sets_dirs - path rnaseq_sets_ids - path proteins - path hintsfile + tuple val(meta), path(fasta), path(rnaseq_sets_ids), path(rnaseq_sets_dirs), path(bam), path(proteins), path(hintsfile) output: tuple val(meta), path("${prefix}/braker.gtf") , emit: gtf tuple val(meta), path("${prefix}/braker.codingseq"), emit: cds tuple val(meta), path("${prefix}/braker.aa") , emit: aa - tuple val(meta), path("${prefix}/hintsfile.gff") , emit: hintsfile, optional: true + tuple val(meta), path("${prefix}/hintsfile.gff") , emit: hintsfile tuple val(meta), path("${prefix}/braker.log") , emit: log tuple val(meta), path("${prefix}/what-to-cite.txt"), emit: citations tuple val(meta), path("${prefix}/braker.gff3") , emit: gff3 , optional: true @@ -32,48 +27,27 @@ process BRAKER3 { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" - def hints = hintsfile ? "--hints=${hintsfile}" : '' + def rna_ids = rnaseq_sets_ids ? "--rnaseq_sets_ids=${rnaseq_sets_ids}" : '' + def rna_dirs = rnaseq_sets_dirs ? "--rnaseq_sets_dirs=${rnaseq_sets_dirs}" : '' def bam = bam ? "--bam=${bam}" : '' def proteins = proteins ? "--prot_seq=${proteins}" : '' - def rna_dirs = rnaseq_sets_dirs ? "--rnaseq_sets_dirs=${rnaseq_sets_dirs}" : '' - def rna_ids = rnaseq_sets_ids ? "--rnaseq_sets_ids=${rnaseq_sets_ids}" : '' + def hints = hintsfile ? "--hints=${hintsfile}" : '' """ - cp -r /usr/share/augustus/config augustus_config - braker.pl \\ --genome ${fasta} \\ --species ${prefix} \\ --workingdir ${prefix} \\ - --AUGUSTUS_CONFIG_PATH "\$(pwd)/augustus_config" \\ --threads ${task.cpus} \\ - ${hints} \\ + ${rna_ids} \\ + ${rna_dirs} \\ ${bam} \\ ${proteins} \\ - ${rna_dirs} \\ - ${rna_ids} \\ + ${hints} \\ ${args} cat <<-END_VERSIONS > versions.yml "${task.process}": - braker3: \$(braker.pl --version 2>&1 | grep "version" | sed 's/braker.pl version//; s/\\s*//') - END_VERSIONS - """ - - stub: - prefix = task.ext.prefix ?: "${meta.id}" - """ - mkdir "$prefix" - - touch "${prefix}/braker.gtf" - touch "${prefix}/braker.codingseq" - touch "${prefix}/braker.aa" - touch "${prefix}/hintsfile.gff" - touch "${prefix}/braker.log" - touch "${prefix}/what-to-cite.txt" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - braker3: \$(braker.pl --version 2>&1 | grep "version" | sed 's/braker.pl version//; s/\\s*//') + braker3: \$(braker.pl --version 2>&1 | sed 's/^.*BRAKER3 v//; s/ .*\$//') END_VERSIONS """ -} \ No newline at end of file +} diff --git a/modules/kherronism/braker3/meta.yml b/modules/kherronism/braker3/meta.yml index b3506fd..9bc13a3 100644 --- a/modules/kherronism/braker3/meta.yml +++ b/modules/kherronism/braker3/meta.yml @@ -22,11 +22,28 @@ input: description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - assembly: + - fasta: type: file - description: Genome assembly + description: Genome assembly fasta pattern: "*.{fasta,fa,fas,faa,fna}" - + - rnaseq_sets_ids: + type: file + description: IDs of RNA-seq data sets to be passed to --rnaseq_sets_ids + - rnaseq_sets_dirs: + type: file + description: Directories of RNA-seq data sets to be passed to --rnaseq_sets_dirs + - bam: + type: file + description: BAM file of RNA-seq data to be passed to --bam + pattern: "*.bam" + - proteins: + type: file + description: Protein evidence to be passed to --proteins + pattern: "*.{fasta,fa,fas,faa}" + - hintsfile: + type: file + description: Hintsfile to be passed to --hintsfile + pattern: "*.{gff, gtf, gff3}" output: - gtf: type: file @@ -55,4 +72,3 @@ output: authors: - "@kherronism" - - "@gallvp" diff --git a/modules/kherronism/repeatmasker/main.nf b/modules/kherronism/repeatmasker/main.nf index fdab29e..6abf0b9 100644 --- a/modules/kherronism/repeatmasker/main.nf +++ b/modules/kherronism/repeatmasker/main.nf @@ -8,8 +8,7 @@ process REPEATMASKER { 'biocontainers/repeatmasker:4.1.5--pl5321hdfd78af_0' }" input: - tuple val(meta), path(fasta) - path(lib) + tuple val(meta), path(fasta), path(lib) output: tuple val(meta), path("${meta.id}/*.f*a.masked") , emit: fasta_masked @@ -40,20 +39,4 @@ process REPEATMASKER { repeatmasker: ${VERSION} END_VERSIONS """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = '4.1.5' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - """ - mkdir "$meta.id" - - touch "${meta.id}/${meta.id}.fasta.masked" - touch "${meta.id}/${meta.id}.fasta.out" - touch "${meta.id}/${meta.id}.fasta.tbl" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - repeatmasker: ${VERSION} - END_VERSIONS - """ } diff --git a/modules/kherronism/repeatmasker/meta.yml b/modules/kherronism/repeatmasker/meta.yml index eb15048..8adeb55 100644 --- a/modules/kherronism/repeatmasker/meta.yml +++ b/modules/kherronism/repeatmasker/meta.yml @@ -44,4 +44,3 @@ output: authors: - "@kherronism" - - "@gallvp" From 2dda7529b9000a590d9d30f9fbf40d32c85d1a2d Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 22 Nov 2023 15:04:06 +1300 Subject: [PATCH 20/59] Updated braker3 --- modules/kherronism/braker3/main.nf | 37 ++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/modules/kherronism/braker3/main.nf b/modules/kherronism/braker3/main.nf index e5cc77c..d44c986 100644 --- a/modules/kherronism/braker3/main.nf +++ b/modules/kherronism/braker3/main.nf @@ -4,17 +4,22 @@ process BRAKER3 { conda "bioconda::braker3=3.0.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'registry.hub.docker.com/teambraker/braker3:v.1.0.4': - 'registry.hub.docker.com/teambraker/braker3:v.1.0.4' }" + 'registry.hub.docker.com/teambraker/braker3:v.1.0.6': + 'registry.hub.docker.com/teambraker/braker3:v.1.0.6' }" input: - tuple val(meta), path(fasta), path(rnaseq_sets_ids), path(rnaseq_sets_dirs), path(bam), path(proteins), path(hintsfile) + tuple val(meta), path(fasta) + path bam + path rnaseq_sets_dirs + path rnaseq_sets_ids + path proteins + path hintsfile output: tuple val(meta), path("${prefix}/braker.gtf") , emit: gtf tuple val(meta), path("${prefix}/braker.codingseq"), emit: cds tuple val(meta), path("${prefix}/braker.aa") , emit: aa - tuple val(meta), path("${prefix}/hintsfile.gff") , emit: hintsfile + tuple val(meta), path("${prefix}/hintsfile.gff") , emit: hintsfile, optional: true tuple val(meta), path("${prefix}/braker.log") , emit: log tuple val(meta), path("${prefix}/what-to-cite.txt"), emit: citations tuple val(meta), path("${prefix}/braker.gff3") , emit: gff3 , optional: true @@ -33,10 +38,13 @@ process BRAKER3 { def proteins = proteins ? "--prot_seq=${proteins}" : '' def hints = hintsfile ? "--hints=${hintsfile}" : '' """ + cp -r /usr/share/augustus/config augustus_config + braker.pl \\ --genome ${fasta} \\ --species ${prefix} \\ --workingdir ${prefix} \\ + --AUGUSTUS_CONFIG_PATH "\$(pwd)/augustus_config" \\ --threads ${task.cpus} \\ ${rna_ids} \\ ${rna_dirs} \\ @@ -47,7 +55,26 @@ process BRAKER3 { cat <<-END_VERSIONS > versions.yml "${task.process}": - braker3: \$(braker.pl --version 2>&1 | sed 's/^.*BRAKER3 v//; s/ .*\$//') + braker3: \$(braker.pl --version 2>/dev/null | sed 's/braker.pl version//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + def createHints = (rna_ids || bam || proteins || hints) ? "touch ${prefix}/hintsfile.gff" : '' + """ + mkdir "$prefix" + + touch "${prefix}/braker.gtf" + touch "${prefix}/braker.codingseq" + touch "${prefix}/braker.aa" + $createHints + touch "${prefix}/braker.log" + touch "${prefix}/what-to-cite.txt" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + braker3: \$(braker.pl --version 2>/dev/null | sed 's/braker.pl version//') END_VERSIONS """ } From 8efa34dcc99a5932cfb7e0ec2927217c6917c0af Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 22 Nov 2023 15:27:07 +1300 Subject: [PATCH 21/59] Updated repeatmasker --- modules/kherronism/repeatmasker/main.nf | 19 ++++++++++++++++++- pangene_pfr.sh | 1 + 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/modules/kherronism/repeatmasker/main.nf b/modules/kherronism/repeatmasker/main.nf index 6abf0b9..fdab29e 100644 --- a/modules/kherronism/repeatmasker/main.nf +++ b/modules/kherronism/repeatmasker/main.nf @@ -8,7 +8,8 @@ process REPEATMASKER { 'biocontainers/repeatmasker:4.1.5--pl5321hdfd78af_0' }" input: - tuple val(meta), path(fasta), path(lib) + tuple val(meta), path(fasta) + path(lib) output: tuple val(meta), path("${meta.id}/*.f*a.masked") , emit: fasta_masked @@ -39,4 +40,20 @@ process REPEATMASKER { repeatmasker: ${VERSION} END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '4.1.5' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + mkdir "$meta.id" + + touch "${meta.id}/${meta.id}.fasta.masked" + touch "${meta.id}/${meta.id}.fasta.out" + touch "${meta.id}/${meta.id}.fasta.tbl" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + repeatmasker: ${VERSION} + END_VERSIONS + """ } diff --git a/pangene_pfr.sh b/pangene_pfr.sh index 785199e..3b048df 100644 --- a/pangene_pfr.sh +++ b/pangene_pfr.sh @@ -14,5 +14,6 @@ ml apptainer/1.1 ml nextflow/23.04.4 export APPTAINER_BINDPATH="$APPTAINER_BINDPATH,/workspace/$USER/tmp:/tmp" +export TMPDIR="/workspace/$USER/tmp" nextflow main.nf -profile slurm -resume \ No newline at end of file From 8d31976d3315c9023617d02160693ecebf3bbed8 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 23 Nov 2023 11:08:28 +1300 Subject: [PATCH 22/59] Updated modules --- modules.json | 2 +- modules/nf-core/sortmerna/tests/main.nf.test | 59 +++++++++++++++++++ .../nf-core/sortmerna/tests/main.nf.test.snap | 49 +++++++++++++++ modules/nf-core/sortmerna/tests/tags.yml | 2 + 4 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 modules/nf-core/sortmerna/tests/main.nf.test create mode 100644 modules/nf-core/sortmerna/tests/main.nf.test.snap create mode 100644 modules/nf-core/sortmerna/tests/tags.yml diff --git a/modules.json b/modules.json index 6c14afd..0510115 100644 --- a/modules.json +++ b/modules.json @@ -84,7 +84,7 @@ }, "sortmerna": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", "installed_by": [ "modules" ] diff --git a/modules/nf-core/sortmerna/tests/main.nf.test b/modules/nf-core/sortmerna/tests/main.nf.test new file mode 100644 index 0000000..3ec2692 --- /dev/null +++ b/modules/nf-core/sortmerna/tests/main.nf.test @@ -0,0 +1,59 @@ +nextflow_process { + + name "Test Process SORTMERNA" + script "../main.nf" + process "SORTMERNA" + tag "modules" + tag "modules_nfcore" + tag "sortmerna" + + test("sarscov2 single_end") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match("se_reads") }, + { assert process.out.log }, + { assert snapshot(process.out.versions).match("se_versions") } + ) + } + + } + + test("sarscov2 paired_end") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match("pe_reads") }, + { assert process.out.log }, + { assert snapshot(process.out.versions).match("pe_versions") } + ) + } + + } + +} diff --git a/modules/nf-core/sortmerna/tests/main.nf.test.snap b/modules/nf-core/sortmerna/tests/main.nf.test.snap new file mode 100644 index 0000000..f1bedb7 --- /dev/null +++ b/modules/nf-core/sortmerna/tests/main.nf.test.snap @@ -0,0 +1,49 @@ +{ + "se_versions": { + "content": [ + [ + "versions.yml:md5,96553a18cad5237fbf76d5a6c966360e" + ] + ], + "timestamp": "2023-11-22T14:25:07.95908694" + }, + "pe_reads": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.non_rRNA.fastq.gz:md5,e62ff0123a74adfc6903d59a449cbdb0", + "test_2.non_rRNA.fastq.gz:md5,e62ff0123a74adfc6903d59a449cbdb0" + ] + ] + ] + ], + "timestamp": "2023-11-22T14:25:19.098771475" + }, + "se_reads": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.non_rRNA.fastq.gz:md5,e62ff0123a74adfc6903d59a449cbdb0" + ] + ] + ], + "timestamp": "2023-11-22T14:25:07.949212892" + }, + "pe_versions": { + "content": [ + [ + "versions.yml:md5,96553a18cad5237fbf76d5a6c966360e" + ] + ], + "timestamp": "2023-11-22T14:25:19.105098985" + } +} \ No newline at end of file diff --git a/modules/nf-core/sortmerna/tests/tags.yml b/modules/nf-core/sortmerna/tests/tags.yml new file mode 100644 index 0000000..e088480 --- /dev/null +++ b/modules/nf-core/sortmerna/tests/tags.yml @@ -0,0 +1,2 @@ +sortmerna: + - modules/nf-core/sortmerna/** From 5eaa87b2ca3c0b824b48d5e694a57bb4aca568ca Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 23 Nov 2023 12:21:45 +1300 Subject: [PATCH 23/59] Imported fastavalidate and liftoff from pfr/nxf-modules --- modules.json | 20 +++++++ modules/local/fasta_validate/main.nf | 43 -------------- .../main.nf => validate_params.nf} | 0 modules/pfr/fastavalidate/main.nf | 56 ++++++++++++++++++ modules/pfr/fastavalidate/meta.yml | 49 +++++++++++++++ modules/{local => pfr}/liftoff/main.nf | 33 ++++++----- modules/pfr/liftoff/meta.yml | 59 +++++++++++++++++++ 7 files changed, 201 insertions(+), 59 deletions(-) delete mode 100644 modules/local/fasta_validate/main.nf rename modules/local/{validate_params/main.nf => validate_params.nf} (100%) create mode 100644 modules/pfr/fastavalidate/main.nf create mode 100644 modules/pfr/fastavalidate/meta.yml rename modules/{local => pfr}/liftoff/main.nf (56%) create mode 100644 modules/pfr/liftoff/meta.yml diff --git a/modules.json b/modules.json index 0510115..69df7af 100644 --- a/modules.json +++ b/modules.json @@ -2,6 +2,26 @@ "name": "PlantandFoodResearch/pangene", "homePage": "https://github.com/PlantandFoodResearch/pangene", "repos": { + "git@github.com:PlantandFoodResearch/nxf-modules.git": { + "modules": { + "pfr": { + "fastavalidate": { + "branch": "main", + "git_sha": "5189302ed5fcbb927689a89812c0f792622c35d2", + "installed_by": [ + "modules" + ] + }, + "liftoff": { + "branch": "main", + "git_sha": "14fd0a73898339bede7ae7bc14077a47c847c5b5", + "installed_by": [ + "modules" + ] + } + } + } + }, "git@github.com:kherronism/nf-modules.git": { "modules": { "kherronism": { diff --git a/modules/local/fasta_validate/main.nf b/modules/local/fasta_validate/main.nf deleted file mode 100644 index 7c37c39..0000000 --- a/modules/local/fasta_validate/main.nf +++ /dev/null @@ -1,43 +0,0 @@ -process FASTA_VALIDATE { - tag "$meta.id" - label "process_single" - - container "docker://gallvp/fasta_validator:a6a2ec1_ps" - - input: - tuple val(meta), path(fasta_file) - - output: - tuple val(meta), path("$validFasta") , emit: valid_fasta - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - validFasta = (fasta_file.toString() - ~/\.\w+$/) + ".validated.fasta" - """ - fasta_validate -v $fasta_file >/dev/null - - # If invalid, the above command will fail and - # the NXF error startegy will kick in. - - cat $fasta_file > $validFasta - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fasta_validate: \$(md5sum \$(which fasta_validate) | cut -d' ' -f1) - END_VERSIONS - """ - - stub: - validFasta = (fasta_file.toString() - ~/\.\w+$/) + ".validated.fasta" - """ - touch $validFasta - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fasta_validate: \$(md5sum \$(which fasta_validate) | cut -d' ' -f1) - END_VERSIONS - """ -} \ No newline at end of file diff --git a/modules/local/validate_params/main.nf b/modules/local/validate_params.nf similarity index 100% rename from modules/local/validate_params/main.nf rename to modules/local/validate_params.nf diff --git a/modules/pfr/fastavalidate/main.nf b/modules/pfr/fastavalidate/main.nf new file mode 100644 index 0000000..873983b --- /dev/null +++ b/modules/pfr/fastavalidate/main.nf @@ -0,0 +1,56 @@ +process FASTAVALIDATE { + tag "$meta.id" + label 'process_single' + + // conda "YOUR-TOOL-HERE" + // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': + // 'biocontainers/YOUR-TOOL-HERE' }" + container 'docker://gallvp/fasta_validator:a6a2ec1_ps' + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path('*.validated.fasta') , emit: valid_fasta , optional: true + tuple val(meta), path('*.error.log') , emit: error_log , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + fasta_validate \\ + -v $fasta \\ + 2> "${prefix}.error.log" \\ + || echo "Errors from fasta_validate printed to ${prefix}.error.log" + + if [ \$(cat "${prefix}.error.log" | wc -l) -gt 0 ]; then + echo "Validation failed..." + cat "${prefix}.error.log" + else + rm "${prefix}.error.log" + + cat $fasta \\ + > "${prefix}.validated.fasta" + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fasta_validate: \$(md5sum \$(which fasta_validate) | cut -d' ' -f1) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch "${prefix}.validated.fasta" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fasta_validate: \$(md5sum \$(which fasta_validate) | cut -d' ' -f1) + END_VERSIONS + """ +} diff --git a/modules/pfr/fastavalidate/meta.yml b/modules/pfr/fastavalidate/meta.yml new file mode 100644 index 0000000..ff63b88 --- /dev/null +++ b/modules/pfr/fastavalidate/meta.yml @@ -0,0 +1,49 @@ +name: "fastavalidate" +description: | + "A simple validator for fasta files. The module emits the validated file or an + error log upon validation failure." +keywords: + - fasta + - validation + - genome +tools: + - fasta_validate: + description: | + "A simple C code to validate a fasta file. It only checks a few things, + and by default only sets its response via the return code, + so you will need to check that!" + homepage: "https://github.com/gallvp/fasta_validator" + documentation: "https://github.com/gallvp/fasta_validator" + tool_dev_url: "https://github.com/gallvp/fasta_validator" + doi: "10.5281/zenodo.2532044" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing file information + e.g. [ id:'test' ] + - fasta: + type: file + description: Input fasta file + pattern: "*.fasta" +output: + - meta: + type: map + description: | + Groovy Map containing file information + e.g. [ id:'test' ] + - valid_fasta: + type: file + description: Validated fasta file if the validation succeeds + pattern: "*.validated.fasta" + - error_log: + type: file + description: Error log if the validation fails + pattern: "*.error.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@gallvp" diff --git a/modules/local/liftoff/main.nf b/modules/pfr/liftoff/main.nf similarity index 56% rename from modules/local/liftoff/main.nf rename to modules/pfr/liftoff/main.nf index e10374d..5356728 100644 --- a/modules/local/liftoff/main.nf +++ b/modules/pfr/liftoff/main.nf @@ -1,13 +1,16 @@ process LIFTOFF { tag "$meta.id" - label "process_high" + label 'process_high' - container 'https://depot.galaxyproject.org/singularity/liftoff:1.6.3--pyhdfd78af_0' + conda "bioconda::liftoff=1.6.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/liftoff:1.6.3--pyhdfd78af_0': + 'biocontainers/liftoff:1.6.3--pyhdfd78af_0' }" input: tuple val(meta), path(target_fa) - path ref_fa - path ref_gff + path ref_fa, name: 'liftoff_reference_assembly.fa' // To avoid name collisions betwen target_fa and ref_fa + path ref_annotation output: tuple val(meta), path("*.gff3") , emit: gff3 @@ -23,18 +26,16 @@ process LIFTOFF { def prefix = task.ext.prefix ?: "${meta.id}" """ liftoff \\ - -g $ref_gff \\ - -p $task.cpus \\ - -o "${prefix}.gff3" \\ - -u "${prefix}.unmapped.txt" \\ - $args \\ - $target_fa \\ - $ref_fa \\ - 2> liftoff.stderr + -g $ref_annotation \\ + -p $task.cpus \\ + -o "${prefix}.gff3" \\ + -u "${prefix}.unmapped.txt" \\ + $args \\ + $target_fa \\ + liftoff_reference_assembly.fa - [ -f "${prefix}.gff3_polished" ] \\ - && mv "${prefix}.gff3_polished" "${prefix}.polished.gff3" \\ - || echo "-polish is absent" + mv "${prefix}.gff3_polished" "${prefix}.polished.gff3" \\ + || echo "-polish is absent" cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -53,4 +54,4 @@ process LIFTOFF { liftoff: \$(liftoff --version 2> /dev/null) END_VERSIONS """ -} \ No newline at end of file +} diff --git a/modules/pfr/liftoff/meta.yml b/modules/pfr/liftoff/meta.yml new file mode 100644 index 0000000..e859282 --- /dev/null +++ b/modules/pfr/liftoff/meta.yml @@ -0,0 +1,59 @@ +name: "liftoff" +description: "Liftoff is a tool that accurately maps annotations in GFF or GTF between assemblies of the same, or closely-related species." +keywords: + - genome + - annotation + - gff3 + - gtf + - liftover +tools: + - "liftoff": + description: "Liftoff is a tool that accurately maps annotations in GFF or GTF between assemblies of the same, or closely-related species." + homepage: "https://github.com/agshumate/Liftoff" + documentation: "https://github.com/agshumate/Liftoff" + tool_dev_url: "https://github.com/agshumate/Liftoff" + doi: "10.1093/bioinformatics/bty191" + licence: ["GPL v3 License"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - target_fa: + type: file + description: Target assembly in fasta format + pattern: "*.{fsa,fa,fasta}" + - ref_fa: + type: file + description: Reference assembly in fasta format + pattern: "*.{fsa,fa,fasta}" + - ref_annotation: + type: file + description: Reference assembly annotations in gtf or gff3 format + pattern: "*.{gtf,gff3}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - gff3: + type: file + description: Lifted annotations for the target assembly in gff3 format + pattern: "*.gff3" + - polished_gff3: + type: file + description: Polished lifted annotations for the target assembly in gff3 format + pattern: "*.polished.gff3" + optional: true + - unmapped_txt: + type: file + description: List of unmapped reference annotations + pattern: "*.unmapped.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@gallvp" From 0a96c7f1de7091d674b0397ad6ddec1739feab49 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 13 Dec 2023 12:21:15 +1300 Subject: [PATCH 24/59] Updated modules and subworkflows --- modules.json | 255 +++--- modules/nf-core/cat/cat/main.nf | 12 +- modules/nf-core/cat/cat/tests/main.nf.test | 26 + .../nf-core/fastavalidator/environment.yml | 9 + modules/nf-core/fastavalidator/main.nf | 62 ++ .../fastavalidator}/meta.yml | 26 +- .../nf-core/fastavalidator/tests/main.nf.test | 60 ++ .../fastavalidator/tests/main.nf.test.snap | 76 ++ modules/nf-core/fastavalidator/tests/tags.yml | 2 + modules/nf-core/fastp/main.nf | 6 +- modules/nf-core/fastqc/main.nf | 4 +- modules/nf-core/fastqc/tests/main.nf.test | 68 ++ modules/nf-core/gffread/main.nf | 10 +- modules/nf-core/gffread/meta.yml | 6 +- modules/nf-core/gffread/tests/main.nf.test | 33 +- .../nf-core/gffread/tests/main.nf.test.snap | 39 +- modules/nf-core/gffread/tests/nextflow.config | 5 + modules/nf-core/samtools/cat/environment.yml | 2 +- modules/nf-core/samtools/cat/main.nf | 4 +- .../nf-core/samtools/cat/tests/main.nf.test | 72 ++ .../samtools/cat/tests/main.nf.test.snap | 26 + modules/nf-core/samtools/cat/tests/tags.yml | 2 + modules/nf-core/star/align/environment.yml | 2 +- modules/nf-core/star/align/main.nf | 4 +- modules/nf-core/star/align/tests/main.nf.test | 339 ++++++++ .../star/align/tests/main.nf.test.snap | 769 ++++++++++++++++++ .../star/align/tests/nextflow.arriba.config | 14 + .../nf-core/star/align/tests/nextflow.config | 14 + .../align/tests/nextflow.starfusion.config | 14 + modules/nf-core/star/align/tests/tags.yml | 2 + .../star/genomegenerate/environment.yml | 2 +- modules/nf-core/star/genomegenerate/main.nf | 4 +- .../star/genomegenerate/tests/main.nf.test | 38 + .../genomegenerate/tests/main.nf.test.snap | 16 + .../star/genomegenerate/tests/tags.yml | 2 + modules/nf-core/umitools/extract/main.nf | 4 +- .../umitools/extract/tests/main.nf.test.snap | 4 +- modules/pfr/fastavalidate/main.nf | 56 -- modules/pfr/liftoff/environment.yml | 9 + modules/pfr/liftoff/main.nf | 14 +- modules/pfr/liftoff/meta.yml | 7 +- modules/pfr/liftoff/tests/main.nf.test | 89 ++ modules/pfr/liftoff/tests/main.nf.test.snap | 23 + modules/pfr/liftoff/tests/nextflow.config | 5 + modules/pfr/liftoff/tests/tags.yml | 2 + .../tests/main.nf.test | 60 ++ .../tests/main.nf.test.snap | 81 ++ .../tests/tags.yml | 2 + 48 files changed, 2129 insertions(+), 252 deletions(-) create mode 100644 modules/nf-core/fastavalidator/environment.yml create mode 100644 modules/nf-core/fastavalidator/main.nf rename modules/{pfr/fastavalidate => nf-core/fastavalidator}/meta.yml (52%) create mode 100644 modules/nf-core/fastavalidator/tests/main.nf.test create mode 100644 modules/nf-core/fastavalidator/tests/main.nf.test.snap create mode 100644 modules/nf-core/fastavalidator/tests/tags.yml create mode 100644 modules/nf-core/gffread/tests/nextflow.config create mode 100644 modules/nf-core/samtools/cat/tests/main.nf.test create mode 100644 modules/nf-core/samtools/cat/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/cat/tests/tags.yml create mode 100644 modules/nf-core/star/align/tests/main.nf.test create mode 100644 modules/nf-core/star/align/tests/main.nf.test.snap create mode 100644 modules/nf-core/star/align/tests/nextflow.arriba.config create mode 100644 modules/nf-core/star/align/tests/nextflow.config create mode 100644 modules/nf-core/star/align/tests/nextflow.starfusion.config create mode 100644 modules/nf-core/star/align/tests/tags.yml create mode 100644 modules/nf-core/star/genomegenerate/tests/main.nf.test create mode 100644 modules/nf-core/star/genomegenerate/tests/main.nf.test.snap create mode 100644 modules/nf-core/star/genomegenerate/tests/tags.yml delete mode 100644 modules/pfr/fastavalidate/main.nf create mode 100644 modules/pfr/liftoff/environment.yml create mode 100644 modules/pfr/liftoff/tests/main.nf.test create mode 100644 modules/pfr/liftoff/tests/main.nf.test.snap create mode 100644 modules/pfr/liftoff/tests/nextflow.config create mode 100644 modules/pfr/liftoff/tests/tags.yml create mode 100644 subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test create mode 100644 subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test.snap create mode 100644 subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/tags.yml diff --git a/modules.json b/modules.json index 69df7af..cde5635 100644 --- a/modules.json +++ b/modules.json @@ -1,148 +1,113 @@ { - "name": "PlantandFoodResearch/pangene", - "homePage": "https://github.com/PlantandFoodResearch/pangene", - "repos": { - "git@github.com:PlantandFoodResearch/nxf-modules.git": { - "modules": { - "pfr": { - "fastavalidate": { - "branch": "main", - "git_sha": "5189302ed5fcbb927689a89812c0f792622c35d2", - "installed_by": [ - "modules" - ] - }, - "liftoff": { - "branch": "main", - "git_sha": "14fd0a73898339bede7ae7bc14077a47c847c5b5", - "installed_by": [ - "modules" - ] - } - } - } - }, - "git@github.com:kherronism/nf-modules.git": { - "modules": { - "kherronism": { - "braker3": { - "branch": "dev", - "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", - "installed_by": [ - "modules" - ] - }, - "repeatmasker": { - "branch": "dev", - "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", - "installed_by": [ - "modules" - ] - } - } - } - }, - "https://github.com/nf-core/modules.git": { - "modules": { - "nf-core": { - "cat/cat": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] - }, - "cat/fastq": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] - }, - "custom/dumpsoftwareversions": { - "branch": "master", - "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", - "installed_by": [ - "modules" - ] - }, - "fastp": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "fastq_fastqc_umitools_fastp" - ] - }, - "fastqc": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules", - "fastq_fastqc_umitools_fastp" - ] - }, - "gffread": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] - }, - "gunzip": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] - }, - "samtools/cat": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] - }, - "sortmerna": { - "branch": "master", - "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", - "installed_by": [ - "modules" - ] - }, - "star/align": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] - }, - "star/genomegenerate": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] - }, - "umitools/extract": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "fastq_fastqc_umitools_fastp" - ] - } - } - }, - "subworkflows": { - "nf-core": { - "fastq_fastqc_umitools_fastp": { - "branch": "master", - "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": [ - "subworkflows" - ] - } - } - } + "name": "PlantandFoodResearch/pangene", + "homePage": "https://github.com/PlantandFoodResearch/pangene", + "repos": { + "git@github.com:PlantandFoodResearch/nxf-modules.git": { + "modules": { + "pfr": { + "liftoff": { + "branch": "main", + "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", + "installed_by": ["modules"] + } } + } + }, + "git@github.com:kherronism/nf-modules.git": { + "modules": { + "kherronism": { + "braker3": { + "branch": "dev", + "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", + "installed_by": ["modules"] + }, + "repeatmasker": { + "branch": "dev", + "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", + "installed_by": ["modules"] + } + } + } + }, + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "cat/cat": { + "branch": "master", + "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", + "installed_by": ["modules"] + }, + "fastavalidator": { + "branch": "master", + "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", + "installed_by": ["modules"] + }, + "fastp": { + "branch": "master", + "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", + "installed_by": ["fastq_fastqc_umitools_fastp"] + }, + "fastqc": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "installed_by": ["fastq_fastqc_umitools_fastp", "modules"] + }, + "gffread": { + "branch": "master", + "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", + "installed_by": ["modules"] + }, + "gunzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/cat": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["modules"] + }, + "sortmerna": { + "branch": "master", + "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", + "installed_by": ["modules"] + }, + "star/align": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["modules"] + }, + "star/genomegenerate": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["modules"] + }, + "umitools/extract": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "installed_by": ["fastq_fastqc_umitools_fastp"] + } + } + }, + "subworkflows": { + "nf-core": { + "fastq_fastqc_umitools_fastp": { + "branch": "master", + "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", + "installed_by": ["subworkflows"] + } + } + } } -} \ No newline at end of file + } +} diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf index 4264a92..970ab76 100644 --- a/modules/nf-core/cat/cat/main.nf +++ b/modules/nf-core/cat/cat/main.nf @@ -35,6 +35,10 @@ process CAT_CAT { in_zip = file_list[0].endsWith('.gz') command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } """ $command1 \\ $args \\ @@ -49,8 +53,12 @@ process CAT_CAT { """ stub: - def file_list = files_in.collect { it.toString() } - prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } """ touch $prefix diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test index 5766daa..ed5a4f1 100644 --- a/modules/nf-core/cat/cat/tests/main.nf.test +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -8,6 +8,32 @@ nextflow_process { tag "cat" tag "cat/cat" + test("test_cat_name_conflict") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'genome', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert !process.success }, + { assert process.stdout.toString().contains("The name of the input file can't be the same as for the output prefix") } + ) + } + } + test("test_cat_unzipped_unzipped") { when { params { diff --git a/modules/nf-core/fastavalidator/environment.yml b/modules/nf-core/fastavalidator/environment.yml new file mode 100644 index 0000000..70f346e --- /dev/null +++ b/modules/nf-core/fastavalidator/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "fastavalidator" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::py_fasta_validator=0.6" diff --git a/modules/nf-core/fastavalidator/main.nf b/modules/nf-core/fastavalidator/main.nf new file mode 100644 index 0000000..ac5470f --- /dev/null +++ b/modules/nf-core/fastavalidator/main.nf @@ -0,0 +1,62 @@ +process FASTAVALIDATOR { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/py_fasta_validator:0.6--py37h595c7a6_0': + 'biocontainers/py_fasta_validator:0.6--py37h595c7a6_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path('*.success.log') , emit: success_log , optional: true + tuple val(meta), path('*.error.log') , emit: error_log , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + py_fasta_validator \\ + -f $fasta \\ + 2> "${prefix}.error.log" \\ + || echo "Errors from fasta_validate printed to ${prefix}.error.log" + + if [ \$(cat "${prefix}.error.log" | wc -l) -gt 0 ]; then + echo "Validation failed..." + + cat \\ + "${prefix}.error.log" + else + echo "Validation successful..." + + mv \\ + "${prefix}.error.log" \\ + fasta_validate.stderr + + echo "Validation successful..." \\ + > "${prefix}.success.log" + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + py_fasta_validator: \$(py_fasta_validator -v | sed 's/.* version //') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo "Validation successful..." \\ + > "${prefix}.success.log" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + py_fasta_validator: \$(py_fasta_validator -v | sed 's/.* version //') + END_VERSIONS + """ +} diff --git a/modules/pfr/fastavalidate/meta.yml b/modules/nf-core/fastavalidator/meta.yml similarity index 52% rename from modules/pfr/fastavalidate/meta.yml rename to modules/nf-core/fastavalidator/meta.yml index ff63b88..c5c4371 100644 --- a/modules/pfr/fastavalidate/meta.yml +++ b/modules/nf-core/fastavalidator/meta.yml @@ -1,6 +1,8 @@ -name: "fastavalidate" +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "fastavalidator" description: | - "A simple validator for fasta files. The module emits the validated file or an + "Python C-extension for a simple validator for fasta files. The module emits the validated file or an error log upon validation failure." keywords: - fasta @@ -9,13 +11,13 @@ keywords: tools: - fasta_validate: description: | - "A simple C code to validate a fasta file. It only checks a few things, + "Python C-extension for a simple C code to validate a fasta file. It only checks a few things, and by default only sets its response via the return code, so you will need to check that!" - homepage: "https://github.com/gallvp/fasta_validator" - documentation: "https://github.com/gallvp/fasta_validator" - tool_dev_url: "https://github.com/gallvp/fasta_validator" - doi: "10.5281/zenodo.2532044" + homepage: "https://github.com/linsalrob/py_fasta_validator" + documentation: "https://github.com/linsalrob/py_fasta_validator" + tool_dev_url: "https://github.com/linsalrob/py_fasta_validator" + doi: "10.5281/zenodo.5002710" licence: ["MIT"] input: - meta: @@ -33,13 +35,13 @@ output: description: | Groovy Map containing file information e.g. [ id:'test' ] - - valid_fasta: + - success_log: type: file - description: Validated fasta file if the validation succeeds - pattern: "*.validated.fasta" + description: Log file for successful validation + pattern: "*.success.log" - error_log: type: file - description: Error log if the validation fails + description: Log file for failed validation pattern: "*.error.log" - versions: type: file @@ -47,3 +49,5 @@ output: pattern: "versions.yml" authors: - "@gallvp" +maintainers: + - "@gallvp" diff --git a/modules/nf-core/fastavalidator/tests/main.nf.test b/modules/nf-core/fastavalidator/tests/main.nf.test new file mode 100644 index 0000000..bb8c22c --- /dev/null +++ b/modules/nf-core/fastavalidator/tests/main.nf.test @@ -0,0 +1,60 @@ +nextflow_process { + + name "Test Process FASTAVALIDATOR" + script "../main.nf" + process "FASTAVALIDATOR" + + tag "modules" + tag "modules_nfcore" + tag "fastavalidator" + + test("sarscov2-fasta-valid") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.success_log != null }, + { assert process.out.error_log == [] }, + { assert path(process.out.success_log.get(0).get(1)).getText().contains("Validation successful...") } + ) + } + + } + + test("sarscov2-gff3-invalid") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.success_log == [] }, + { assert process.out.error_log != null }, + { assert path(process.out.error_log.get(0).get(1)).getText().contains("genome.gff3 does not start with a >") } + ) + } + + } +} diff --git a/modules/nf-core/fastavalidator/tests/main.nf.test.snap b/modules/nf-core/fastavalidator/tests/main.nf.test.snap new file mode 100644 index 0000000..382dee7 --- /dev/null +++ b/modules/nf-core/fastavalidator/tests/main.nf.test.snap @@ -0,0 +1,76 @@ +{ + "sarscov2-fasta-valid": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.success.log:md5,b0b859eda1db5cd43915846e00ebc22c" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,05aa059840b3b4dd6d88bc1e4bf976d7" + ], + "error_log": [ + + ], + "success_log": [ + [ + { + "id": "test", + "single_end": false + }, + "test.success.log:md5,b0b859eda1db5cd43915846e00ebc22c" + ] + ], + "versions": [ + "versions.yml:md5,05aa059840b3b4dd6d88bc1e4bf976d7" + ] + } + ], + "timestamp": "2023-11-28T11:23:25.106872" + }, + "sarscov2-gff3-invalid": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.error.log:md5,531d520c0e7767176f743f197f1f87b3" + ] + ], + "2": [ + "versions.yml:md5,05aa059840b3b4dd6d88bc1e4bf976d7" + ], + "error_log": [ + [ + { + "id": "test", + "single_end": false + }, + "test.error.log:md5,531d520c0e7767176f743f197f1f87b3" + ] + ], + "success_log": [ + + ], + "versions": [ + "versions.yml:md5,05aa059840b3b4dd6d88bc1e4bf976d7" + ] + } + ], + "timestamp": "2023-11-28T11:23:29.40324" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastavalidator/tests/tags.yml b/modules/nf-core/fastavalidator/tests/tags.yml new file mode 100644 index 0000000..c3c7757 --- /dev/null +++ b/modules/nf-core/fastavalidator/tests/tags.yml @@ -0,0 +1,2 @@ +fastavalidator: + - "modules/nf-core/fastavalidator/**" diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf index c8e815a..5fac3c1 100644 --- a/modules/nf-core/fastp/main.nf +++ b/modules/nf-core/fastp/main.nf @@ -45,7 +45,7 @@ process FASTP { $adapter_list \\ $fail_fastq \\ $args \\ - 2> ${prefix}.fastp.log \\ + 2> >(tee ${prefix}.fastp.log >&2) \\ | gzip -c > ${prefix}.fastp.fastq.gz cat <<-END_VERSIONS > versions.yml @@ -66,7 +66,7 @@ process FASTP { $adapter_list \\ $fail_fastq \\ $args \\ - 2> ${prefix}.fastp.log + 2> >(tee ${prefix}.fastp.log >&2) cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -91,7 +91,7 @@ process FASTP { --thread $task.cpus \\ --detect_adapter_for_pe \\ $args \\ - 2> ${prefix}.fastp.log + 2> >(tee ${prefix}.fastp.log >&2) cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 50e59f2..9e19a74 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -37,7 +37,7 @@ process FASTQC { cat <<-END_VERSIONS > versions.yml "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) END_VERSIONS """ @@ -49,7 +49,7 @@ process FASTQC { cat <<-END_VERSIONS > versions.yml "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) END_VERSIONS """ } diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test index 6437a14..b9e8f92 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -38,4 +38,72 @@ nextflow_process { ) } } +// TODO +// // +// // Test with paired-end data +// // +// workflow test_fastqc_paired_end { +// input = [ +// [id: 'test', single_end: false], // meta map +// [ +// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), +// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) +// ] +// ] + +// FASTQC ( input ) +// } + +// // +// // Test with interleaved data +// // +// workflow test_fastqc_interleaved { +// input = [ +// [id: 'test', single_end: false], // meta map +// file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) +// ] + +// FASTQC ( input ) +// } + +// // +// // Test with bam data +// // +// workflow test_fastqc_bam { +// input = [ +// [id: 'test', single_end: false], // meta map +// file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) +// ] + +// FASTQC ( input ) +// } + +// // +// // Test with multiple samples +// // +// workflow test_fastqc_multiple { +// input = [ +// [id: 'test', single_end: false], // meta map +// [ +// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), +// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), +// file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), +// file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) +// ] +// ] + +// FASTQC ( input ) +// } + +// // +// // Test with custom prefix +// // +// workflow test_fastqc_custom_prefix { +// input = [ +// [ id:'mysample', single_end:true ], // meta map +// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) +// ] + +// FASTQC ( input ) +// } } diff --git a/modules/nf-core/gffread/main.nf b/modules/nf-core/gffread/main.nf index 68f8045..d8a473e 100644 --- a/modules/nf-core/gffread/main.nf +++ b/modules/nf-core/gffread/main.nf @@ -11,20 +11,22 @@ process GFFREAD { path gff output: - path "*.gtf" , emit: gtf + path "*.gtf" , emit: gtf , optional: true + path "*.gff3" , emit: gffread_gff , optional: true path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${gff.baseName}" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${gff.baseName}" + def extension = args.contains("-T") ? 'gtf' : 'gffread.gff3' """ gffread \\ $gff \\ $args \\ - -o ${prefix}.gtf + -o ${prefix}.${extension} cat <<-END_VERSIONS > versions.yml "${task.process}": gffread: \$(gffread --version 2>&1) diff --git a/modules/nf-core/gffread/meta.yml b/modules/nf-core/gffread/meta.yml index f486f8b..27ac310 100644 --- a/modules/nf-core/gffread/meta.yml +++ b/modules/nf-core/gffread/meta.yml @@ -20,8 +20,12 @@ input: output: - gtf: type: file - description: GTF file resulting from the conversion of the GFF input file + description: GTF file resulting from the conversion of the GFF input file if '-T' argument is present pattern: "*.{gtf}" + - gffread_gff: + type: file + description: GFF3 file resulting from the conversion of the GFF input file if '-T' argument is absent + pattern: "*.{gff3}" - versions: type: file description: File containing software versions diff --git a/modules/nf-core/gffread/tests/main.nf.test b/modules/nf-core/gffread/tests/main.nf.test index 67d47ec..3c064b3 100644 --- a/modules/nf-core/gffread/tests/main.nf.test +++ b/modules/nf-core/gffread/tests/main.nf.test @@ -3,11 +3,38 @@ nextflow_process { name "Test Process GFFREAD" script "../main.nf" process "GFFREAD" + tag "gffread" tag "modules_nfcore" tag "modules" - test("Should run without failures") { + test("sarscov2-gff3-gtf") { + + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf != null }, + { assert process.out.gffread_gff == [] } + ) + } + + } + + test("sarscov2-gff3-gff3") { when { params { @@ -23,7 +50,9 @@ nextflow_process { then { assertAll ( { assert process.success }, - { assert snapshot(process.out).match() } + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_gff != null }, ) } diff --git a/modules/nf-core/gffread/tests/main.nf.test.snap b/modules/nf-core/gffread/tests/main.nf.test.snap index fb5460c..1f1342e 100644 --- a/modules/nf-core/gffread/tests/main.nf.test.snap +++ b/modules/nf-core/gffread/tests/main.nf.test.snap @@ -1,21 +1,52 @@ { - "Should run without failures": { + "sarscov2-gff3-gtf": { "content": [ { "0": [ - "genome.gtf:md5,f184f856b7fe3e159d21b052b5dd3954" + "genome.gtf:md5,2394072d7d31530dfd590c4a117bf6e3" ], "1": [ + + ], + "2": [ + "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + ], + "gffread_gff": [ + + ], + "gtf": [ + "genome.gtf:md5,2394072d7d31530dfd590c4a117bf6e3" + ], + "versions": [ + "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + ] + } + ], + "timestamp": "2023-11-29T15:39:30.006985" + }, + "sarscov2-gff3-gff3": { + "content": [ + { + "0": [ + + ], + "1": [ + "genome.gffread.gff3:md5,a7d40d99dcddac23ac673c473279ea2d" + ], + "2": [ "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" ], + "gffread_gff": [ + "genome.gffread.gff3:md5,a7d40d99dcddac23ac673c473279ea2d" + ], "gtf": [ - "genome.gtf:md5,f184f856b7fe3e159d21b052b5dd3954" + ], "versions": [ "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" ] } ], - "timestamp": "2023-10-17T10:00:08.542490523" + "timestamp": "2023-11-29T15:39:34.636061" } } \ No newline at end of file diff --git a/modules/nf-core/gffread/tests/nextflow.config b/modules/nf-core/gffread/tests/nextflow.config new file mode 100644 index 0000000..74b2509 --- /dev/null +++ b/modules/nf-core/gffread/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '-T' + } +} diff --git a/modules/nf-core/samtools/cat/environment.yml b/modules/nf-core/samtools/cat/environment.yml index 80da1cf..0455a7d 100644 --- a/modules/nf-core/samtools/cat/environment.yml +++ b/modules/nf-core/samtools/cat/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::samtools=1.17 + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/cat/main.nf b/modules/nf-core/samtools/cat/main.nf index 5d939aa..b3b2508 100644 --- a/modules/nf-core/samtools/cat/main.nf +++ b/modules/nf-core/samtools/cat/main.nf @@ -4,8 +4,8 @@ process SAMTOOLS_CAT { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(input_files, stageAs: "?/*") diff --git a/modules/nf-core/samtools/cat/tests/main.nf.test b/modules/nf-core/samtools/cat/tests/main.nf.test new file mode 100644 index 0000000..49c633f --- /dev/null +++ b/modules/nf-core/samtools/cat/tests/main.nf.test @@ -0,0 +1,72 @@ +nextflow_process { + + name "Test Process SAMTOOLS_CAT" + script "../main.nf" + process "SAMTOOLS_CAT" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/cat" + + test("sarscov2 - [bam1, bam2]") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_unaligned_bam'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [bam1, bam2] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_unaligned_bam'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/samtools/cat/tests/main.nf.test.snap b/modules/nf-core/samtools/cat/tests/main.nf.test.snap new file mode 100644 index 0000000..298e25d --- /dev/null +++ b/modules/nf-core/samtools/cat/tests/main.nf.test.snap @@ -0,0 +1,26 @@ +{ + "sarscov2 - [bam1, bam2]": { + "content": [ + "test.bam", + [ + + ], + [ + "versions.yml:md5,f10a4f6b2e0272bef2ceb4ca826a15a1" + ] + ], + "timestamp": "2023-12-04T14:00:18.264348819" + }, + "sarscov2 - [bam1, bam2] - stub": { + "content": [ + "test.bam", + [ + + ], + [ + "versions.yml:md5,f10a4f6b2e0272bef2ceb4ca826a15a1" + ] + ], + "timestamp": "2023-12-04T14:03:17.714482742" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/cat/tests/tags.yml b/modules/nf-core/samtools/cat/tests/tags.yml new file mode 100644 index 0000000..9760557 --- /dev/null +++ b/modules/nf-core/samtools/cat/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/cat: + - "modules/nf-core/samtools/cat/**" diff --git a/modules/nf-core/star/align/environment.yml b/modules/nf-core/star/align/environment.yml index 6db2098..36fcd02 100644 --- a/modules/nf-core/star/align/environment.yml +++ b/modules/nf-core/star/align/environment.yml @@ -5,5 +5,5 @@ channels: - defaults dependencies: - bioconda::star=2.7.10a - - bioconda::samtools=1.16.1 + - bioconda::samtools=1.18 - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/align/main.nf b/modules/nf-core/star/align/main.nf index fa645a6..8e9c48b 100644 --- a/modules/nf-core/star/align/main.nf +++ b/modules/nf-core/star/align/main.nf @@ -4,8 +4,8 @@ process STAR_ALIGN { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' : - 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' : + 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' }" input: tuple val(meta), path(reads, stageAs: "input*/*") diff --git a/modules/nf-core/star/align/tests/main.nf.test b/modules/nf-core/star/align/tests/main.nf.test new file mode 100644 index 0000000..4c87847 --- /dev/null +++ b/modules/nf-core/star/align/tests/main.nf.test @@ -0,0 +1,339 @@ +nextflow_process { + + name "Test Process STAR_ALIGN" + script "../main.nf" + process "STAR_ALIGN" + tag "modules" + tag "modules_nfcore" + tag "star" + tag "star/align" + + test("homo_sapiens - single_end") { + config "./nextflow.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true) ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.log_final[0][1]).name).match("homo_sapiens - single_end - log_final") }, + { assert snapshot(file(process.out.log_out[0][1]).name).match("homo_sapiens - single_end - log_out") }, + { assert snapshot(process.out.bam).match("homo_sapiens - single_end - bam") }, + { assert snapshot(process.out.bam_sorted).match("homo_sapiens - single_end - bam_sorted") }, + { assert snapshot(process.out.bam_transcript).match("homo_sapiens - single_end - bam_transcript") }, + { assert snapshot(process.out.bam_unsorted).match("homo_sapiens - single_end - bam_unsorted") }, + { assert snapshot(process.out.bedgraph).match("homo_sapiens - single_end - bedgraph") }, + { assert snapshot(process.out.fastq).match("homo_sapiens - single_end - fastq") }, + { assert snapshot(process.out.junction).match("homo_sapiens - single_end - junction") }, + { assert snapshot(process.out.log_progress).match("homo_sapiens - single_end - log_progress") }, + { assert snapshot(process.out.read_per_gene_tab).match("homo_sapiens - single_end - read_per_gene_tab") }, + { assert snapshot(process.out.sam).match("homo_sapiens - single_end - sam") }, + { assert snapshot(process.out.spl_junc_tab).match("homo_sapiens - single_end - spl_junc_tab") }, + { assert snapshot(process.out.tab).match("homo_sapiens - single_end - tab") }, + { assert snapshot(process.out.wig).match("homo_sapiens - single_end - wig") }, + { assert snapshot(process.out.versions).match("homo_sapiens - single_end - versions") } + ) + } + } + + test("homo_sapiens - paired_end") { + config "./nextflow.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_2_fastq_gz'], checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.log_final[0][1]).name).match("homo_sapiens - paired_end - log_final") }, + { assert snapshot(file(process.out.log_out[0][1]).name).match("homo_sapiens - paired_end - log_out") }, + { assert snapshot(process.out.bam).match("homo_sapiens - paired_end - bam") }, + { assert snapshot(process.out.bam_sorted).match("homo_sapiens - paired_end - bam_sorted") }, + { assert snapshot(process.out.bam_transcript).match("homo_sapiens - paired_end - bam_transcript") }, + { assert snapshot(process.out.bam_unsorted).match("homo_sapiens - paired_end - bam_unsorted") }, + { assert snapshot(process.out.bedgraph).match("homo_sapiens - paired_end - bedgraph") }, + { assert snapshot(process.out.fastq).match("homo_sapiens - paired_end - fastq") }, + { assert snapshot(process.out.junction).match("homo_sapiens - paired_end - junction") }, + { assert snapshot(process.out.log_progress).match("homo_sapiens - paired_end - log_progress") }, + { assert snapshot(process.out.read_per_gene_tab).match("homo_sapiens - paired_end - read_per_gene_tab") }, + { assert snapshot(process.out.sam).match("homo_sapiens - paired_end - sam") }, + { assert snapshot(process.out.spl_junc_tab).match("homo_sapiens - paired_end - spl_junc_tab") }, + { assert snapshot(process.out.tab).match("homo_sapiens - paired_end - tab") }, + { assert snapshot(process.out.wig).match("homo_sapiens - paired_end - wig") }, + { assert snapshot(process.out.versions).match("homo_sapiens - paired_end - versions") } + ) + } + } + + test("homo_sapiens - paired_end - arriba") { + config "./nextflow.arriba.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_2_fastq_gz'], checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.log_final[0][1]).name).match("homo_sapiens - paired_end - arriba - log_final") }, + { assert snapshot(file(process.out.log_out[0][1]).name).match("homo_sapiens - paired_end - arriba - log_out") }, + { assert snapshot(file(process.out.log_progress[0][1]).name).match("homo_sapiens - paired_end - arriba - log_progress") }, + { assert snapshot(process.out.bam).match("homo_sapiens - paired_end - arriba - bam") }, + { assert snapshot(process.out.bam_sorted).match("homo_sapiens - paired_end - arriba - bam_sorted") }, + { assert snapshot(process.out.bam_transcript).match("homo_sapiens - paired_end - arriba - bam_transcript") }, + { assert snapshot(process.out.bam_unsorted).match("homo_sapiens - paired_end - arriba - bam_unsorted") }, + { assert snapshot(process.out.bedgraph).match("homo_sapiens - paired_end - arriba - bedgraph") }, + { assert snapshot(process.out.fastq).match("homo_sapiens - paired_end - arriba - fastq") }, + { assert snapshot(process.out.junction).match("homo_sapiens - paired_end - arriba - junction") }, + { assert snapshot(process.out.read_per_gene_tab).match("homo_sapiens - paired_end - arriba - read_per_gene_tab") }, + { assert snapshot(process.out.sam).match("homo_sapiens - paired_end - arriba - sam") }, + { assert snapshot(process.out.spl_junc_tab).match("homo_sapiens - paired_end - arriba - spl_junc_tab") }, + { assert snapshot(process.out.tab).match("homo_sapiens - paired_end - arriba - tab") }, + { assert snapshot(process.out.wig).match("homo_sapiens - paired_end - arriba - wig") }, + { assert snapshot(process.out.versions).match("homo_sapiens - paired_end - arriba - versions") } + ) + } + } + + test("homo_sapiens - paired_end - starfusion") { + config "./nextflow.starfusion.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_2_fastq_gz'], checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.log_final[0][1]).name).match("homo_sapiens - paired_end - starfusion - log_final") }, + { assert snapshot(file(process.out.log_out[0][1]).name).match("homo_sapiens - paired_end - starfusion - log_out") }, + { assert snapshot(file(process.out.log_progress[0][1]).name).match("homo_sapiens - paired_end - starfusion - log_progress") }, + { assert snapshot(process.out.bam).match("homo_sapiens - paired_end - starfusion - bam") }, + { assert snapshot(process.out.bam_sorted).match("homo_sapiens - paired_end - starfusion - bam_sorted") }, + { assert snapshot(process.out.bam_transcript).match("homo_sapiens - paired_end - starfusion - bam_transcript") }, + { assert snapshot(process.out.bam_unsorted).match("homo_sapiens - paired_end - starfusion - bam_unsorted") }, + { assert snapshot(process.out.bedgraph).match("homo_sapiens - paired_end - starfusion - bedgraph") }, + { assert snapshot(process.out.fastq).match("homo_sapiens - paired_end - starfusion - fastq") }, + { assert snapshot(process.out.junction).match("homo_sapiens - paired_end - starfusion - junction") }, + { assert snapshot(process.out.read_per_gene_tab).match("homo_sapiens - paired_end - starfusion - read_per_gene_tab") }, + { assert snapshot(process.out.sam).match("homo_sapiens - paired_end - starfusion - sam") }, + { assert snapshot(process.out.spl_junc_tab).match("homo_sapiens - paired_end - starfusion - spl_junc_tab") }, + { assert snapshot(process.out.tab).match("homo_sapiens - paired_end - starfusion - tab") }, + { assert snapshot(process.out.wig).match("homo_sapiens - paired_end - starfusion - wig") }, + { assert snapshot(process.out.versions).match("homo_sapiens - paired_end - starfusion - versions") } + ) + } + } + + test("homo_sapiens - paired_end - multiple") { + config "./nextflow.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_2_fastq_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_2_fastq_gz'], checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.log_final[0][1]).name).match("homo_sapiens - paired_end - multiple - log_final") }, + { assert snapshot(file(process.out.log_out[0][1]).name).match("homo_sapiens - paired_end - multiple - log_out") }, + { assert snapshot(file(process.out.log_progress[0][1]).name).match("homo_sapiens - paired_end - multiple - log_progress") }, + { assert snapshot(process.out.bam).match("homo_sapiens - paired_end - multiple - bam") }, + { assert snapshot(process.out.bam_sorted).match("homo_sapiens - paired_end - multiple - bam_sorted") }, + { assert snapshot(process.out.bam_transcript).match("homo_sapiens - paired_end - multiple - bam_transcript") }, + { assert snapshot(process.out.bam_unsorted).match("homo_sapiens - paired_end - multiple - bam_unsorted") }, + { assert snapshot(process.out.bedgraph).match("homo_sapiens - paired_end - multiple - bedgraph") }, + { assert snapshot(process.out.fastq).match("homo_sapiens - paired_end - multiple - fastq") }, + { assert snapshot(process.out.junction).match("homo_sapiens - paired_end - multiple - junction") }, + { assert snapshot(process.out.read_per_gene_tab).match("homo_sapiens - paired_end - multiple - read_per_gene_tab") }, + { assert snapshot(process.out.sam).match("homo_sapiens - paired_end - multiple - sam") }, + { assert snapshot(process.out.spl_junc_tab).match("homo_sapiens - paired_end - multiple - spl_junc_tab") }, + { assert snapshot(process.out.tab).match("homo_sapiens - paired_end - multiple - tab") }, + { assert snapshot(process.out.wig).match("homo_sapiens - paired_end - multiple - wig") }, + { assert snapshot(process.out.versions).match("homo_sapiens - paired_end - multiple - versions") } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/star/align/tests/main.nf.test.snap b/modules/nf-core/star/align/tests/main.nf.test.snap new file mode 100644 index 0000000..08edb91 --- /dev/null +++ b/modules/nf-core/star/align/tests/main.nf.test.snap @@ -0,0 +1,769 @@ +{ + "homo_sapiens - paired_end - multiple - bam_sorted": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,ab07c21d63ab0a6c07d171d213c81d5a" + ] + ] + ], + "timestamp": "2023-12-04T18:01:19.968225733" + }, + "homo_sapiens - paired_end - multiple - wig": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.857804" + }, + "homo_sapiens - paired_end - arriba - tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,5155c9fd1f787ad6d7d80987fb06219c" + ] + ] + ], + "timestamp": "2023-12-04T17:56:12.347549723" + }, + "homo_sapiens - single_end - wig": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.24701" + }, + "homo_sapiens - paired_end - sam": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.383818" + }, + "homo_sapiens - paired_end - arriba - versions": { + "content": [ + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "timestamp": "2023-12-04T17:56:12.431212643" + }, + "homo_sapiens - paired_end - multiple - bedgraph": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Signal.Unique.str1.out.bg:md5,d7bf8b70b436ca048a62513e1d0ece3a", + "test.Signal.UniqueMultiple.str1.out.bg:md5,686d58493b9eb445b56ace4d67f76ef6" + ] + ] + ] + ], + "timestamp": "2023-12-04T18:01:20.07119229" + }, + "homo_sapiens - paired_end - read_per_gene_tab": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.368841" + }, + "homo_sapiens - paired_end - arriba - bedgraph": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.102537" + }, + "homo_sapiens - single_end - junction": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.185369" + }, + "homo_sapiens - paired_end - arriba - spl_junc_tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,5155c9fd1f787ad6d7d80987fb06219c" + ] + ] + ], + "timestamp": "2023-12-04T17:56:12.268388251" + }, + "homo_sapiens - single_end - sam": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.216183" + }, + "homo_sapiens - paired_end - fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.327236" + }, + "homo_sapiens - single_end - versions": { + "content": [ + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "timestamp": "2023-12-04T17:53:26.664210196" + }, + "homo_sapiens - paired_end - multiple - log_out": { + "content": [ + "test.Log.out" + ], + "timestamp": "2023-11-23T13:29:01.022176" + }, + "homo_sapiens - paired_end - arriba - fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.15277" + }, + "homo_sapiens - paired_end - multiple - junction": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.52923" + }, + "homo_sapiens - paired_end - multiple - spl_junc_tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,069877e053714e23010fe4e1c003b4a2" + ] + ] + ], + "timestamp": "2023-12-04T18:01:20.189486201" + }, + "homo_sapiens - paired_end - starfusion - log_final": { + "content": [ + "test.Log.final.out" + ], + "timestamp": "2023-11-23T13:27:55.905883" + }, + "homo_sapiens - paired_end - starfusion - fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.192302" + }, + "homo_sapiens - paired_end - multiple - sam": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.661837" + }, + "homo_sapiens - paired_end - multiple - log_final": { + "content": [ + "test.Log.final.out" + ], + "timestamp": "2023-11-23T13:29:00.966417" + }, + "homo_sapiens - paired_end - starfusion - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.out.bam:md5,bcad07b838f6762fc01eea52b5cd3f84" + ] + ] + ], + "timestamp": "2023-12-04T17:59:58.53235164" + }, + "homo_sapiens - paired_end - arriba - junction": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.202776" + }, + "homo_sapiens - single_end - bedgraph": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test.Signal.Unique.str1.out.bg:md5,c56fc1472776fb927eaf62d973da5f9a", + "test.Signal.UniqueMultiple.str1.out.bg:md5,e93373cf6f2a2a9506e2efdb260cdd4f" + ] + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.394863748" + }, + "homo_sapiens - paired_end - arriba - read_per_gene_tab": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.251962" + }, + "homo_sapiens - paired_end - starfusion - bam_sorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.040843" + }, + "homo_sapiens - single_end - bam_unsorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.154172" + }, + "homo_sapiens - paired_end - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,b9ee1c607e07323bc1652ef3babb543f" + ] + ] + ], + "timestamp": "2023-12-04T17:54:11.934832258" + }, + "homo_sapiens - paired_end - arriba - bam_transcript": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:06.998817" + }, + "homo_sapiens - paired_end - log_out": { + "content": [ + "test.Log.out" + ], + "timestamp": "2023-11-23T13:23:33.259699" + }, + "homo_sapiens - paired_end - arriba - log_out": { + "content": [ + "test.Log.out" + ], + "timestamp": "2023-11-23T13:25:06.849451" + }, + "homo_sapiens - paired_end - multiple - versions": { + "content": [ + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "timestamp": "2023-12-04T18:01:20.393705142" + }, + "homo_sapiens - paired_end - starfusion - bam_transcript": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.082408" + }, + "homo_sapiens - paired_end - starfusion - tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,19c3faa1bfa9a0cc5e4c45f17065b53a" + ] + ] + ], + "timestamp": "2023-12-04T17:59:58.818041322" + }, + "homo_sapiens - single_end - fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.175307" + }, + "homo_sapiens - paired_end - tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,844af19ab0fc8cd9a3f75228445aca0d" + ] + ] + ], + "timestamp": "2023-12-04T17:54:12.255481058" + }, + "homo_sapiens - paired_end - starfusion - bedgraph": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.155413" + }, + "homo_sapiens - single_end - bam_transcript": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.144852" + }, + "homo_sapiens - paired_end - versions": { + "content": [ + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "timestamp": "2023-12-04T17:54:12.343840482" + }, + "homo_sapiens - paired_end - multiple - tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,069877e053714e23010fe4e1c003b4a2" + ] + ] + ], + "timestamp": "2023-12-04T18:01:20.291692062" + }, + "homo_sapiens - single_end - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.Aligned.sortedByCoord.out.bam:md5,c6cfaccaf91bc7fdabed3cfe236d4535" + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.265642675" + }, + "homo_sapiens - paired_end - arriba - wig": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.444214" + }, + "homo_sapiens - paired_end - log_progress": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.progress.out:md5,b2bd061d6cbaaf3d6d3b1fed547f69b8" + ] + ] + ], + "timestamp": "2023-12-04T17:54:12.126063825" + }, + "homo_sapiens - paired_end - arriba - log_final": { + "content": [ + "test.Log.final.out" + ], + "timestamp": "2023-11-23T13:25:06.829799" + }, + "homo_sapiens - paired_end - bam_unsorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.300509" + }, + "homo_sapiens - paired_end - arriba - sam": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.300383" + }, + "homo_sapiens - paired_end - multiple - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,ab07c21d63ab0a6c07d171d213c81d5a" + ] + ] + ], + "timestamp": "2023-12-04T18:01:19.851247126" + }, + "homo_sapiens - paired_end - multiple - fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.462257" + }, + "homo_sapiens - single_end - bam_sorted": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.Aligned.sortedByCoord.out.bam:md5,c6cfaccaf91bc7fdabed3cfe236d4535" + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.335457371" + }, + "homo_sapiens - paired_end - arriba - bam_sorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:06.94699" + }, + "homo_sapiens - paired_end - starfusion - junction": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Chimeric.out.junction:md5,c10ef219f4a30e83711b995bc5e40dba" + ] + ] + ], + "timestamp": "2023-12-04T17:59:58.641115828" + }, + "homo_sapiens - single_end - tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.SJ.out.tab:md5,75a516ab950fb958f40b29996474949c" + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.580593434" + }, + "homo_sapiens - paired_end - starfusion - versions": { + "content": [ + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "timestamp": "2023-12-04T17:59:58.907317103" + }, + "homo_sapiens - paired_end - multiple - bam_unsorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.330463" + }, + "homo_sapiens - paired_end - arriba - log_progress": { + "content": [ + "test.Log.progress.out" + ], + "timestamp": "2023-11-23T13:25:06.86866" + }, + "homo_sapiens - paired_end - bedgraph": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Signal.Unique.str1.out.bg:md5,d7bf8b70b436ca048a62513e1d0ece3a", + "test.Signal.UniqueMultiple.str1.out.bg:md5,686d58493b9eb445b56ace4d67f76ef6" + ] + ] + ] + ], + "timestamp": "2023-12-04T17:54:12.064121304" + }, + "homo_sapiens - paired_end - starfusion - bam_unsorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.118974" + }, + "homo_sapiens - paired_end - starfusion - read_per_gene_tab": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.264699" + }, + "homo_sapiens - paired_end - multiple - log_progress": { + "content": [ + "test.Log.progress.out" + ], + "timestamp": "2023-11-23T13:29:01.076947" + }, + "homo_sapiens - paired_end - arriba - bam_unsorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.050409" + }, + "homo_sapiens - paired_end - bam_sorted": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,b9ee1c607e07323bc1652ef3babb543f" + ] + ] + ], + "timestamp": "2023-12-04T17:54:12.002180537" + }, + "homo_sapiens - single_end - spl_junc_tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.SJ.out.tab:md5,75a516ab950fb958f40b29996474949c" + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.50932751" + }, + "homo_sapiens - paired_end - starfusion - spl_junc_tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,19c3faa1bfa9a0cc5e4c45f17065b53a" + ] + ] + ], + "timestamp": "2023-12-04T17:59:58.731699486" + }, + "homo_sapiens - single_end - log_out": { + "content": [ + "test.Log.out" + ], + "timestamp": "2023-11-23T13:22:55.126286" + }, + "homo_sapiens - paired_end - log_final": { + "content": [ + "test.Log.final.out" + ], + "timestamp": "2023-11-23T13:23:33.253884" + }, + "homo_sapiens - single_end - log_final": { + "content": [ + "test.Log.final.out" + ], + "timestamp": "2023-11-23T13:22:55.11799" + }, + "homo_sapiens - paired_end - bam_transcript": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.287684" + }, + "homo_sapiens - paired_end - starfusion - log_progress": { + "content": [ + "test.Log.progress.out" + ], + "timestamp": "2023-11-23T13:27:55.971484" + }, + "homo_sapiens - paired_end - multiple - bam_transcript": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.264176" + }, + "homo_sapiens - paired_end - multiple - read_per_gene_tab": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.596406" + }, + "homo_sapiens - single_end - read_per_gene_tab": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.205936" + }, + "homo_sapiens - paired_end - junction": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.340653" + }, + "homo_sapiens - paired_end - spl_junc_tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,844af19ab0fc8cd9a3f75228445aca0d" + ] + ] + ], + "timestamp": "2023-12-04T17:54:12.185730856" + }, + "homo_sapiens - paired_end - starfusion - sam": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.300637" + }, + "homo_sapiens - paired_end - arriba - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.out.bam:md5,c1b1747f5873f2d17762725636e891d5" + ] + ] + ], + "timestamp": "2023-12-04T17:56:12.190560178" + }, + "homo_sapiens - single_end - log_progress": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.Log.progress.out:md5,b2bd061d6cbaaf3d6d3b1fed547f69b8" + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.450352138" + }, + "homo_sapiens - paired_end - starfusion - wig": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.422018" + }, + "homo_sapiens - paired_end - wig": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.429457" + }, + "homo_sapiens - paired_end - starfusion - log_out": { + "content": [ + "test.Log.out" + ], + "timestamp": "2023-11-23T13:27:55.93945" + } +} \ No newline at end of file diff --git a/modules/nf-core/star/align/tests/nextflow.arriba.config b/modules/nf-core/star/align/tests/nextflow.arriba.config new file mode 100644 index 0000000..2324b9e --- /dev/null +++ b/modules/nf-core/star/align/tests/nextflow.arriba.config @@ -0,0 +1,14 @@ +process { + + withName: STAR_GENOMEGENERATE { + ext.args = '--genomeSAindexNbases 9' + } + + withName: STAR_ALIGN { + ext.args = '--readFilesCommand zcat --outSAMtype BAM Unsorted --outSAMunmapped Within --outBAMcompression 0 --outFilterMultimapNmax 50 --peOverlapNbasesMin 10 --alignSplicedMateMapLminOverLmate 0.5 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentMin 10 --chimOutType WithinBAM HardClip --chimJunctionOverhangMin 10 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 --chimSegmentReadGapMax 3 --chimMultimapNmax 50' + } + +} + +// Fix chown issue for the output star folder +docker.runOptions = '--platform=linux/amd64 -u $(id -u):$(id -g)' diff --git a/modules/nf-core/star/align/tests/nextflow.config b/modules/nf-core/star/align/tests/nextflow.config new file mode 100644 index 0000000..c4ac580 --- /dev/null +++ b/modules/nf-core/star/align/tests/nextflow.config @@ -0,0 +1,14 @@ +process { + + withName: STAR_GENOMEGENERATE { + ext.args = '--genomeSAindexNbases 9' + } + + withName: STAR_ALIGN { + ext.args = '--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --outWigType bedGraph --outWigStrand Unstranded' + } + +} + +// Fix chown issue for the output star folder +docker.runOptions = '--platform=linux/amd64 -u $(id -u):$(id -g)' diff --git a/modules/nf-core/star/align/tests/nextflow.starfusion.config b/modules/nf-core/star/align/tests/nextflow.starfusion.config new file mode 100644 index 0000000..467b649 --- /dev/null +++ b/modules/nf-core/star/align/tests/nextflow.starfusion.config @@ -0,0 +1,14 @@ +process { + + withName: STAR_GENOMEGENERATE { + ext.args = '--genomeSAindexNbases 9' + } + + withName: STAR_ALIGN { + ext.args = '--readFilesCommand zcat --outSAMtype BAM Unsorted --outReadsUnmapped None --twopassMode Basic --outSAMstrandField intronMotif --outSAMunmapped Within --chimSegmentMin 12 --chimJunctionOverhangMin 8 --chimOutJunctionFormat 1 --alignSJDBoverhangMin 10 --alignMatesGapMax 100000 --alignIntronMax 100000 --alignSJstitchMismatchNmax 5 -1 5 5 --chimMultimapScoreRange 3 --chimScoreJunctionNonGTAG -4 --chimMultimapNmax 20 --chimNonchimScoreDropMin 10 --peOverlapNbasesMin 12 --peOverlapMMp 0.1 --alignInsertionFlush Right --alignSplicedMateMapLminOverLmate 0 --alignSplicedMateMapLmin 30' + } + +} + +// Fix chown issue for the output star folder +docker.runOptions = '--platform=linux/amd64 -u $(id -u):$(id -g)' diff --git a/modules/nf-core/star/align/tests/tags.yml b/modules/nf-core/star/align/tests/tags.yml new file mode 100644 index 0000000..8beace1 --- /dev/null +++ b/modules/nf-core/star/align/tests/tags.yml @@ -0,0 +1,2 @@ +star/align: + - modules/nf-core/star/align/** diff --git a/modules/nf-core/star/genomegenerate/environment.yml b/modules/nf-core/star/genomegenerate/environment.yml index 0b35ff5..350a459 100644 --- a/modules/nf-core/star/genomegenerate/environment.yml +++ b/modules/nf-core/star/genomegenerate/environment.yml @@ -5,5 +5,5 @@ channels: - defaults dependencies: - bioconda::star=2.7.10a - - bioconda::samtools=1.16.1 + - bioconda::samtools=1.18 - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf index 473e62a..2bc3e29 100644 --- a/modules/nf-core/star/genomegenerate/main.nf +++ b/modules/nf-core/star/genomegenerate/main.nf @@ -4,8 +4,8 @@ process STAR_GENOMEGENERATE { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' : - 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' : + 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test b/modules/nf-core/star/genomegenerate/tests/main.nf.test new file mode 100644 index 0000000..eed8292 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test @@ -0,0 +1,38 @@ +nextflow_process { + + name "Test Process STAR_GENOMEGENERATE" + script "../main.nf" + process "STAR_GENOMEGENERATE" + tag "modules" + tag "modules_nfcore" + tag "star" + tag "star/genomegenerate" + + test("homo_sapiens") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).name).match("index") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap new file mode 100644 index 0000000..e7bb6ee --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap @@ -0,0 +1,16 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "timestamp": "2023-12-04T18:01:27.298248806" + }, + "index": { + "content": [ + "star" + ], + "timestamp": "2023-11-23T11:31:47.560528" + } +} \ No newline at end of file diff --git a/modules/nf-core/star/genomegenerate/tests/tags.yml b/modules/nf-core/star/genomegenerate/tests/tags.yml new file mode 100644 index 0000000..79f619b --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/tags.yml @@ -0,0 +1,2 @@ +star/genomegenerate: + - modules/nf-core/star/genomegenerate/** diff --git a/modules/nf-core/umitools/extract/main.nf b/modules/nf-core/umitools/extract/main.nf index a01ef73..4bd79e7 100644 --- a/modules/nf-core/umitools/extract/main.nf +++ b/modules/nf-core/umitools/extract/main.nf @@ -33,7 +33,7 @@ process UMITOOLS_EXTRACT { cat <<-END_VERSIONS > versions.yml "${task.process}": - umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') + umitools: \$( umi_tools --version | sed '/version:/!d; s/.*: //' ) END_VERSIONS """ } else { @@ -49,7 +49,7 @@ process UMITOOLS_EXTRACT { cat <<-END_VERSIONS > versions.yml "${task.process}": - umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//') + umitools: \$( umi_tools --version | sed '/version:/!d; s/.*: //' ) END_VERSIONS """ } diff --git a/modules/nf-core/umitools/extract/tests/main.nf.test.snap b/modules/nf-core/umitools/extract/tests/main.nf.test.snap index 54e77fb..6d5944f 100644 --- a/modules/nf-core/umitools/extract/tests/main.nf.test.snap +++ b/modules/nf-core/umitools/extract/tests/main.nf.test.snap @@ -2,9 +2,9 @@ "versions": { "content": [ [ - "versions.yml:md5,866a2da05ce1af35cc07261ffe6bc31a" + "versions.yml:md5,5a18da2d3a5a4de15e7aaae9082d7abb" ] ], - "timestamp": "2023-10-17T08:25:55.427194" + "timestamp": "2023-12-08T09:41:43.540658352" } } \ No newline at end of file diff --git a/modules/pfr/fastavalidate/main.nf b/modules/pfr/fastavalidate/main.nf deleted file mode 100644 index 873983b..0000000 --- a/modules/pfr/fastavalidate/main.nf +++ /dev/null @@ -1,56 +0,0 @@ -process FASTAVALIDATE { - tag "$meta.id" - label 'process_single' - - // conda "YOUR-TOOL-HERE" - // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - // 'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE': - // 'biocontainers/YOUR-TOOL-HERE' }" - container 'docker://gallvp/fasta_validator:a6a2ec1_ps' - - input: - tuple val(meta), path(fasta) - - output: - tuple val(meta), path('*.validated.fasta') , emit: valid_fasta , optional: true - tuple val(meta), path('*.error.log') , emit: error_log , optional: true - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - fasta_validate \\ - -v $fasta \\ - 2> "${prefix}.error.log" \\ - || echo "Errors from fasta_validate printed to ${prefix}.error.log" - - if [ \$(cat "${prefix}.error.log" | wc -l) -gt 0 ]; then - echo "Validation failed..." - cat "${prefix}.error.log" - else - rm "${prefix}.error.log" - - cat $fasta \\ - > "${prefix}.validated.fasta" - fi - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fasta_validate: \$(md5sum \$(which fasta_validate) | cut -d' ' -f1) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch "${prefix}.validated.fasta" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fasta_validate: \$(md5sum \$(which fasta_validate) | cut -d' ' -f1) - END_VERSIONS - """ -} diff --git a/modules/pfr/liftoff/environment.yml b/modules/pfr/liftoff/environment.yml new file mode 100644 index 0000000..8761c9b --- /dev/null +++ b/modules/pfr/liftoff/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "liftoff" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::liftoff=1.6.3" diff --git a/modules/pfr/liftoff/main.nf b/modules/pfr/liftoff/main.nf index 5356728..a382dab 100644 --- a/modules/pfr/liftoff/main.nf +++ b/modules/pfr/liftoff/main.nf @@ -2,7 +2,7 @@ process LIFTOFF { tag "$meta.id" label 'process_high' - conda "bioconda::liftoff=1.6.3" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/liftoff:1.6.3--pyhdfd78af_0': 'biocontainers/liftoff:1.6.3--pyhdfd78af_0' }" @@ -11,9 +11,9 @@ process LIFTOFF { tuple val(meta), path(target_fa) path ref_fa, name: 'liftoff_reference_assembly.fa' // To avoid name collisions betwen target_fa and ref_fa path ref_annotation - + output: - tuple val(meta), path("*.gff3") , emit: gff3 + tuple val(meta), path("${prefix}.gff3") , emit: gff3 // To avoid pattern collision with '*.polished.gff3' tuple val(meta), path("*.polished.gff3") , emit: polished_gff3, optional: true tuple val(meta), path("*.unmapped.txt") , emit: unmapped_txt path "versions.yml" , emit: versions @@ -22,8 +22,8 @@ process LIFTOFF { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" """ liftoff \\ -g $ref_annotation \\ @@ -42,9 +42,9 @@ process LIFTOFF { liftoff: \$(liftoff --version 2> /dev/null) END_VERSIONS """ - + stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ touch "${prefix}.gff3" touch "${prefix}.unmapped.txt" diff --git a/modules/pfr/liftoff/meta.yml b/modules/pfr/liftoff/meta.yml index e859282..ad1c5b8 100644 --- a/modules/pfr/liftoff/meta.yml +++ b/modules/pfr/liftoff/meta.yml @@ -1,3 +1,5 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "liftoff" description: "Liftoff is a tool that accurately maps annotations in GFF or GTF between assemblies of the same, or closely-related species." keywords: @@ -46,7 +48,6 @@ output: type: file description: Polished lifted annotations for the target assembly in gff3 format pattern: "*.polished.gff3" - optional: true - unmapped_txt: type: file description: List of unmapped reference annotations @@ -56,4 +57,6 @@ output: description: File containing software versions pattern: "versions.yml" authors: - - "@gallvp" + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/pfr/liftoff/tests/main.nf.test b/modules/pfr/liftoff/tests/main.nf.test new file mode 100644 index 0000000..00d1d2a --- /dev/null +++ b/modules/pfr/liftoff/tests/main.nf.test @@ -0,0 +1,89 @@ +nextflow_process { + + name "Test Process LIFTOFF" + script "../main.nf" + process "LIFTOFF" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "liftoff" + + test("homo_sapiens-genome_21_fasta-genome_1_fasta-genome_1_gtf") { + + setup { + run("GUNZIP") { + script "../../../nf-core/gunzip" + + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['homo_sapiens']['genome']['genome_1_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true) + ] + input[1] = GUNZIP.out.gunzip.map { meta, file -> file } + input[2] = [ + file(params.test_data['homo_sapiens']['genome']['genome_1_gtf'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.unmapped_txt).match("unmapped_txt") }, + { assert snapshot(process.out.versions).match("versions") }, + { assert path(process.out.gff3.get(0).get(1)).getText().contains("chr21\tLiftoff\texon\t34608061\t34608118\t.\t+\t.") }, + { assert path(process.out.polished_gff3.get(0).get(1)).getText().contains("chr21\tLiftoff\texon\t34608061\t34608118\t.\t+\t.") } + ) + } + + } + + test("stub") { + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true) + ] + input[1] = [ + file(params.test_data['homo_sapiens']['genome']['genome_1_fasta'], checkIfExists: true) + ] + input[2] = [ + file(params.test_data['homo_sapiens']['genome']['genome_1_gtf'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.gff3 != null }, + { assert process.out.polished_gff3 == [] }, + { assert process.out.unmapped_txt != null }, + { assert process.out.versions != null }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + +} diff --git a/modules/pfr/liftoff/tests/main.nf.test.snap b/modules/pfr/liftoff/tests/main.nf.test.snap new file mode 100644 index 0000000..36c39b6 --- /dev/null +++ b/modules/pfr/liftoff/tests/main.nf.test.snap @@ -0,0 +1,23 @@ +{ + "unmapped_txt": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.unmapped.txt:md5,7391d10df6e15db356b084c9af5259e4" + ] + ] + ], + "timestamp": "2023-12-01T13:57:40.748507" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,205d9c609e7fe27d8199550d842bdce8" + ] + ], + "timestamp": "2023-12-01T13:57:40.752414" + } +} \ No newline at end of file diff --git a/modules/pfr/liftoff/tests/nextflow.config b/modules/pfr/liftoff/tests/nextflow.config new file mode 100644 index 0000000..06b9d76 --- /dev/null +++ b/modules/pfr/liftoff/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: LIFTOFF { + ext.args = '-exclude_partial -copies -polish -a 0.1 -s 0.1' + } +} diff --git a/modules/pfr/liftoff/tests/tags.yml b/modules/pfr/liftoff/tests/tags.yml new file mode 100644 index 0000000..4ae1fb0 --- /dev/null +++ b/modules/pfr/liftoff/tests/tags.yml @@ -0,0 +1,2 @@ +liftoff: + - "modules/pfr/liftoff/**" diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test new file mode 100644 index 0000000..cdd7398 --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test @@ -0,0 +1,60 @@ +nextflow_workflow { + + name "Test Workflow FASTQ_FASTQC_UMITOOLS_FASTP" + script "../main.nf" + workflow "FASTQ_FASTQC_UMITOOLS_FASTP" + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fastq_fastqc_umitools_fastp" + tag "fastq_fastqc_umitools_fastp" + tag "fastqc" + tag "umitools/extract" + tag "fastp" + + + test("sarscov2 paired-end [fastq]") { + + when { + workflow { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = false // skip_fastqc + input[2] = false // with_umi + input[3] = false // skip_umi_extract + input[4] = 1 // umi_discard_read + input[5] = false // skip_trimming + input[6] = [] // adapter_fasta + input[7] = false // save_trimmed_fail + input[8] = false // save_merged + input[9] = 1 // min_trimmed_reads + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out.reads).match("reads") }, + { assert snapshot(workflow.out.umi_log).match("umi_log") }, + { assert snapshot(workflow.out.trim_json).match("trim_json") }, + { assert snapshot(workflow.out.trim_reads_fail).match("trim_reads_fail") }, + { assert snapshot(workflow.out.trim_reads_merged).match("trim_reads_merged") }, + { assert snapshot(workflow.out.trim_read_count).match("trim_read_count") }, + { assert snapshot(workflow.out.versions).match("versions") }, + + { assert workflow.out.fastqc_raw_html }, + { assert workflow.out.fastqc_raw_zip }, + { assert workflow.out.trim_html }, + { assert workflow.out.trim_log }, + { assert workflow.out.fastqc_trim_html }, + { assert workflow.out.fastqc_trim_zip } + ) + } + } +} diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test.snap b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test.snap new file mode 100644 index 0000000..38a65ae --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test.snap @@ -0,0 +1,81 @@ +{ + "trim_reads_merged": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-26T02:28:26.26920982" + }, + "trim_reads_fail": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-26T02:28:26.25861515" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,85bd0117e5778fff18e3920972a296ad", + "versions.yml:md5,c50aa59475ab901bc6f9a2cf7b1a14e0", + "versions.yml:md5,f3dcaae948e8eed92b4a5557b4c6668e" + ] + ], + "timestamp": "2023-11-26T02:28:26.30891403" + }, + "trim_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.fastp.json:md5,1e0f8e27e71728e2b63fc64086be95cd" + ] + ] + ], + "timestamp": "2023-11-26T02:28:26.24768259" + }, + "reads": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastp.fastq.gz:md5,67b2bbae47f073e05a97a9c2edce23c7", + "test_2.fastp.fastq.gz:md5,25cbdca08e2083dbd4f0502de6b62f39" + ] + ] + ] + ], + "timestamp": "2023-12-04T11:30:32.061644815" + }, + "umi_log": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-26T02:28:26.238536" + }, + "trim_read_count": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + 198 + ] + ] + ], + "timestamp": "2023-11-26T02:28:26.27984169" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/tags.yml b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/tags.yml new file mode 100644 index 0000000..84a4b56 --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/fastq_fastqc_umitools_fastp: + - subworkflows/nf-core/fastq_fastqc_umitools_fastp/** From 8a6c5fe9af6d5721dada1ac3efc5a7385c650df7 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 13 Dec 2023 12:25:18 +1300 Subject: [PATCH 25/59] Added EDTA from pfr/nxf-modules --- modules.json | 15 ++ modules/local/edta/edta/main.nf | 72 -------- modules/local/edta/restore_edta_ids/main.nf | 63 ------- modules/local/edta/shorten_edta_ids/main.nf | 30 ---- .../pfr/custom/restoregffids/environment.yml | 9 + modules/pfr/custom/restoregffids/main.nf | 35 ++++ modules/pfr/custom/restoregffids/meta.yml | 58 ++++++ .../templates/restore_gff_ids.py} | 33 ++-- .../custom/restoregffids/tests/main.nf.test | 63 +++++++ .../restoregffids/tests/main.nf.test.snap | 41 +++++ .../pfr/custom/restoregffids/tests/tags.yml | 2 + .../custom/shortenfastaids/environment.yml | 11 ++ modules/pfr/custom/shortenfastaids/main.nf | 34 ++++ modules/pfr/custom/shortenfastaids/meta.yml | 58 ++++++ .../templates}/shorten_fasta_ids.py | 91 +++++----- .../custom/shortenfastaids/tests/main.nf.test | 131 ++++++++++++++ .../shortenfastaids/tests/main.nf.test.snap | 170 ++++++++++++++++++ .../pfr/custom/shortenfastaids/tests/tags.yml | 2 + modules/pfr/edta/edta/environment.yml | 9 + modules/pfr/edta/edta/main.nf | 93 ++++++++++ modules/pfr/edta/edta/meta.yml | 61 +++++++ modules/pfr/edta/edta/tests/main.nf.test | 72 ++++++++ modules/pfr/edta/edta/tests/nextflow.config | 3 + modules/pfr/edta/edta/tests/tags.yml | 2 + 24 files changed, 936 insertions(+), 222 deletions(-) delete mode 100644 modules/local/edta/edta/main.nf delete mode 100644 modules/local/edta/restore_edta_ids/main.nf delete mode 100644 modules/local/edta/shorten_edta_ids/main.nf create mode 100644 modules/pfr/custom/restoregffids/environment.yml create mode 100644 modules/pfr/custom/restoregffids/main.nf create mode 100644 modules/pfr/custom/restoregffids/meta.yml rename modules/{local/edta/restore_edta_ids/resources/usr/bin/reverse_edta_naming.py => pfr/custom/restoregffids/templates/restore_gff_ids.py} (50%) create mode 100644 modules/pfr/custom/restoregffids/tests/main.nf.test create mode 100644 modules/pfr/custom/restoregffids/tests/main.nf.test.snap create mode 100644 modules/pfr/custom/restoregffids/tests/tags.yml create mode 100644 modules/pfr/custom/shortenfastaids/environment.yml create mode 100644 modules/pfr/custom/shortenfastaids/main.nf create mode 100644 modules/pfr/custom/shortenfastaids/meta.yml rename modules/{local/edta/shorten_edta_ids/resources/usr/bin => pfr/custom/shortenfastaids/templates}/shorten_fasta_ids.py (55%) create mode 100644 modules/pfr/custom/shortenfastaids/tests/main.nf.test create mode 100644 modules/pfr/custom/shortenfastaids/tests/main.nf.test.snap create mode 100644 modules/pfr/custom/shortenfastaids/tests/tags.yml create mode 100644 modules/pfr/edta/edta/environment.yml create mode 100644 modules/pfr/edta/edta/main.nf create mode 100644 modules/pfr/edta/edta/meta.yml create mode 100644 modules/pfr/edta/edta/tests/main.nf.test create mode 100644 modules/pfr/edta/edta/tests/nextflow.config create mode 100644 modules/pfr/edta/edta/tests/tags.yml diff --git a/modules.json b/modules.json index cde5635..fa42bdf 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,21 @@ "git@github.com:PlantandFoodResearch/nxf-modules.git": { "modules": { "pfr": { + "custom/restoregffids": { + "branch": "main", + "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", + "installed_by": ["modules"] + }, + "custom/shortenfastaids": { + "branch": "main", + "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", + "installed_by": ["modules"] + }, + "edta/edta": { + "branch": "main", + "git_sha": "d14b1f1d790cc01c11f8ec5aa80a9562c6808b20", + "installed_by": ["modules"] + }, "liftoff": { "branch": "main", "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", diff --git a/modules/local/edta/edta/main.nf b/modules/local/edta/edta/main.nf deleted file mode 100644 index 9c9b180..0000000 --- a/modules/local/edta/edta/main.nf +++ /dev/null @@ -1,72 +0,0 @@ -process EDTA { - tag "$meta.id" - label "process_high" - label "process_week_long" - - container 'https://depot.galaxyproject.org/singularity/edta:2.1.0--hdfd78af_1' - - input: - tuple val(meta), path(fasta_file) - - output: - tuple val(meta), path('*.EDTA.TElib.fa') , emit: te_lib_fasta - tuple val(meta), path('*.EDTA.intact.gff3') , emit: intact_gff3 - tuple val(meta), path('*.EDTA.pass.list') , emit: pass_list - tuple val(meta), path('*.EDTA.out') , emit: out_file - tuple val(meta), path('*.EDTA.TEanno.gff3') , emit: te_anno_gff3 - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def modFileName = "${fasta_file}.mod" - """ - EDTA.pl \\ - --genome $fasta_file \\ - --threads $task.cpus \\ - $args - - if [ -f "${modFileName}.EDTA.raw/LTR/${modFileName}.pass.list" ]; then - cat "${modFileName}.EDTA.raw/LTR/${modFileName}.pass.list" \\ - > "${modFileName}.EDTA.pass.list" - else - echo "EDTA PASS LIST IS EMPTY" \\ - > "${modFileName}.EDTA.pass.list" - fi - - if [ -f "${modFileName}.EDTA.anno/${modFileName}.out" ]; then - cat "${modFileName}.EDTA.anno/${modFileName}.out" \\ - > "${modFileName}.EDTA.out" - else - echo "EDTA DID NOT PRODUCE AN OUT FILE" \\ - > "${modFileName}.EDTA.out" - fi - - if [ ! -f "${modFileName}.EDTA.TEanno.gff3" ]; then - echo "##EDTA DID NOT PRODUCE A TEANNO GFF3" \\ - > "${modFileName}.EDTA.TEanno.gff3" - fi - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - EDTA: \$(EDTA.pl -h | awk ' /##### Extensive/ {print \$7}') - END_VERSIONS - """ - - stub: - def modFileName = "${fasta_file}.mod" - """ - touch "${modFileName}.EDTA.TElib.fa" - touch "${modFileName}.EDTA.intact.gff3" - touch "${modFileName}.EDTA.pass.list" - touch "${modFileName}.EDTA.out" - touch "${modFileName}.EDTA.TEanno.gff3" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - EDTA: \$(EDTA.pl -h | awk ' /##### Extensive/ {print \$7}') - END_VERSIONS - """ -} \ No newline at end of file diff --git a/modules/local/edta/restore_edta_ids/main.nf b/modules/local/edta/restore_edta_ids/main.nf deleted file mode 100644 index 4da8a34..0000000 --- a/modules/local/edta/restore_edta_ids/main.nf +++ /dev/null @@ -1,63 +0,0 @@ -process RESTORE_EDTA_IDS { - tag "$meta.id" - label "process_single" - - container "docker://gallvp/python3npkgs:v0.4" - - input: - tuple val(meta), path(te_lib_fa) - path(intact_gff3) - path(pass_list) - path(out_file) - path(te_anno_gff3) - path(renamed_ids_tsv) - - output: - tuple val(meta), path("${meta.id}.EDTA.TElib.fa") , emit: te_lib_fasta - tuple val(meta), path("${meta.id}.EDTA.intact.gff3") , emit: intact_gff3 - tuple val(meta), path("${meta.id}.renamed.ids.EDTA.pass.list") , emit: pass_list - tuple val(meta), path("${meta.id}.renamed.ids.EDTA.out") , emit: out_file - tuple val(meta), path("${meta.id}.EDTA.TEanno.gff3") , emit: te_anno_gff3 - tuple val(meta), path("${meta.id}.renamed.ids.tsv") , emit: renamed_ids_tsv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - """ - cat $pass_list > "${meta.id}.renamed.ids.EDTA.pass.list" - cat $out_file > "${meta.id}.renamed.ids.EDTA.out" - cat $te_lib_fa > "${meta.id}.EDTA.TElib.fa" - cat $renamed_ids_tsv > "${meta.id}.renamed.ids.tsv" - - renamed_ids_head=\$(head -n 1 "$renamed_ids_tsv") - - if [[ \$renamed_ids_head == "IDs have acceptable length and character. No change required." ]]; then - cat $te_anno_gff3 > "${meta.id}.EDTA.TEanno.gff3" - cat $intact_gff3 > "${meta.id}.EDTA.intact.gff3" - else - reverse_edta_naming.py "$renamed_ids_tsv" "$te_anno_gff3" "$intact_gff3" "$meta" - fi - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - reverse_edta_naming: \$(md5sum \$(which reverse_edta_naming.py) | cut -d' ' -f1) - END_VERSIONS - """ - - stub: - """ - touch "${meta.id}.EDTA.TElib.fa" - touch "${meta.id}.EDTA.intact.gff3" - touch "${meta.id}.renamed.ids.EDTA.pass.list" - touch "${meta.id}.renamed.ids.EDTA.out" - touch "${meta.id}.EDTA.TEanno.gff3" - touch "${meta.id}.renamed.ids.tsv" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - reverse_edta_naming: \$(md5sum \$(which reverse_edta_naming.py) | cut -d' ' -f1) - END_VERSIONS - """ -} \ No newline at end of file diff --git a/modules/local/edta/shorten_edta_ids/main.nf b/modules/local/edta/shorten_edta_ids/main.nf deleted file mode 100644 index 43b94f0..0000000 --- a/modules/local/edta/shorten_edta_ids/main.nf +++ /dev/null @@ -1,30 +0,0 @@ -process SHORTEN_EDTA_IDS { - tag "$meta.id" - label "process_single" - - container "docker://gallvp/python3npkgs:v0.4" - - input: - tuple val(meta), path(fasta_file) - - output: - tuple val(meta), path("*.renamed.ids.fa") , emit: renamed_ids_fasta - tuple val(meta), path("*.renamed.ids.tsv") , emit: renamed_ids_tsv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - """ - FILE="$fasta_file" - output_prefix="\${FILE%.*}" - - shorten_fasta_ids.py "$fasta_file" "\$output_prefix" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - shorten_fasta_ids: \$(md5sum \$(which shorten_fasta_ids.py) | cut -d' ' -f1) - END_VERSIONS - """ -} \ No newline at end of file diff --git a/modules/pfr/custom/restoregffids/environment.yml b/modules/pfr/custom/restoregffids/environment.yml new file mode 100644 index 0000000..2450c45 --- /dev/null +++ b/modules/pfr/custom/restoregffids/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "custom_restoregffids" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "python=3.10.2" diff --git a/modules/pfr/custom/restoregffids/main.nf b/modules/pfr/custom/restoregffids/main.nf new file mode 100644 index 0000000..14e2c07 --- /dev/null +++ b/modules/pfr/custom/restoregffids/main.nf @@ -0,0 +1,35 @@ +process CUSTOM_RESTOREGFFIDS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.10.2': + 'biocontainers/python:3.10.2' }" + + input: + tuple val(meta), path(gff3) + path(ids_tsv) + + output: + tuple val(meta), path("*.restored.ids.gff3") , emit: restored_ids_gff3 + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + template 'restore_gff_ids.py' + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch "${prefix}.restored.ids.gff3" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/pfr/custom/restoregffids/meta.yml b/modules/pfr/custom/restoregffids/meta.yml new file mode 100644 index 0000000..4e42b82 --- /dev/null +++ b/modules/pfr/custom/restoregffids/meta.yml @@ -0,0 +1,58 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "custom_restoregffids" +description: | + Restores IDs in a gff3 file based on a TSV table + consisting of original (first column) and new IDs (second column). + This module is helpful when some tools like EDTA implicitly shorten + the IDs without producing the ID map, leading to downstream mismatch + in IDs across files. +keywords: + - genome + - gff + - ID + - shorten + - restore +tools: + - "python": + description: | + Python is a programming language that lets you work quickly + and integrate systems more effectively + homepage: "https://www.python.org" + documentation: "https://docs.python.org/3/" + tool_dev_url: "https://github.com/python/cpython" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - gff3: + type: file + description: Input gff3 file + pattern: "*.{gff,gff3}" + - ids_tsv: + type: file + description: | + A TSV file with original (first column) and new ids (second column) + if id change was required + pattern: "*.tsv" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - restored_ids_gff3: + type: file + description: GFF3 file with restored ids + pattern: "*.restored.ids.gff3" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/local/edta/restore_edta_ids/resources/usr/bin/reverse_edta_naming.py b/modules/pfr/custom/restoregffids/templates/restore_gff_ids.py similarity index 50% rename from modules/local/edta/restore_edta_ids/resources/usr/bin/reverse_edta_naming.py rename to modules/pfr/custom/restoregffids/templates/restore_gff_ids.py index 7e8522c..d0699de 100755 --- a/modules/local/edta/restore_edta_ids/resources/usr/bin/reverse_edta_naming.py +++ b/modules/pfr/custom/restoregffids/templates/restore_gff_ids.py @@ -1,19 +1,18 @@ #!/usr/bin/env python3 -import sys +from platform import python_version -renamed_ids_tsv = sys.argv[1] -te_anno_gff3 = sys.argv[2] -intact_gff3 = sys.argv[3] -output_prefix = sys.argv[4] +ids_tsv = "$ids_tsv" +input_gff3 = "$gff3" +output_prefix = "$prefix" -def create_name_mapping_from_file(file_path): +def create_name_mapping_from_tsv(file_path): dictionary = {} with open(file_path, "r") as tsv_file: for line in tsv_file: - columns = line.strip().split("\t") + columns = line.strip().split("\\t") if len(columns) != 2: raise ValueError(f"{file_path} should be a two column TSV file") @@ -23,7 +22,12 @@ def create_name_mapping_from_file(file_path): return dictionary -def reverse_rename_gff3_file(new_to_orig_ids, file_path, output_file_name): +def restore_gff3_ids(new_to_orig_ids, file_path, output_file_name): + # Write versions + with open(f"versions.yml", "w") as f_versions: + f_versions.write('"${task.process}":\\n') + f_versions.write(f" python: {python_version()}\\n") + with open(file_path, "r") as input_gff3_file: input_lines = input_gff3_file.readlines() @@ -33,16 +37,11 @@ def reverse_rename_gff3_file(new_to_orig_ids, file_path, output_file_name): output_gff_file.write(line) continue - new_id = line.split("\t")[0] + new_id = line.split("\\t")[0] orig_id = new_to_orig_ids[new_id] - output_gff_file.write(line.replace(new_id, orig_id)) + output_gff_file.write("\\t".join([orig_id] + line.split("\\t")[1:])) if __name__ == "__main__": - new_to_orig_ids = create_name_mapping_from_file(renamed_ids_tsv) - reverse_rename_gff3_file( - new_to_orig_ids, te_anno_gff3, f"{output_prefix}.EDTA.TEanno.gff3" - ) - reverse_rename_gff3_file( - new_to_orig_ids, intact_gff3, f"{output_prefix}.EDTA.intact.gff3" - ) + new_to_orig_ids = create_name_mapping_from_tsv(ids_tsv) + restore_gff3_ids(new_to_orig_ids, input_gff3, f"{output_prefix}.restored.ids.gff3") diff --git a/modules/pfr/custom/restoregffids/tests/main.nf.test b/modules/pfr/custom/restoregffids/tests/main.nf.test new file mode 100644 index 0000000..521b924 --- /dev/null +++ b/modules/pfr/custom/restoregffids/tests/main.nf.test @@ -0,0 +1,63 @@ +nextflow_process { + + name "Test Process CUSTOM_RESTOREGFFIDS" + script "../main.nf" + process "CUSTOM_RESTOREGFFIDS" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/restoregffids" + + test("sarscov2-genome_gff3-success") { + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ] + input[1] = Channel.of('Chr1\tMT192765.1').collectFile(name: 'id_map.tsv', newLine: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.restored_ids_gff3.get(0).get(1)).getText().contains("Chr1") }, + { assert !path(process.out.restored_ids_gff3.get(0).get(1)).getText().contains("MT192765.1") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ] + input[1] = Channel.of('Chr1\tMT192765.1').collectFile(name: 'id_map.tsv', newLine: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.restored_ids_gff3 != null }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + +} diff --git a/modules/pfr/custom/restoregffids/tests/main.nf.test.snap b/modules/pfr/custom/restoregffids/tests/main.nf.test.snap new file mode 100644 index 0000000..ffe43e7 --- /dev/null +++ b/modules/pfr/custom/restoregffids/tests/main.nf.test.snap @@ -0,0 +1,41 @@ +{ + "sarscov2-genome_gff3-success": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.restored.ids.gff3:md5,2c294938b9eb4e52d19e14725c1d92a9" + ] + ], + "1": [ + "versions.yml:md5,32d31c4f1da9a3d1be013fd163e5867e" + ], + "restored_ids_gff3": [ + [ + { + "id": "test", + "single_end": false + }, + "test.restored.ids.gff3:md5,2c294938b9eb4e52d19e14725c1d92a9" + ] + ], + "versions": [ + "versions.yml:md5,32d31c4f1da9a3d1be013fd163e5867e" + ] + } + ], + "timestamp": "2023-12-07T13:49:30.047425" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,32d31c4f1da9a3d1be013fd163e5867e" + ] + ], + "timestamp": "2023-12-07T13:49:30.071175" + } +} \ No newline at end of file diff --git a/modules/pfr/custom/restoregffids/tests/tags.yml b/modules/pfr/custom/restoregffids/tests/tags.yml new file mode 100644 index 0000000..1d4b9a8 --- /dev/null +++ b/modules/pfr/custom/restoregffids/tests/tags.yml @@ -0,0 +1,2 @@ +custom/restoregffids: + - "modules/pfr/custom/restoregffids/**" diff --git a/modules/pfr/custom/shortenfastaids/environment.yml b/modules/pfr/custom/shortenfastaids/environment.yml new file mode 100644 index 0000000..e80fa7c --- /dev/null +++ b/modules/pfr/custom/shortenfastaids/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "custom_shortenfastaids" +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - biopython==1.75 + - python=3.8 diff --git a/modules/pfr/custom/shortenfastaids/main.nf b/modules/pfr/custom/shortenfastaids/main.nf new file mode 100644 index 0000000..92762ef --- /dev/null +++ b/modules/pfr/custom/shortenfastaids/main.nf @@ -0,0 +1,34 @@ +process CUSTOM_SHORTENFASTAIDS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/biopython:1.75': + 'biocontainers/biopython:1.75' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.short.ids.fasta") , emit: short_ids_fasta , optional: true + tuple val(meta), path("*.short.ids.tsv") , emit: short_ids_tsv , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + template 'shorten_fasta_ids.py' + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | cut -d' ' -f2) + biopython: \$(pip list | grep "biopython" | cut -d' ' -f3) + END_VERSIONS + """ +} diff --git a/modules/pfr/custom/shortenfastaids/meta.yml b/modules/pfr/custom/shortenfastaids/meta.yml new file mode 100644 index 0000000..2425810 --- /dev/null +++ b/modules/pfr/custom/shortenfastaids/meta.yml @@ -0,0 +1,58 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "custom_shortenfastaids" +description: | + Shortens fasta IDs and produces a new fasta along with a TSV table + consisting of original (first column) and new IDs (second column). + This module is helpful when some tools like EDTA implicitly shorten + the IDs without producing the ID map, leading to downstream mismatch + in IDs across files. +keywords: + - genome + - fasta + - ID + - shorten +tools: + - "biopython": + description: | + Biopython is a set of freely available tools for biological computation written in Python by + an international team of developers. + homepage: "https://biopython.org" + documentation: "https://biopython.org/wiki/Documentation" + tool_dev_url: "https://github.com/biopython/biopython" + doi: "10.1093/bioinformatics/btp163" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - fasta: + type: file + description: Input fasta file + pattern: "*.{fsa,fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - short_ids_fasta: + type: file + description: Fasta file with shortened ids if id change is required + pattern: "*.{fsa,fa,fasta}" + - short_ids_tsv: + type: file + description: | + A TSV file with original (first column) and new ids (second column) + if id change is required + pattern: "*.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/local/edta/shorten_edta_ids/resources/usr/bin/shorten_fasta_ids.py b/modules/pfr/custom/shortenfastaids/templates/shorten_fasta_ids.py similarity index 55% rename from modules/local/edta/shorten_edta_ids/resources/usr/bin/shorten_fasta_ids.py rename to modules/pfr/custom/shortenfastaids/templates/shorten_fasta_ids.py index 0b6e6d2..54f35bf 100755 --- a/modules/local/edta/shorten_edta_ids/resources/usr/bin/shorten_fasta_ids.py +++ b/modules/pfr/custom/shortenfastaids/templates/shorten_fasta_ids.py @@ -1,26 +1,22 @@ #!/usr/bin/env python3 import re -import sys from Bio import SeqIO +from importlib.metadata import version +from platform import python_version # The input fasta file path -fasta_file_path = sys.argv[1] +fasta_file_path = "$fasta" +output_files_prefix = "$prefix" -# The prefix for output files: prefix.renamed.ids.fa, prefix.renamed.ids.tsv -output_files_prefix = sys.argv[2] -# In the case where IDs have acceptable character and no change is needed, the output is stdout: -# "IDs have acceptable length and character. No change required." - - -def extract_fasta_ids(fasta_file_path): +def extract_fasta_ids_and_descriptions(fasta_file_path): fasta_file_obj = SeqIO.parse(fasta_file_path, "fasta") ids = [] for record in fasta_file_obj: - ids.append(record.id) + ids.append((record.id, record.description)) return ids @@ -38,29 +34,39 @@ def write_fasta_with_new_ids(fasta_file_path, id_mapping, file_prefix): replaced_records.append(record) - SeqIO.write(replaced_records, f"{file_prefix}.renamed.ids.fa", "fasta") - - -def write_fasta_without_comments(fasta_file_path, file_prefix): - old_fasta_file_obj = SeqIO.parse(fasta_file_path, "fasta") + SeqIO.write(replaced_records, f"{file_prefix}.short.ids.fasta", "fasta") - replaced_records = [] - for record in old_fasta_file_obj: - record.description = "" - replaced_records.append(record) - SeqIO.write(replaced_records, f"{file_prefix}.renamed.ids.fa", "fasta") +def do_id_need_to_change(id_and_description, silent=False): + id = id_and_description[0] + description = id_and_description[1] + if len(id) > 13: + if not silent: + print(f"{id} has length greater than 13") + return True + if not re.match(r"^[a-zA-Z0-9_]+\$", id): + if not silent: + print(f"{id} does not match '^[a-zA-Z0-9_]+\$'") + return True -def do_id_need_to_change(id): - if len(id) > 13 or not re.match(r"^[a-zA-Z0-9_]+$", id): + if description != id and description != "": + if not silent: + print(f"{id} contains a comment: {description.replace(id, '')}") return True + if not silent: + print(f"{id} is acceptable") return False -def do_ids_need_to_change(ids): - return any([do_id_need_to_change(id) for id in ids]) +def do_ids_need_to_change(ids_and_descriptions, silent=False): + return any( + [ + do_id_need_to_change(id_and_description, silent) + for id_and_description in ids_and_descriptions + ] + ) def extract_common_patterns(ids): @@ -80,23 +86,25 @@ def extract_common_patterns(ids): return {pattern: pattern[:3] for pattern in common_patterns} -def shorten_ids(ids, patterns_dict): +def shorten_ids(input_ids_and_descriptions, patterns_dict): shortened_ids = [] - for id in ids: - if not do_id_need_to_change(id): + for id_and_description in input_ids_and_descriptions: + id = id_and_description[0] + description = "" # Treat description as absent as it will be removed by write_fasta_with_new_ids + if not do_id_need_to_change((id, description), silent=True): shortened_ids.append(id) continue shortened_id = shorten_id_by_pattern_replacement(patterns_dict, id) - if not do_id_need_to_change(shortened_id): + if not do_id_need_to_change((shortened_id, description), silent=True): shortened_ids.append(shortened_id) continue shortened_id = f"Ctg{generate_hash(id)}" - if not do_id_need_to_change(shortened_id): + if not do_id_need_to_change((shortened_id, description), silent=True): shortened_ids.append(shortened_id) continue @@ -146,24 +154,27 @@ def fail_if_new_ids_not_valid(ids): if __name__ == "__main__": - input_ids = extract_fasta_ids(fasta_file_path) + input_ids_and_descriptions = extract_fasta_ids_and_descriptions(fasta_file_path) + input_ids = [x[0] for x in input_ids_and_descriptions] - if not do_ids_need_to_change(input_ids): - print("IDs have acceptable length and character. No change required.") - - with open(f"{output_files_prefix}.renamed.ids.tsv", "w") as f: - f.write("IDs have acceptable length and character. No change required.") - - write_fasta_without_comments(fasta_file_path, output_files_prefix) + # Write versions + with open(f"versions.yml", "w") as f_versions: + f_versions.write('"${task.process}":\\n') + f_versions.write(f" python: {python_version()}\\n") + f_versions.write(f" biopython: {version('biopython')}\\n") + if not do_ids_need_to_change(input_ids_and_descriptions): + print("IDs have acceptable length and character. No change required.") exit(0) - new_ids = shorten_ids(input_ids, extract_common_patterns(input_ids)) + new_ids = shorten_ids( + input_ids_and_descriptions, extract_common_patterns(input_ids) + ) fail_if_new_ids_not_valid(new_ids) - with open(f"{output_files_prefix}.renamed.ids.tsv", "w") as f: + with open(f"{output_files_prefix}.short.ids.tsv", "w") as f: for input_id, new_id in zip(input_ids, new_ids): - f.write(f"{input_id}\t{new_id}\n") + f.write(f"{input_id}\\t{new_id}\\n") write_fasta_with_new_ids( fasta_file_path, zip(input_ids, new_ids), output_files_prefix diff --git a/modules/pfr/custom/shortenfastaids/tests/main.nf.test b/modules/pfr/custom/shortenfastaids/tests/main.nf.test new file mode 100644 index 0000000..dc46bae --- /dev/null +++ b/modules/pfr/custom/shortenfastaids/tests/main.nf.test @@ -0,0 +1,131 @@ +nextflow_process { + + name "Test Process CUSTOM_SHORTENFASTAIDS" + script "../main.nf" + process "CUSTOM_SHORTENFASTAIDS" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/shortenfastaids" + + test("homo_sapiens-genome_fasta-no_change") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot(process.out.versions).match("versions") }, + { assert process.out.short_ids_fasta == [] }, + { assert process.out.short_ids_tsv == [] } + ) + } + + } + + test("sarscov2-genome_fasta-pattern_change") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("homo_sapiens-genome2_fasta-length_change") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['homo_sapiens']['genome']['genome2_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("custom_fasta-comment_change") { + + when { + process { + """ + input[0] = Channel.of('>Chr1 This is a test comment', 'AGCTAGCT') + | collectFile(name: 'sample.fasta', newLine: true) + | map { file -> [ [ id:'test' ], file ] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") }, + { assert process.out.short_ids_fasta == [] }, + { assert process.out.short_ids_tsv == [] } + ) + } + + } + +} diff --git a/modules/pfr/custom/shortenfastaids/tests/main.nf.test.snap b/modules/pfr/custom/shortenfastaids/tests/main.nf.test.snap new file mode 100644 index 0000000..8fed1b9 --- /dev/null +++ b/modules/pfr/custom/shortenfastaids/tests/main.nf.test.snap @@ -0,0 +1,170 @@ +{ + "custom_fasta-comment_change": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.short.ids.fasta:md5,c861b9d46a4d9bdba66953cff572fc5d" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.short.ids.tsv:md5,8762f2bffbdff75c2812bad72ba52bba" + ] + ], + "2": [ + "versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba" + ], + "short_ids_fasta": [ + [ + { + "id": "test" + }, + "test.short.ids.fasta:md5,c861b9d46a4d9bdba66953cff572fc5d" + ] + ], + "short_ids_tsv": [ + [ + { + "id": "test" + }, + "test.short.ids.tsv:md5,8762f2bffbdff75c2812bad72ba52bba" + ] + ], + "versions": [ + "versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba" + ] + } + ], + "timestamp": "2023-12-07T13:33:05.523745" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba" + ] + ], + "timestamp": "2023-12-07T13:30:30.361527" + }, + "homo_sapiens-genome_fasta-no_change": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba" + ], + "short_ids_fasta": [ + + ], + "short_ids_tsv": [ + + ], + "versions": [ + "versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba" + ] + } + ], + "timestamp": "2023-12-07T13:32:54.220188" + }, + "homo_sapiens-genome2_fasta-length_change": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.short.ids.fasta:md5,1382acd98d4cd233a8062ef01b2aaa6d" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.short.ids.tsv:md5,99c0f2a529cb595b2d8530024ed2880e" + ] + ], + "2": [ + "versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba" + ], + "short_ids_fasta": [ + [ + { + "id": "test" + }, + "test.short.ids.fasta:md5,1382acd98d4cd233a8062ef01b2aaa6d" + ] + ], + "short_ids_tsv": [ + [ + { + "id": "test" + }, + "test.short.ids.tsv:md5,99c0f2a529cb595b2d8530024ed2880e" + ] + ], + "versions": [ + "versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba" + ] + } + ], + "timestamp": "2023-12-07T13:33:01.924483" + }, + "sarscov2-genome_fasta-pattern_change": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.short.ids.fasta:md5,14d6f587b6d28889c5c0f985e78d602f" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.short.ids.tsv:md5,d7a2af88e8549586e5616bff6a88bd71" + ] + ], + "2": [ + "versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba" + ], + "short_ids_fasta": [ + [ + { + "id": "test" + }, + "test.short.ids.fasta:md5,14d6f587b6d28889c5c0f985e78d602f" + ] + ], + "short_ids_tsv": [ + [ + { + "id": "test" + }, + "test.short.ids.tsv:md5,d7a2af88e8549586e5616bff6a88bd71" + ] + ], + "versions": [ + "versions.yml:md5,e5704a53ebea373dac3a93ae800d48ba" + ] + } + ], + "timestamp": "2023-12-07T13:32:58.12885" + } +} \ No newline at end of file diff --git a/modules/pfr/custom/shortenfastaids/tests/tags.yml b/modules/pfr/custom/shortenfastaids/tests/tags.yml new file mode 100644 index 0000000..4715b64 --- /dev/null +++ b/modules/pfr/custom/shortenfastaids/tests/tags.yml @@ -0,0 +1,2 @@ +custom/shortenfastaids: + - "modules/pfr/custom/shortenfastaids/**" diff --git a/modules/pfr/edta/edta/environment.yml b/modules/pfr/edta/edta/environment.yml new file mode 100644 index 0000000..63160e8 --- /dev/null +++ b/modules/pfr/edta/edta/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "edta_edta" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::edta=2.1.0" diff --git a/modules/pfr/edta/edta/main.nf b/modules/pfr/edta/edta/main.nf new file mode 100644 index 0000000..458f525 --- /dev/null +++ b/modules/pfr/edta/edta/main.nf @@ -0,0 +1,93 @@ +process EDTA_EDTA { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/edta:2.1.0--hdfd78af_1': + 'biocontainers/edta:2.1.0--hdfd78af_1' }" + + input: + tuple val(meta), path(fasta) + path cds + path curatedlib + path rmout + path exclude + + output: + tuple val(meta), path('*.log') , emit: log + tuple val(meta), path('*.EDTA.TElib.fa') , emit: te_lib_fasta + tuple val(meta), path('*.EDTA.pass.list') , emit: pass_list , optional: true + tuple val(meta), path('*.EDTA.out') , emit: out_file , optional: true + tuple val(meta), path('*.EDTA.TEanno.gff3') , emit: te_anno_gff3 , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def mod_file_name = "${fasta}.mod" + def cds_file = cds ? "--cds $cds" : '' + def curatedlib_file = curatedlib ? "--curatedlib $curatedlib": '' + def rmout_file = rmout ? "--rmout $rmout" : '' + def exclude_file = exclude ? "--exclude $exclude" : '' + """ + EDTA.pl \\ + --genome $fasta \\ + --threads $task.cpus \\ + $cds_file \\ + $curatedlib_file \\ + $rmout_file \\ + $exclude_file \\ + $args \\ + &> "${prefix}.log" + + mv \\ + "${mod_file_name}.EDTA.TElib.fa" \\ + "${prefix}.EDTA.TElib.fa" + + [ -f "${mod_file_name}.EDTA.raw/LTR/${mod_file_name}.pass.list" ] \\ + && mv \\ + "${mod_file_name}.EDTA.raw/LTR/${mod_file_name}.pass.list" \\ + "${prefix}.EDTA.pass.list" \\ + || echo "EDTA did not produce a pass.list file" + + [ -f "${mod_file_name}.EDTA.anno/${mod_file_name}.out" ] \\ + && mv \\ + "${mod_file_name}.EDTA.anno/${mod_file_name}.out" \\ + "${prefix}.EDTA.out" \\ + || echo "EDTA did not produce an out file" + + [ -f "${mod_file_name}.EDTA.TEanno.gff3" ] \\ + && mv \\ + "${mod_file_name}.EDTA.TEanno.gff3" \\ + "${prefix}.EDTA.TEanno.gff3" \\ + || echo "EDTA did not produce a TEanno gff3 file" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + EDTA: \$(EDTA.pl -h | awk ' /##### Extensive/ {print \$7}') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def touch_pass_list = args.contains("--anno 1") ? "touch ${prefix}.EDTA.pass.list" : '' + def touch_out_file = args.contains("--anno 1") ? "touch ${prefix}.EDTA.out" : '' + def touch_te_anno = args.contains("--anno 1") ? "touch ${prefix}.EDTA.TEanno.gff3": '' + """ + touch "${prefix}.log" + touch "${prefix}.EDTA.TElib.fa" + $touch_pass_list + $touch_out_file + $touch_te_anno + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + EDTA: \$(EDTA.pl -h | awk ' /##### Extensive/ {print \$7}') + END_VERSIONS + """ +} diff --git a/modules/pfr/edta/edta/meta.yml b/modules/pfr/edta/edta/meta.yml new file mode 100644 index 0000000..4d59fdf --- /dev/null +++ b/modules/pfr/edta/edta/meta.yml @@ -0,0 +1,61 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "edta_edta" +description: Extensive de-novo TE Annotator (EDTA) +keywords: + - genome + - repeat + - annotation + - transposable-elements +tools: + - "edta": + description: Extensive de-novo TE Annotator (EDTA) + homepage: "https://github.com/oushujun/EDTA" + documentation: "https://github.com/oushujun/EDTA" + tool_dev_url: "https://github.com/oushujun/EDTA" + doi: "10.1186/s13059-019-1905-y" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fsa,fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - log: + type: file + description: Log emitted by EDTA + pattern: "*.log" + - te_lib_fasta: + type: file + description: A non-redundant TE library in fasta format + pattern: "*.EDTA.TElib.fa" + - pass_list: + type: file + description: A summary table of intact LTR-RTs with coordinate and structural information + pattern: "*.EDTA.pass.list" + - out_file: + type: file + description: RepeatMasker annotation of all LTR sequences in the genome + pattern: "*.EDTA.out" + - te_anno_gff3: + type: file + description: A gff3 file containing both structurally intact and fragmented TE annotations + pattern: "*.EDTA.TEanno.gff3" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/pfr/edta/edta/tests/main.nf.test b/modules/pfr/edta/edta/tests/main.nf.test new file mode 100644 index 0000000..d0a7142 --- /dev/null +++ b/modules/pfr/edta/edta/tests/main.nf.test @@ -0,0 +1,72 @@ +nextflow_process { + + name "Test Process EDTA_EDTA" + script "../main.nf" + process "EDTA_EDTA" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "edta" + tag "edta/edta" + + test("homo_sapiens-genome_fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[1] = [] + input[2] = [] + input[3] = [] + input[4] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[1] = [] + input[2] = [] + input[3] = [] + input[4] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") }, + { assert process.out.log != null }, + { assert process.out.te_lib_fasta != null }, + { assert process.out.pass_list != null }, + { assert process.out.out_file != null }, + { assert process.out.te_anno_gff3 != null } + ) + } + + } + +} diff --git a/modules/pfr/edta/edta/tests/nextflow.config b/modules/pfr/edta/edta/tests/nextflow.config new file mode 100644 index 0000000..b20ca5e --- /dev/null +++ b/modules/pfr/edta/edta/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args = '--anno 1 --evaluate 1' +} diff --git a/modules/pfr/edta/edta/tests/tags.yml b/modules/pfr/edta/edta/tests/tags.yml new file mode 100644 index 0000000..180ae6d --- /dev/null +++ b/modules/pfr/edta/edta/tests/tags.yml @@ -0,0 +1,2 @@ +edta/edta: + - "modules/pfr/edta/edta/**" From 26c33faddd8e133886dd330c372c19485b927e74 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 19 Dec 2023 10:31:05 +1300 Subject: [PATCH 26/59] Updated config --- conf/base.config | 45 ++++++++++++++++++---------------- conf/manifest.config | 4 +-- conf/reporting_defaults.config | 2 -- pangene_pfr.sh | 4 +-- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/conf/base.config b/conf/base.config index 54db554..54114d3 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,7 +1,11 @@ profiles { - slurm { + pfr { process { executor = 'slurm' + + apptainer { + envWhitelist= 'APPTAINER_BINDPATH,APPTAINER_BIND' + } } } @@ -10,10 +14,22 @@ profiles { executor = 'local' } } + + apptainer { + apptainer.enabled = true + apptainer.autoMounts= true + apptainer.registry = 'quay.io' + } + + docker { + docker.enabled = true + docker.userEmulation= false + docker.fixOwnership = true + docker.runOptions = '--platform=linux/amd64' + docker.registry = 'quay.io' + } } -// Source: https://github.com/nf-core/rnaseq -// License: https://github.com/nf-core/rnaseq/blob/master/LICENSE process { cpus = { check_max( 1 * task.attempt, 'cpus' ) } @@ -24,12 +40,6 @@ process { maxRetries = 1 maxErrors = '-1' - // Process-specific resource requirements - // NOTE - Please try and re-use the labels below as much as possible. - // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. - // If possible, it would be nice to keep the same label naming convention when - // adding in your local modules too. - // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { check_max( 1 , 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } @@ -53,19 +63,14 @@ process { withLabel:process_long { time = { check_max( 20.h * task.attempt, 'time' ) } } - withLabel:process_week_long { - time = { check_max( 7.days * task.attempt, 'time' ) } - } withLabel:process_high_memory { memory = { check_max( 200.GB * task.attempt, 'memory' ) } } -} - -apptainer { - enabled = true - autoMounts = true - envWhitelist = "APPTAINER_BINDPATH,APPTAINER_BIND" - registry = 'quay.io' + + // Custom + withLabel:process_week_long { + time = { check_max( 7.days * task.attempt, 'time' ) } + } } nextflow { @@ -74,8 +79,6 @@ nextflow { } } -// Source: https://github.com/nf-core/rnaseq -// License: https://github.com/nf-core/rnaseq/blob/master/LICENSE def check_max(obj, type) { if (type == 'memory') { try { diff --git a/conf/manifest.config b/conf/manifest.config index 7bf1f6b..706052c 100644 --- a/conf/manifest.config +++ b/conf/manifest.config @@ -1,10 +1,10 @@ manifest { - name = 'pan-gene' + name = 'pangene' author = """Usman Rashid""" homePage = 'https://github.com/PlantandFoodResearch/pan-gene' description = """A NextFlow pipeline for pan-genome annotation""" mainScript = 'main.nf' - nextflowVersion = '!>=22.10.4' + nextflowVersion = '!>=23.04.4' version = '0.1' doi = '' } \ No newline at end of file diff --git a/conf/reporting_defaults.config b/conf/reporting_defaults.config index c85d378..5df9469 100644 --- a/conf/reporting_defaults.config +++ b/conf/reporting_defaults.config @@ -1,5 +1,3 @@ -// Source: https://github.com/nf-core/rnaseq -// License: https://github.com/nf-core/rnaseq/blob/master/LICENSE def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true diff --git a/pangene_pfr.sh b/pangene_pfr.sh index 3b048df..ab3d262 100644 --- a/pangene_pfr.sh +++ b/pangene_pfr.sh @@ -13,7 +13,7 @@ ml apptainer/1.1 ml nextflow/23.04.4 -export APPTAINER_BINDPATH="$APPTAINER_BINDPATH,/workspace/$USER/tmp:/tmp" export TMPDIR="/workspace/$USER/tmp" +export APPTAINER_BINDPATH="$APPTAINER_BINDPATH,$TMPDIR:$TMPDIR,$TMPDIR:/tmp" -nextflow main.nf -profile slurm -resume \ No newline at end of file +nextflow main.nf -profile pfr,apptainer -resume \ No newline at end of file From 9e5831471f6c25e39fd564f1a21eaace4b2a6b37 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 19 Dec 2023 11:50:16 +1300 Subject: [PATCH 27/59] Integrated fastavalidator --- TODO.md | 12 +- modules.json | 291 ++++++++++++++----------- subworkflows/local/prepare_assembly.nf | 20 +- 3 files changed, 190 insertions(+), 133 deletions(-) diff --git a/TODO.md b/TODO.md index 8c90b99..0134c26 100644 --- a/TODO.md +++ b/TODO.md @@ -2,4 +2,14 @@ - [ ] From Ross regarding post-processing: > [9:49 am] Ross Crowhurst -Here is an easy one: BLATSp vs swissprot & Arabidpsis and check query is with set thresholds of reference - if so accept; If not move to BLASTp vs Uniref90 or Refeq (or some other predetermined model species) - same deal accept if within threshold limits. Else BLASTn of cds vs NCBI nt (really scrapping the bottom of the barrel here). If not a hit to anything then chances are its garbage and should be removed. Some ppl might try to claim its a unique protein to the genotype but in 20 years I have never seen one of those be supported - mostly this category is garbage. The screen agains NCBI nt also assists to classify "bits" as well retroposonss etc. Idea being you want to remove garbage predictions - as this does take time you can see why some papers just filter out by size. \ No newline at end of file +Here is an easy one: BLATSp vs swissprot & Arabidpsis and check query is with set thresholds of reference - if so accept; If not move to BLASTp vs Uniref90 or Refeq (or some other predetermined model species) - same deal accept if within threshold limits. Else BLASTn of cds vs NCBI nt (really scrapping the bottom of the barrel here). If not a hit to anything then chances are its garbage and should be removed. Some ppl might try to claim its a unique protein to the genotype but in 20 years I have never seen one of those be supported - mostly this category is garbage. The screen agains NCBI nt also assists to classify "bits" as well retroposonss etc. Idea being you want to remove garbage predictions - as this does take time you can see why some papers just filter out by size. + +- [ ] From Cecilia: + +> https://github.com/zhaotao1987/SynNet-Pipeline + +- [ ] From Ross: + +> https://www.biorxiv.org/content/10.1101/096529v2.full.pdf + +> Don't use `-exclude_partial` \ No newline at end of file diff --git a/modules.json b/modules.json index fa42bdf..0119dda 100644 --- a/modules.json +++ b/modules.json @@ -1,128 +1,169 @@ { - "name": "PlantandFoodResearch/pangene", - "homePage": "https://github.com/PlantandFoodResearch/pangene", - "repos": { - "git@github.com:PlantandFoodResearch/nxf-modules.git": { - "modules": { - "pfr": { - "custom/restoregffids": { - "branch": "main", - "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", - "installed_by": ["modules"] - }, - "custom/shortenfastaids": { - "branch": "main", - "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", - "installed_by": ["modules"] - }, - "edta/edta": { - "branch": "main", - "git_sha": "d14b1f1d790cc01c11f8ec5aa80a9562c6808b20", - "installed_by": ["modules"] - }, - "liftoff": { - "branch": "main", - "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", - "installed_by": ["modules"] - } + "name": "PlantandFoodResearch/pangene", + "homePage": "https://github.com/PlantandFoodResearch/pangene", + "repos": { + "git@github.com:PlantandFoodResearch/nxf-modules.git": { + "modules": { + "pfr": { + "custom/restoregffids": { + "branch": "main", + "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", + "installed_by": [ + "modules" + ] + }, + "custom/shortenfastaids": { + "branch": "main", + "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", + "installed_by": [ + "modules" + ] + }, + "edta/edta": { + "branch": "main", + "git_sha": "d14b1f1d790cc01c11f8ec5aa80a9562c6808b20", + "installed_by": [ + "modules" + ] + }, + "liftoff": { + "branch": "main", + "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", + "installed_by": [ + "modules" + ] + } + } + } + }, + "git@github.com:kherronism/nf-modules.git": { + "modules": { + "kherronism": { + "braker3": { + "branch": "dev", + "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", + "installed_by": [ + "modules" + ] + }, + "repeatmasker": { + "branch": "dev", + "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", + "installed_by": [ + "modules" + ] + } + } + } + }, + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "cat/cat": { + "branch": "master", + "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", + "installed_by": [ + "modules" + ] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", + "installed_by": [ + "modules" + ] + }, + "fastavalidator": { + "branch": "master", + "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", + "installed_by": [ + "modules" + ] + }, + "fastp": { + "branch": "master", + "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", + "installed_by": [ + "fastq_fastqc_umitools_fastp" + ] + }, + "fastqc": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "installed_by": [ + "fastq_fastqc_umitools_fastp", + "modules" + ] + }, + "gffread": { + "branch": "master", + "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", + "installed_by": [ + "modules" + ] + }, + "gunzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "samtools/cat": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": [ + "modules" + ] + }, + "sortmerna": { + "branch": "master", + "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", + "installed_by": [ + "modules" + ] + }, + "star/align": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": [ + "modules" + ] + }, + "star/genomegenerate": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": [ + "modules" + ] + }, + "umitools/extract": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "installed_by": [ + "fastq_fastqc_umitools_fastp" + ] + } + } + }, + "subworkflows": { + "nf-core": { + "fastq_fastqc_umitools_fastp": { + "branch": "master", + "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", + "installed_by": [ + "subworkflows" + ] + } + } + } } - } - }, - "git@github.com:kherronism/nf-modules.git": { - "modules": { - "kherronism": { - "braker3": { - "branch": "dev", - "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", - "installed_by": ["modules"] - }, - "repeatmasker": { - "branch": "dev", - "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", - "installed_by": ["modules"] - } - } - } - }, - "https://github.com/nf-core/modules.git": { - "modules": { - "nf-core": { - "cat/cat": { - "branch": "master", - "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", - "installed_by": ["modules"] - }, - "cat/fastq": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] - }, - "custom/dumpsoftwareversions": { - "branch": "master", - "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", - "installed_by": ["modules"] - }, - "fastavalidator": { - "branch": "master", - "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", - "installed_by": ["modules"] - }, - "fastp": { - "branch": "master", - "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", - "installed_by": ["fastq_fastqc_umitools_fastp"] - }, - "fastqc": { - "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", - "installed_by": ["fastq_fastqc_umitools_fastp", "modules"] - }, - "gffread": { - "branch": "master", - "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", - "installed_by": ["modules"] - }, - "gunzip": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] - }, - "samtools/cat": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] - }, - "sortmerna": { - "branch": "master", - "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", - "installed_by": ["modules"] - }, - "star/align": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] - }, - "star/genomegenerate": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] - }, - "umitools/extract": { - "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", - "installed_by": ["fastq_fastqc_umitools_fastp"] - } - } - }, - "subworkflows": { - "nf-core": { - "fastq_fastqc_umitools_fastp": { - "branch": "master", - "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", - "installed_by": ["subworkflows"] - } - } - } } - } -} +} \ No newline at end of file diff --git a/subworkflows/local/prepare_assembly.nf b/subworkflows/local/prepare_assembly.nf index 7469afc..05bd1ec 100644 --- a/subworkflows/local/prepare_assembly.nf +++ b/subworkflows/local/prepare_assembly.nf @@ -1,11 +1,9 @@ include { GUNZIP as GUNZIP_TARGET_ASSEMBLY } from '../../modules/nf-core/gunzip' include { GUNZIP as GUNZIP_TE_LIBRARY } from '../../modules/nf-core/gunzip' -include { FASTA_VALIDATE } from '../../modules/local/fasta_validate' +include { FASTAVALIDATOR } from '../../modules/nf-core/fastavalidator' include { REPEATMASKER } from '../../modules/kherronism/repeatmasker' include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate' -include { FASTA_EDTA } from '../../subworkflows/local/fasta_edta' - workflow PREPARE_ASSEMBLY { take: target_assembly // channel: [ meta, fasta ] @@ -29,11 +27,19 @@ workflow PREPARE_ASSEMBLY { ) | set { ch_gunzip_target_assembly } - // MODULE: FASTA_VALIDATE - FASTA_VALIDATE(ch_gunzip_target_assembly) - .valid_fasta + // MODULE: FASTAVALIDATOR + FASTAVALIDATOR(ch_gunzip_target_assembly) + + ch_gunzip_target_assembly + | join(FASTAVALIDATOR.out.success_log) + | map { meta, fasta, log -> [ meta, fasta ] } | set { ch_validated_target_assembly } + FASTAVALIDATOR.out.error_log + | map { meta, log -> + System.err.println("WARNING: FASTAVALIDATOR failed for ${meta.id} with error: ${log}. ${meta.id} is excluded from further analysis.") + } + // MODULE: GUNZIP_TE_LIBRARY te_library | branch { meta, file -> @@ -85,7 +91,7 @@ workflow PREPARE_ASSEMBLY { | set { ch_assembly_index } Channel.empty() - | mix(FASTA_VALIDATE.out.versions.first()) + | mix(FASTAVALIDATOR.out.versions.first()) | mix(GUNZIP_TE_LIBRARY.out.versions.first()) | mix(FASTA_EDTA.out.versions) | mix(REPEATMASKER.out.versions.first()) From 4534684f412accaef7dd8213a9a7351549e5e0a0 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 19 Dec 2023 12:12:52 +1300 Subject: [PATCH 28/59] Added patch for star/genomegenerate --- modules.json | 3 +- .../star/genomegenerate/environment.yml | 4 +- modules/nf-core/star/genomegenerate/main.nf | 83 +++--- .../genomegenerate/star-genomegenerate.diff | 247 ++++++++++++++++++ .../star/genomegenerate/tests/main.nf.test | 81 +++++- .../genomegenerate/tests/main.nf.test.snap | 14 +- 6 files changed, 395 insertions(+), 37 deletions(-) create mode 100644 modules/nf-core/star/genomegenerate/star-genomegenerate.diff diff --git a/modules.json b/modules.json index 0119dda..80995ba 100644 --- a/modules.json +++ b/modules.json @@ -142,7 +142,8 @@ "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": [ "modules" - ] + ], + "patch": "modules/nf-core/star/genomegenerate/star-genomegenerate.diff" }, "umitools/extract": { "branch": "master", diff --git a/modules/nf-core/star/genomegenerate/environment.yml b/modules/nf-core/star/genomegenerate/environment.yml index 350a459..93e4476 100644 --- a/modules/nf-core/star/genomegenerate/environment.yml +++ b/modules/nf-core/star/genomegenerate/environment.yml @@ -1,9 +1,11 @@ name: star_genomegenerate + channels: - conda-forge - bioconda - defaults + dependencies: - - bioconda::star=2.7.10a - bioconda::samtools=1.18 + - bioconda::star=2.7.10a - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf index 2bc3e29..b885571 100644 --- a/modules/nf-core/star/genomegenerate/main.nf +++ b/modules/nf-core/star/genomegenerate/main.nf @@ -19,9 +19,10 @@ process STAR_GENOMEGENERATE { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def args_list = args.tokenize() - def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + def args = task.ext.args ?: '' + def args_list = args.tokenize() + def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + def include_gtf = gtf ? "--sjdbGTFfile $gtf" : '' if (args_list.contains('--genomeSAindexNbases')) { """ mkdir star @@ -29,7 +30,7 @@ process STAR_GENOMEGENERATE { --runMode genomeGenerate \\ --genomeDir star/ \\ --genomeFastaFiles $fasta \\ - --sjdbGTFfile $gtf \\ + $include_gtf \\ --runThreadN $task.cpus \\ $memory \\ $args @@ -51,7 +52,7 @@ process STAR_GENOMEGENERATE { --runMode genomeGenerate \\ --genomeDir star/ \\ --genomeFastaFiles $fasta \\ - --sjdbGTFfile $gtf \\ + $include_gtf \\ --runThreadN $task.cpus \\ --genomeSAindexNbases \$NUM_BASES \\ $memory \\ @@ -67,30 +68,52 @@ process STAR_GENOMEGENERATE { } stub: - """ - mkdir star - touch star/Genome - touch star/Log.out - touch star/SA - touch star/SAindex - touch star/chrLength.txt - touch star/chrName.txt - touch star/chrNameLength.txt - touch star/chrStart.txt - touch star/exonGeTrInfo.tab - touch star/exonInfo.tab - touch star/geneInfo.tab - touch star/genomeParameters.txt - touch star/sjdbInfo.txt - touch star/sjdbList.fromGTF.out.tab - touch star/sjdbList.out.tab - touch star/transcriptInfo.tab + if (gtf) { + """ + mkdir star + touch star/Genome + touch star/Log.out + touch star/SA + touch star/SAindex + touch star/chrLength.txt + touch star/chrName.txt + touch star/chrNameLength.txt + touch star/chrStart.txt + touch star/exonGeTrInfo.tab + touch star/exonInfo.tab + touch star/geneInfo.tab + touch star/genomeParameters.txt + touch star/sjdbInfo.txt + touch star/sjdbList.fromGTF.out.tab + touch star/sjdbList.out.tab + touch star/transcriptInfo.tab - cat <<-END_VERSIONS > versions.yml - "${task.process}": - star: \$(STAR --version | sed -e "s/STAR_//g") - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } else { + """ + mkdir star + touch star/Genome + touch star/Log.out + touch star/SA + touch star/SAindex + touch star/chrLength.txt + touch star/chrName.txt + touch star/chrNameLength.txt + touch star/chrStart.txt + touch star/genomeParameters.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } } diff --git a/modules/nf-core/star/genomegenerate/star-genomegenerate.diff b/modules/nf-core/star/genomegenerate/star-genomegenerate.diff new file mode 100644 index 0000000..0181f46 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/star-genomegenerate.diff @@ -0,0 +1,247 @@ +Changes in module 'nf-core/star/genomegenerate' +--- modules/nf-core/star/genomegenerate/environment.yml ++++ modules/nf-core/star/genomegenerate/environment.yml +@@ -1,9 +1,11 @@ + name: star_genomegenerate ++ + channels: + - conda-forge + - bioconda + - defaults ++ + dependencies: ++ - bioconda::samtools=1.18 + - bioconda::star=2.7.10a +- - bioconda::samtools=1.18 + - conda-forge::gawk=5.1.0 + +--- modules/nf-core/star/genomegenerate/main.nf ++++ modules/nf-core/star/genomegenerate/main.nf +@@ -19,9 +19,10 @@ + task.ext.when == null || task.ext.when + + script: +- def args = task.ext.args ?: '' +- def args_list = args.tokenize() +- def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' ++ def args = task.ext.args ?: '' ++ def args_list = args.tokenize() ++ def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' ++ def include_gtf = gtf ? "--sjdbGTFfile $gtf" : '' + if (args_list.contains('--genomeSAindexNbases')) { + """ + mkdir star +@@ -29,7 +30,7 @@ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ +- --sjdbGTFfile $gtf \\ ++ $include_gtf \\ + --runThreadN $task.cpus \\ + $memory \\ + $args +@@ -51,7 +52,7 @@ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ +- --sjdbGTFfile $gtf \\ ++ $include_gtf \\ + --runThreadN $task.cpus \\ + --genomeSAindexNbases \$NUM_BASES \\ + $memory \\ +@@ -67,30 +68,52 @@ + } + + stub: +- """ +- mkdir star +- touch star/Genome +- touch star/Log.out +- touch star/SA +- touch star/SAindex +- touch star/chrLength.txt +- touch star/chrName.txt +- touch star/chrNameLength.txt +- touch star/chrStart.txt +- touch star/exonGeTrInfo.tab +- touch star/exonInfo.tab +- touch star/geneInfo.tab +- touch star/genomeParameters.txt +- touch star/sjdbInfo.txt +- touch star/sjdbList.fromGTF.out.tab +- touch star/sjdbList.out.tab +- touch star/transcriptInfo.tab ++ if (gtf) { ++ """ ++ mkdir star ++ touch star/Genome ++ touch star/Log.out ++ touch star/SA ++ touch star/SAindex ++ touch star/chrLength.txt ++ touch star/chrName.txt ++ touch star/chrNameLength.txt ++ touch star/chrStart.txt ++ touch star/exonGeTrInfo.tab ++ touch star/exonInfo.tab ++ touch star/geneInfo.tab ++ touch star/genomeParameters.txt ++ touch star/sjdbInfo.txt ++ touch star/sjdbList.fromGTF.out.tab ++ touch star/sjdbList.out.tab ++ touch star/transcriptInfo.tab + +- cat <<-END_VERSIONS > versions.yml +- "${task.process}": +- star: \$(STAR --version | sed -e "s/STAR_//g") +- samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') +- gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') +- END_VERSIONS +- """ ++ cat <<-END_VERSIONS > versions.yml ++ "${task.process}": ++ star: \$(STAR --version | sed -e "s/STAR_//g") ++ samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') ++ gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') ++ END_VERSIONS ++ """ ++ } else { ++ """ ++ mkdir star ++ touch star/Genome ++ touch star/Log.out ++ touch star/SA ++ touch star/SAindex ++ touch star/chrLength.txt ++ touch star/chrName.txt ++ touch star/chrNameLength.txt ++ touch star/chrStart.txt ++ touch star/genomeParameters.txt ++ ++ cat <<-END_VERSIONS > versions.yml ++ "${task.process}": ++ star: \$(STAR --version | sed -e "s/STAR_//g") ++ samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') ++ gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') ++ END_VERSIONS ++ """ ++ } + } + +--- modules/nf-core/star/genomegenerate/tests/main.nf.test.snap ++++ modules/nf-core/star/genomegenerate/tests/main.nf.test.snap +@@ -5,12 +5,18 @@ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], +- "timestamp": "2023-12-04T18:01:27.298248806" ++ "timestamp": "2023-12-19T11:05:51.741109" + }, +- "index": { ++ "index_with_gtf": { + "content": [ +- "star" ++ "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, exonGeTrInfo.tab, exonInfo.tab, geneInfo.tab, genomeParameters.txt, sjdbInfo.txt, sjdbList.fromGTF.out.tab, sjdbList.out.tab, transcriptInfo.tab]" + ], +- "timestamp": "2023-11-23T11:31:47.560528" ++ "timestamp": "2023-12-19T11:38:14.551548" ++ }, ++ "index_without_gtf": { ++ "content": [ ++ "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, genomeParameters.txt]" ++ ], ++ "timestamp": "2023-12-19T11:38:22.382905" + } + } +--- modules/nf-core/star/genomegenerate/tests/main.nf.test ++++ modules/nf-core/star/genomegenerate/tests/main.nf.test +@@ -28,7 +28,86 @@ + then { + assertAll( + { assert process.success }, +- { assert snapshot(file(process.out.index[0][1]).name).match("index") }, ++ { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_with_gtf") }, ++ { assert snapshot(process.out.versions).match("versions") } ++ ) ++ } ++ ++ } ++ ++ test("homo_sapiens-stub") { ++ ++ options '-stub' ++ ++ when { ++ process { ++ """ ++ input[0] = Channel.of([ ++ [ id:'test_fasta' ], ++ [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] ++ ]) ++ input[1] = Channel.of([ ++ [ id:'test_gtf' ], ++ [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] ++ ]) ++ """ ++ } ++ } ++ ++ then { ++ assertAll( ++ { assert process.success }, ++ { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_with_gtf") }, ++ { assert snapshot(process.out.versions).match("versions") } ++ ) ++ } ++ ++ } ++ ++ test("homo_sapiens-without_gtf") { ++ ++ when { ++ process { ++ """ ++ input[0] = Channel.of([ ++ [ id:'test_fasta' ], ++ [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] ++ ]) ++ input[1] = Channel.of([ [], [] ]) ++ """ ++ } ++ } ++ ++ then { ++ assertAll( ++ { assert process.success }, ++ { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_without_gtf") }, ++ { assert snapshot(process.out.versions).match("versions") } ++ ) ++ } ++ ++ } ++ ++ test("homo_sapiens-without_gtf-stub") { ++ ++ options '-stub' ++ ++ when { ++ process { ++ """ ++ input[0] = Channel.of([ ++ [ id:'test_fasta' ], ++ [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] ++ ]) ++ input[1] = Channel.of([ [], [] ]) ++ """ ++ } ++ } ++ ++ then { ++ assertAll( ++ { assert process.success }, ++ { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_without_gtf") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + +************************************************************ diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test b/modules/nf-core/star/genomegenerate/tests/main.nf.test index eed8292..af0c942 100644 --- a/modules/nf-core/star/genomegenerate/tests/main.nf.test +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test @@ -28,7 +28,86 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(file(process.out.index[0][1]).name).match("index") }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_with_gtf") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("homo_sapiens-stub") { + + options '-stub' + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_with_gtf") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("homo_sapiens-without_gtf") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ [], [] ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_without_gtf") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("homo_sapiens-without_gtf-stub") { + + options '-stub' + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + ]) + input[1] = Channel.of([ [], [] ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_without_gtf") }, { assert snapshot(process.out.versions).match("versions") } ) } diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap index e7bb6ee..9de08c7 100644 --- a/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap @@ -5,12 +5,18 @@ "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" ] ], - "timestamp": "2023-12-04T18:01:27.298248806" + "timestamp": "2023-12-19T11:05:51.741109" }, - "index": { + "index_with_gtf": { "content": [ - "star" + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, exonGeTrInfo.tab, exonInfo.tab, geneInfo.tab, genomeParameters.txt, sjdbInfo.txt, sjdbList.fromGTF.out.tab, sjdbList.out.tab, transcriptInfo.tab]" ], - "timestamp": "2023-11-23T11:31:47.560528" + "timestamp": "2023-12-19T11:38:14.551548" + }, + "index_without_gtf": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, genomeParameters.txt]" + ], + "timestamp": "2023-12-19T11:38:22.382905" } } \ No newline at end of file From ec7ffc1b050d0f301baefed276a768d2f71913aa Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 19 Dec 2023 15:52:18 +1300 Subject: [PATCH 29/59] Incorporated fasta_edta_lai --- conf/modules.config | 10 +- modules.json | 21 ++ modules/pfr/lai/environment.yml | 9 + modules/pfr/lai/main.nf | 69 +++++++ modules/pfr/lai/meta.yml | 68 +++++++ modules/pfr/lai/tests/main.nf.test | 120 +++++++++++ modules/pfr/lai/tests/main.nf.test.snap | 10 + modules/pfr/lai/tests/nextflow.config | 10 + modules/pfr/lai/tests/tags.yml | 2 + nextflow.config | 8 +- subworkflows/local/fasta_edta.nf | 43 ---- subworkflows/local/prepare_assembly.nf | 19 +- subworkflows/pfr/fasta_edta_lai/main.nf | 88 ++++++++ subworkflows/pfr/fasta_edta_lai/meta.yml | 69 +++++++ .../pfr/fasta_edta_lai/tests/main.nf.test | 38 ++++ .../pfr/fasta_edta_lai/tests/tags.yml | 2 + workflows/pangene.nf | 188 +++++++++--------- 17 files changed, 621 insertions(+), 153 deletions(-) create mode 100644 modules/pfr/lai/environment.yml create mode 100644 modules/pfr/lai/main.nf create mode 100644 modules/pfr/lai/meta.yml create mode 100644 modules/pfr/lai/tests/main.nf.test create mode 100644 modules/pfr/lai/tests/main.nf.test.snap create mode 100644 modules/pfr/lai/tests/nextflow.config create mode 100644 modules/pfr/lai/tests/tags.yml delete mode 100644 subworkflows/local/fasta_edta.nf create mode 100644 subworkflows/pfr/fasta_edta_lai/main.nf create mode 100644 subworkflows/pfr/fasta_edta_lai/meta.yml create mode 100644 subworkflows/pfr/fasta_edta_lai/tests/main.nf.test create mode 100644 subworkflows/pfr/fasta_edta_lai/tests/tags.yml diff --git a/conf/modules.config b/conf/modules.config index 58830e9..392583a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1,18 +1,16 @@ process { - withName: 'EDTA' { + withName: 'EDTA_EDTA' { ext.args = [ params.edta_is_sensitive ? "--sensitive 1" : "--sensitive 0", "--anno 0", "--force 1" ].join(' ').trim() - } - withName: 'RESTORE_EDTA_IDS' { publishDir = [ - path: { "${params.outdir}/edta/${meta.id}" }, + path: { "${params.outdir}/edta" }, mode: "copy", - saveAs: { filename -> filename.equals("versions.yml") ? null : filename }, - enabled: params.edta_save_outputs + pattern: '*.EDTA.TElib.fa', + enabled: params.edta_save_te_lib ] } diff --git a/modules.json b/modules.json index 80995ba..b57ef90 100644 --- a/modules.json +++ b/modules.json @@ -9,6 +9,7 @@ "branch": "main", "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", "installed_by": [ + "fasta_edta_lai", "modules" ] }, @@ -16,6 +17,7 @@ "branch": "main", "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", "installed_by": [ + "fasta_edta_lai", "modules" ] }, @@ -23,9 +25,17 @@ "branch": "main", "git_sha": "d14b1f1d790cc01c11f8ec5aa80a9562c6808b20", "installed_by": [ + "fasta_edta_lai", "modules" ] }, + "lai": { + "branch": "main", + "git_sha": "7e6e3cb41362a045c6bb6065903efa0eba246e87", + "installed_by": [ + "fasta_edta_lai" + ] + }, "liftoff": { "branch": "main", "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", @@ -34,6 +44,17 @@ ] } } + }, + "subworkflows": { + "pfr": { + "fasta_edta_lai": { + "branch": "main", + "git_sha": "fde2f37998ba54ec9c0b5cf65a2f28f14af981b0", + "installed_by": [ + "subworkflows" + ] + } + } } }, "git@github.com:kherronism/nf-modules.git": { diff --git a/modules/pfr/lai/environment.yml b/modules/pfr/lai/environment.yml new file mode 100644 index 0000000..94fadbd --- /dev/null +++ b/modules/pfr/lai/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "lai" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::LTR_retriever=2.9.0" diff --git a/modules/pfr/lai/main.nf b/modules/pfr/lai/main.nf new file mode 100644 index 0000000..d4fced9 --- /dev/null +++ b/modules/pfr/lai/main.nf @@ -0,0 +1,69 @@ +process LAI { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ltr_retriever:2.9.0--hdfd78af_2': + 'biocontainers/ltr_retriever:2.9.0--hdfd78af_2' }" + + input: + tuple val(meta), path(fasta) + path pass_list + path annotation_out + path monoploid_seqs + + output: + tuple val(meta), path("*.LAI.log") , emit: log + tuple val(meta), path("*.LAI.out") , emit: lai_out , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def monoploid_param = monoploid_seqs ? "-mono $monoploid_seqs" : '' + def lai_output_name = monoploid_seqs ? "${annotation_out}.${monoploid_seqs}.out.LAI" : "${annotation_out}.LAI" + """ + # Remove comments from genome fasta, + # otherwise LAI triggers its sequence name change logic + + sed \\ + '/^>/ s/\\s.*\$//' \\ + $fasta \\ + > for_lai_no_comments.fsa + + LAI \\ + -genome for_lai_no_comments.fsa \\ + -intact $pass_list \\ + -all $annotation_out \\ + -t $task.cpus \\ + $monoploid_param \\ + $args \\ + > "${prefix}.LAI.log" + + mv \\ + $lai_output_name \\ + "${prefix}.LAI.out" \\ + || echo "LAI did not produce the output file" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + lai: \$(cat /usr/local/share/LTR_retriever/LAI | grep "my \\\$version" | sed 's/my \$version="//; s/";//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.LAI.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + lai: \$(cat /usr/local/share/LTR_retriever/LAI | grep "my \\\$version" | sed 's/my \$version="//; s/";//') + END_VERSIONS + """ +} diff --git a/modules/pfr/lai/meta.yml b/modules/pfr/lai/meta.yml new file mode 100644 index 0000000..6fd7aef --- /dev/null +++ b/modules/pfr/lai/meta.yml @@ -0,0 +1,68 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "lai" +description: Estimates the mean LTR sequence identity in the genome +keywords: + - genomics + - annotation + - repeat + - long terminal retrotransposon + - retrotransposon + - stats + - qc +tools: + - "lai": + description: Assessing genome assembly quality using the LTR Assembly Index (LAI) + homepage: "https://github.com/oushujun/LTR_retriever" + documentation: "https://github.com/oushujun/LTR_retriever" + tool_dev_url: "https://github.com/oushujun/LTR_retriever" + doi: "10.1093/nar/gky730" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - fasta: + type: file + description: The genome file that is used to generate everything + pattern: "*.{fsa,fa,fasta}" + - pass_list: + type: file + description: A list of intact LTR-RTs generated by LTR_retriever + pattern: "*.pass.list" + - annotation_out: + type: file + description: RepeatMasker annotation of all LTR sequences in the genome + pattern: "*.out" + - monoploid_seqs: + type: file + description: | + This parameter is mainly for ployploid genomes. User provides a list of + sequence names that represent a monoploid (1x). LAI will be calculated only + on these sequences if provided. + pattern: "*.txt" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - log: + type: file + description: Log from LAI + pattern: "*.LAI.log" + - lai_out: + type: file + description: | + Output file from LAI if LAI is able to estimate the index from the inputs + pattern: "*.LAI.out" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/pfr/lai/tests/main.nf.test b/modules/pfr/lai/tests/main.nf.test new file mode 100644 index 0000000..353043c --- /dev/null +++ b/modules/pfr/lai/tests/main.nf.test @@ -0,0 +1,120 @@ +nextflow_process { + + name "Test Process LAI" + script "../main.nf" + process "LAI" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "lai" + tag "gt/suffixerator" + tag "nf-core/gunzip" + tag "gt/ltrharvest" + tag "ltrretriever" + + test("homo_sapiens-genome_21_fasta-success") { + + setup { + run("GUNZIP") { + script "../../../nf-core/gunzip" + + process { + """ + input[0] = [ + [ id:'test' ], + file('/Users/hrauxr/Projects/nxf-modules/tests/data/chr1.fa.gz', checkIfExists: true) + ] + """ + } + } + + run("GT_SUFFIXERATOR") { + script "../../../pfr/gt/suffixerator" + + process { + """ + input[0] = GUNZIP.out.gunzip + """ + } + } + + run("GT_LTRHARVEST") { + script "../../../pfr/gt/ltrharvest" + + process { + """ + input[0] = GT_SUFFIXERATOR.out.index + """ + } + } + + run("LTRRETRIEVER") { + script "../../../pfr/ltrretriever" + + process { + """ + input[0] = GUNZIP.out.gunzip + input[1] = GT_LTRHARVEST.out.tabout.map { meta, tabout -> tabout } + input[2] = [] + input[3] = [] + input[4] = [] + """ + } + } + } + + when { + process { + """ + input[0] = GUNZIP.out.gunzip + input[1] = LTRRETRIEVER.out.pass_list.map { meta, pass_list -> pass_list } + input[2] = LTRRETRIEVER.out.annotation_out.map { meta, annotation_out -> annotation_out } + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.log.get(0).get(1)).getText().contains("Dependency checking: Passed!") }, + { assert path(process.out.log.get(0).get(1)).getText().contains("Calculate LAI:") }, + { assert path(process.out.log.get(0).get(1)).getText().contains("Total LTR sequence content (0%) is too low for accurate LAI calculation") }, + { assert path(process.out.log.get(0).get(1)).getText().contains("Sorry, LAI is not applicable on the current genome assembly.") }, + { assert process.out.lai_out == [] }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true) + ] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + +} diff --git a/modules/pfr/lai/tests/main.nf.test.snap b/modules/pfr/lai/tests/main.nf.test.snap new file mode 100644 index 0000000..751ddb6 --- /dev/null +++ b/modules/pfr/lai/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,2ac93e1e6324236af6f9a794bbac2099" + ] + ], + "timestamp": "2023-12-05T12:15:32.969684" + } +} \ No newline at end of file diff --git a/modules/pfr/lai/tests/nextflow.config b/modules/pfr/lai/tests/nextflow.config new file mode 100644 index 0000000..516a3e2 --- /dev/null +++ b/modules/pfr/lai/tests/nextflow.config @@ -0,0 +1,10 @@ +process { + + withName: GT_SUFFIXERATOR { + ext.args = '-tis -suf -lcp -des -ssp -sds -dna' + } + + withName: GT_LTRHARVEST { + ext.args = '-minlenltr 100 -maxlenltr 7000 -mintsd 4 -maxtsd 6 -motif TGCA -motifmis 1 -similar 85 -vic 10 -seed 20 -seqids yes' + } +} diff --git a/modules/pfr/lai/tests/tags.yml b/modules/pfr/lai/tests/tags.yml new file mode 100644 index 0000000..252295d --- /dev/null +++ b/modules/pfr/lai/tests/tags.yml @@ -0,0 +1,2 @@ +lai: + - "modules/pfr/lai/**" diff --git a/nextflow.config b/nextflow.config index 135bf29..669b8ca 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,8 +2,8 @@ includeConfig './conf/base.config' params { target_assemblies = [ - ["red5_v2p1", "/workspace/hrauxr/pangene/.test/red5_v2p1_chr1.fasta"], - ["donghong", "/workspace/hrauxr/pangene/.test/donghong.chr1.fsa.gz"] + ["red5_v2p1", ".test/red5_v2p1_chr1.fasta"], + ["donghong", ".test/donghong.chr1.fsa.gz"] ] // Pattern: [ [tag, fasta(.gz) ] ] // Permissible tags: tag, tag_1, tag_tag2_3, tag_tag2_tag3; @@ -11,7 +11,7 @@ params { // "." is not allowed in the tag name te_libraries = [ - ["donghong", "/workspace/hrauxr/pangene/.test/donghong.TElib.fa.gz"] + ["donghong", ".test/donghong.TElib.fa.gz"] ] // Pattern: [ [tag, fasta(.gz) ] ] // Optional Set to null if libraries are not available. @@ -21,7 +21,7 @@ params { // When the TE lib is not available for a traget assembly, EDTA is used to create one. edta_is_sensitive = false - edta_save_outputs = false + edta_save_te_lib = true repeatmasker_save_outputs = true diff --git a/subworkflows/local/fasta_edta.nf b/subworkflows/local/fasta_edta.nf deleted file mode 100644 index c47e557..0000000 --- a/subworkflows/local/fasta_edta.nf +++ /dev/null @@ -1,43 +0,0 @@ -include { SHORTEN_EDTA_IDS } from '../../modules/local/edta/shorten_edta_ids' -include { EDTA } from '../../modules/local/edta/edta' -include { RESTORE_EDTA_IDS } from '../../modules/local/edta/restore_edta_ids' - -workflow FASTA_EDTA { - take: - genome_fasta // channel: [ meta, fasta ] - - main: - SHORTEN_EDTA_IDS(genome_fasta) - .renamed_ids_fasta - | EDTA - - RESTORE_EDTA_IDS( - EDTA.out.te_lib_fasta, - EDTA.out.intact_gff3.map { it[1] }, - EDTA.out.pass_list.map { it[1] }, - EDTA.out.out_file.map { it[1] }, - EDTA.out.te_anno_gff3.map { it[1] }, - SHORTEN_EDTA_IDS.out.renamed_ids_tsv.map { it[1] } - ) - - Channel.empty() - | mix( - SHORTEN_EDTA_IDS.out.versions.first() - ) - | mix( - EDTA.out.versions.first() - ) - | mix( - RESTORE_EDTA_IDS.out.versions.first() - ) - | set { ch_versions } - - emit: - te_lib_fasta = RESTORE_EDTA_IDS.out.te_lib_fasta // channel: [ meta, fasta ] - intact_gff3 = RESTORE_EDTA_IDS.out.intact_gff3 // channel: [ meta, gff3 ] - pass_list = RESTORE_EDTA_IDS.out.pass_list // channel: [ meta, pass.list ] - out_file = RESTORE_EDTA_IDS.out.out_file // channel: [ meta, out.file ] - te_anno_gff3 = RESTORE_EDTA_IDS.out.te_anno_gff3 // channel: [ meta, gff3 ] - renamed_ids_tsv = RESTORE_EDTA_IDS.out.renamed_ids_tsv // channel: [ meta, tsv ] - versions = ch_versions // channel: [ versions.yml ] -} \ No newline at end of file diff --git a/subworkflows/local/prepare_assembly.nf b/subworkflows/local/prepare_assembly.nf index 05bd1ec..db2e3c3 100644 --- a/subworkflows/local/prepare_assembly.nf +++ b/subworkflows/local/prepare_assembly.nf @@ -4,6 +4,8 @@ include { FASTAVALIDATOR } from '../../modules/nf-core/fa include { REPEATMASKER } from '../../modules/kherronism/repeatmasker' include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate' +include { FASTA_EDTA_LAI } from '../../subworkflows/pfr/fasta_edta_lai' + workflow PREPARE_ASSEMBLY { take: target_assembly // channel: [ meta, fasta ] @@ -57,7 +59,7 @@ workflow PREPARE_ASSEMBLY { ) | set { ch_gunzip_te_library } - // SUBWORKFLOW: FASTA_EDTA + // SUBWORKFLOW: FASTA_EDTA_LAI ch_validated_target_assembly | join( ch_gunzip_te_library, remainder: true @@ -66,12 +68,18 @@ workflow PREPARE_ASSEMBLY { teLib == null } | map { meta, assembly, teLib -> [meta, assembly] } - | FASTA_EDTA + | set { ch_edta_inputs } + + FASTA_EDTA_LAI ( + ch_edta_inputs, + [], + true // Skip LAI + ) // MODULE: REPEATMASKER ch_validated_target_assembly | join( - FASTA_EDTA.out.te_lib_fasta.mix(ch_gunzip_te_library) + FASTA_EDTA_LAI.out.te_lib_fasta.mix(ch_gunzip_te_library) ) | set { ch_assembly_n_te_lib } @@ -84,8 +92,7 @@ workflow PREPARE_ASSEMBLY { def star_ignore_sjdbgtf = true STAR_GENOMEGENERATE( ch_validated_target_assembly, - ch_validated_target_assembly.map { meta, maskedFasta -> [meta, []] }, - star_ignore_sjdbgtf + ch_validated_target_assembly.map { meta, fasta -> [ [], [] ] } ) .index | set { ch_assembly_index } @@ -93,7 +100,7 @@ workflow PREPARE_ASSEMBLY { Channel.empty() | mix(FASTAVALIDATOR.out.versions.first()) | mix(GUNZIP_TE_LIBRARY.out.versions.first()) - | mix(FASTA_EDTA.out.versions) + | mix(FASTA_EDTA_LAI.out.versions) | mix(REPEATMASKER.out.versions.first()) | mix(STAR_GENOMEGENERATE.out.versions.first()) | mix(GUNZIP_TARGET_ASSEMBLY.out.versions.first()) diff --git a/subworkflows/pfr/fasta_edta_lai/main.nf b/subworkflows/pfr/fasta_edta_lai/main.nf new file mode 100644 index 0000000..2e73ca5 --- /dev/null +++ b/subworkflows/pfr/fasta_edta_lai/main.nf @@ -0,0 +1,88 @@ +include { CUSTOM_SHORTENFASTAIDS } from '../../../modules/pfr/custom/shortenfastaids' +include { EDTA_EDTA } from '../../../modules/pfr/edta/edta' +include { LAI } from '../../../modules/pfr/lai' +include { CUSTOM_RESTOREGFFIDS } from '../../../modules/pfr/custom/restoregffids' + +workflow FASTA_EDTA_LAI { + + take: + ch_fasta // channel: [ val(meta), fasta ] + ch_monoploid_seqs // channel: [ val(meta), txt ]; Optional: Set to [] if not needed + skip_lai // val; true|false + + main: + + ch_versions = Channel.empty() + + // MOUDLE: CUSTOM_SHORTENFASTAIDS + CUSTOM_SHORTENFASTAIDS ( ch_fasta ) + + ch_short_ids_fasta = ch_fasta + | join(CUSTOM_SHORTENFASTAIDS.out.short_ids_fasta, by:0, remainder:true) + | map { meta, fasta, short_ids_fasta -> + [ meta, short_ids_fasta ?: fasta ] + } + + ch_short_ids_tsv = CUSTOM_SHORTENFASTAIDS.out.short_ids_tsv + ch_versions = ch_versions.mix(CUSTOM_SHORTENFASTAIDS.out.versions.first()) + + // MODULE: EDTA_EDTA + EDTA_EDTA ( + ch_short_ids_fasta, + [], + [], + [], + [] + ) + + ch_te_lib_fasta = EDTA_EDTA.out.te_lib_fasta + ch_pass_list = EDTA_EDTA.out.pass_list + ch_out_file = EDTA_EDTA.out.out_file + ch_te_anno_gff3 = EDTA_EDTA.out.te_anno_gff3 + ch_versions = ch_versions.mix(EDTA_EDTA.out.versions.first()) + + // MODULE: LAI + ch_lai_inputs = skip_lai + ? Channel.empty() + : ch_short_ids_fasta + | join(ch_pass_list) + | join(ch_out_file) + | join( + ch_monoploid_seqs ?: Channel.empty(), + by:0, + remainder: true + ) + | map { meta, fasta, pass, out, mono -> + [ meta, fasta, pass, out, mono ?: [] ] + } + LAI ( + ch_lai_inputs.map { meta, fasta, pass, out, mono -> [ meta, fasta ] }, + ch_lai_inputs.map { meta, fasta, pass, out, mono -> pass }, + ch_lai_inputs.map { meta, fasta, pass, out, mono -> out }, + ch_lai_inputs.map { meta, fasta, pass, out, mono -> mono } + ) + + ch_lai_log = LAI.out.log + ch_lai_out = LAI.out.lai_out + ch_versions = ch_versions.mix(LAI.out.versions.first()) + + // MODULE: CUSTOM_RESTOREGFFIDS + ch_restorable_gff_tsv = ch_te_anno_gff3.join(ch_short_ids_tsv) + + CUSTOM_RESTOREGFFIDS ( + ch_restorable_gff_tsv.map { meta, gff, tsv -> [ meta, gff ] }, + ch_restorable_gff_tsv.map { meta, gff, tsv -> tsv } + ) + + ch_restored_gff = ch_te_anno_gff3 + | join(CUSTOM_RESTOREGFFIDS.out.restored_ids_gff3, by:0, remainder:true) + | map { meta, gff, restored_gff -> [ meta, restored_gff ?: gff ] } + ch_versions = ch_versions.mix(CUSTOM_RESTOREGFFIDS.out.versions.first()) + + emit: + te_lib_fasta = ch_te_lib_fasta // channel: [ val(meta), fasta ] + te_anno_gff3 = ch_restored_gff // channel: [ val(meta), gff ] + lai_log = ch_lai_log // channel: [ val(meta), log ] + lai_out = ch_lai_out // channel: [ val(meta), out ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/pfr/fasta_edta_lai/meta.yml b/subworkflows/pfr/fasta_edta_lai/meta.yml new file mode 100644 index 0000000..52483ce --- /dev/null +++ b/subworkflows/pfr/fasta_edta_lai/meta.yml @@ -0,0 +1,69 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fasta_edta_lai" +description: | + Performs extensive de-novo transposable element annotation with EDTA and optionally estimates repeat-space completeness with LAI +keywords: + - genomics + - genome + - annotation + - repeat + - transposons + - stats + - qc +components: + - custom/restoregffids + - custom/shortenfastaids + - edta/edta + - lai +input: + - ch_fasta: + type: file + description: | + Channel for the assembly fasta file + Structure: [ val(meta), path(fasta) ] + pattern: "*.{fsa/fa/fasta}" + - ch_monoploid_seqs: + type: file + description: | + Channel for providing a list of monoploid sequences + for correct estimation of LAI for polyploid genomes. + This parameter is useful when all the haplotypes are + stored in a single fasta file. + Structure: [ val(meta), path(txt) ] + pattern: "*.txt" + - skip_lai: + type: boolean + description: | + Skip LAI estimation + Structure: [ val(boolean) ] +output: + - te_lib_fasta: + type: file + description: A non-redundant TE library in fasta format + pattern: "*.EDTA.TElib.fa" + - te_anno_gff3: + type: file + description: A gff3 file containing both structurally intact and fragmented TE annotations + pattern: "*.EDTA.TEanno.gff3" + - lai_log: + type: file + description: | + Log from LAI + Structure: [ val(meta), path(log) ] + pattern: "*.LAI.log" + - lai_out: + type: file + description: | + LAI output + Structure: [ val(meta), path(out) ] + pattern: "*.LAI.out" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test b/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test new file mode 100644 index 0000000..a4fa87b --- /dev/null +++ b/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test @@ -0,0 +1,38 @@ +nextflow_workflow { + + name "Test Workflow FASTA_EDTA_LAI" + script "../main.nf" + workflow "FASTA_EDTA_LAI" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fasta_edta_lai" + tag "fasta_edta_lai" + tag "lai" + tag "edta/edta" + tag "custom/restoregffids" + tag "custom/shortenfastaids" + + test("test_data") { + + when { + workflow { + """ + input[0] = Channel.of([ + [ id:'test' ], + file("/Users/hrauxr/Projects/nxf-modules/data/chr1.fa", checkIfExists: true) + ]) + input[1] = [] + input[2] = false + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out.versions).match("versions") } + ) + } + } +} diff --git a/subworkflows/pfr/fasta_edta_lai/tests/tags.yml b/subworkflows/pfr/fasta_edta_lai/tests/tags.yml new file mode 100644 index 0000000..b114c58 --- /dev/null +++ b/subworkflows/pfr/fasta_edta_lai/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/fasta_edta_lai: + - subworkflows/pfr/fasta_edta_lai/** diff --git a/workflows/pangene.nf b/workflows/pangene.nf index 9110688..6241ab7 100644 --- a/workflows/pangene.nf +++ b/workflows/pangene.nf @@ -1,15 +1,15 @@ include { validateParams } from '../modules/local/validate_params' include { PREPARE_ASSEMBLY } from '../subworkflows/local/prepare_assembly' -include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' -include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' -include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' +// include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' +// include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' +// include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' -include { BRAKER3 } from '../modules/kherronism/braker3' +// include { BRAKER3 } from '../modules/kherronism/braker3' -include { FASTA_LIFTOFF } from '../subworkflows/local/fasta_liftoff' +// include { FASTA_LIFTOFF } from '../subworkflows/local/fasta_liftoff' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions' +// include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions' validateParams(params) @@ -46,26 +46,26 @@ workflow PANGENE { | collect : Channel.empty() - ch_ext_prot_fastas = params.external_protein_fastas - ? Channel.fromList(params.external_protein_fastas) - | map { filePath -> - def fileHandle = file(filePath, checkIfExists: true) - [[id:fileHandle.getSimpleName()], fileHandle] - } - : Channel.empty() + // ch_ext_prot_fastas = params.external_protein_fastas + // ? Channel.fromList(params.external_protein_fastas) + // | map { filePath -> + // def fileHandle = file(filePath, checkIfExists: true) + // [[id:fileHandle.getSimpleName()], fileHandle] + // } + // : Channel.empty() - ch_xref_annotations_mm = params.liftoff_xref_annotations - ? Channel.fromList(params.liftoff_xref_annotations) - | multiMap { fasta, gff -> - def fastaFile = file(fasta, checkIfExists:true) + // ch_xref_annotations_mm = params.liftoff_xref_annotations + // ? Channel.fromList(params.liftoff_xref_annotations) + // | multiMap { fasta, gff -> + // def fastaFile = file(fasta, checkIfExists:true) - fasta: [[id:fastaFile.getSimpleName()], fastaFile] - gff: [[id:fastaFile.getSimpleName()], file(gff, checkIfExists:true)] - } - : Channel.empty() + // fasta: [[id:fastaFile.getSimpleName()], fastaFile] + // gff: [[id:fastaFile.getSimpleName()], file(gff, checkIfExists:true)] + // } + // : Channel.empty() - ch_xref_annotations_fasta = ch_xref_annotations_mm.fasta - ch_xref_annotations_gff = ch_xref_annotations_mm.gff + // ch_xref_annotations_fasta = ch_xref_annotations_mm.fasta + // ch_xref_annotations_gff = ch_xref_annotations_mm.gff // SUBWORKFLOW: PREPARE_ASSEMBLY PREPARE_ASSEMBLY( @@ -78,76 +78,76 @@ workflow PANGENE { ch_target_assemby_index = PREPARE_ASSEMBLY.out.target_assemby_index ch_versions = ch_versions.mix(PREPARE_ASSEMBLY.out.versions) - // SUBWORKFLOW: PREPROCESS_RNASEQ - PREPROCESS_RNASEQ( - ch_samplesheet, - ch_tar_assm_str, - params.skip_fastqc, - params.skip_fastp, - params.save_trimmed, - params.min_trimmed_reads, - params.remove_ribo_rna, - ch_sortmerna_fastas - ) - - ch_trim_reads = PREPROCESS_RNASEQ.out.trim_reads - ch_reads_target = PREPROCESS_RNASEQ.out.reads_target - ch_versions = ch_versions.mix(PREPROCESS_RNASEQ.out.versions) - - // SUBWORKFLOW: ALIGN_RNASEQ - ALIGN_RNASEQ( - ch_reads_target, - ch_trim_reads, - ch_target_assemby_index - ) - - ch_rnaseq_bam = ALIGN_RNASEQ.out.bam - ch_versions = ch_versions.mix(ALIGN_RNASEQ.out.versions) - - // MODULE: PREPARE_EXT_PROTS - PREPARE_EXT_PROTS( - ch_ext_prot_fastas - ) - - ch_ext_prots_fasta = PREPARE_EXT_PROTS.out.ext_prots_fasta - ch_versions = ch_versions.mix(PREPARE_EXT_PROTS.out.versions) - - // MODULE: BRAKER3 - ch_braker_inputs = ch_masked_target_assembly - | join(ch_rnaseq_bam, remainder: true) - | combine( - ch_ext_prots_fasta.map { meta, filePath -> filePath }.ifEmpty(null) - ) - | map { meta, fasta, bam, prots -> [meta, fasta, bam ?: [], prots ?: []] } + // // SUBWORKFLOW: PREPROCESS_RNASEQ + // PREPROCESS_RNASEQ( + // ch_samplesheet, + // ch_tar_assm_str, + // params.skip_fastqc, + // params.skip_fastp, + // params.save_trimmed, + // params.min_trimmed_reads, + // params.remove_ribo_rna, + // ch_sortmerna_fastas + // ) + + // ch_trim_reads = PREPROCESS_RNASEQ.out.trim_reads + // ch_reads_target = PREPROCESS_RNASEQ.out.reads_target + // ch_versions = ch_versions.mix(PREPROCESS_RNASEQ.out.versions) + + // // SUBWORKFLOW: ALIGN_RNASEQ + // ALIGN_RNASEQ( + // ch_reads_target, + // ch_trim_reads, + // ch_target_assemby_index + // ) + + // ch_rnaseq_bam = ALIGN_RNASEQ.out.bam + // ch_versions = ch_versions.mix(ALIGN_RNASEQ.out.versions) + + // // MODULE: PREPARE_EXT_PROTS + // PREPARE_EXT_PROTS( + // ch_ext_prot_fastas + // ) + + // ch_ext_prots_fasta = PREPARE_EXT_PROTS.out.ext_prots_fasta + // ch_versions = ch_versions.mix(PREPARE_EXT_PROTS.out.versions) + + // // MODULE: BRAKER3 + // ch_braker_inputs = ch_masked_target_assembly + // | join(ch_rnaseq_bam, remainder: true) + // | combine( + // ch_ext_prots_fasta.map { meta, filePath -> filePath }.ifEmpty(null) + // ) + // | map { meta, fasta, bam, prots -> [meta, fasta, bam ?: [], prots ?: []] } - def rnaseq_sets_dirs = [] - def rnaseq_sets_ids = [] - def hintsfile = [] - - BRAKER3( - ch_braker_inputs.map { meta, fasta, bam, prots -> [meta, fasta] }, - ch_braker_inputs.map { meta, fasta, bam, prots -> bam }, - rnaseq_sets_dirs, - rnaseq_sets_ids, - ch_braker_inputs.map { meta, fasta, bam, prots -> prots }, - hintsfile - ) - - ch_braker_gff3 = BRAKER3.out.gff3 - ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) - - // SUBWORKFLOW: FASTA_LIFTOFF - FASTA_LIFTOFF( - ch_valid_target_assembly, - ch_xref_annotations_fasta, - ch_xref_annotations_gff - ) - - ch_liftoff_gff3 = FASTA_LIFTOFF.out.gff3 - ch_versions = ch_versions.mix(FASTA_LIFTOFF.out.versions) - - // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) + // def rnaseq_sets_dirs = [] + // def rnaseq_sets_ids = [] + // def hintsfile = [] + + // BRAKER3( + // ch_braker_inputs.map { meta, fasta, bam, prots -> [meta, fasta] }, + // ch_braker_inputs.map { meta, fasta, bam, prots -> bam }, + // rnaseq_sets_dirs, + // rnaseq_sets_ids, + // ch_braker_inputs.map { meta, fasta, bam, prots -> prots }, + // hintsfile + // ) + + // ch_braker_gff3 = BRAKER3.out.gff3 + // ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) + + // // SUBWORKFLOW: FASTA_LIFTOFF + // FASTA_LIFTOFF( + // ch_valid_target_assembly, + // ch_xref_annotations_fasta, + // ch_xref_annotations_gff + // ) + + // ch_liftoff_gff3 = FASTA_LIFTOFF.out.gff3 + // ch_versions = ch_versions.mix(FASTA_LIFTOFF.out.versions) + + // // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS + // CUSTOM_DUMPSOFTWAREVERSIONS ( + // ch_versions.unique().collectFile(name: 'collated_versions.yml') + // ) } \ No newline at end of file From 2de0d224d0b2720e780f9cb7c179c8e8bad347f5 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 20 Dec 2023 14:23:55 +1300 Subject: [PATCH 30/59] Trying to add FASTQ_FASTQC_UMITOOLS_FASTP --- nextflow.config | 2 +- workflows/pangene.nf | 30 +++++++++++++++--------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/nextflow.config b/nextflow.config index 669b8ca..d861331 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,7 +25,7 @@ params { repeatmasker_save_outputs = true - samplesheet = "./.test/samplesheet.csv" + samplesheet = "./.test/samplesheet_small.csv" // Optional: Set to null if not available skip_fastqc = false diff --git a/workflows/pangene.nf b/workflows/pangene.nf index 6241ab7..82fdbe4 100644 --- a/workflows/pangene.nf +++ b/workflows/pangene.nf @@ -1,7 +1,7 @@ include { validateParams } from '../modules/local/validate_params' include { PREPARE_ASSEMBLY } from '../subworkflows/local/prepare_assembly' -// include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' +include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' // include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' // include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' @@ -78,21 +78,21 @@ workflow PANGENE { ch_target_assemby_index = PREPARE_ASSEMBLY.out.target_assemby_index ch_versions = ch_versions.mix(PREPARE_ASSEMBLY.out.versions) - // // SUBWORKFLOW: PREPROCESS_RNASEQ - // PREPROCESS_RNASEQ( - // ch_samplesheet, - // ch_tar_assm_str, - // params.skip_fastqc, - // params.skip_fastp, - // params.save_trimmed, - // params.min_trimmed_reads, - // params.remove_ribo_rna, - // ch_sortmerna_fastas - // ) + // SUBWORKFLOW: PREPROCESS_RNASEQ + PREPROCESS_RNASEQ( + ch_samplesheet, + ch_tar_assm_str, + params.skip_fastqc, + params.skip_fastp, + params.save_trimmed, + params.min_trimmed_reads, + params.remove_ribo_rna, + ch_sortmerna_fastas + ) - // ch_trim_reads = PREPROCESS_RNASEQ.out.trim_reads - // ch_reads_target = PREPROCESS_RNASEQ.out.reads_target - // ch_versions = ch_versions.mix(PREPROCESS_RNASEQ.out.versions) + ch_trim_reads = PREPROCESS_RNASEQ.out.trim_reads + ch_reads_target = PREPROCESS_RNASEQ.out.reads_target + ch_versions = ch_versions.mix(PREPROCESS_RNASEQ.out.versions) // // SUBWORKFLOW: ALIGN_RNASEQ // ALIGN_RNASEQ( From 48e72710950ff5e4b4201a27b2331f64d50e4bbe Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 21 Dec 2023 10:15:14 +1300 Subject: [PATCH 31/59] Updated modules and applied prettier --- .nf-core.yml | 2 +- README.md | 12 +- TODO.md | 4 +- modules.json | 327 ++++++++---------- modules/kherronism/braker3/meta.yml | 6 +- modules/kherronism/repeatmasker/meta.yml | 7 +- .../genomegenerate/star-genomegenerate.diff | 247 ------------- 7 files changed, 155 insertions(+), 450 deletions(-) delete mode 100644 modules/nf-core/star/genomegenerate/star-genomegenerate.diff diff --git a/.nf-core.yml b/.nf-core.yml index b1a7f0e..3805dc8 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1 @@ -repository_type: pipeline \ No newline at end of file +repository_type: pipeline diff --git a/README.md b/README.md index ea8b609..8efbcf0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # PANGENE + A NextFlow pipeline for pan-genome annotation. ## Pipeline Flowchart @@ -12,7 +13,7 @@ flowchart TD EDTA REPEATMASKER end - + TARGET_ASSEMBLIES(["[target_assemblies]"]) TE_LIBRARIES(["[te_libs]"]) TARGET_ASSEMBLIES --> FASTA_VALIDATE @@ -30,7 +31,7 @@ flowchart TD STAR SAMTOOLS_CAT end - + SAMPLESHEET([samplesheet]) SAMPLESHEET --> |Tech. reps|CAT_FASTQ CAT_FASTQ --> FASTQC @@ -60,7 +61,7 @@ flowchart TD XREF_ANNOTATIONS --> |xref_fasta|LIFTOFF GFFREAD --> LIFTOFF anno_fasta --> |Fasta|LIFTOFF - + EXTERNAL_PROTEIN_SEQS --> CAT anno_masked_fasta --> |Masked fasta|BRAKER3 anno_bam --> |RNASeq bam|BRAKER3 @@ -76,10 +77,9 @@ flowchart TD Configure the pipeline by modifying `nextflow.config` and submit to SLURM for execution. ```bash -sbatch ./pan_gene_pfr.sh +sbatch ./pan_gene_pfr.sh ``` - ## Third-party Sources Some software components of this pipeline have been adopted from following third-party sources: @@ -94,4 +94,4 @@ Some software components of this pipeline have been adopted from following third 2. nf-core/rnaseq [MIT](https://github.com/nf-core/rnaseq/blob/master/LICENSE): https://github.com/nf-core/rnaseq 3. rewarewaannotation [MIT](https://github.com/kherronism/rewarewaannotation/blob/master/LICENSE): https://github.com/kherronism/rewarewaannotation -4. assembly_qc [GPL-3.0](https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE): https://github.com/Plant-Food-Research-Open/assembly_qc \ No newline at end of file +4. assembly_qc [GPL-3.0](https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE): https://github.com/Plant-Food-Research-Open/assembly_qc diff --git a/TODO.md b/TODO.md index 0134c26..94f51c1 100644 --- a/TODO.md +++ b/TODO.md @@ -2,7 +2,7 @@ - [ ] From Ross regarding post-processing: > [9:49 am] Ross Crowhurst -Here is an easy one: BLATSp vs swissprot & Arabidpsis and check query is with set thresholds of reference - if so accept; If not move to BLASTp vs Uniref90 or Refeq (or some other predetermined model species) - same deal accept if within threshold limits. Else BLASTn of cds vs NCBI nt (really scrapping the bottom of the barrel here). If not a hit to anything then chances are its garbage and should be removed. Some ppl might try to claim its a unique protein to the genotype but in 20 years I have never seen one of those be supported - mostly this category is garbage. The screen agains NCBI nt also assists to classify "bits" as well retroposonss etc. Idea being you want to remove garbage predictions - as this does take time you can see why some papers just filter out by size. +> Here is an easy one: BLATSp vs swissprot & Arabidpsis and check query is with set thresholds of reference - if so accept; If not move to BLASTp vs Uniref90 or Refeq (or some other predetermined model species) - same deal accept if within threshold limits. Else BLASTn of cds vs NCBI nt (really scrapping the bottom of the barrel here). If not a hit to anything then chances are its garbage and should be removed. Some ppl might try to claim its a unique protein to the genotype but in 20 years I have never seen one of those be supported - mostly this category is garbage. The screen agains NCBI nt also assists to classify "bits" as well retroposonss etc. Idea being you want to remove garbage predictions - as this does take time you can see why some papers just filter out by size. - [ ] From Cecilia: @@ -12,4 +12,4 @@ Here is an easy one: BLATSp vs swissprot & Arabidpsis and check query is with se > https://www.biorxiv.org/content/10.1101/096529v2.full.pdf -> Don't use `-exclude_partial` \ No newline at end of file +> Don't use `-exclude_partial` diff --git a/modules.json b/modules.json index b57ef90..dfc1e93 100644 --- a/modules.json +++ b/modules.json @@ -1,191 +1,142 @@ { - "name": "PlantandFoodResearch/pangene", - "homePage": "https://github.com/PlantandFoodResearch/pangene", - "repos": { - "git@github.com:PlantandFoodResearch/nxf-modules.git": { - "modules": { - "pfr": { - "custom/restoregffids": { - "branch": "main", - "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", - "installed_by": [ - "fasta_edta_lai", - "modules" - ] - }, - "custom/shortenfastaids": { - "branch": "main", - "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", - "installed_by": [ - "fasta_edta_lai", - "modules" - ] - }, - "edta/edta": { - "branch": "main", - "git_sha": "d14b1f1d790cc01c11f8ec5aa80a9562c6808b20", - "installed_by": [ - "fasta_edta_lai", - "modules" - ] - }, - "lai": { - "branch": "main", - "git_sha": "7e6e3cb41362a045c6bb6065903efa0eba246e87", - "installed_by": [ - "fasta_edta_lai" - ] - }, - "liftoff": { - "branch": "main", - "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", - "installed_by": [ - "modules" - ] - } - } - }, - "subworkflows": { - "pfr": { - "fasta_edta_lai": { - "branch": "main", - "git_sha": "fde2f37998ba54ec9c0b5cf65a2f28f14af981b0", - "installed_by": [ - "subworkflows" - ] - } - } - } - }, - "git@github.com:kherronism/nf-modules.git": { - "modules": { - "kherronism": { - "braker3": { - "branch": "dev", - "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", - "installed_by": [ - "modules" - ] - }, - "repeatmasker": { - "branch": "dev", - "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", - "installed_by": [ - "modules" - ] - } - } - } - }, - "https://github.com/nf-core/modules.git": { - "modules": { - "nf-core": { - "cat/cat": { - "branch": "master", - "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", - "installed_by": [ - "modules" - ] - }, - "cat/fastq": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] - }, - "custom/dumpsoftwareversions": { - "branch": "master", - "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", - "installed_by": [ - "modules" - ] - }, - "fastavalidator": { - "branch": "master", - "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", - "installed_by": [ - "modules" - ] - }, - "fastp": { - "branch": "master", - "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", - "installed_by": [ - "fastq_fastqc_umitools_fastp" - ] - }, - "fastqc": { - "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", - "installed_by": [ - "fastq_fastqc_umitools_fastp", - "modules" - ] - }, - "gffread": { - "branch": "master", - "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", - "installed_by": [ - "modules" - ] - }, - "gunzip": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] - }, - "samtools/cat": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": [ - "modules" - ] - }, - "sortmerna": { - "branch": "master", - "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", - "installed_by": [ - "modules" - ] - }, - "star/align": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": [ - "modules" - ] - }, - "star/genomegenerate": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": [ - "modules" - ], - "patch": "modules/nf-core/star/genomegenerate/star-genomegenerate.diff" - }, - "umitools/extract": { - "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", - "installed_by": [ - "fastq_fastqc_umitools_fastp" - ] - } - } - }, - "subworkflows": { - "nf-core": { - "fastq_fastqc_umitools_fastp": { - "branch": "master", - "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", - "installed_by": [ - "subworkflows" - ] - } - } - } + "name": "PlantandFoodResearch/pangene", + "homePage": "https://github.com/PlantandFoodResearch/pangene", + "repos": { + "git@github.com:PlantandFoodResearch/nxf-modules.git": { + "modules": { + "pfr": { + "custom/restoregffids": { + "branch": "main", + "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", + "installed_by": ["fasta_edta_lai", "modules"] + }, + "custom/shortenfastaids": { + "branch": "main", + "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", + "installed_by": ["fasta_edta_lai", "modules"] + }, + "edta/edta": { + "branch": "main", + "git_sha": "d14b1f1d790cc01c11f8ec5aa80a9562c6808b20", + "installed_by": ["fasta_edta_lai", "modules"] + }, + "lai": { + "branch": "main", + "git_sha": "7e6e3cb41362a045c6bb6065903efa0eba246e87", + "installed_by": ["fasta_edta_lai"] + }, + "liftoff": { + "branch": "main", + "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", + "installed_by": ["modules"] + } } + }, + "subworkflows": { + "pfr": { + "fasta_edta_lai": { + "branch": "main", + "git_sha": "fde2f37998ba54ec9c0b5cf65a2f28f14af981b0", + "installed_by": ["subworkflows"] + } + } + } + }, + "git@github.com:kherronism/nf-modules.git": { + "modules": { + "kherronism": { + "braker3": { + "branch": "dev", + "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", + "installed_by": ["modules"] + }, + "repeatmasker": { + "branch": "dev", + "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", + "installed_by": ["modules"] + } + } + } + }, + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "cat/cat": { + "branch": "master", + "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", + "installed_by": ["modules"] + }, + "fastavalidator": { + "branch": "master", + "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", + "installed_by": ["modules"] + }, + "fastp": { + "branch": "master", + "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", + "installed_by": ["fastq_fastqc_umitools_fastp"] + }, + "fastqc": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "installed_by": ["fastq_fastqc_umitools_fastp", "modules"] + }, + "gffread": { + "branch": "master", + "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", + "installed_by": ["modules"] + }, + "gunzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/cat": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["modules"] + }, + "sortmerna": { + "branch": "master", + "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", + "installed_by": ["modules"] + }, + "star/align": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["modules"] + }, + "star/genomegenerate": { + "branch": "master", + "git_sha": "d87a6e2156c2099c09280fa70776eaf0a824817a", + "installed_by": ["modules"] + }, + "umitools/extract": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "installed_by": ["fastq_fastqc_umitools_fastp"] + } + } + }, + "subworkflows": { + "nf-core": { + "fastq_fastqc_umitools_fastp": { + "branch": "master", + "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", + "installed_by": ["subworkflows"] + } + } + } } -} \ No newline at end of file + } +} diff --git a/modules/kherronism/braker3/meta.yml b/modules/kherronism/braker3/meta.yml index 9bc13a3..ed4da40 100644 --- a/modules/kherronism/braker3/meta.yml +++ b/modules/kherronism/braker3/meta.yml @@ -1,6 +1,6 @@ name: braker3 description: | - Gene prediction in novel genomes using RNA-seq and protein homology information + Gene prediction in novel genomes using RNA-seq and protein homology information keywords: - genome - annotation @@ -8,8 +8,8 @@ keywords: tools: - braker3: description: "BRAKER3 is a pipeline for fully automated prediction of - protein coding gene structures using protein and RNA-seq and protein homology - information" + protein coding gene structures using protein and RNA-seq and protein homology + information" homepage: "https://github.com/Gaius-Augustus/BRAKER" documentation: "https://github.com/Gaius-Augustus/BRAKER" tool_dev_url: "https://github.com/Gaius-Augustus/BRAKER" diff --git a/modules/kherronism/repeatmasker/meta.yml b/modules/kherronism/repeatmasker/meta.yml index 8adeb55..0cab608 100644 --- a/modules/kherronism/repeatmasker/meta.yml +++ b/modules/kherronism/repeatmasker/meta.yml @@ -1,6 +1,6 @@ name: repeatmasker description: | - Screening DNA sequences for interspersed repeats and low complexity DNA sequences. + Screening DNA sequences for interspersed repeats and low complexity DNA sequences keywords: - genome @@ -9,8 +9,9 @@ keywords: tools: - repeatmasker: - description: "RepeatMasker is a program that screens DNA sequences for interspersed - repeats and low complexity DNA sequences." + description: | + RepeatMasker is a program that screens DNA sequences for interspersed + repeats and low complexity DNA sequences homepage: "https://www.repeatmasker.org/" documentation: "https://www.repeatmasker.org/webrepeatmaskerhelp.html" tool_dev_url: "https://github.com/rmhubley/RepeatMasker" diff --git a/modules/nf-core/star/genomegenerate/star-genomegenerate.diff b/modules/nf-core/star/genomegenerate/star-genomegenerate.diff deleted file mode 100644 index 0181f46..0000000 --- a/modules/nf-core/star/genomegenerate/star-genomegenerate.diff +++ /dev/null @@ -1,247 +0,0 @@ -Changes in module 'nf-core/star/genomegenerate' ---- modules/nf-core/star/genomegenerate/environment.yml -+++ modules/nf-core/star/genomegenerate/environment.yml -@@ -1,9 +1,11 @@ - name: star_genomegenerate -+ - channels: - - conda-forge - - bioconda - - defaults -+ - dependencies: -+ - bioconda::samtools=1.18 - - bioconda::star=2.7.10a -- - bioconda::samtools=1.18 - - conda-forge::gawk=5.1.0 - ---- modules/nf-core/star/genomegenerate/main.nf -+++ modules/nf-core/star/genomegenerate/main.nf -@@ -19,9 +19,10 @@ - task.ext.when == null || task.ext.when - - script: -- def args = task.ext.args ?: '' -- def args_list = args.tokenize() -- def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' -+ def args = task.ext.args ?: '' -+ def args_list = args.tokenize() -+ def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' -+ def include_gtf = gtf ? "--sjdbGTFfile $gtf" : '' - if (args_list.contains('--genomeSAindexNbases')) { - """ - mkdir star -@@ -29,7 +30,7 @@ - --runMode genomeGenerate \\ - --genomeDir star/ \\ - --genomeFastaFiles $fasta \\ -- --sjdbGTFfile $gtf \\ -+ $include_gtf \\ - --runThreadN $task.cpus \\ - $memory \\ - $args -@@ -51,7 +52,7 @@ - --runMode genomeGenerate \\ - --genomeDir star/ \\ - --genomeFastaFiles $fasta \\ -- --sjdbGTFfile $gtf \\ -+ $include_gtf \\ - --runThreadN $task.cpus \\ - --genomeSAindexNbases \$NUM_BASES \\ - $memory \\ -@@ -67,30 +68,52 @@ - } - - stub: -- """ -- mkdir star -- touch star/Genome -- touch star/Log.out -- touch star/SA -- touch star/SAindex -- touch star/chrLength.txt -- touch star/chrName.txt -- touch star/chrNameLength.txt -- touch star/chrStart.txt -- touch star/exonGeTrInfo.tab -- touch star/exonInfo.tab -- touch star/geneInfo.tab -- touch star/genomeParameters.txt -- touch star/sjdbInfo.txt -- touch star/sjdbList.fromGTF.out.tab -- touch star/sjdbList.out.tab -- touch star/transcriptInfo.tab -+ if (gtf) { -+ """ -+ mkdir star -+ touch star/Genome -+ touch star/Log.out -+ touch star/SA -+ touch star/SAindex -+ touch star/chrLength.txt -+ touch star/chrName.txt -+ touch star/chrNameLength.txt -+ touch star/chrStart.txt -+ touch star/exonGeTrInfo.tab -+ touch star/exonInfo.tab -+ touch star/geneInfo.tab -+ touch star/genomeParameters.txt -+ touch star/sjdbInfo.txt -+ touch star/sjdbList.fromGTF.out.tab -+ touch star/sjdbList.out.tab -+ touch star/transcriptInfo.tab - -- cat <<-END_VERSIONS > versions.yml -- "${task.process}": -- star: \$(STAR --version | sed -e "s/STAR_//g") -- samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') -- gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') -- END_VERSIONS -- """ -+ cat <<-END_VERSIONS > versions.yml -+ "${task.process}": -+ star: \$(STAR --version | sed -e "s/STAR_//g") -+ samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') -+ gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') -+ END_VERSIONS -+ """ -+ } else { -+ """ -+ mkdir star -+ touch star/Genome -+ touch star/Log.out -+ touch star/SA -+ touch star/SAindex -+ touch star/chrLength.txt -+ touch star/chrName.txt -+ touch star/chrNameLength.txt -+ touch star/chrStart.txt -+ touch star/genomeParameters.txt -+ -+ cat <<-END_VERSIONS > versions.yml -+ "${task.process}": -+ star: \$(STAR --version | sed -e "s/STAR_//g") -+ samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') -+ gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') -+ END_VERSIONS -+ """ -+ } - } - ---- modules/nf-core/star/genomegenerate/tests/main.nf.test.snap -+++ modules/nf-core/star/genomegenerate/tests/main.nf.test.snap -@@ -5,12 +5,18 @@ - "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" - ] - ], -- "timestamp": "2023-12-04T18:01:27.298248806" -+ "timestamp": "2023-12-19T11:05:51.741109" - }, -- "index": { -+ "index_with_gtf": { - "content": [ -- "star" -+ "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, exonGeTrInfo.tab, exonInfo.tab, geneInfo.tab, genomeParameters.txt, sjdbInfo.txt, sjdbList.fromGTF.out.tab, sjdbList.out.tab, transcriptInfo.tab]" - ], -- "timestamp": "2023-11-23T11:31:47.560528" -+ "timestamp": "2023-12-19T11:38:14.551548" -+ }, -+ "index_without_gtf": { -+ "content": [ -+ "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, genomeParameters.txt]" -+ ], -+ "timestamp": "2023-12-19T11:38:22.382905" - } - } ---- modules/nf-core/star/genomegenerate/tests/main.nf.test -+++ modules/nf-core/star/genomegenerate/tests/main.nf.test -@@ -28,7 +28,86 @@ - then { - assertAll( - { assert process.success }, -- { assert snapshot(file(process.out.index[0][1]).name).match("index") }, -+ { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_with_gtf") }, -+ { assert snapshot(process.out.versions).match("versions") } -+ ) -+ } -+ -+ } -+ -+ test("homo_sapiens-stub") { -+ -+ options '-stub' -+ -+ when { -+ process { -+ """ -+ input[0] = Channel.of([ -+ [ id:'test_fasta' ], -+ [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] -+ ]) -+ input[1] = Channel.of([ -+ [ id:'test_gtf' ], -+ [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] -+ ]) -+ """ -+ } -+ } -+ -+ then { -+ assertAll( -+ { assert process.success }, -+ { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_with_gtf") }, -+ { assert snapshot(process.out.versions).match("versions") } -+ ) -+ } -+ -+ } -+ -+ test("homo_sapiens-without_gtf") { -+ -+ when { -+ process { -+ """ -+ input[0] = Channel.of([ -+ [ id:'test_fasta' ], -+ [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] -+ ]) -+ input[1] = Channel.of([ [], [] ]) -+ """ -+ } -+ } -+ -+ then { -+ assertAll( -+ { assert process.success }, -+ { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_without_gtf") }, -+ { assert snapshot(process.out.versions).match("versions") } -+ ) -+ } -+ -+ } -+ -+ test("homo_sapiens-without_gtf-stub") { -+ -+ options '-stub' -+ -+ when { -+ process { -+ """ -+ input[0] = Channel.of([ -+ [ id:'test_fasta' ], -+ [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] -+ ]) -+ input[1] = Channel.of([ [], [] ]) -+ """ -+ } -+ } -+ -+ then { -+ assertAll( -+ { assert process.success }, -+ { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_without_gtf") }, - { assert snapshot(process.out.versions).match("versions") } - ) - } - -************************************************************ From c526933df6406fae336c799fd2e1c2c725916dcb Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 21 Dec 2023 10:23:21 +1300 Subject: [PATCH 32/59] FASTP now has stub --- modules.json | 327 ++++++++++-------- modules/nf-core/fastp/fastp.diff | 28 ++ modules/nf-core/fastp/main.nf | 18 + .../fastq_fastqc_umitools_fastp/main.nf | 3 + 4 files changed, 237 insertions(+), 139 deletions(-) create mode 100644 modules/nf-core/fastp/fastp.diff diff --git a/modules.json b/modules.json index dfc1e93..dc6674b 100644 --- a/modules.json +++ b/modules.json @@ -1,142 +1,191 @@ { - "name": "PlantandFoodResearch/pangene", - "homePage": "https://github.com/PlantandFoodResearch/pangene", - "repos": { - "git@github.com:PlantandFoodResearch/nxf-modules.git": { - "modules": { - "pfr": { - "custom/restoregffids": { - "branch": "main", - "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", - "installed_by": ["fasta_edta_lai", "modules"] - }, - "custom/shortenfastaids": { - "branch": "main", - "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", - "installed_by": ["fasta_edta_lai", "modules"] - }, - "edta/edta": { - "branch": "main", - "git_sha": "d14b1f1d790cc01c11f8ec5aa80a9562c6808b20", - "installed_by": ["fasta_edta_lai", "modules"] - }, - "lai": { - "branch": "main", - "git_sha": "7e6e3cb41362a045c6bb6065903efa0eba246e87", - "installed_by": ["fasta_edta_lai"] - }, - "liftoff": { - "branch": "main", - "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", - "installed_by": ["modules"] - } + "name": "PlantandFoodResearch/pangene", + "homePage": "https://github.com/PlantandFoodResearch/pangene", + "repos": { + "git@github.com:PlantandFoodResearch/nxf-modules.git": { + "modules": { + "pfr": { + "custom/restoregffids": { + "branch": "main", + "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", + "installed_by": [ + "fasta_edta_lai", + "modules" + ] + }, + "custom/shortenfastaids": { + "branch": "main", + "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", + "installed_by": [ + "fasta_edta_lai", + "modules" + ] + }, + "edta/edta": { + "branch": "main", + "git_sha": "d14b1f1d790cc01c11f8ec5aa80a9562c6808b20", + "installed_by": [ + "fasta_edta_lai", + "modules" + ] + }, + "lai": { + "branch": "main", + "git_sha": "7e6e3cb41362a045c6bb6065903efa0eba246e87", + "installed_by": [ + "fasta_edta_lai" + ] + }, + "liftoff": { + "branch": "main", + "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", + "installed_by": [ + "modules" + ] + } + } + }, + "subworkflows": { + "pfr": { + "fasta_edta_lai": { + "branch": "main", + "git_sha": "fde2f37998ba54ec9c0b5cf65a2f28f14af981b0", + "installed_by": [ + "subworkflows" + ] + } + } + } + }, + "git@github.com:kherronism/nf-modules.git": { + "modules": { + "kherronism": { + "braker3": { + "branch": "dev", + "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", + "installed_by": [ + "modules" + ] + }, + "repeatmasker": { + "branch": "dev", + "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", + "installed_by": [ + "modules" + ] + } + } + } + }, + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "cat/cat": { + "branch": "master", + "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", + "installed_by": [ + "modules" + ] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", + "installed_by": [ + "modules" + ] + }, + "fastavalidator": { + "branch": "master", + "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", + "installed_by": [ + "modules" + ] + }, + "fastp": { + "branch": "master", + "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", + "installed_by": [ + "fastq_fastqc_umitools_fastp" + ], + "patch": "modules/nf-core/fastp/fastp.diff" + }, + "fastqc": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "installed_by": [ + "fastq_fastqc_umitools_fastp", + "modules" + ] + }, + "gffread": { + "branch": "master", + "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", + "installed_by": [ + "modules" + ] + }, + "gunzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "samtools/cat": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": [ + "modules" + ] + }, + "sortmerna": { + "branch": "master", + "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", + "installed_by": [ + "modules" + ] + }, + "star/align": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": [ + "modules" + ] + }, + "star/genomegenerate": { + "branch": "master", + "git_sha": "d87a6e2156c2099c09280fa70776eaf0a824817a", + "installed_by": [ + "modules" + ] + }, + "umitools/extract": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "installed_by": [ + "fastq_fastqc_umitools_fastp" + ] + } + } + }, + "subworkflows": { + "nf-core": { + "fastq_fastqc_umitools_fastp": { + "branch": "master", + "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", + "installed_by": [ + "subworkflows" + ] + } + } + } } - }, - "subworkflows": { - "pfr": { - "fasta_edta_lai": { - "branch": "main", - "git_sha": "fde2f37998ba54ec9c0b5cf65a2f28f14af981b0", - "installed_by": ["subworkflows"] - } - } - } - }, - "git@github.com:kherronism/nf-modules.git": { - "modules": { - "kherronism": { - "braker3": { - "branch": "dev", - "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", - "installed_by": ["modules"] - }, - "repeatmasker": { - "branch": "dev", - "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", - "installed_by": ["modules"] - } - } - } - }, - "https://github.com/nf-core/modules.git": { - "modules": { - "nf-core": { - "cat/cat": { - "branch": "master", - "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", - "installed_by": ["modules"] - }, - "cat/fastq": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] - }, - "custom/dumpsoftwareversions": { - "branch": "master", - "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", - "installed_by": ["modules"] - }, - "fastavalidator": { - "branch": "master", - "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", - "installed_by": ["modules"] - }, - "fastp": { - "branch": "master", - "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", - "installed_by": ["fastq_fastqc_umitools_fastp"] - }, - "fastqc": { - "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", - "installed_by": ["fastq_fastqc_umitools_fastp", "modules"] - }, - "gffread": { - "branch": "master", - "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", - "installed_by": ["modules"] - }, - "gunzip": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] - }, - "samtools/cat": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] - }, - "sortmerna": { - "branch": "master", - "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", - "installed_by": ["modules"] - }, - "star/align": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] - }, - "star/genomegenerate": { - "branch": "master", - "git_sha": "d87a6e2156c2099c09280fa70776eaf0a824817a", - "installed_by": ["modules"] - }, - "umitools/extract": { - "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", - "installed_by": ["fastq_fastqc_umitools_fastp"] - } - } - }, - "subworkflows": { - "nf-core": { - "fastq_fastqc_umitools_fastp": { - "branch": "master", - "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", - "installed_by": ["subworkflows"] - } - } - } } - } -} +} \ No newline at end of file diff --git a/modules/nf-core/fastp/fastp.diff b/modules/nf-core/fastp/fastp.diff new file mode 100644 index 0000000..4213043 --- /dev/null +++ b/modules/nf-core/fastp/fastp.diff @@ -0,0 +1,28 @@ +Changes in module 'nf-core/fastp' +--- modules/nf-core/fastp/main.nf ++++ modules/nf-core/fastp/main.nf +@@ -99,4 +99,22 @@ + END_VERSIONS + """ + } ++ ++ stub: ++ def prefix = task.ext.prefix ?: "${meta.id}" ++ def is_single_output = task.ext.args?.contains('--interleaved_in') || meta.single_end ++ def touch_reads = is_single_output ? "${prefix}.fastp.fastq.gz" : "${prefix}_1.fastp.fastq.gz ${prefix}_2.fastp.fastq.gz" ++ def touch_merged = (!is_single_output && save_merged) ? "touch ${prefix}.merged.fastq.gz" : "" ++ """ ++ touch $touch_reads ++ touch "${prefix}.fastp.json" ++ touch "${prefix}.fastp.html" ++ touch "${prefix}.fastp.log" ++ $touch_merged ++ ++ cat <<-END_VERSIONS > versions.yml ++ "${task.process}": ++ fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") ++ END_VERSIONS ++ """ + } + +************************************************************ diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf index 5fac3c1..1f56640 100644 --- a/modules/nf-core/fastp/main.nf +++ b/modules/nf-core/fastp/main.nf @@ -99,4 +99,22 @@ process FASTP { END_VERSIONS """ } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def is_single_output = task.ext.args?.contains('--interleaved_in') || meta.single_end + def touch_reads = is_single_output ? "${prefix}.fastp.fastq.gz" : "${prefix}_1.fastp.fastq.gz ${prefix}_2.fastp.fastq.gz" + def touch_merged = (!is_single_output && save_merged) ? "touch ${prefix}.merged.fastq.gz" : "" + """ + touch $touch_reads + touch "${prefix}.fastp.json" + touch "${prefix}.fastp.html" + touch "${prefix}.fastp.log" + $touch_merged + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ } diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf index 3dbb27e..711210f 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf @@ -13,6 +13,9 @@ include { FASTP } from '../../../modules/nf-core/fastp/main' import groovy.json.JsonSlurper def getFastpReadsAfterFiltering(json_file) { + + if (!json_file.text) { return 0 } // Usman Rashid: To allow -stub with FASTP + def Map json = (Map) new JsonSlurper().parseText(json_file.text).get('summary') return json['after_filtering']['total_reads'].toLong() } From ac45bbb4b07afa2f1997b80aaa25a6d85a8c395e Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 21 Dec 2023 12:38:15 +1300 Subject: [PATCH 33/59] SORTMERNA now has stub --- modules.json | 3 +- modules/nf-core/sortmerna/main.nf | 26 ++++++++++++++ modules/nf-core/sortmerna/sortmerna.diff | 36 +++++++++++++++++++ nextflow.config | 8 ++--- subworkflows/local/preprocess_rnaseq.nf | 8 +++++ .../fastq_fastqc_umitools_fastp/main.nf | 6 ++-- 6 files changed, 79 insertions(+), 8 deletions(-) create mode 100644 modules/nf-core/sortmerna/sortmerna.diff diff --git a/modules.json b/modules.json index dc6674b..7cf4c73 100644 --- a/modules.json +++ b/modules.json @@ -150,7 +150,8 @@ "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", "installed_by": [ "modules" - ] + ], + "patch": "modules/nf-core/sortmerna/sortmerna.diff" }, "star/align": { "branch": "master", diff --git a/modules/nf-core/sortmerna/main.nf b/modules/nf-core/sortmerna/main.nf index 53ccb97..909a7b1 100644 --- a/modules/nf-core/sortmerna/main.nf +++ b/modules/nf-core/sortmerna/main.nf @@ -67,4 +67,30 @@ process SORTMERNA { END_VERSIONS """ } + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + """ + touch ${prefix}.non_rRNA.fastq.gz + touch ${prefix}.sortmerna.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') + END_VERSIONS + """ + } else { + """ + touch ${prefix}_1.non_rRNA.fastq.gz + touch ${prefix}_2.non_rRNA.fastq.gz + touch ${prefix}.sortmerna.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') + END_VERSIONS + """ + } } diff --git a/modules/nf-core/sortmerna/sortmerna.diff b/modules/nf-core/sortmerna/sortmerna.diff new file mode 100644 index 0000000..66d58d5 --- /dev/null +++ b/modules/nf-core/sortmerna/sortmerna.diff @@ -0,0 +1,36 @@ +Changes in module 'nf-core/sortmerna' +--- modules/nf-core/sortmerna/main.nf ++++ modules/nf-core/sortmerna/main.nf +@@ -67,4 +67,30 @@ + END_VERSIONS + """ + } ++ ++ stub: ++ def args = task.ext.args ?: '' ++ def prefix = task.ext.prefix ?: "${meta.id}" ++ if (meta.single_end) { ++ """ ++ touch ${prefix}.non_rRNA.fastq.gz ++ touch ${prefix}.sortmerna.log ++ ++ cat <<-END_VERSIONS > versions.yml ++ "${task.process}": ++ sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') ++ END_VERSIONS ++ """ ++ } else { ++ """ ++ touch ${prefix}_1.non_rRNA.fastq.gz ++ touch ${prefix}_2.non_rRNA.fastq.gz ++ touch ${prefix}.sortmerna.log ++ ++ cat <<-END_VERSIONS > versions.yml ++ "${task.process}": ++ sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') ++ END_VERSIONS ++ """ ++ } + } + +************************************************************ diff --git a/nextflow.config b/nextflow.config index d861331..587694a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -33,16 +33,16 @@ params { min_trimmed_reads = 10000 extra_fastp_args = "" - save_trimmed = false + save_trimmed = true // toggling this parameter results in rerun of FASTP and FASTQC_TRIM - remove_ribo_rna = false - save_non_ribo_reads = false + remove_ribo_rna = true + save_non_ribo_reads = true ribo_database_manifest = "${projectDir}/assets/rrna-db-defaults.txt" star_max_intron_length = 16000 star_align_extra_args = "" - star_save_outputs = false + star_save_outputs = true external_protein_fastas = [ "/workspace/ComparativeDataSources/OrthoDB11/Viridiplantae.fa.gz", diff --git a/subworkflows/local/preprocess_rnaseq.nf b/subworkflows/local/preprocess_rnaseq.nf index 7a82786..9184808 100644 --- a/subworkflows/local/preprocess_rnaseq.nf +++ b/subworkflows/local/preprocess_rnaseq.nf @@ -70,6 +70,14 @@ workflow PREPROCESS_RNASEQ { .reads | set { ch_trim_reads } + ch_cat_fastq + | join(ch_trim_reads, remainder:true) + | map { meta, reads, trimmed -> + if (!trimmed) { + System.err.println("WARNING: Dropping ${reads.collect { it.getName() }} as read count after trimming is less than $min_trimmed_reads") + } + } + // MODULE: SORTMERNA if (remove_ribo_rna) { SORTMERNA ( diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf index 711210f..2c67b3c 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf @@ -12,9 +12,9 @@ include { FASTP } from '../../../modules/nf-core/fastp/main' // import groovy.json.JsonSlurper -def getFastpReadsAfterFiltering(json_file) { +def getFastpReadsAfterFiltering(json_file, min_trimmed_reads) { - if (!json_file.text) { return 0 } // Usman Rashid: To allow -stub with FASTP + if (!json_file.text) { return min_trimmed_reads } // Usman Rashid: To allow -stub with FASTP def Map json = (Map) new JsonSlurper().parseText(json_file.text).get('summary') return json['after_filtering']['total_reads'].toLong() @@ -99,7 +99,7 @@ workflow FASTQ_FASTQC_UMITOOLS_FASTP { .out .reads .join(trim_json) - .map { meta, reads, json -> [ meta, reads, getFastpReadsAfterFiltering(json) ] } + .map { meta, reads, json -> [ meta, reads, getFastpReadsAfterFiltering(json, min_trimmed_reads) ] } .set { ch_num_trimmed_reads } ch_num_trimmed_reads From ed6aa33f6b775bb436957b8d9a3d40d41c85a30b Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 21 Dec 2023 13:09:44 +1300 Subject: [PATCH 34/59] Cleaned up prepare_assembly --- modules.json | 330 +++++++++++-------------- subworkflows/local/prepare_assembly.nf | 134 +++++----- 2 files changed, 204 insertions(+), 260 deletions(-) diff --git a/modules.json b/modules.json index 7cf4c73..299e449 100644 --- a/modules.json +++ b/modules.json @@ -1,192 +1,144 @@ { - "name": "PlantandFoodResearch/pangene", - "homePage": "https://github.com/PlantandFoodResearch/pangene", - "repos": { - "git@github.com:PlantandFoodResearch/nxf-modules.git": { - "modules": { - "pfr": { - "custom/restoregffids": { - "branch": "main", - "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", - "installed_by": [ - "fasta_edta_lai", - "modules" - ] - }, - "custom/shortenfastaids": { - "branch": "main", - "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", - "installed_by": [ - "fasta_edta_lai", - "modules" - ] - }, - "edta/edta": { - "branch": "main", - "git_sha": "d14b1f1d790cc01c11f8ec5aa80a9562c6808b20", - "installed_by": [ - "fasta_edta_lai", - "modules" - ] - }, - "lai": { - "branch": "main", - "git_sha": "7e6e3cb41362a045c6bb6065903efa0eba246e87", - "installed_by": [ - "fasta_edta_lai" - ] - }, - "liftoff": { - "branch": "main", - "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", - "installed_by": [ - "modules" - ] - } - } - }, - "subworkflows": { - "pfr": { - "fasta_edta_lai": { - "branch": "main", - "git_sha": "fde2f37998ba54ec9c0b5cf65a2f28f14af981b0", - "installed_by": [ - "subworkflows" - ] - } - } - } - }, - "git@github.com:kherronism/nf-modules.git": { - "modules": { - "kherronism": { - "braker3": { - "branch": "dev", - "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", - "installed_by": [ - "modules" - ] - }, - "repeatmasker": { - "branch": "dev", - "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", - "installed_by": [ - "modules" - ] - } - } - } - }, - "https://github.com/nf-core/modules.git": { - "modules": { - "nf-core": { - "cat/cat": { - "branch": "master", - "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", - "installed_by": [ - "modules" - ] - }, - "cat/fastq": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] - }, - "custom/dumpsoftwareversions": { - "branch": "master", - "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", - "installed_by": [ - "modules" - ] - }, - "fastavalidator": { - "branch": "master", - "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", - "installed_by": [ - "modules" - ] - }, - "fastp": { - "branch": "master", - "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", - "installed_by": [ - "fastq_fastqc_umitools_fastp" - ], - "patch": "modules/nf-core/fastp/fastp.diff" - }, - "fastqc": { - "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", - "installed_by": [ - "fastq_fastqc_umitools_fastp", - "modules" - ] - }, - "gffread": { - "branch": "master", - "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", - "installed_by": [ - "modules" - ] - }, - "gunzip": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] - }, - "samtools/cat": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": [ - "modules" - ] - }, - "sortmerna": { - "branch": "master", - "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", - "installed_by": [ - "modules" - ], - "patch": "modules/nf-core/sortmerna/sortmerna.diff" - }, - "star/align": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": [ - "modules" - ] - }, - "star/genomegenerate": { - "branch": "master", - "git_sha": "d87a6e2156c2099c09280fa70776eaf0a824817a", - "installed_by": [ - "modules" - ] - }, - "umitools/extract": { - "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", - "installed_by": [ - "fastq_fastqc_umitools_fastp" - ] - } - } - }, - "subworkflows": { - "nf-core": { - "fastq_fastqc_umitools_fastp": { - "branch": "master", - "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", - "installed_by": [ - "subworkflows" - ] - } - } - } + "name": "PlantandFoodResearch/pangene", + "homePage": "https://github.com/PlantandFoodResearch/pangene", + "repos": { + "git@github.com:PlantandFoodResearch/nxf-modules.git": { + "modules": { + "pfr": { + "custom/restoregffids": { + "branch": "main", + "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", + "installed_by": ["fasta_edta_lai", "modules"] + }, + "custom/shortenfastaids": { + "branch": "main", + "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", + "installed_by": ["fasta_edta_lai", "modules"] + }, + "edta/edta": { + "branch": "main", + "git_sha": "d14b1f1d790cc01c11f8ec5aa80a9562c6808b20", + "installed_by": ["fasta_edta_lai", "modules"] + }, + "lai": { + "branch": "main", + "git_sha": "7e6e3cb41362a045c6bb6065903efa0eba246e87", + "installed_by": ["fasta_edta_lai"] + }, + "liftoff": { + "branch": "main", + "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", + "installed_by": ["modules"] + } } + }, + "subworkflows": { + "pfr": { + "fasta_edta_lai": { + "branch": "main", + "git_sha": "fde2f37998ba54ec9c0b5cf65a2f28f14af981b0", + "installed_by": ["subworkflows"] + } + } + } + }, + "git@github.com:kherronism/nf-modules.git": { + "modules": { + "kherronism": { + "braker3": { + "branch": "dev", + "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", + "installed_by": ["modules"] + }, + "repeatmasker": { + "branch": "dev", + "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", + "installed_by": ["modules"] + } + } + } + }, + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "cat/cat": { + "branch": "master", + "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", + "installed_by": ["modules"] + }, + "fastavalidator": { + "branch": "master", + "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", + "installed_by": ["modules"] + }, + "fastp": { + "branch": "master", + "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", + "installed_by": ["fastq_fastqc_umitools_fastp"], + "patch": "modules/nf-core/fastp/fastp.diff" + }, + "fastqc": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "installed_by": ["fastq_fastqc_umitools_fastp", "modules"] + }, + "gffread": { + "branch": "master", + "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", + "installed_by": ["modules"] + }, + "gunzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/cat": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["modules"] + }, + "sortmerna": { + "branch": "master", + "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", + "installed_by": ["modules"], + "patch": "modules/nf-core/sortmerna/sortmerna.diff" + }, + "star/align": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["modules"] + }, + "star/genomegenerate": { + "branch": "master", + "git_sha": "d87a6e2156c2099c09280fa70776eaf0a824817a", + "installed_by": ["modules"] + }, + "umitools/extract": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "installed_by": ["fastq_fastqc_umitools_fastp"] + } + } + }, + "subworkflows": { + "nf-core": { + "fastq_fastqc_umitools_fastp": { + "branch": "master", + "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", + "installed_by": ["subworkflows"] + } + } + } } -} \ No newline at end of file + } +} diff --git a/subworkflows/local/prepare_assembly.nf b/subworkflows/local/prepare_assembly.nf index db2e3c3..ed32afb 100644 --- a/subworkflows/local/prepare_assembly.nf +++ b/subworkflows/local/prepare_assembly.nf @@ -8,34 +8,35 @@ include { FASTA_EDTA_LAI } from '../../subworkflows/pfr/f workflow PREPARE_ASSEMBLY { take: - target_assembly // channel: [ meta, fasta ] - te_library // channel: [ meta, fasta ] + target_assembly // channel: [ meta, fasta ] + te_library // channel: [ meta, fasta ] main: + ch_versions = Channel.empty() + // MODULE: GUNZIP_TARGET_ASSEMBLY - target_assembly - | branch { meta, file -> - gz: "$file".endsWith(".gz") - rest: !"$file".endsWith(".gz") - } - | set { tech_target_assembly_branch } + target_assembly_branch = target_assembly + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } + + GUNZIP_TARGET_ASSEMBLY ( target_assembly_branch.gz ) + + ch_gunzip_assembly = GUNZIP_TARGET_ASSEMBLY.out.gunzip + | mix( + target_assembly_branch.rest + ) + ch_versions = ch_versions.mix(GUNZIP_TARGET_ASSEMBLY.out.versions.first()) - GUNZIP_TARGET_ASSEMBLY( - tech_target_assembly_branch.gz - ) - .gunzip - | mix( - tech_target_assembly_branch.rest - ) - | set { ch_gunzip_target_assembly } // MODULE: FASTAVALIDATOR - FASTAVALIDATOR(ch_gunzip_target_assembly) + FASTAVALIDATOR ( ch_gunzip_assembly ) - ch_gunzip_target_assembly - | join(FASTAVALIDATOR.out.success_log) - | map { meta, fasta, log -> [ meta, fasta ] } - | set { ch_validated_target_assembly } + ch_validated_assembly = ch_gunzip_assembly + | join(FASTAVALIDATOR.out.success_log) + | map { meta, fasta, log -> [ meta, fasta ] } + ch_versions = ch_versions.mix(FASTAVALIDATOR.out.versions.first()) FASTAVALIDATOR.out.error_log | map { meta, log -> @@ -43,72 +44,63 @@ workflow PREPARE_ASSEMBLY { } // MODULE: GUNZIP_TE_LIBRARY - te_library - | branch { meta, file -> - gz: "$file".endsWith(".gz") - rest: !"$file".endsWith(".gz") - } - | set { ch_te_library_branch } + ch_te_library_branch = te_library + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } - GUNZIP_TE_LIBRARY( - ch_te_library_branch.gz - ) - .gunzip - | mix( - ch_te_library_branch.rest - ) - | set { ch_gunzip_te_library } + GUNZIP_TE_LIBRARY ( ch_te_library_branch.gz ) + + ch_gunzip_te_library = GUNZIP_TE_LIBRARY.out.gunzip + | mix( + ch_te_library_branch.rest + ) + ch_versions = ch_versions.mix(GUNZIP_TE_LIBRARY.out.versions.first()) // SUBWORKFLOW: FASTA_EDTA_LAI - ch_validated_target_assembly - | join( - ch_gunzip_te_library, remainder: true - ) - | filter { meta, assembly, teLib -> - teLib == null - } - | map { meta, assembly, teLib -> [meta, assembly] } - | set { ch_edta_inputs } + ch_edta_inputs = ch_validated_assembly + | join( + ch_gunzip_te_library, remainder: true + ) + | filter { meta, assembly, teLib -> + teLib == null + } + | map { meta, assembly, teLib -> [meta, assembly] } - FASTA_EDTA_LAI ( + FASTA_EDTA_LAI( ch_edta_inputs, [], true // Skip LAI ) - // MODULE: REPEATMASKER - ch_validated_target_assembly - | join( - FASTA_EDTA_LAI.out.te_lib_fasta.mix(ch_gunzip_te_library) - ) - | set { ch_assembly_n_te_lib } + ch_assembly_and_te_lib = ch_validated_assembly + | join( + FASTA_EDTA_LAI.out.te_lib_fasta.mix(ch_gunzip_te_library) + ) + ch_versions = ch_versions.mix(FASTA_EDTA_LAI.out.versions.first()) + + // MODULE: REPEATMASKER REPEATMASKER( - ch_assembly_n_te_lib.map { meta, assembly, teLib -> [meta, assembly] }, - ch_assembly_n_te_lib.map { meta, assembly, teLib -> teLib }, + ch_assembly_and_te_lib.map { meta, assembly, teLib -> [meta, assembly] }, + ch_assembly_and_te_lib.map { meta, assembly, teLib -> teLib }, ) + ch_versions = ch_versions.mix(REPEATMASKER.out.versions.first()) + // MODULE: STAR_GENOMEGENERATE - def star_ignore_sjdbgtf = true STAR_GENOMEGENERATE( - ch_validated_target_assembly, - ch_validated_target_assembly.map { meta, fasta -> [ [], [] ] } + ch_validated_assembly, + ch_validated_assembly.map { meta, fasta -> [ [], [] ] } ) - .index - | set { ch_assembly_index } - - Channel.empty() - | mix(FASTAVALIDATOR.out.versions.first()) - | mix(GUNZIP_TE_LIBRARY.out.versions.first()) - | mix(FASTA_EDTA_LAI.out.versions) - | mix(REPEATMASKER.out.versions.first()) - | mix(STAR_GENOMEGENERATE.out.versions.first()) - | mix(GUNZIP_TARGET_ASSEMBLY.out.versions.first()) - | set { ch_versions } + + ch_assembly_index = STAR_GENOMEGENERATE.out.index + ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions.first()) emit: - target_assemby = ch_validated_target_assembly // channel: [ meta, fasta ] - masked_target_assembly = REPEATMASKER.out.fasta_masked // channel: [ meta, fasta ] - target_assemby_index = ch_assembly_index // channel: [ meta, star_index ] - versions = ch_versions // channel: [ versions.yml ] + target_assemby = ch_validated_assembly // channel: [ meta, fasta ] + masked_target_assembly = REPEATMASKER.out.fasta_masked // channel: [ meta, fasta ] + target_assemby_index = ch_assembly_index // channel: [ meta, star_index ] + versions = ch_versions // channel: [ versions.yml ] } \ No newline at end of file From 144edb244a50ac05acb770da6e49882c21a6cf47 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 21 Dec 2023 13:20:32 +1300 Subject: [PATCH 35/59] Cleaned up preprocess_rnaseq --- subworkflows/local/preprocess_rnaseq.nf | 100 ++++++++++++------------ 1 file changed, 49 insertions(+), 51 deletions(-) diff --git a/subworkflows/local/preprocess_rnaseq.nf b/subworkflows/local/preprocess_rnaseq.nf index 9184808..ba444bb 100644 --- a/subworkflows/local/preprocess_rnaseq.nf +++ b/subworkflows/local/preprocess_rnaseq.nf @@ -6,7 +6,7 @@ include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../../subworkflows/nf-core/fastq workflow PREPROCESS_RNASEQ { take: samplesheet // path: csv - permissible_target_assemblies // val: assembly_a,assembly_b + permissible_assemblies // val: assembly_a,assembly_b skip_fastqc // val: true|false skip_fastp // val: true|false save_trimmed // val: true|false @@ -16,45 +16,46 @@ workflow PREPROCESS_RNASEQ { main: ch_versions = Channel.empty() + // SUBWORKFLOW: EXTRACT_SAMPLES EXTRACT_SAMPLES( samplesheet, - permissible_target_assemblies + permissible_assemblies ) - .reads - | map { meta, fastq -> - groupID = meta.id - ~/_T\d+/ - [ meta + [id: groupID], fastq ] - } - | groupTuple() - | branch { meta, fastq -> - single : fastq.size() == 1 - return [ meta, fastq.flatten() ] - multiple: fastq.size() > 1 - return [ meta, fastq.flatten() ] - } - | set { ch_fastq } - EXTRACT_SAMPLES.out.assemblies - | map { meta, assembly -> - groupID = meta.id - ~/_T\d+/ - [ meta + [id: groupID], assembly ] - } - | unique - | set { ch_reads_target } + ch_fastq = EXTRACT_SAMPLES.out.reads + | map { meta, fastq -> + groupID = meta.id - ~/_T\d+/ + [ meta + [id: groupID], fastq ] + } + | groupTuple() + | branch { meta, fastq -> + single : fastq.size() == 1 + return [ meta, fastq.flatten() ] + multiple: fastq.size() > 1 + return [ meta, fastq.flatten() ] + } + + ch_reads_target = EXTRACT_SAMPLES.out.assemblies + | map { meta, assembly -> + groupID = meta.id - ~/_T\d+/ + [ meta + [id: groupID], assembly ] + } + | unique + + ch_versions = ch_versions.mix(EXTRACT_SAMPLES.out.versions) // MODULES: CAT_FASTQ - CAT_FASTQ ( - ch_fastq.multiple - ) - .reads - | mix(ch_fastq.single) - | set { ch_cat_fastq } + CAT_FASTQ ( ch_fastq.multiple ) + + ch_cat_fastq = CAT_FASTQ.out.reads.mix(ch_fastq.single) + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first()) // SUBWORKFLOW: FASTQ_FASTQC_UMITOOLS_FASTP - def with_umi = false - def skip_umi_extract = true - def umi_discard_read = false + def with_umi = false + def skip_umi_extract = true + def umi_discard_read = false + FASTQ_FASTQC_UMITOOLS_FASTP ( ch_cat_fastq, skip_fastqc, @@ -67,8 +68,8 @@ workflow PREPROCESS_RNASEQ { save_trimmed, min_trimmed_reads ) - .reads - | set { ch_trim_reads } + + ch_trim_reads = FASTQ_FASTQC_UMITOOLS_FASTP.out.reads ch_cat_fastq | join(ch_trim_reads, remainder:true) @@ -78,26 +79,23 @@ workflow PREPROCESS_RNASEQ { } } - // MODULE: SORTMERNA - if (remove_ribo_rna) { - SORTMERNA ( - ch_trim_reads, - sortmerna_fastas - ) - .reads - | set { ch_sortmerna_reads } + ch_versions = ch_versions.mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.versions.first()) - ch_versions = ch_versions.mix(SORTMERNA.out.versions.first()) - } + // MODULE: SORTMERNA + SORTMERNA( + remove_ribo_rna ? ch_trim_reads : Channel.empty(), + sortmerna_fastas + ) + + ch_emitted_reads = remove_ribo_rna + ? SORTMERNA.out.reads + : ch_trim_reads + ch_versions = ch_versions.mix(SORTMERNA.out.versions.first()) - ch_versions - | mix(EXTRACT_SAMPLES.out.versions) - | mix(CAT_FASTQ.out.versions.first()) - | mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.versions) - | set { ch_versions } + emit: - trim_reads = remove_ribo_rna ? ch_sortmerna_reads : ch_trim_reads // channel: [ meta, [ fq ] ] - reads_target = ch_reads_target // channel: [ meta, assembly_id ] - versions = ch_versions // channel: [ versions.yml ] + trim_reads = ch_emitted_reads // channel: [ meta, [ fq ] ] + reads_target = ch_reads_target // channel: [ meta, assembly_id ] + versions = ch_versions // channel: [ versions.yml ] } \ No newline at end of file From a79c18729aebe6c72a7c34cf9c4a2fe0a7e5be4d Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 21 Dec 2023 13:50:49 +1300 Subject: [PATCH 36/59] Reformatted and inc ALIGN_RNASEQ --- subworkflows/local/align_rnaseq.nf | 105 +++++++++++++++-------------- workflows/pangene.nf | 18 ++--- 2 files changed, 64 insertions(+), 59 deletions(-) diff --git a/subworkflows/local/align_rnaseq.nf b/subworkflows/local/align_rnaseq.nf index c0a9039..544fea9 100644 --- a/subworkflows/local/align_rnaseq.nf +++ b/subworkflows/local/align_rnaseq.nf @@ -3,69 +3,74 @@ include { SAMTOOLS_CAT } from '../../modules/nf-core/samtools/cat' workflow ALIGN_RNASEQ { take: - reads_target // channel: [ meta, assembly_id ] - trim_reads // channel: [ meta, [ fq ] ] - assembly_index // channel: [ meta2, star_index ] + reads_target // channel: [ meta, assembly_id ] + trim_reads // channel: [ meta, [ fq ] ] + assembly_index // channel: [ meta2, star_index ] main: + ch_versions = Channel.empty() + // MODULE: STAR_ALIGN - reads_target - | combine(trim_reads, by:0) - | map { meta, assembly, fastq -> - [assembly, [id:"${meta.id}.on.${assembly}", single_end:meta.single_end, target_assembly:assembly], fastq] - } - | combine( - assembly_index.map { meta, index -> [meta.id, index] }, - by:0 - ) - | map { assembly, meta, fastq, index -> [meta, fastq, index] } - | set { ch_star_inputs } + ch_star_inputs = reads_target + | combine(trim_reads, by:0) + | map { meta, assembly, fastq -> + [ + assembly, + [ + id: "${meta.id}.on.${assembly}", + single_end: meta.single_end, + target_assembly: assembly + ], + fastq + ] + } + | combine( + assembly_index.map { meta, index -> [ meta.id, index ] }, + by:0 + ) + | map { assembly, meta, fastq, index -> [ meta, fastq, index ] } - def star_ignore_sjdbgtf = true - def seq_platform = false - def seq_center = false + def star_ignore_sjdbgtf = true + def seq_platform = false + def seq_center = false + STAR_ALIGN( - ch_star_inputs.map { meta, fastq, index -> [meta, fastq] }, - ch_star_inputs.map { meta, fastq, index -> [[id: meta.target_assembly], index] }, - ch_star_inputs.map { meta, fastq, index -> [[id: meta.target_assembly], []] }, + ch_star_inputs.map { meta, fastq, index -> [ meta, fastq ] }, + ch_star_inputs.map { meta, fastq, index -> [ [ id: meta.target_assembly ], index ] }, + ch_star_inputs.map { meta, fastq, index -> [ [ id: meta.target_assembly ], [] ] }, star_ignore_sjdbgtf, seq_platform, seq_center ) - .bam_sorted - | set { ch_star_bam } + + ch_star_bam = STAR_ALIGN.out.bam_sorted + ch_versions = ch_versions.mix(STAR_ALIGN.out.versions.first()) // MODULE: SAMTOOLS_CAT - ch_star_bam - | map { meta, bam -> - [ - [id: meta.target_assembly], - bam instanceof List ? bam.find {it =~ /Aligned/} : bam - ] - } - | groupTuple - | branch { meta, bamList -> - bams: bamList.size() > 1 - bam: bamList.size() <= 1 - } - | set { ch_star_bam_branch } + ch_star_bam_branch = ch_star_bam + | map { meta, bam -> + [ + [ id: meta.target_assembly ], + bam instanceof List ? bam.find { it =~ /Aligned/ } : bam + ] + } + | groupTuple + | branch { meta, bamList -> + bams: bamList.size() > 1 + bam: bamList.size() <= 1 + } - SAMTOOLS_CAT( - ch_star_bam_branch.bams - ) - .bam - | map { meta, bam -> [meta, [bam]] } - | mix( - ch_star_bam_branch.bam - ) - | set { ch_samtools_bam } + SAMTOOLS_CAT ( ch_star_bam_branch.bams ) - Channel.empty() - | mix(STAR_ALIGN.out.versions.first()) - | mix(SAMTOOLS_CAT.out.versions.first()) - | set { ch_versions } + ch_samtools_bam = SAMTOOLS_CAT.out.bam + | map { meta, bam -> [meta, [bam]] } + | mix( + ch_star_bam_branch.bam + ) + + ch_versions = ch_versions.mix(SAMTOOLS_CAT.out.versions.first()) emit: - bam = ch_samtools_bam // channel: [ [ id, single_end, target_assembly ], [ bam ] ] - versions = ch_versions // channel: [ versions.yml ] + bam = ch_samtools_bam // channel: [ [ id, single_end, target_assembly ], [ bam ] ] + versions = ch_versions // channel: [ versions.yml ] } \ No newline at end of file diff --git a/workflows/pangene.nf b/workflows/pangene.nf index 82fdbe4..53d8162 100644 --- a/workflows/pangene.nf +++ b/workflows/pangene.nf @@ -2,7 +2,7 @@ include { validateParams } from '../modules/local/validate_params include { PREPARE_ASSEMBLY } from '../subworkflows/local/prepare_assembly' include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' -// include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' +include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' // include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' // include { BRAKER3 } from '../modules/kherronism/braker3' @@ -94,15 +94,15 @@ workflow PANGENE { ch_reads_target = PREPROCESS_RNASEQ.out.reads_target ch_versions = ch_versions.mix(PREPROCESS_RNASEQ.out.versions) - // // SUBWORKFLOW: ALIGN_RNASEQ - // ALIGN_RNASEQ( - // ch_reads_target, - // ch_trim_reads, - // ch_target_assemby_index - // ) + // SUBWORKFLOW: ALIGN_RNASEQ + ALIGN_RNASEQ( + ch_reads_target, + ch_trim_reads, + ch_target_assemby_index + ) - // ch_rnaseq_bam = ALIGN_RNASEQ.out.bam - // ch_versions = ch_versions.mix(ALIGN_RNASEQ.out.versions) + ch_rnaseq_bam = ALIGN_RNASEQ.out.bam + ch_versions = ch_versions.mix(ALIGN_RNASEQ.out.versions) // // MODULE: PREPARE_EXT_PROTS // PREPARE_EXT_PROTS( From 6f61f5d3a688017c396275690f27163ee1560294 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 21 Dec 2023 14:13:50 +1300 Subject: [PATCH 37/59] Cleaned up and inc PREPARE_EXT_PROTS --- TODO.md | 2 ++ nextflow.config | 4 +-- subworkflows/local/prepare_ext_prots.nf | 46 +++++++++++-------------- workflows/pangene.nf | 32 ++++++++--------- 4 files changed, 40 insertions(+), 44 deletions(-) diff --git a/TODO.md b/TODO.md index 94f51c1..2366b0a 100644 --- a/TODO.md +++ b/TODO.md @@ -13,3 +13,5 @@ > https://www.biorxiv.org/content/10.1101/096529v2.full.pdf > Don't use `-exclude_partial` + +- [ ] Sort out EDTA testing diff --git a/nextflow.config b/nextflow.config index 587694a..ef139b4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -45,8 +45,8 @@ params { star_save_outputs = true external_protein_fastas = [ - "/workspace/ComparativeDataSources/OrthoDB11/Viridiplantae.fa.gz", - "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/RU01.20221115150135.pep.fasta" + ".test/ext_prots/Viridiplantae.fa.gz", + ".test/ext_prots/RU01.20221115150135.pep.fasta" ] // Optional: Set to null if not available diff --git a/subworkflows/local/prepare_ext_prots.nf b/subworkflows/local/prepare_ext_prots.nf index d14c60b..fff42ae 100644 --- a/subworkflows/local/prepare_ext_prots.nf +++ b/subworkflows/local/prepare_ext_prots.nf @@ -6,36 +6,30 @@ workflow PREPARE_EXT_PROTS { ch_ext_prot_fastas // Channel: [ meta, fasta ] main: - ch_ext_prot_fastas - | branch { meta, file -> - gz: "$file".endsWith(".gz") - rest: !"$file".endsWith(".gz") - } - | set { ch_ext_prot_seqs_branch } + ch_versions = Channel.empty() // MODULE: GUNZIP - GUNZIP( - ch_ext_prot_seqs_branch.gz - ) - .gunzip - | mix( - ch_ext_prot_seqs_branch.rest - ) - | set { ch_ext_prot_gunzip_fastas } + ch_ext_prot_seqs_branch = ch_ext_prot_fastas + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } + + GUNZIP ( ch_ext_prot_seqs_branch.gz ) + + ch_ext_prot_gunzip_fastas = GUNZIP.out.gunzip.mix(ch_ext_prot_seqs_branch.rest) + | map { meta, filePath -> filePath } + | collect + | map { fileList -> [ [ id: "ext_protein_seqs" ], fileList ] } + + ch_versions = ch_versions.mix(GUNZIP.out.versions.first()) - // MODULE: CAT_PROTEIN_FASTAS - ch_ext_prot_gunzip_fastas - | map { meta, filePath -> filePath } - | collect - | map { fileList -> [[id:"ext_protein_seqs"], fileList] } - | CAT_PROTEIN_FASTAS + // MODULE: CAT_CAT as CAT_PROTEIN_FASTAS + CAT_PROTEIN_FASTAS ( ch_ext_prot_gunzip_fastas ) - Channel.empty() - | mix(GUNZIP.out.versions.first()) - | mix(CAT_PROTEIN_FASTAS.out.versions) - | set { ch_versions } + ch_versions = ch_versions.mix(CAT_PROTEIN_FASTAS.out.versions) emit: - ext_prots_fasta = CAT_PROTEIN_FASTAS.out.file_out // Channel: [ meta, fasta ] - versions = ch_versions // Channel: [ versions.yml ] + ext_prots_fasta = CAT_PROTEIN_FASTAS.out.file_out // Channel: [ meta, fasta ] + versions = ch_versions // Channel: [ versions.yml ] } \ No newline at end of file diff --git a/workflows/pangene.nf b/workflows/pangene.nf index 53d8162..70ab2cc 100644 --- a/workflows/pangene.nf +++ b/workflows/pangene.nf @@ -3,7 +3,7 @@ include { validateParams } from '../modules/local/validate_params include { PREPARE_ASSEMBLY } from '../subworkflows/local/prepare_assembly' include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' -// include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' +include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' // include { BRAKER3 } from '../modules/kherronism/braker3' @@ -19,12 +19,12 @@ workflow PANGENE { ch_target_assembly = Channel.fromList(params.target_assemblies) | map { tag, filePath -> - [[id:tag], file(filePath, checkIfExists: true)] + [ [ id: tag ], file(filePath, checkIfExists: true) ] } ch_te_library = Channel.fromList(params.te_libraries) | map { tag, filePath -> - [[id:tag], file(filePath, checkIfExists: true)] + [ [ id:tag ], file(filePath, checkIfExists: true) ] } ch_samplesheet = params.samplesheet @@ -46,13 +46,13 @@ workflow PANGENE { | collect : Channel.empty() - // ch_ext_prot_fastas = params.external_protein_fastas - // ? Channel.fromList(params.external_protein_fastas) - // | map { filePath -> - // def fileHandle = file(filePath, checkIfExists: true) - // [[id:fileHandle.getSimpleName()], fileHandle] - // } - // : Channel.empty() + ch_ext_prot_fastas = params.external_protein_fastas + ? Channel.fromList(params.external_protein_fastas) + | map { filePath -> + def fileHandle = file(filePath, checkIfExists: true) + [ [id: fileHandle.getSimpleName() ], fileHandle] + } + : Channel.empty() // ch_xref_annotations_mm = params.liftoff_xref_annotations // ? Channel.fromList(params.liftoff_xref_annotations) @@ -104,13 +104,13 @@ workflow PANGENE { ch_rnaseq_bam = ALIGN_RNASEQ.out.bam ch_versions = ch_versions.mix(ALIGN_RNASEQ.out.versions) - // // MODULE: PREPARE_EXT_PROTS - // PREPARE_EXT_PROTS( - // ch_ext_prot_fastas - // ) + // MODULE: PREPARE_EXT_PROTS + PREPARE_EXT_PROTS( + ch_ext_prot_fastas + ) - // ch_ext_prots_fasta = PREPARE_EXT_PROTS.out.ext_prots_fasta - // ch_versions = ch_versions.mix(PREPARE_EXT_PROTS.out.versions) + ch_ext_prots_fasta = PREPARE_EXT_PROTS.out.ext_prots_fasta + ch_versions = ch_versions.mix(PREPARE_EXT_PROTS.out.versions) // // MODULE: BRAKER3 // ch_braker_inputs = ch_masked_target_assembly From 1184795c7a33d71591fa99aa56d23f6c7bdb68a3 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 21 Dec 2023 14:30:44 +1300 Subject: [PATCH 38/59] Cleanedup BRAKER3 --- modules/kherronism/braker3/main.nf | 27 +++++++++--------- workflows/pangene.nf | 44 +++++++++++++++--------------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/modules/kherronism/braker3/main.nf b/modules/kherronism/braker3/main.nf index d44c986..14fc08c 100644 --- a/modules/kherronism/braker3/main.nf +++ b/modules/kherronism/braker3/main.nf @@ -2,10 +2,10 @@ process BRAKER3 { tag "${meta.id}" label 'process_high' - conda "bioconda::braker3=3.0.3" + conda "bioconda::braker3=3.0.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'registry.hub.docker.com/teambraker/braker3:v.1.0.6': - 'registry.hub.docker.com/teambraker/braker3:v.1.0.6' }" + 'https://depot.galaxyproject.org/singularity/braker3%3A3.0.6--hdfd78af_0': + 'biocontainers/braker3:3.0.6--hdfd78af_0' }" input: tuple val(meta), path(fasta) @@ -29,14 +29,14 @@ process BRAKER3 { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" - def rna_ids = rnaseq_sets_ids ? "--rnaseq_sets_ids=${rnaseq_sets_ids}" : '' - def rna_dirs = rnaseq_sets_dirs ? "--rnaseq_sets_dirs=${rnaseq_sets_dirs}" : '' - def bam = bam ? "--bam=${bam}" : '' - def proteins = proteins ? "--prot_seq=${proteins}" : '' - def hints = hintsfile ? "--hints=${hintsfile}" : '' + def rna_ids = rnaseq_sets_ids ? "--rnaseq_sets_ids=${rnaseq_sets_ids}" : '' + def rna_dirs = rnaseq_sets_dirs ? "--rnaseq_sets_dirs=${rnaseq_sets_dirs}" : '' + def bam = bam ? "--bam=${bam}" : '' + def proteins = proteins ? "--prot_seq=${proteins}" : '' + def hints = hintsfile ? "--hints=${hintsfile}" : '' """ cp -r /usr/share/augustus/config augustus_config @@ -60,15 +60,16 @@ process BRAKER3 { """ stub: - prefix = task.ext.prefix ?: "${meta.id}" - def createHints = (rna_ids || bam || proteins || hints) ? "touch ${prefix}/hintsfile.gff" : '' + prefix = task.ext.prefix ?: "${meta.id}" + def rna_ids = rnaseq_sets_ids ? "--rnaseq_sets_ids=${rnaseq_sets_ids}" : '' + def touch_hints = (rna_ids || bam || proteins || hints) ? "touch ${prefix}/hintsfile.gff" : '' """ mkdir "$prefix" touch "${prefix}/braker.gtf" touch "${prefix}/braker.codingseq" touch "${prefix}/braker.aa" - $createHints + $touch_hints touch "${prefix}/braker.log" touch "${prefix}/what-to-cite.txt" diff --git a/workflows/pangene.nf b/workflows/pangene.nf index 70ab2cc..4482c05 100644 --- a/workflows/pangene.nf +++ b/workflows/pangene.nf @@ -5,7 +5,7 @@ include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' -// include { BRAKER3 } from '../modules/kherronism/braker3' +include { BRAKER3 } from '../modules/kherronism/braker3' // include { FASTA_LIFTOFF } from '../subworkflows/local/fasta_liftoff' @@ -112,29 +112,29 @@ workflow PANGENE { ch_ext_prots_fasta = PREPARE_EXT_PROTS.out.ext_prots_fasta ch_versions = ch_versions.mix(PREPARE_EXT_PROTS.out.versions) - // // MODULE: BRAKER3 - // ch_braker_inputs = ch_masked_target_assembly - // | join(ch_rnaseq_bam, remainder: true) - // | combine( - // ch_ext_prots_fasta.map { meta, filePath -> filePath }.ifEmpty(null) - // ) - // | map { meta, fasta, bam, prots -> [meta, fasta, bam ?: [], prots ?: []] } + // MODULE: BRAKER3 + ch_braker_inputs = ch_masked_target_assembly + | join(ch_rnaseq_bam, remainder: true) + | combine( + ch_ext_prots_fasta.map { meta, filePath -> filePath }.ifEmpty(null) + ) + | map { meta, fasta, bam, prots -> [ meta, fasta, bam ?: [], prots ?: [] ] } - // def rnaseq_sets_dirs = [] - // def rnaseq_sets_ids = [] - // def hintsfile = [] - - // BRAKER3( - // ch_braker_inputs.map { meta, fasta, bam, prots -> [meta, fasta] }, - // ch_braker_inputs.map { meta, fasta, bam, prots -> bam }, - // rnaseq_sets_dirs, - // rnaseq_sets_ids, - // ch_braker_inputs.map { meta, fasta, bam, prots -> prots }, - // hintsfile - // ) + def rnaseq_sets_dirs = [] + def rnaseq_sets_ids = [] + def hintsfile = [] + + BRAKER3( + ch_braker_inputs.map { meta, fasta, bam, prots -> [meta, fasta] }, + ch_braker_inputs.map { meta, fasta, bam, prots -> bam }, + rnaseq_sets_dirs, + rnaseq_sets_ids, + ch_braker_inputs.map { meta, fasta, bam, prots -> prots }, + hintsfile + ) - // ch_braker_gff3 = BRAKER3.out.gff3 - // ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) + ch_braker_gff3 = BRAKER3.out.gff3 + ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) // // SUBWORKFLOW: FASTA_LIFTOFF // FASTA_LIFTOFF( From a9f1fc6c91188256aec0359eb26ececb4a3d68b0 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 21 Dec 2023 15:32:48 +1300 Subject: [PATCH 39/59] Updated fastp, sortmerna and liftoff --- modules.json | 12 +- modules/nf-core/fastp/fastp.diff | 28 -- modules/nf-core/fastp/main.nf | 2 +- modules/nf-core/fastp/tests/main.nf.test | 241 ++++++++++++++++++ modules/nf-core/fastp/tests/main.nf.test.snap | 55 ++++ modules/nf-core/sortmerna/environment.yml | 2 +- modules/nf-core/sortmerna/main.nf | 6 +- modules/nf-core/sortmerna/sortmerna.diff | 36 --- modules/nf-core/sortmerna/tests/main.nf.test | 97 ++++++- .../nf-core/sortmerna/tests/main.nf.test.snap | 46 ++-- modules/pfr/liftoff/main.nf | 19 +- modules/pfr/liftoff/meta.yml | 8 +- modules/pfr/liftoff/tests/main.nf.test | 54 +++- modules/pfr/liftoff/tests/main.nf.test.snap | 11 + 14 files changed, 483 insertions(+), 134 deletions(-) delete mode 100644 modules/nf-core/fastp/fastp.diff delete mode 100644 modules/nf-core/sortmerna/sortmerna.diff diff --git a/modules.json b/modules.json index 299e449..633ed49 100644 --- a/modules.json +++ b/modules.json @@ -27,7 +27,7 @@ }, "liftoff": { "branch": "main", - "git_sha": "3593ec100c92b656204bf739a51d62fd44d81f6f", + "git_sha": "444b35f4e6285115f84d2bfce49fc0e6d8a2754e", "installed_by": ["modules"] } } @@ -83,9 +83,8 @@ }, "fastp": { "branch": "master", - "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", - "installed_by": ["fastq_fastqc_umitools_fastp"], - "patch": "modules/nf-core/fastp/fastp.diff" + "git_sha": "d086322563bdbb08c94bf15a7db58a39ccdb1520", + "installed_by": ["fastq_fastqc_umitools_fastp"] }, "fastqc": { "branch": "master", @@ -109,9 +108,8 @@ }, "sortmerna": { "branch": "master", - "git_sha": "a20b6b1e9114a08007608528e4a2b0fbbb8a9ca2", - "installed_by": ["modules"], - "patch": "modules/nf-core/sortmerna/sortmerna.diff" + "git_sha": "ce558e30784469b88a16923ca96d81899d240b42", + "installed_by": ["modules"] }, "star/align": { "branch": "master", diff --git a/modules/nf-core/fastp/fastp.diff b/modules/nf-core/fastp/fastp.diff deleted file mode 100644 index 4213043..0000000 --- a/modules/nf-core/fastp/fastp.diff +++ /dev/null @@ -1,28 +0,0 @@ -Changes in module 'nf-core/fastp' ---- modules/nf-core/fastp/main.nf -+++ modules/nf-core/fastp/main.nf -@@ -99,4 +99,22 @@ - END_VERSIONS - """ - } -+ -+ stub: -+ def prefix = task.ext.prefix ?: "${meta.id}" -+ def is_single_output = task.ext.args?.contains('--interleaved_in') || meta.single_end -+ def touch_reads = is_single_output ? "${prefix}.fastp.fastq.gz" : "${prefix}_1.fastp.fastq.gz ${prefix}_2.fastp.fastq.gz" -+ def touch_merged = (!is_single_output && save_merged) ? "touch ${prefix}.merged.fastq.gz" : "" -+ """ -+ touch $touch_reads -+ touch "${prefix}.fastp.json" -+ touch "${prefix}.fastp.html" -+ touch "${prefix}.fastp.log" -+ $touch_merged -+ -+ cat <<-END_VERSIONS > versions.yml -+ "${task.process}": -+ fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") -+ END_VERSIONS -+ """ - } - -************************************************************ diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf index 1f56640..2a3b679 100644 --- a/modules/nf-core/fastp/main.nf +++ b/modules/nf-core/fastp/main.nf @@ -99,7 +99,7 @@ process FASTP { END_VERSIONS """ } - + stub: def prefix = task.ext.prefix ?: "${meta.id}" def is_single_output = task.ext.args?.contains('--interleaved_in') || meta.single_end diff --git a/modules/nf-core/fastp/tests/main.nf.test b/modules/nf-core/fastp/tests/main.nf.test index f610b73..17dce8a 100644 --- a/modules/nf-core/fastp/tests/main.nf.test +++ b/modules/nf-core/fastp/tests/main.nf.test @@ -57,6 +57,67 @@ nextflow_process { { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } } }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_single_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_single_end-stub") { + + options '-stub' + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ + [ id:'test', single_end:true ], + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_single_end-for_stub_match") + }, { assert snapshot(process.out.versions).match("versions") } ) } @@ -127,6 +188,67 @@ nextflow_process { { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } } }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_paired_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end-stub") { + + options '-stub' + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_paired_end-for_stub_match") + }, { assert snapshot(process.out.versions).match("versions") } ) } @@ -181,6 +303,66 @@ nextflow_process { { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } } }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_interleaved-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("fastp test_fastp_interleaved-stub") { + + options '-stub' + + config './nextflow.config' + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_interleaved-for_stub_match") + }, { assert snapshot(process.out.versions).match("versions") } ) } @@ -399,6 +581,65 @@ nextflow_process { { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } } }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_paired_end_merged-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_merged-stub") { + + options '-stub' + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = true + + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.json.collect { file(it[1]).getName() } + + process.out.html.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.reads_fail.collect { file(it[1]).getName() } + + process.out.reads_merged.collect { file(it[1]).getName() } + ).sort() + ).match("test_fastp_paired_end_merged-for_stub_match") + }, { assert snapshot(process.out.versions).match("versions") } ) } diff --git a/modules/nf-core/fastp/tests/main.nf.test.snap b/modules/nf-core/fastp/tests/main.nf.test.snap index 0fa68c7..1b7d241 100644 --- a/modules/nf-core/fastp/tests/main.nf.test.snap +++ b/modules/nf-core/fastp/tests/main.nf.test.snap @@ -1,4 +1,19 @@ { + "test_fastp_paired_end-for_stub_match": { + "content": [ + [ + [ + "test_1.fastp.fastq.gz", + "test_2.fastp.fastq.gz" + ], + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "{id=test, single_end=false}" + ] + ], + "timestamp": "2023-12-21T09:44:37.202512" + }, "fastp test_fastp_interleaved_json": { "content": [ [ @@ -13,6 +28,22 @@ ], "timestamp": "2023-10-17T11:04:45.794175881" }, + "test_fastp_paired_end_merged-for_stub_match": { + "content": [ + [ + [ + "test_1.fastp.fastq.gz", + "test_2.fastp.fastq.gz" + ], + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "test.merged.fastq.gz", + "{id=test, single_end=false}" + ] + ], + "timestamp": "2023-12-21T09:53:45.237014" + }, "test_fastp_single_end_json": { "content": [ [ @@ -35,6 +66,30 @@ ], "timestamp": "2023-10-17T11:04:10.582076024" }, + "test_fastp_interleaved-for_stub_match": { + "content": [ + [ + "test.fastp.fastq.gz", + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "{id=test, single_end=true}" + ] + ], + "timestamp": "2023-12-21T09:48:43.148485" + }, + "test_fastp_single_end-for_stub_match": { + "content": [ + [ + "test.fastp.fastq.gz", + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "{id=test, single_end=true}" + ] + ], + "timestamp": "2023-12-21T09:20:07.254788" + }, "test_fastp_single_end_trim_fail_json": { "content": [ [ diff --git a/modules/nf-core/sortmerna/environment.yml b/modules/nf-core/sortmerna/environment.yml index 3dae00a..f40f995 100644 --- a/modules/nf-core/sortmerna/environment.yml +++ b/modules/nf-core/sortmerna/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::sortmerna=4.3.4 + - bioconda::sortmerna=4.3.6 diff --git a/modules/nf-core/sortmerna/main.nf b/modules/nf-core/sortmerna/main.nf index 909a7b1..29c640c 100644 --- a/modules/nf-core/sortmerna/main.nf +++ b/modules/nf-core/sortmerna/main.nf @@ -1,11 +1,11 @@ process SORTMERNA { tag "$meta.id" - label "process_high" + label 'process_high' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/sortmerna:4.3.4--h9ee0642_0' : - 'biocontainers/sortmerna:4.3.4--h9ee0642_0' }" + 'https://depot.galaxyproject.org/singularity/sortmerna:4.3.6--h9ee0642_0' : + 'biocontainers/sortmerna:4.3.6--h9ee0642_0' }" input: tuple val(meta), path(reads) diff --git a/modules/nf-core/sortmerna/sortmerna.diff b/modules/nf-core/sortmerna/sortmerna.diff deleted file mode 100644 index 66d58d5..0000000 --- a/modules/nf-core/sortmerna/sortmerna.diff +++ /dev/null @@ -1,36 +0,0 @@ -Changes in module 'nf-core/sortmerna' ---- modules/nf-core/sortmerna/main.nf -+++ modules/nf-core/sortmerna/main.nf -@@ -67,4 +67,30 @@ - END_VERSIONS - """ - } -+ -+ stub: -+ def args = task.ext.args ?: '' -+ def prefix = task.ext.prefix ?: "${meta.id}" -+ if (meta.single_end) { -+ """ -+ touch ${prefix}.non_rRNA.fastq.gz -+ touch ${prefix}.sortmerna.log -+ -+ cat <<-END_VERSIONS > versions.yml -+ "${task.process}": -+ sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') -+ END_VERSIONS -+ """ -+ } else { -+ """ -+ touch ${prefix}_1.non_rRNA.fastq.gz -+ touch ${prefix}_2.non_rRNA.fastq.gz -+ touch ${prefix}.sortmerna.log -+ -+ cat <<-END_VERSIONS > versions.yml -+ "${task.process}": -+ sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') -+ END_VERSIONS -+ """ -+ } - } - -************************************************************ diff --git a/modules/nf-core/sortmerna/tests/main.nf.test b/modules/nf-core/sortmerna/tests/main.nf.test index 3ec2692..8a01e2a 100644 --- a/modules/nf-core/sortmerna/tests/main.nf.test +++ b/modules/nf-core/sortmerna/tests/main.nf.test @@ -23,9 +23,51 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out.reads).match("se_reads") }, - { assert process.out.log }, - { assert snapshot(process.out.versions).match("se_versions") } + { assert process.out.reads }, + { assert file(process.out.log[0][1]).text.contains("Total reads passing E-value threshold = 100 (100.00)") }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("sarscov2 single_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("sarscov2 single_end stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("sarscov2 single_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } ) } @@ -48,9 +90,52 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out.reads).match("pe_reads") }, - { assert process.out.log }, - { assert snapshot(process.out.versions).match("pe_versions") } + { assert process.out.reads }, + { assert file(process.out.log[0][1]).text.contains("Total reads passing E-value threshold = 200 (100.00)") }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("sarscov2 paired_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("sarscov2 paired_end stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("sarscov2 paired_end-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } ) } diff --git a/modules/nf-core/sortmerna/tests/main.nf.test.snap b/modules/nf-core/sortmerna/tests/main.nf.test.snap index f1bedb7..e502000 100644 --- a/modules/nf-core/sortmerna/tests/main.nf.test.snap +++ b/modules/nf-core/sortmerna/tests/main.nf.test.snap @@ -1,49 +1,33 @@ { - "se_versions": { + "sarscov2 single_end-for_stub_match": { "content": [ [ - "versions.yml:md5,96553a18cad5237fbf76d5a6c966360e" + "test.non_rRNA.fastq.gz", + "test.sortmerna.log", + "{id=test, single_end=true}" ] ], - "timestamp": "2023-11-22T14:25:07.95908694" + "timestamp": "2023-12-21T11:56:00.15356" }, - "pe_reads": { + "versions": { "content": [ [ - [ - { - "id": "test", - "single_end": false - }, - [ - "test_1.non_rRNA.fastq.gz:md5,e62ff0123a74adfc6903d59a449cbdb0", - "test_2.non_rRNA.fastq.gz:md5,e62ff0123a74adfc6903d59a449cbdb0" - ] - ] + "versions.yml:md5,7df9d50209f351e1f75e05a1fad6ba4b" ] ], - "timestamp": "2023-11-22T14:25:19.098771475" + "timestamp": "2023-12-21T11:56:00.200244" }, - "se_reads": { + "sarscov2 paired_end-for_stub_match": { "content": [ [ [ - { - "id": "test", - "single_end": true - }, - "test.non_rRNA.fastq.gz:md5,e62ff0123a74adfc6903d59a449cbdb0" - ] - ] - ], - "timestamp": "2023-11-22T14:25:07.949212892" - }, - "pe_versions": { - "content": [ - [ - "versions.yml:md5,96553a18cad5237fbf76d5a6c966360e" + "test_1.non_rRNA.fastq.gz", + "test_2.non_rRNA.fastq.gz" + ], + "test.sortmerna.log", + "{id=test, single_end=false}" ] ], - "timestamp": "2023-11-22T14:25:19.105098985" + "timestamp": "2023-12-21T12:00:47.879193" } } \ No newline at end of file diff --git a/modules/pfr/liftoff/main.nf b/modules/pfr/liftoff/main.nf index a382dab..317eca1 100644 --- a/modules/pfr/liftoff/main.nf +++ b/modules/pfr/liftoff/main.nf @@ -9,11 +9,11 @@ process LIFTOFF { input: tuple val(meta), path(target_fa) - path ref_fa, name: 'liftoff_reference_assembly.fa' // To avoid name collisions betwen target_fa and ref_fa + path ref_fa, name: 'ref_assembly.fa' path ref_annotation output: - tuple val(meta), path("${prefix}.gff3") , emit: gff3 // To avoid pattern collision with '*.polished.gff3' + tuple val(meta), path("${prefix}.gff3") , emit: gff3 tuple val(meta), path("*.polished.gff3") , emit: polished_gff3, optional: true tuple val(meta), path("*.unmapped.txt") , emit: unmapped_txt path "versions.yml" , emit: versions @@ -22,8 +22,8 @@ process LIFTOFF { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" """ liftoff \\ -g $ref_annotation \\ @@ -32,9 +32,11 @@ process LIFTOFF { -u "${prefix}.unmapped.txt" \\ $args \\ $target_fa \\ - liftoff_reference_assembly.fa + ref_assembly.fa - mv "${prefix}.gff3_polished" "${prefix}.polished.gff3" \\ + mv \\ + "${prefix}.gff3_polished" \\ + "${prefix}.polished.gff3" \\ || echo "-polish is absent" cat <<-END_VERSIONS > versions.yml @@ -44,10 +46,13 @@ process LIFTOFF { """ stub: - prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def touch_polished = args.contains('-polish') ? "touch ${prefix}.polished.gff3" : '' """ touch "${prefix}.gff3" touch "${prefix}.unmapped.txt" + $touch_polished cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/pfr/liftoff/meta.yml b/modules/pfr/liftoff/meta.yml index ad1c5b8..46b3c58 100644 --- a/modules/pfr/liftoff/meta.yml +++ b/modules/pfr/liftoff/meta.yml @@ -1,7 +1,9 @@ --- # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "liftoff" -description: "Liftoff is a tool that accurately maps annotations in GFF or GTF between assemblies of the same, or closely-related species." +description: | + Uses Liftoff to accurately map annotations in GFF or GTF between assemblies of the same, + or closely-related species keywords: - genome - annotation @@ -10,7 +12,9 @@ keywords: - liftover tools: - "liftoff": - description: "Liftoff is a tool that accurately maps annotations in GFF or GTF between assemblies of the same, or closely-related species." + description: | + Liftoff is a tool that accurately maps annotations in GFF or GTF between assemblies of the same, + or closely-related species homepage: "https://github.com/agshumate/Liftoff" documentation: "https://github.com/agshumate/Liftoff" tool_dev_url: "https://github.com/agshumate/Liftoff" diff --git a/modules/pfr/liftoff/tests/main.nf.test b/modules/pfr/liftoff/tests/main.nf.test index 00d1d2a..272c882 100644 --- a/modules/pfr/liftoff/tests/main.nf.test +++ b/modules/pfr/liftoff/tests/main.nf.test @@ -7,6 +7,7 @@ nextflow_process { tag "modules" tag "modules_nfcore" + tag "nf-core/gunzip" tag "liftoff" test("homo_sapiens-genome_21_fasta-genome_1_fasta-genome_1_gtf") { @@ -45,16 +46,41 @@ nextflow_process { assertAll( { assert process.success }, { assert snapshot(process.out.unmapped_txt).match("unmapped_txt") }, - { assert snapshot(process.out.versions).match("versions") }, - { assert path(process.out.gff3.get(0).get(1)).getText().contains("chr21\tLiftoff\texon\t34608061\t34608118\t.\t+\t.") }, - { assert path(process.out.polished_gff3.get(0).get(1)).getText().contains("chr21\tLiftoff\texon\t34608061\t34608118\t.\t+\t.") } + { assert file(process.out.gff3[0][1]).text.contains("chr21\tLiftoff\texon\t34608061\t34608118\t.\t+\t.") }, + { assert file(process.out.polished_gff3[0][1]).text.contains("chr21\tLiftoff\texon\t34608061\t34608118\t.\t+\t.") }, + { + assert snapshot( + ( + [process.out.gff3[0][0].toString()] + // meta + process.out.gff3.collect { file(it[1]).getName() } + + process.out.polished_gff3.collect { file(it[1]).getName() } + + process.out.unmapped_txt.collect { file(it[1]).getName() } + ).sort() + ).match("homo_sapiens-genome_21_fasta-genome_1_fasta-genome_1_gtf-for_stub_match") + }, + { assert snapshot(process.out.versions).match("versions") } ) } } - test("stub") { - options "-stub" + test("homo_sapiens-genome_21_fasta-genome_1_fasta-genome_1_gtf-stub") { + options '-stub' + + setup { + run("GUNZIP") { + script "../../../nf-core/gunzip" + + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['homo_sapiens']['genome']['genome_1_fasta'], checkIfExists: true) + ] + """ + } + } + } when { process { @@ -63,9 +89,7 @@ nextflow_process { [ id:'test' ], file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true) ] - input[1] = [ - file(params.test_data['homo_sapiens']['genome']['genome_1_fasta'], checkIfExists: true) - ] + input[1] = GUNZIP.out.gunzip.map { meta, file -> file } input[2] = [ file(params.test_data['homo_sapiens']['genome']['genome_1_gtf'], checkIfExists: true) ] @@ -76,10 +100,16 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert process.out.gff3 != null }, - { assert process.out.polished_gff3 == [] }, - { assert process.out.unmapped_txt != null }, - { assert process.out.versions != null }, + { + assert snapshot( + ( + [process.out.gff3[0][0].toString()] + // meta + process.out.gff3.collect { file(it[1]).getName() } + + process.out.polished_gff3.collect { file(it[1]).getName() } + + process.out.unmapped_txt.collect { file(it[1]).getName() } + ).sort() + ).match("homo_sapiens-genome_21_fasta-genome_1_fasta-genome_1_gtf-for_stub_match") + }, { assert snapshot(process.out.versions).match("versions") } ) } diff --git a/modules/pfr/liftoff/tests/main.nf.test.snap b/modules/pfr/liftoff/tests/main.nf.test.snap index 36c39b6..baa4d70 100644 --- a/modules/pfr/liftoff/tests/main.nf.test.snap +++ b/modules/pfr/liftoff/tests/main.nf.test.snap @@ -19,5 +19,16 @@ ] ], "timestamp": "2023-12-01T13:57:40.752414" + }, + "homo_sapiens-genome_21_fasta-genome_1_fasta-genome_1_gtf-for_stub_match": { + "content": [ + [ + "test.gff3", + "test.polished.gff3", + "test.unmapped.txt", + "{id=test}" + ] + ], + "timestamp": "2023-12-21T15:20:04.816416" } } \ No newline at end of file From d0faf8d68b3679602c11f4102c1a463b9fd9cb02 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 21 Dec 2023 16:21:51 +1300 Subject: [PATCH 40/59] Cleaned up fasta_liftoff --- conf/base.config | 7 +- nextflow.config | 8 +- subworkflows/local/fasta_liftoff.nf | 137 +++++++++++++++------------- workflows/pangene.nf | 54 +++++------ 4 files changed, 106 insertions(+), 100 deletions(-) diff --git a/conf/base.config b/conf/base.config index 54114d3..4467c0b 100644 --- a/conf/base.config +++ b/conf/base.config @@ -66,10 +66,9 @@ process { withLabel:process_high_memory { memory = { check_max( 200.GB * task.attempt, 'memory' ) } } - - // Custom - withLabel:process_week_long { - time = { check_max( 7.days * task.attempt, 'time' ) } + + withName:CUSTOM_DUMPSOFTWAREVERSIONS { + cache = false } } diff --git a/nextflow.config b/nextflow.config index ef139b4..14544a8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,12 +54,12 @@ params { liftoff_xref_annotations = [ [ - "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/Russell_V2a.chromosomes.and.unassiged.and.haplotigs.fsa", - "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/RU01.20221115150135.gff3" + ".test/liftoff/Russell_V2a.chromosomes.and.unassiged.and.haplotigs.fsa", + ".test/liftoff/RU01.20221115150135.gff3" ], [ - "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_chr_all.fas", - "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_GFF3_genes_transposons.fixed.gff3" + ".test/liftoff/TAIR10_chr_all.fas", + ".test/liftoff/TAIR10_GFF3_genes_transposons.fixed.gff3" ] ] // Format: [ [ fasta(.gz), gff3(.gz) ] ] diff --git a/subworkflows/local/fasta_liftoff.nf b/subworkflows/local/fasta_liftoff.nf index 5e6fd22..8952e1d 100644 --- a/subworkflows/local/fasta_liftoff.nf +++ b/subworkflows/local/fasta_liftoff.nf @@ -1,85 +1,96 @@ include { GUNZIP as GUNZIP_FASTA } from '../../modules/nf-core/gunzip' include { GUNZIP as GUNZIP_GFF } from '../../modules/nf-core/gunzip' include { GFFREAD } from '../../modules/nf-core/gffread' -include { LIFTOFF } from '../../modules/local/liftoff' +include { LIFTOFF } from '../../modules/pfr/liftoff' workflow FASTA_LIFTOFF { take: - target_assemby // Channel: [ meta, fasta ] - xref_annotations_fasta // Channel: [ meta2, fasta ] - xref_annotations_gff // Channel: [ meta2, gff3 ] + target_assemby // Channel: [ meta, fasta ] + xref_fasta // Channel: [ meta2, fasta ] + xref_gff // Channel: [ meta2, gff3 ] main: - // MODULE: GUNZIP_FASTA - xref_annotations_fasta - | branch { meta, file -> - gz: "$file".endsWith(".gz") - rest: !"$file".endsWith(".gz") - } - | set { xref_annotations_fasta_branch } + ch_versions = Channel.empty() - GUNZIP_FASTA( - xref_annotations_fasta_branch.gz - ) - .gunzip - | mix( - xref_annotations_fasta_branch.rest - ) - | set { ch_xref_annotations_gunzip_fasta } + // MODULE: GUNZIP as GUNZIP_FASTA + ch_xref_fasta_branch = xref_fasta + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } + + GUNZIP_FASTA ( ch_xref_fasta_branch.gz ) + + ch_xref_gunzip_fasta = GUNZIP_FASTA.out.gunzip + | mix( + ch_xref_fasta_branch.rest + ) + + ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions.first()) - // MODULE: GUNZIP_GFF - xref_annotations_gff - | branch { meta, file -> - gz: "$file".endsWith(".gz") - rest: !"$file".endsWith(".gz") - } - | set { xref_annotations_gff_branch } + // MODULE: GUNZIP as GUNZIP_GFF + ch_xref_gff_branch = xref_gff + | branch { meta, file -> + gz: "$file".endsWith(".gz") + rest: !"$file".endsWith(".gz") + } - GUNZIP_GFF( - xref_annotations_gff_branch.gz - ) - .gunzip - | mix( - xref_annotations_gff_branch.rest - ) - | set { ch_xref_annotations_gunzip_gff } + GUNZIP_GFF ( ch_xref_gff_branch.gz ) + + ch_xref_gunzip_gff = GUNZIP_GFF.out.gunzip + | mix( + ch_xref_gff_branch.rest + ) + + ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions.first()) // MODULE: GFFREAD - GFFREAD( - ch_xref_annotations_gunzip_gff - ) - .gff - | set { ch_gffread_gff } + ch_gffread_inputs = ch_xref_gunzip_gff + | map { meta, gff -> + [ gff.getSimpleName(), meta, gff ] + } // For meta insertion later, remove when GFFREAD has meta + + GFFREAD ( ch_gffread_inputs.map { name, meta, gff -> gff } ) + + ch_gffread_gff = GFFREAD.out.gffread_gff + | map { gff -> [ gff.getSimpleName(), gff ] } + | join(ch_gffread_inputs) + | map { fid, gffread_gff, meta, gff -> [ meta, gffread_gff ] } + // meta insertion + + ch_versions = ch_versions.mix(GFFREAD.out.versions.first()) // MODULE: LIFTOFF - target_assemby - | combine( - ch_xref_annotations_gunzip_fasta - | join( - ch_gffread_gff - ) - ) - | map { meta, targetFasta, refMeta, refFasta, refGFF -> - [[id:"${meta.id}.from.${refMeta.id}", target_assemby: meta.id], targetFasta, refFasta, refGFF] - } - | set { ch_liftoff_inputs } + ch_liftoff_inputs = target_assemby + | combine( + ch_xref_gunzip_fasta + | join( + ch_gffread_gff + ) + ) + | map { meta, target_fa, ref_meta, ref_fa, ref_gff -> + [ + [ + id: "${meta.id}.from.${ref_meta.id}", + target_assemby: meta.id + ], + target_fa, + ref_fa, + ref_gff + ] + } LIFTOFF( - ch_liftoff_inputs.map { meta, targetFasta, refFasta, refGFF -> [meta, targetFasta] }, - ch_liftoff_inputs.map { meta, targetFasta, refFasta, refGFF -> refFasta }, - ch_liftoff_inputs.map { meta, targetFasta, refFasta, refGFF -> refGFF } + ch_liftoff_inputs.map { meta, target_fa, ref_fa, ref_gff -> [ meta, target_fa ] }, + ch_liftoff_inputs.map { meta, target_fa, ref_fa, ref_gff -> ref_fa }, + ch_liftoff_inputs.map { meta, target_fa, ref_fa, ref_gff -> ref_gff } ) - .polished_gff3 - | map { meta, gff -> [[id: meta.target_assemby], gff] } - | groupTuple - | set { ch_liftoff_gff3 } - Channel.empty() - | mix(GUNZIP_FASTA.out.versions.first()) - | mix(GUNZIP_GFF.out.versions.first()) - | mix(GFFREAD.out.versions.first()) - | mix(LIFTOFF.out.versions.first()) - | set { ch_versions } + ch_liftoff_gff3 = LIFTOFF.out.polished_gff3 + | map { meta, gff -> [ [ id: meta.target_assemby ], gff ] } + | groupTuple + + ch_versions = ch_versions.mix(LIFTOFF.out.versions.first()) emit: gff3 = ch_liftoff_gff3 // [ meta, [ gff3 ] ] diff --git a/workflows/pangene.nf b/workflows/pangene.nf index 4482c05..7cc8c4a 100644 --- a/workflows/pangene.nf +++ b/workflows/pangene.nf @@ -1,15 +1,11 @@ include { validateParams } from '../modules/local/validate_params' - include { PREPARE_ASSEMBLY } from '../subworkflows/local/prepare_assembly' include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' - include { BRAKER3 } from '../modules/kherronism/braker3' - -// include { FASTA_LIFTOFF } from '../subworkflows/local/fasta_liftoff' - -// include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions' +include { FASTA_LIFTOFF } from '../subworkflows/local/fasta_liftoff' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions' validateParams(params) @@ -50,22 +46,22 @@ workflow PANGENE { ? Channel.fromList(params.external_protein_fastas) | map { filePath -> def fileHandle = file(filePath, checkIfExists: true) - [ [id: fileHandle.getSimpleName() ], fileHandle] + [ [ id: fileHandle.getSimpleName() ], fileHandle] } : Channel.empty() - // ch_xref_annotations_mm = params.liftoff_xref_annotations - // ? Channel.fromList(params.liftoff_xref_annotations) - // | multiMap { fasta, gff -> - // def fastaFile = file(fasta, checkIfExists:true) + ch_xref_mm = params.liftoff_xref_annotations + ? Channel.fromList(params.liftoff_xref_annotations) + | multiMap { fasta, gff -> + def fastaFile = file(fasta, checkIfExists:true) - // fasta: [[id:fastaFile.getSimpleName()], fastaFile] - // gff: [[id:fastaFile.getSimpleName()], file(gff, checkIfExists:true)] - // } - // : Channel.empty() + fasta: [ [ id: fastaFile.getSimpleName() ], fastaFile ] + gff: [ [ id: fastaFile.getSimpleName() ], file(gff, checkIfExists:true) ] + } + : Channel.empty() - // ch_xref_annotations_fasta = ch_xref_annotations_mm.fasta - // ch_xref_annotations_gff = ch_xref_annotations_mm.gff + ch_xref_fasta = ch_xref_mm.fasta + ch_xref_gff = ch_xref_mm.gff // SUBWORKFLOW: PREPARE_ASSEMBLY PREPARE_ASSEMBLY( @@ -136,18 +132,18 @@ workflow PANGENE { ch_braker_gff3 = BRAKER3.out.gff3 ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) - // // SUBWORKFLOW: FASTA_LIFTOFF - // FASTA_LIFTOFF( - // ch_valid_target_assembly, - // ch_xref_annotations_fasta, - // ch_xref_annotations_gff - // ) + // SUBWORKFLOW: FASTA_LIFTOFF + FASTA_LIFTOFF( + ch_valid_target_assembly, + ch_xref_fasta, + ch_xref_gff + ) - // ch_liftoff_gff3 = FASTA_LIFTOFF.out.gff3 - // ch_versions = ch_versions.mix(FASTA_LIFTOFF.out.versions) + ch_liftoff_gff3 = FASTA_LIFTOFF.out.gff3 + ch_versions = ch_versions.mix(FASTA_LIFTOFF.out.versions) - // // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS - // CUSTOM_DUMPSOFTWAREVERSIONS ( - // ch_versions.unique().collectFile(name: 'collated_versions.yml') - // ) + // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) } \ No newline at end of file From 27a1293e458bc3c5e3dcc0c1cd06c9751e78b25a Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Mon, 8 Jan 2024 09:55:12 +1300 Subject: [PATCH 41/59] Updated fasta_edta_lai --- modules.json | 4 ++-- modules/pfr/edta/edta/main.nf | 2 +- modules/pfr/edta/edta/meta.yml | 21 +++++++++++++++++++ modules/pfr/edta/edta/tests/main.nf.test | 21 ++++++++++++------- modules/pfr/edta/edta/tests/nextflow.config | 2 +- .../pfr/fasta_edta_lai/tests/main.nf.test | 20 ++++++++++++++---- .../fasta_edta_lai/tests/main.nf.test.snap | 11 ++++++++++ 7 files changed, 65 insertions(+), 16 deletions(-) create mode 100644 subworkflows/pfr/fasta_edta_lai/tests/main.nf.test.snap diff --git a/modules.json b/modules.json index 633ed49..f40e823 100644 --- a/modules.json +++ b/modules.json @@ -17,7 +17,7 @@ }, "edta/edta": { "branch": "main", - "git_sha": "d14b1f1d790cc01c11f8ec5aa80a9562c6808b20", + "git_sha": "35468dbb1f35eb17a43d7e05544601c7c3f8cd90", "installed_by": ["fasta_edta_lai", "modules"] }, "lai": { @@ -36,7 +36,7 @@ "pfr": { "fasta_edta_lai": { "branch": "main", - "git_sha": "fde2f37998ba54ec9c0b5cf65a2f28f14af981b0", + "git_sha": "5ae026a98da1331433fa4cf5b667c9abdf43e395", "installed_by": ["subworkflows"] } } diff --git a/modules/pfr/edta/edta/main.nf b/modules/pfr/edta/edta/main.nf index 458f525..a81c528 100644 --- a/modules/pfr/edta/edta/main.nf +++ b/modules/pfr/edta/edta/main.nf @@ -42,7 +42,7 @@ process EDTA_EDTA { $rmout_file \\ $exclude_file \\ $args \\ - &> "${prefix}.log" + &> >(tee "${prefix}.log" 2>&1) mv \\ "${mod_file_name}.EDTA.TElib.fa" \\ diff --git a/modules/pfr/edta/edta/meta.yml b/modules/pfr/edta/edta/meta.yml index 4d59fdf..52503b8 100644 --- a/modules/pfr/edta/edta/meta.yml +++ b/modules/pfr/edta/edta/meta.yml @@ -25,6 +25,27 @@ input: type: file description: Genome fasta file pattern: "*.{fsa,fa,fasta}" + - cds: + type: file + description: | + A FASTA file containing the coding sequence (no introns, UTRs, nor TEs) + of this genome or its close relative + pattern: "*.{fsa,fa,fasta}" + - curatedlib: + type: file + description: | + A curated library to keep consistent naming and classification for known TEs + pattern: "*.liban" + - rmout: + type: file + description: | + Homology-based TE annotation instead of using the EDTA library for masking in + RepeatMasker .out format + pattern: "*.out" + - exclude: + type: file + description: Exclude regions (bed format) from TE masking in the MAKER.masked output + pattern: "*.bed" output: - meta: type: map diff --git a/modules/pfr/edta/edta/tests/main.nf.test b/modules/pfr/edta/edta/tests/main.nf.test index d0a7142..3aed0a2 100644 --- a/modules/pfr/edta/edta/tests/main.nf.test +++ b/modules/pfr/edta/edta/tests/main.nf.test @@ -15,14 +15,19 @@ nextflow_process { when { process { """ - input[0] = [ - [ id:'test' ], // meta map - file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) - ] - input[1] = [] - input[2] = [] - input[3] = [] - input[4] = [] + input[0] = Channel.of(file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true)) + | map { f -> + ( + ['>Chr21'] + + f.readLines().subList(66666.toInteger(), 116666.toInteger()) // 4 MB to 7 MB; 60 bases per line + ).join('\\n') + } + | collectFile(name: 'genome_3_to_10_mb.fasta') + | map { f -> [ [ id: 'test'], f ] } + input[1] = [] + input[2] = [] + input[3] = [] + input[4] = [] """ } } diff --git a/modules/pfr/edta/edta/tests/nextflow.config b/modules/pfr/edta/edta/tests/nextflow.config index b20ca5e..e58e10e 100644 --- a/modules/pfr/edta/edta/tests/nextflow.config +++ b/modules/pfr/edta/edta/tests/nextflow.config @@ -1,3 +1,3 @@ process { - ext.args = '--anno 1 --evaluate 1' + ext.args = '--anno 1' } diff --git a/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test b/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test index a4fa87b..e852a70 100644 --- a/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test +++ b/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test @@ -15,13 +15,25 @@ nextflow_workflow { test("test_data") { + setup { + run("GUNZIP") { + script "../../../../modules/nf-core/gunzip" + + process { + """ + input[0] = [ + [ id:'test' ], + file('/Users/hrauxr/Projects/nxf-modules/tests/data/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + } + when { workflow { """ - input[0] = Channel.of([ - [ id:'test' ], - file("/Users/hrauxr/Projects/nxf-modules/data/chr1.fa", checkIfExists: true) - ]) + input[0] = GUNZIP.out.gunzip input[1] = [] input[2] = false """ diff --git a/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test.snap b/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test.snap new file mode 100644 index 0000000..574acc9 --- /dev/null +++ b/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test.snap @@ -0,0 +1,11 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,0d4bc49e94acb8995ca552d4e666e3ce", + "versions.yml:md5,754bb19f86be761d90c002a0af2faf1c" + ] + ], + "timestamp": "2023-12-22T14:09:24.171934" + } +} \ No newline at end of file From 3867ed5e1a60646182afb14f7d00d1fe24005f8b Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Mon, 8 Jan 2024 10:17:31 +1300 Subject: [PATCH 42/59] Added script for local stub run --- pangene_local_stub.sh | 9 +++++++++ pangene_pfr.sh | 5 ++++- 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100755 pangene_local_stub.sh diff --git a/pangene_local_stub.sh b/pangene_local_stub.sh new file mode 100755 index 0000000..e8227d5 --- /dev/null +++ b/pangene_local_stub.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +nextflow \ + main.nf \ + -profile local,docker \ + -resume \ + -stub \ + --max_cpus=1 \ + --max_memory=1.GB \ No newline at end of file diff --git a/pangene_pfr.sh b/pangene_pfr.sh index ab3d262..ca1a335 100644 --- a/pangene_pfr.sh +++ b/pangene_pfr.sh @@ -16,4 +16,7 @@ ml nextflow/23.04.4 export TMPDIR="/workspace/$USER/tmp" export APPTAINER_BINDPATH="$APPTAINER_BINDPATH,$TMPDIR:$TMPDIR,$TMPDIR:/tmp" -nextflow main.nf -profile pfr,apptainer -resume \ No newline at end of file +nextflow \ + main.nf \ + -profile pfr,apptainer \ + -resume \ No newline at end of file From 476149154cec16229c903624ccc8197b9a4e20c6 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Mon, 8 Jan 2024 11:26:41 +1300 Subject: [PATCH 43/59] Samplesheet now accepts relative paths --- nextflow.config | 2 +- pangene_local_stub.sh | 3 ++- subworkflows/local/extract_samples.nf | 25 +++++++++++++++++-------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/nextflow.config b/nextflow.config index 14544a8..d2a0651 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,7 +25,7 @@ params { repeatmasker_save_outputs = true - samplesheet = "./.test/samplesheet_small.csv" + samplesheet = "./.test/samplesheet.csv" // Optional: Set to null if not available skip_fastqc = false diff --git a/pangene_local_stub.sh b/pangene_local_stub.sh index e8227d5..0a18650 100755 --- a/pangene_local_stub.sh +++ b/pangene_local_stub.sh @@ -6,4 +6,5 @@ nextflow \ -resume \ -stub \ --max_cpus=1 \ - --max_memory=1.GB \ No newline at end of file + --max_memory=1.GB \ + --samplesheet="./.test/samplesheet_small.csv" \ No newline at end of file diff --git a/subworkflows/local/extract_samples.nf b/subworkflows/local/extract_samples.nf index e63bbe0..957f218 100644 --- a/subworkflows/local/extract_samples.nf +++ b/subworkflows/local/extract_samples.nf @@ -22,7 +22,10 @@ workflow EXTRACT_SAMPLES { SAMPLESHEET_CHECK ( samplesheet, permissible_target_assemblies ) .csv | splitCsv ( header:true, sep:',' ) - | map { create_fastq_channel(it) } + | combine ( samplesheet ) + | map { row, sheet -> + create_fastq_channel(row, sheet.getParent()) + } | set { ch_reads } reads = ch_reads.map { meta, fastq -> [[id:meta.id, single_end:meta.single_end], fastq]} @@ -40,25 +43,31 @@ workflow EXTRACT_SAMPLES { } // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { +def create_fastq_channel(LinkedHashMap row, sheetPath) { // create meta map def meta = [:] meta.id = row.sample meta.single_end = row.single_end.toBoolean() meta.target_assemblies = row.target_assemblies.split(";").sort() + def fq1 = row.fastq_1.startsWith("/") ? row.fastq_1 : "$sheetPath/${row.fastq_1}" + def fq2 = row.fastq_2.startsWith("/") ? row.fastq_2 : "$sheetPath/${row.fastq_2}" + // add path(s) of the fastq file(s) to the meta map def fastq_meta = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" + if (!file(fq1).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${fq1}" } if (meta.single_end) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] + fastq_meta = [ meta, [ file(fq1) ] ] } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + if (!file(fq2).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${fq2}" } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + fastq_meta = [ meta, [ file(fq1), file(fq2) ] ] } + + println fastq_meta + return fastq_meta } \ No newline at end of file From 8ced02787818d8da7c3e6e7b158da58512137138 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Mon, 8 Jan 2024 11:34:25 +1300 Subject: [PATCH 44/59] Updated modules --- modules.json | 4 +- .../dumpsoftwareversions/tests/main.nf.test | 7 +- .../tests/main.nf.test.snap | 50 ++-- modules/nf-core/fastqc/tests/main.nf.test | 271 ++++++++++++------ .../nf-core/fastqc/tests/main.nf.test.snap | 12 +- 5 files changed, 238 insertions(+), 106 deletions(-) diff --git a/modules.json b/modules.json index f40e823..cdb8a24 100644 --- a/modules.json +++ b/modules.json @@ -73,7 +73,7 @@ }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", + "git_sha": "37dee863936732fe7e05dc598bf6e183a8e7ef73", "installed_by": ["modules"] }, "fastavalidator": { @@ -88,7 +88,7 @@ }, "fastqc": { "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "git_sha": "617777a807a1770f73deb38c80004bac06807eef", "installed_by": ["fastq_fastqc_umitools_fastp", "modules"] }, "gffread": { diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test index eec1db1..b1e1630 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -31,7 +31,12 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out).match() } + { assert snapshot( + process.out.versions, + file(process.out.mqc_yml[0]).readLines()[0..10], + file(process.out.yml[0]).readLines()[0..7] + ).match() + } ) } } diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap index 4274ed5..29e7244 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -1,27 +1,33 @@ { "Should run without failures": { "content": [ - { - "0": [ - "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" - ], - "1": [ - "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" - ], - "2": [ - "versions.yml:md5,3843ac526e762117eedf8825b40683df" - ], - "mqc_yml": [ - "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" - ], - "versions": [ - "versions.yml:md5,3843ac526e762117eedf8825b40683df" - ], - "yml": [ - "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" - ] - } + [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + [ + "data: \"\\n\\n \\n \\n \\n \\n \\n \\n \\n\\", + " \\n\\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n \\n \\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n\\n\\n \\n\\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\" + ], + [ + "CUSTOM_DUMPSOFTWAREVERSIONS:", + " python: 3.12.0", + " yaml: 6.0.1", + "TOOL1:", + " tool1: 0.11.9", + "TOOL2:", + " tool2: '1.9'", + "Workflow:" + ] ], - "timestamp": "2023-11-03T14:43:22.157011" + "timestamp": "2024-01-05T00:18:43.461970077" } -} +} \ No newline at end of file diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test index b9e8f92..ad9bc54 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -3,23 +3,21 @@ nextflow_process { name "Test Process FASTQC" script "../main.nf" process "FASTQC" + tag "modules" tag "modules_nfcore" tag "fastqc" - test("Single-Read") { + test("sarscov2 single-end [fastq]") { when { - params { - outdir = "$outputDir" - } process { """ input[0] = [ - [ id: 'test', single_end:true ], - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] + [ id: 'test', single_end:true ], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] ] """ } @@ -28,82 +26,195 @@ nextflow_process { then { assertAll ( { assert process.success }, + // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. // looks like this:
Mon 2 Oct 2023
test.gz
// https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 - { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" }, - { assert path(process.out.html.get(0).get(1)).getText().contains("") }, - { assert snapshot(process.out.versions).match("versions") }, - { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 paired-end [fastq]") { + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("") }, + { assert path(process.out.html[0][1][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 interleaved [fastq]") { + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], // meta map + file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 paired-end [bam]") { + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } ) } } -// TODO -// // -// // Test with paired-end data -// // -// workflow test_fastqc_paired_end { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with interleaved data -// // -// workflow test_fastqc_interleaved { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with bam data -// // -// workflow test_fastqc_bam { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with multiple samples -// // -// workflow test_fastqc_multiple { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with custom prefix -// // -// workflow test_fastqc_custom_prefix { -// input = [ -// [ id:'mysample', single_end:true ], // meta map -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } + + test("sarscov2 multiple [fastq]") { + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" }, + { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" }, + { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("") }, + { assert path(process.out.html[0][1][1]).text.contains("") }, + { assert path(process.out.html[0][1][2]).text.contains("") }, + { assert path(process.out.html[0][1][3]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 custom_prefix") { + + when { + process { + """ + input[0] = [ + [ id:'mysample', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 single-end [fastq] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id: 'test', single_end:true ], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.html.collect { file(it[1]).getName() } + + process.out.zip.collect { file(it[1]).getName() } + + process.out.versions ).match() } + ) + } + } + } diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap index 636a32c..5ef5afb 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -1,10 +1,20 @@ { + "sarscov2 single-end [fastq] - stub": { + "content": [ + [ + "test.html", + "test.zip", + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "timestamp": "2023-12-29T02:48:05.126117287" + }, "versions": { "content": [ [ "versions.yml:md5,e1cc25ca8af856014824abd842e93978" ] ], - "timestamp": "2023-10-09T23:40:54+0000" + "timestamp": "2023-12-29T02:46:49.507942667" } } \ No newline at end of file From fce65a05dc9b7ff009ddbceed4937c7818783b4e Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Mon, 8 Jan 2024 11:43:54 +1300 Subject: [PATCH 45/59] Removed -exclude_partial and updated flowchart --- README.md | 7 ++++++- TODO.md | 2 -- conf/modules.config | 1 - 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8efbcf0..e5251f2 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ flowchart TD TARGET_ASSEMBLIES TE_LIBRARIES FASTA_VALIDATE + fasta_file_from_fasta_validate EDTA REPEATMASKER end @@ -17,7 +18,8 @@ flowchart TD TARGET_ASSEMBLIES(["[target_assemblies]"]) TE_LIBRARIES(["[te_libs]"]) TARGET_ASSEMBLIES --> FASTA_VALIDATE - FASTA_VALIDATE --> EDTA + FASTA_VALIDATE --> |Fasta|fasta_file_from_fasta_validate(( )) + fasta_file_from_fasta_validate --> EDTA TE_LIBRARIES --> REPEATMASKER EDTA --> |te_lib absent|REPEATMASKER @@ -28,6 +30,7 @@ flowchart TD FASTP FASTP_FASTQC SORTMERNA + fasta_file_for_star STAR SAMTOOLS_CAT end @@ -39,6 +42,8 @@ flowchart TD FASTQC --> FASTP FASTP --> FASTP_FASTQC[FASTQC] FASTP_FASTQC --> SORTMERNA + fasta_file_for_star(( )) + fasta_file_for_star --> |Fasta|STAR SORTMERNA --> STAR STAR --> SAMTOOLS_CAT diff --git a/TODO.md b/TODO.md index 2366b0a..1d4fed8 100644 --- a/TODO.md +++ b/TODO.md @@ -12,6 +12,4 @@ > https://www.biorxiv.org/content/10.1101/096529v2.full.pdf -> Don't use `-exclude_partial` - - [ ] Sort out EDTA testing diff --git a/conf/modules.config b/conf/modules.config index 392583a..474a129 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -135,7 +135,6 @@ if(params.liftoff_xref_annotations) { withName: LIFTOFF { ext.args = ' ' ext.args = [ - '-exclude_partial', '-copies', '-polish', "-a $params.liftoff_coverage", From 582f3faeeccf10105790f80836bf66e48ae06acd Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Mon, 8 Jan 2024 15:24:04 +1300 Subject: [PATCH 46/59] Separated test config for local and pfr --- TODO.md | 6 ++++++ conf/local_stub_params.json | 29 +++++++++++++++++++++++++++ nextflow.config | 20 +++++++++--------- pangene_local_stub.sh | 4 +--- subworkflows/local/extract_samples.nf | 2 -- 5 files changed, 46 insertions(+), 15 deletions(-) create mode 100644 conf/local_stub_params.json diff --git a/TODO.md b/TODO.md index 1d4fed8..3742442 100644 --- a/TODO.md +++ b/TODO.md @@ -13,3 +13,9 @@ > https://www.biorxiv.org/content/10.1101/096529v2.full.pdf - [ ] Sort out EDTA testing + +- Mib finder, eggnog, blastp against TAIR and uniprot (Wait) +- entap to merge (Wait) +- trinity and PASA + StringTie2 -> Evigene (Do) +- othrofinder paper +- gffcompre on braker and liftoff diff --git a/conf/local_stub_params.json b/conf/local_stub_params.json new file mode 100644 index 0000000..df155d5 --- /dev/null +++ b/conf/local_stub_params.json @@ -0,0 +1,29 @@ +{ + "target_assemblies": [ + ["red5_v2p1", ".test/red5_v2p1_chr1.fasta"], + ["donghong", ".test/donghong.chr1.fsa.gz"] + ], + + "te_libraries": [["donghong", ".test/donghong.TElib.fa.gz"]], + + "samplesheet": "./.test/samplesheet_small.csv", + + "external_protein_fastas": [ + ".test/ext_prots/Viridiplantae.fa.gz", + ".test/ext_prots/RU01.20221115150135.pep.fasta" + ], + + "liftoff_xref_annotations": [ + [ + ".test/liftoff/Russell_V2a.chromosomes.and.unassiged.and.haplotigs.fsa", + ".test/liftoff/RU01.20221115150135.gff3" + ], + [ + ".test/liftoff/TAIR10_chr_all.fas", + ".test/liftoff/TAIR10_GFF3_genes_transposons.fixed.gff3" + ] + ], + + "max_cpus": 1, + "max_memory": "1.GB" +} diff --git a/nextflow.config b/nextflow.config index d2a0651..e0d658b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,8 +2,8 @@ includeConfig './conf/base.config' params { target_assemblies = [ - ["red5_v2p1", ".test/red5_v2p1_chr1.fasta"], - ["donghong", ".test/donghong.chr1.fsa.gz"] + ["red5_v2p1", "/workspace/pangene/test_data/red5_v2p1_chr1.fasta"], + ["donghong", "/workspace/pangene/test_data/donghong.chr1.fsa.gz"] ] // Pattern: [ [tag, fasta(.gz) ] ] // Permissible tags: tag, tag_1, tag_tag2_3, tag_tag2_tag3; @@ -11,7 +11,7 @@ params { // "." is not allowed in the tag name te_libraries = [ - ["donghong", ".test/donghong.TElib.fa.gz"] + ["donghong", "/workspace/pangene/test_data/donghong.TElib.fa.gz"] ] // Pattern: [ [tag, fasta(.gz) ] ] // Optional Set to null if libraries are not available. @@ -25,7 +25,7 @@ params { repeatmasker_save_outputs = true - samplesheet = "./.test/samplesheet.csv" + samplesheet = "/workspace/pangene/test_data/samplesheet.csv" // Optional: Set to null if not available skip_fastqc = false @@ -45,8 +45,8 @@ params { star_save_outputs = true external_protein_fastas = [ - ".test/ext_prots/Viridiplantae.fa.gz", - ".test/ext_prots/RU01.20221115150135.pep.fasta" + "/workspace/ComparativeDataSources/OrthoDB11/Viridiplantae.fa.gz", + "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/RU01.20221115150135.pep.fasta" ] // Optional: Set to null if not available @@ -54,12 +54,12 @@ params { liftoff_xref_annotations = [ [ - ".test/liftoff/Russell_V2a.chromosomes.and.unassiged.and.haplotigs.fsa", - ".test/liftoff/RU01.20221115150135.gff3" + "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/Russell_V2a.chromosomes.and.unassiged.and.haplotigs.fsa", + "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/RU01.20221115150135.gff3" ], [ - ".test/liftoff/TAIR10_chr_all.fas", - ".test/liftoff/TAIR10_GFF3_genes_transposons.fixed.gff3" + "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_chr_all.fas", + "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_GFF3_genes_transposons.fixed.gff3" ] ] // Format: [ [ fasta(.gz), gff3(.gz) ] ] diff --git a/pangene_local_stub.sh b/pangene_local_stub.sh index 0a18650..7101009 100755 --- a/pangene_local_stub.sh +++ b/pangene_local_stub.sh @@ -5,6 +5,4 @@ nextflow \ -profile local,docker \ -resume \ -stub \ - --max_cpus=1 \ - --max_memory=1.GB \ - --samplesheet="./.test/samplesheet_small.csv" \ No newline at end of file + --params-file conf/local_stub_params.json \ No newline at end of file diff --git a/subworkflows/local/extract_samples.nf b/subworkflows/local/extract_samples.nf index 957f218..75437da 100644 --- a/subworkflows/local/extract_samples.nf +++ b/subworkflows/local/extract_samples.nf @@ -67,7 +67,5 @@ def create_fastq_channel(LinkedHashMap row, sheetPath) { fastq_meta = [ meta, [ file(fq1), file(fq2) ] ] } - println fastq_meta - return fastq_meta } \ No newline at end of file From e213bd383aa73e6e37b22a452e9f30cb40868398 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Mon, 8 Jan 2024 15:31:07 +1300 Subject: [PATCH 47/59] Fixed local script typo --- pangene_local_stub.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangene_local_stub.sh b/pangene_local_stub.sh index 7101009..01c93e4 100755 --- a/pangene_local_stub.sh +++ b/pangene_local_stub.sh @@ -5,4 +5,4 @@ nextflow \ -profile local,docker \ -resume \ -stub \ - --params-file conf/local_stub_params.json \ No newline at end of file + -params-file conf/local_stub_params.json \ No newline at end of file From c961ab03b30bb77936fae6fa6f2bc839b8648856 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Mon, 8 Jan 2024 17:23:31 +1300 Subject: [PATCH 48/59] Fixed apptainer scope bug in base config --- conf/base.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index 4467c0b..2a6c2fe 100644 --- a/conf/base.config +++ b/conf/base.config @@ -2,10 +2,10 @@ profiles { pfr { process { executor = 'slurm' + } - apptainer { - envWhitelist= 'APPTAINER_BINDPATH,APPTAINER_BIND' - } + apptainer { + envWhitelist = 'APPTAINER_BINDPATH,APPTAINER_BIND' } } From 260c7068162bfc1bd5b4b4f0d6c6586d022074ed Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Mon, 8 Jan 2024 17:47:13 +1300 Subject: [PATCH 49/59] Updated README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e5251f2..8f277ea 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ flowchart TD Configure the pipeline by modifying `nextflow.config` and submit to SLURM for execution. ```bash -sbatch ./pan_gene_pfr.sh +sbatch ./pangene_pfr.sh ``` ## Third-party Sources From 565e7d76c6b99eee0efadc53ba631c99f05c8d99 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 9 Jan 2024 11:39:32 +1300 Subject: [PATCH 50/59] Readded -exclude_partial and now using teambraker container --- conf/modules.config | 1 + modules/kherronism/braker3/main.nf | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 474a129..392583a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -135,6 +135,7 @@ if(params.liftoff_xref_annotations) { withName: LIFTOFF { ext.args = ' ' ext.args = [ + '-exclude_partial', '-copies', '-polish', "-a $params.liftoff_coverage", diff --git a/modules/kherronism/braker3/main.nf b/modules/kherronism/braker3/main.nf index 14fc08c..31c1b33 100644 --- a/modules/kherronism/braker3/main.nf +++ b/modules/kherronism/braker3/main.nf @@ -2,10 +2,7 @@ process BRAKER3 { tag "${meta.id}" label 'process_high' - conda "bioconda::braker3=3.0.6" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/braker3%3A3.0.6--hdfd78af_0': - 'biocontainers/braker3:3.0.6--hdfd78af_0' }" + container "docker://teambraker/braker3:v1.0.6" input: tuple val(meta), path(fasta) From 47b7c40a23a626bf071432b057899995df3ef723 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 9 Jan 2024 21:06:21 +1300 Subject: [PATCH 51/59] Added config for test data and quay.io container for braker3 --- README.md | 2 +- assets/rrna-db-test.txt | 1 + conf/local_stub_params.json | 29 ---------------------------- conf/test_params.json | 31 ++++++++++++++++++++++++++++++ modules/kherronism/braker3/main.nf | 2 +- pangene_local | 16 +++++++++++++++ pangene_local_stub.sh | 8 -------- pangene_pfr.sh => pangene_pfr | 0 8 files changed, 50 insertions(+), 39 deletions(-) create mode 100644 assets/rrna-db-test.txt delete mode 100644 conf/local_stub_params.json create mode 100644 conf/test_params.json create mode 100755 pangene_local delete mode 100755 pangene_local_stub.sh rename pangene_pfr.sh => pangene_pfr (100%) diff --git a/README.md b/README.md index 8f277ea..b1d7966 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ flowchart TD Configure the pipeline by modifying `nextflow.config` and submit to SLURM for execution. ```bash -sbatch ./pangene_pfr.sh +sbatch ./pangene_pfr ``` ## Third-party Sources diff --git a/assets/rrna-db-test.txt b/assets/rrna-db-test.txt new file mode 100644 index 0000000..16504bb --- /dev/null +++ b/assets/rrna-db-test.txt @@ -0,0 +1 @@ +https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-28s-id98.fasta \ No newline at end of file diff --git a/conf/local_stub_params.json b/conf/local_stub_params.json deleted file mode 100644 index df155d5..0000000 --- a/conf/local_stub_params.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "target_assemblies": [ - ["red5_v2p1", ".test/red5_v2p1_chr1.fasta"], - ["donghong", ".test/donghong.chr1.fsa.gz"] - ], - - "te_libraries": [["donghong", ".test/donghong.TElib.fa.gz"]], - - "samplesheet": "./.test/samplesheet_small.csv", - - "external_protein_fastas": [ - ".test/ext_prots/Viridiplantae.fa.gz", - ".test/ext_prots/RU01.20221115150135.pep.fasta" - ], - - "liftoff_xref_annotations": [ - [ - ".test/liftoff/Russell_V2a.chromosomes.and.unassiged.and.haplotigs.fsa", - ".test/liftoff/RU01.20221115150135.gff3" - ], - [ - ".test/liftoff/TAIR10_chr_all.fas", - ".test/liftoff/TAIR10_GFF3_genes_transposons.fixed.gff3" - ] - ], - - "max_cpus": 1, - "max_memory": "1.GB" -} diff --git a/conf/test_params.json b/conf/test_params.json new file mode 100644 index 0000000..30c9c9c --- /dev/null +++ b/conf/test_params.json @@ -0,0 +1,31 @@ +{ + "target_assemblies": [ + ["red5_v2p1", ".test/target/red5_v2p1_chr1_600k.fasta.gz"], + ["donghong", ".test/target/donghong_chr1_600k.fsa.gz"] + ], + + "te_libraries": [["donghong", ".test/te_lib/donghong.TElib.fa.gz"]], + + "samplesheet": ".test/samplesheet/samplesheet.csv", + + "ribo_database_manifest":"assets/rrna-db-test.txt", + + "external_protein_fastas": [ + ".test/ext_prot/RU01_20221115150135_chr1_600k.pep.fasta.gz", + ".test/ext_prot/RU01_20221115150135_chr2_600k.pep.fasta.gz" + ], + + "liftoff_xref_annotations": [ + [ + ".test/liftoff/Russell_V2a_chr1_600k.fsa.gz", + ".test/liftoff/Russell_V2a_chr1_600k.gff3.gz" + ], + [ + ".test/liftoff/TAIR10_chr1_600k.fas.gz", + ".test/liftoff/TAIR10_chr1_600k.gff3.gz" + ] + ], + + "max_cpus": 2, + "max_memory": "3.GB" +} diff --git a/modules/kherronism/braker3/main.nf b/modules/kherronism/braker3/main.nf index 31c1b33..aab3eae 100644 --- a/modules/kherronism/braker3/main.nf +++ b/modules/kherronism/braker3/main.nf @@ -2,7 +2,7 @@ process BRAKER3 { tag "${meta.id}" label 'process_high' - container "docker://teambraker/braker3:v1.0.6" + container "gallvp/teambraker_braker3:v1.0.6" input: tuple val(meta), path(fasta) diff --git a/pangene_local b/pangene_local new file mode 100755 index 0000000..8e8e692 --- /dev/null +++ b/pangene_local @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +[[ $1 == '-stub' ]] \ + && stub='-stub' \ + || stub='' + +[[ $1 == '-stub' ]] \ + && echo 'Executing with -stub' \ + || echo 'Executing without -stub' + +nextflow \ + main.nf \ + -profile local,docker \ + -resume \ + $stub \ + -params-file conf/test_params.json \ No newline at end of file diff --git a/pangene_local_stub.sh b/pangene_local_stub.sh deleted file mode 100755 index 01c93e4..0000000 --- a/pangene_local_stub.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -nextflow \ - main.nf \ - -profile local,docker \ - -resume \ - -stub \ - -params-file conf/local_stub_params.json \ No newline at end of file diff --git a/pangene_pfr.sh b/pangene_pfr similarity index 100% rename from pangene_pfr.sh rename to pangene_pfr From 664fba178e10e6d078964651f4b904851c4eafb3 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 10 Jan 2024 11:06:52 +1300 Subject: [PATCH 52/59] Now using repeatmodeler by default --- README.md | 11 ++- conf/modules.config | 13 ++- modules.json | 10 ++ modules/local/validate_params.nf | 8 ++ .../builddatabase/environment.yml | 9 ++ .../pfr/repeatmodeler/builddatabase/main.nf | 50 ++++++++++ .../pfr/repeatmodeler/builddatabase/meta.yml | 44 +++++++++ .../builddatabase/tests/main.nf.test | 60 ++++++++++++ .../builddatabase/tests/main.nf.test.snap | 16 ++++ .../builddatabase/tests/tags.yml | 2 + .../repeatmodeler/environment.yml | 9 ++ .../pfr/repeatmodeler/repeatmodeler/main.nf | 54 +++++++++++ .../pfr/repeatmodeler/repeatmodeler/meta.yml | 52 +++++++++++ .../repeatmodeler/tests/main.nf.test | 92 +++++++++++++++++++ .../repeatmodeler/tests/main.nf.test.snap | 46 ++++++++++ .../repeatmodeler/tests/tags.yml | 2 + nextflow.config | 6 +- subworkflows/local/prepare_assembly.nf | 29 +++++- workflows/pangene.nf | 3 +- 19 files changed, 507 insertions(+), 9 deletions(-) create mode 100644 modules/pfr/repeatmodeler/builddatabase/environment.yml create mode 100644 modules/pfr/repeatmodeler/builddatabase/main.nf create mode 100644 modules/pfr/repeatmodeler/builddatabase/meta.yml create mode 100644 modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test create mode 100644 modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test.snap create mode 100644 modules/pfr/repeatmodeler/builddatabase/tests/tags.yml create mode 100644 modules/pfr/repeatmodeler/repeatmodeler/environment.yml create mode 100644 modules/pfr/repeatmodeler/repeatmodeler/main.nf create mode 100644 modules/pfr/repeatmodeler/repeatmodeler/meta.yml create mode 100644 modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test create mode 100644 modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test.snap create mode 100644 modules/pfr/repeatmodeler/repeatmodeler/tests/tags.yml diff --git a/README.md b/README.md index b1d7966..5efb30f 100644 --- a/README.md +++ b/README.md @@ -12,16 +12,21 @@ flowchart TD FASTA_VALIDATE fasta_file_from_fasta_validate EDTA + REPEATMODELER + te_lib_absent_node REPEATMASKER end TARGET_ASSEMBLIES(["[target_assemblies]"]) TE_LIBRARIES(["[te_libs]"]) TARGET_ASSEMBLIES --> FASTA_VALIDATE - FASTA_VALIDATE --> |Fasta|fasta_file_from_fasta_validate(( )) - fasta_file_from_fasta_validate --> EDTA + FASTA_VALIDATE --- |Fasta|fasta_file_from_fasta_validate(( )) + fasta_file_from_fasta_validate --> |or|EDTA + fasta_file_from_fasta_validate --> |default|REPEATMODELER + REPEATMODELER --- te_lib_absent_node(( )) + EDTA --- te_lib_absent_node TE_LIBRARIES --> REPEATMASKER - EDTA --> |te_lib absent|REPEATMASKER + te_lib_absent_node --> REPEATMASKER subgraph Samplesheet [ ] SAMPLESHEET diff --git a/conf/modules.config b/conf/modules.config index 392583a..d2149ce 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -10,7 +10,18 @@ process { path: { "${params.outdir}/edta" }, mode: "copy", pattern: '*.EDTA.TElib.fa', - enabled: params.edta_save_te_lib + enabled: params.save_annotated_te_lib + ] + } + + withName: 'REPEATMODELER_REPEATMODELER' { + ext.args = '-LTRStruct' + + publishDir = [ + path: { "${params.outdir}/repeatmodeler" }, + mode: "copy", + pattern: '*.fa', + enabled: params.save_annotated_te_lib ] } diff --git a/modules.json b/modules.json index cdb8a24..a645b68 100644 --- a/modules.json +++ b/modules.json @@ -29,6 +29,16 @@ "branch": "main", "git_sha": "444b35f4e6285115f84d2bfce49fc0e6d8a2754e", "installed_by": ["modules"] + }, + "repeatmodeler/builddatabase": { + "branch": "main", + "git_sha": "9da0567f685b2772f65290f2bd6d6347671c8310", + "installed_by": ["modules"] + }, + "repeatmodeler/repeatmodeler": { + "branch": "main", + "git_sha": "9da0567f685b2772f65290f2bd6d6347671c8310", + "installed_by": ["modules"] } } }, diff --git a/modules/local/validate_params.nf b/modules/local/validate_params.nf index 5eb6207..f6ce18a 100644 --- a/modules/local/validate_params.nf +++ b/modules/local/validate_params.nf @@ -1,6 +1,14 @@ def validateParams(params) { validateFastaTags(params) + if (!params['repeat_annotator']) { + error "Error: repeat_annotator must be either 'repeatmodeler' or 'edta'" + } + + if ( !(params['repeat_annotator'] in ['repeatmodeler', 'edta']) ) { + error "Error: repeat_annotator must be either 'repeatmodeler' or 'edta'" + } + validateTETags(params) validateTEFastaCorrespondence(params) diff --git a/modules/pfr/repeatmodeler/builddatabase/environment.yml b/modules/pfr/repeatmodeler/builddatabase/environment.yml new file mode 100644 index 0000000..ecc282e --- /dev/null +++ b/modules/pfr/repeatmodeler/builddatabase/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "repeatmodeler_builddatabase" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::repeatmodeler=2.0.5" diff --git a/modules/pfr/repeatmodeler/builddatabase/main.nf b/modules/pfr/repeatmodeler/builddatabase/main.nf new file mode 100644 index 0000000..486e25d --- /dev/null +++ b/modules/pfr/repeatmodeler/builddatabase/main.nf @@ -0,0 +1,50 @@ +process REPEATMODELER_BUILDDATABASE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/repeatmodeler:2.0.5--pl5321hdfd78af_0': + 'biocontainers/repeatmodeler:2.0.5--pl5321hdfd78af_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("${prefix}.*") , emit: db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + """ + BuildDatabase \\ + -name $prefix \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + repeatmodeler: \$(RepeatModeler | grep '/usr/local/bin/RepeatModeler - ' | sed 's|/usr/local/bin/RepeatModeler - ||') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.nhr + touch ${prefix}.nin + touch ${prefix}.njs + touch ${prefix}.nnd + touch ${prefix}.nni + touch ${prefix}.nog + touch ${prefix}.nsq + touch ${prefix}.translation + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + repeatmodeler: \$(RepeatModeler | grep '/usr/local/bin/RepeatModeler - ' | sed 's|/usr/local/bin/RepeatModeler - ||') + END_VERSIONS + """ +} diff --git a/modules/pfr/repeatmodeler/builddatabase/meta.yml b/modules/pfr/repeatmodeler/builddatabase/meta.yml new file mode 100644 index 0000000..d3aa931 --- /dev/null +++ b/modules/pfr/repeatmodeler/builddatabase/meta.yml @@ -0,0 +1,44 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "repeatmodeler_builddatabase" +description: Create a database for RepeatModeler +keywords: + - genomics + - fasta + - repeat +tools: + - "repeatmodeler": + description: "RepeatModeler is a de-novo repeat family identification and modeling package." + homepage: "https://github.com/Dfam-consortium/RepeatModeler" + documentation: "https://github.com/Dfam-consortium/RepeatModeler" + tool_dev_url: "https://github.com/Dfam-consortium/RepeatModeler" + licence: ["Open Software License v2.1"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - fasta: + type: file + description: Fasta file + pattern: "*.{fasta,fsa,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - db: + type: file + description: Database files for repeatmodeler + pattern: "`${prefix}.*`" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test b/modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test new file mode 100644 index 0000000..616f88c --- /dev/null +++ b/modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test @@ -0,0 +1,60 @@ +nextflow_process { + + name "Test Process REPEATMODELER_BUILDDATABASE" + script "../main.nf" + process "REPEATMODELER_BUILDDATABASE" + + tag "modules" + tag "modules_nfcore" + tag "repeatmodeler" + tag "repeatmodeler/builddatabase" + + test("sarscov2-genome_fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") }, + { assert snapshot(process.out.db[0][1].collect { file(it).name }.sort().toString()).match("for-stub-match") } + ) + } + + } + + test("sarscov2-genome_fasta-stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") }, + { assert snapshot(process.out.db[0][1].collect { file(it).name }.sort().toString()).match("for-stub-match") } + ) + } + + } + +} diff --git a/modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test.snap b/modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test.snap new file mode 100644 index 0000000..cda327e --- /dev/null +++ b/modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test.snap @@ -0,0 +1,16 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,7944637266bc3e2726899eaad5e46c87" + ] + ], + "timestamp": "2024-01-09T15:14:48.807063" + }, + "for-stub-match": { + "content": [ + "[test.nhr, test.nin, test.njs, test.nnd, test.nni, test.nog, test.nsq, test.translation]" + ], + "timestamp": "2024-01-09T15:14:48.81702" + } +} \ No newline at end of file diff --git a/modules/pfr/repeatmodeler/builddatabase/tests/tags.yml b/modules/pfr/repeatmodeler/builddatabase/tests/tags.yml new file mode 100644 index 0000000..426540d --- /dev/null +++ b/modules/pfr/repeatmodeler/builddatabase/tests/tags.yml @@ -0,0 +1,2 @@ +repeatmodeler/builddatabase: + - "modules/pfr/repeatmodeler/builddatabase/**" diff --git a/modules/pfr/repeatmodeler/repeatmodeler/environment.yml b/modules/pfr/repeatmodeler/repeatmodeler/environment.yml new file mode 100644 index 0000000..2422071 --- /dev/null +++ b/modules/pfr/repeatmodeler/repeatmodeler/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "repeatmodeler_repeatmodeler" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::repeatmodeler=2.0.5" diff --git a/modules/pfr/repeatmodeler/repeatmodeler/main.nf b/modules/pfr/repeatmodeler/repeatmodeler/main.nf new file mode 100644 index 0000000..34df322 --- /dev/null +++ b/modules/pfr/repeatmodeler/repeatmodeler/main.nf @@ -0,0 +1,54 @@ +process REPEATMODELER_REPEATMODELER { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/repeatmodeler:2.0.5--pl5321hdfd78af_0': + 'biocontainers/repeatmodeler:2.0.5--pl5321hdfd78af_0' }" + + input: + tuple val(meta), path(db) + + output: + tuple val(meta), path("*.fa") , emit: fasta + tuple val(meta), path("*.stk") , emit: stk + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def db_name = file(db[0]).getBaseName() + """ + RepeatModeler \\ + -database $db_name \\ + $args \\ + -threads $task.cpus + + mv ${db_name}-families.fa ${prefix}.fa + mv ${db_name}-families.stk ${prefix}.stk + mv ${db_name}-rmod.log ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + repeatmodeler: \$(RepeatModeler | grep '/usr/local/bin/RepeatModeler - ' | sed 's|/usr/local/bin/RepeatModeler - ||') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.fa + touch ${prefix}.stk + touch ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + repeatmodeler: \$(RepeatModeler | grep '/usr/local/bin/RepeatModeler - ' | sed 's|/usr/local/bin/RepeatModeler - ||') + END_VERSIONS + """ +} diff --git a/modules/pfr/repeatmodeler/repeatmodeler/meta.yml b/modules/pfr/repeatmodeler/repeatmodeler/meta.yml new file mode 100644 index 0000000..29bb795 --- /dev/null +++ b/modules/pfr/repeatmodeler/repeatmodeler/meta.yml @@ -0,0 +1,52 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "repeatmodeler_repeatmodeler" +description: Performs de novo transposable element (TE) family identification with RepeatModeler +keywords: + - genomics + - fasta + - repeat + - transposable element +tools: + - "repeatmodeler": + description: "RepeatModeler is a de-novo repeat family identification and modeling package." + homepage: "https://github.com/Dfam-consortium/RepeatModeler" + documentation: "https://github.com/Dfam-consortium/RepeatModeler" + tool_dev_url: "https://github.com/Dfam-consortium/RepeatModeler" + licence: ["Open Software License v2.1"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - db: + type: file + description: RepeatModeler database files generated with REPEATMODELER_BUILDDATABASE + pattern: "*" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - fasta: + type: file + description: Consensus repeat sequences + pattern: "*.fa" + - stk: + type: file + description: Seed alignments + pattern: "*.stk" + - log: + type: file + description: A summarized log of the run + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test b/modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test new file mode 100644 index 0000000..78b7957 --- /dev/null +++ b/modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test @@ -0,0 +1,92 @@ +nextflow_process { + + name "Test Process REPEATMODELER_REPEATMODELER" + script "../main.nf" + process "REPEATMODELER_REPEATMODELER" + + tag "modules" + tag "modules_nfcore" + tag "repeatmodeler" + tag "repeatmodeler/repeatmodeler" + tag "repeatmodeler/builddatabase" + + test("homo_sapiens-genome_fasta") { + + setup { + run("REPEATMODELER_BUILDDATABASE") { + script "../../../../pfr/repeatmodeler/builddatabase" + + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = REPEATMODELER_BUILDDATABASE.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.fasta).match("fasta") }, + { assert snapshot(process.out.stk).match("stk") }, + { assert file(process.out.log[0][1]).text.contains('1 families discovered.') }, + { assert snapshot(process.out.versions).match("versions") }, + { + assert snapshot( + ( + process.out.fasta.collect { file(it[1]).getName() } + + process.out.stk.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("for-stub-match") + } + ) + } + + } + + test("homo_sapiens-genome_fasta-stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") }, + { + assert snapshot( + ( + process.out.fasta.collect { file(it[1]).getName() } + + process.out.stk.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("for-stub-match") + } + ) + } + + } + +} diff --git a/modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test.snap b/modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test.snap new file mode 100644 index 0000000..051dd60 --- /dev/null +++ b/modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test.snap @@ -0,0 +1,46 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,1bb6846ecf1304c262eaef4d3de60cf9" + ] + ], + "timestamp": "2024-01-09T15:06:55.753492" + }, + "stk": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.stk:md5,acd01ad35763c11315e2297a4f051d57" + ] + ] + ], + "timestamp": "2024-01-09T15:06:55.740963" + }, + "for-stub-match": { + "content": [ + [ + "test.fa", + "test.log", + "test.stk" + ] + ], + "timestamp": "2024-01-09T15:06:55.759971" + }, + "fasta": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.fa:md5,e25326771341204e1f8054d9529411e5" + ] + ] + ], + "timestamp": "2024-01-09T15:06:55.737658" + } +} \ No newline at end of file diff --git a/modules/pfr/repeatmodeler/repeatmodeler/tests/tags.yml b/modules/pfr/repeatmodeler/repeatmodeler/tests/tags.yml new file mode 100644 index 0000000..648cc93 --- /dev/null +++ b/modules/pfr/repeatmodeler/repeatmodeler/tests/tags.yml @@ -0,0 +1,2 @@ +repeatmodeler/repeatmodeler: + - "modules/pfr/repeatmodeler/repeatmodeler/**" diff --git a/nextflow.config b/nextflow.config index e0d658b..cbc0a2e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -20,8 +20,12 @@ params { // Not all target_assemblies need to have an associated (by tag) TE library. // When the TE lib is not available for a traget assembly, EDTA is used to create one. + repeat_annotator = 'repeatmodeler' + // 'repeatmodeler' or 'edta' + + save_annotated_te_lib = true + edta_is_sensitive = false - edta_save_te_lib = true repeatmasker_save_outputs = true diff --git a/subworkflows/local/prepare_assembly.nf b/subworkflows/local/prepare_assembly.nf index ed32afb..9fc6244 100644 --- a/subworkflows/local/prepare_assembly.nf +++ b/subworkflows/local/prepare_assembly.nf @@ -1,6 +1,8 @@ include { GUNZIP as GUNZIP_TARGET_ASSEMBLY } from '../../modules/nf-core/gunzip' include { GUNZIP as GUNZIP_TE_LIBRARY } from '../../modules/nf-core/gunzip' include { FASTAVALIDATOR } from '../../modules/nf-core/fastavalidator' +include { REPEATMODELER_BUILDDATABASE } from '../../modules/pfr/repeatmodeler/builddatabase' +include { REPEATMODELER_REPEATMODELER } from '../../modules/pfr/repeatmodeler/repeatmodeler' include { REPEATMASKER } from '../../modules/kherronism/repeatmasker' include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate' @@ -10,6 +12,7 @@ workflow PREPARE_ASSEMBLY { take: target_assembly // channel: [ meta, fasta ] te_library // channel: [ meta, fasta ] + repeat_annotator // val(String), 'repeatmodeler' or 'edta' main: ch_versions = Channel.empty() @@ -59,7 +62,7 @@ workflow PREPARE_ASSEMBLY { ch_versions = ch_versions.mix(GUNZIP_TE_LIBRARY.out.versions.first()) // SUBWORKFLOW: FASTA_EDTA_LAI - ch_edta_inputs = ch_validated_assembly + ch_annotator_inputs = ch_validated_assembly | join( ch_gunzip_te_library, remainder: true ) @@ -67,19 +70,39 @@ workflow PREPARE_ASSEMBLY { teLib == null } | map { meta, assembly, teLib -> [meta, assembly] } + + ch_edta_inputs = repeat_annotator != 'edta' + ? Channel.empty() + : ch_annotator_inputs FASTA_EDTA_LAI( ch_edta_inputs, [], true // Skip LAI ) + + ch_versions = ch_versions.mix(FASTA_EDTA_LAI.out.versions.first()) + + // MODULE: REPEATMODELER_BUILDDATABASE + ch_repeatmodeler_inputs = repeat_annotator != 'repeatmodeler' + ? Channel.empty() + : ch_annotator_inputs + + REPEATMODELER_BUILDDATABASE ( ch_repeatmodeler_inputs ) + + ch_versions = ch_versions.mix(REPEATMODELER_BUILDDATABASE.out.versions.first()) + + // MODULE: REPEATMODELER_REPEATMODELER + REPEATMODELER_REPEATMODELER ( REPEATMODELER_BUILDDATABASE.out.db ) ch_assembly_and_te_lib = ch_validated_assembly | join( - FASTA_EDTA_LAI.out.te_lib_fasta.mix(ch_gunzip_te_library) + repeat_annotator == 'edta' + ? FASTA_EDTA_LAI.out.te_lib_fasta.mix(ch_gunzip_te_library) + : REPEATMODELER_REPEATMODELER.out.fasta.mix(ch_gunzip_te_library) ) - ch_versions = ch_versions.mix(FASTA_EDTA_LAI.out.versions.first()) + ch_versions = ch_versions.mix(REPEATMODELER_REPEATMODELER.out.versions.first()) // MODULE: REPEATMASKER REPEATMASKER( diff --git a/workflows/pangene.nf b/workflows/pangene.nf index 7cc8c4a..72b9fd6 100644 --- a/workflows/pangene.nf +++ b/workflows/pangene.nf @@ -66,7 +66,8 @@ workflow PANGENE { // SUBWORKFLOW: PREPARE_ASSEMBLY PREPARE_ASSEMBLY( ch_target_assembly, - ch_te_library + ch_te_library, + params.repeat_annotator ) ch_valid_target_assembly = PREPARE_ASSEMBLY.out.target_assemby From dea4bb5af54cae9adca1852d472a29e9b0252e54 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 10 Jan 2024 12:54:37 +1300 Subject: [PATCH 53/59] BRAKER3 now runnable with test data --- conf/test_params.json | 4 +++- modules/kherronism/braker3/main.nf | 23 ++++++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/conf/test_params.json b/conf/test_params.json index 30c9c9c..fda29e2 100644 --- a/conf/test_params.json +++ b/conf/test_params.json @@ -1,6 +1,6 @@ { "target_assemblies": [ - ["red5_v2p1", ".test/target/red5_v2p1_chr1_600k.fasta.gz"], + ["red5_v2p1", ".test/target/red5_v2p1_chr1_1200k.fasta.gz"], ["donghong", ".test/target/donghong_chr1_600k.fsa.gz"] ], @@ -15,6 +15,8 @@ ".test/ext_prot/RU01_20221115150135_chr2_600k.pep.fasta.gz" ], + "braker_extra_args": "--testMode --species=arabidopsis --useexisting", + "liftoff_xref_annotations": [ [ ".test/liftoff/Russell_V2a_chr1_600k.fsa.gz", diff --git a/modules/kherronism/braker3/main.nf b/modules/kherronism/braker3/main.nf index aab3eae..ae0ec81 100644 --- a/modules/kherronism/braker3/main.nf +++ b/modules/kherronism/braker3/main.nf @@ -26,20 +26,25 @@ process BRAKER3 { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" - def rna_ids = rnaseq_sets_ids ? "--rnaseq_sets_ids=${rnaseq_sets_ids}" : '' - def rna_dirs = rnaseq_sets_dirs ? "--rnaseq_sets_dirs=${rnaseq_sets_dirs}" : '' - def bam = bam ? "--bam=${bam}" : '' - def proteins = proteins ? "--prot_seq=${proteins}" : '' - def hints = hintsfile ? "--hints=${hintsfile}" : '' + def test_mode = args.contains('--testMode') // Custom flag for test data + def args_fmt = test_mode ? args.replace('--testMode', '') : args + + def rna_ids = rnaseq_sets_ids ? "--rnaseq_sets_ids=${rnaseq_sets_ids}" : '' + def rna_dirs = rnaseq_sets_dirs ? "--rnaseq_sets_dirs=${rnaseq_sets_dirs}" : '' + def bam = bam && !test_mode ? "--bam=${bam}" : '' + def proteins = proteins && !test_mode ? "--prot_seq=${proteins}" : '' + def hints = hintsfile ? "--hints=${hintsfile}" : '' + + def new_species = args.contains('--species') ? '' : "--species new_species" """ cp -r /usr/share/augustus/config augustus_config braker.pl \\ --genome ${fasta} \\ - --species ${prefix} \\ + ${new_species} \\ --workingdir ${prefix} \\ --AUGUSTUS_CONFIG_PATH "\$(pwd)/augustus_config" \\ --threads ${task.cpus} \\ @@ -48,7 +53,7 @@ process BRAKER3 { ${bam} \\ ${proteins} \\ ${hints} \\ - ${args} + ${args_fmt} cat <<-END_VERSIONS > versions.yml "${task.process}": From 0535b1a32bc9dd3dba7f96896a3c25e4ef6ffe18 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 10 Jan 2024 15:28:05 +1300 Subject: [PATCH 54/59] Added editor config --- .editorconfig | 15 +++++++++++++++ .gitignore | 2 +- cleanNXF.sh | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..2951ad8 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,15 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +indent_size = 4 +indent_style = space + +[*.{md,yml,yaml,cff}] +indent_size = 2 + +[*.nf.test] +insert_final_newline = false diff --git a/.gitignore b/.gitignore index 8f984b0..93035ae 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,4 @@ testing* *.stderr .literature -.test \ No newline at end of file +.test diff --git a/cleanNXF.sh b/cleanNXF.sh index c566dbf..8c64a3e 100755 --- a/cleanNXF.sh +++ b/cleanNXF.sh @@ -8,4 +8,4 @@ for i in $(ls work | grep -v "conda"); do rm -rf "work/$i" done -echo "Cleaned work..." \ No newline at end of file +echo "Cleaned work..." From 457a64303cf772998b3888592debf9af3400f512 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 11 Jan 2024 10:31:47 +1300 Subject: [PATCH 55/59] Disabled sortmerna by default added option to save cat bam --- conf/modules.config | 13 +++++++++++-- conf/test_params.json | 1 + nextflow.config | 19 +++++++++++-------- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index d2149ce..fc489bf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -30,7 +30,7 @@ process { "-no_is", "-xsmall", ].join(' ').trim() - + publishDir = [ path: { "${params.outdir}/repeatmasker" }, mode: "copy", @@ -127,6 +127,15 @@ process { ] } + withName: '.*:ALIGN_RNASEQ:SAMTOOLS_CAT' { + publishDir = [ + path: { "${params.outdir}/star/cat_bam" }, + mode: "copy", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_cat_bam + ] + } + withName: BRAKER3 { ext.args = [ "--gff3", @@ -174,4 +183,4 @@ process { enabled: true ] } -} \ No newline at end of file +} diff --git a/conf/test_params.json b/conf/test_params.json index fda29e2..fef1871 100644 --- a/conf/test_params.json +++ b/conf/test_params.json @@ -8,6 +8,7 @@ "samplesheet": ".test/samplesheet/samplesheet.csv", + "remove_ribo_rna": true, "ribo_database_manifest":"assets/rrna-db-test.txt", "external_protein_fastas": [ diff --git a/nextflow.config b/nextflow.config index cbc0a2e..409da80 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,7 +9,7 @@ params { // Permissible tags: tag, tag_1, tag_tag2_3, tag_tag2_tag3; // Any name with alphanumeric characters including "_". // "." is not allowed in the tag name - + te_libraries = [ ["donghong", "/workspace/pangene/test_data/donghong.TElib.fa.gz"] ] @@ -19,16 +19,16 @@ params { // Each TE library should have an associated (by tag) assembly in target_assemblies. // Not all target_assemblies need to have an associated (by tag) TE library. // When the TE lib is not available for a traget assembly, EDTA is used to create one. - + repeat_annotator = 'repeatmodeler' // 'repeatmodeler' or 'edta' - + save_annotated_te_lib = true - + edta_is_sensitive = false - + repeatmasker_save_outputs = true - + samplesheet = "/workspace/pangene/test_data/samplesheet.csv" // Optional: Set to null if not available @@ -40,13 +40,16 @@ params { save_trimmed = true // toggling this parameter results in rerun of FASTP and FASTQC_TRIM - remove_ribo_rna = true + remove_ribo_rna = false save_non_ribo_reads = true ribo_database_manifest = "${projectDir}/assets/rrna-db-defaults.txt" star_max_intron_length = 16000 star_align_extra_args = "" star_save_outputs = true + save_cat_bam = true + // A single BAM is created for each assembly from all the RNAseq samples, if there + // are more than one external_protein_fastas = [ "/workspace/ComparativeDataSources/OrthoDB11/Viridiplantae.fa.gz", @@ -81,4 +84,4 @@ params { includeConfig './conf/manifest.config' includeConfig './conf/modules.config' -includeConfig './conf/reporting_defaults.config' \ No newline at end of file +includeConfig './conf/reporting_defaults.config' From 784bb54b01a36cc89d62e672074389c3517d5a67 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Thu, 11 Jan 2024 11:21:45 +1300 Subject: [PATCH 56/59] Added pre-commit --- .gitignore | 10 +- .pre-commit-config.yaml | 5 + .prettierignore | 19 ++ .prettierrc.yml | 1 + assets/rrna-db-defaults.txt | 2 +- assets/rrna-db-test.txt | 2 +- bin/make-samplesheet.py | 2 +- conf/base.config | 2 +- conf/manifest.config | 2 +- conf/reporting_defaults.config | 2 +- conf/test_params.json | 42 ++- main.nf | 2 +- modules.json | 296 +++++++++--------- .../dumpsoftwareversions/environment.yml | 2 +- .../custom/dumpsoftwareversions/main.nf | 4 +- .../tests/main.nf.test.snap | 12 +- .../umitools/extract/tests/nextflow.config | 2 +- pangene_local | 8 +- pangene_pfr | 2 +- subworkflows/local/align_rnaseq.nf | 10 +- subworkflows/local/extract_samples.nf | 6 +- subworkflows/local/fasta_liftoff.nf | 12 +- subworkflows/local/prepare_assembly.nf | 12 +- subworkflows/local/prepare_ext_prots.nf | 12 +- subworkflows/local/preprocess_rnaseq.nf | 12 +- workflows/pangene.nf | 8 +- 26 files changed, 257 insertions(+), 232 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 .prettierignore create mode 100644 .prettierrc.yml diff --git a/.gitignore b/.gitignore index 93035ae..62d31c0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,16 @@ .nextflow* work/ -data/ results/ .DS_Store -testing/ -testing* +*.code-workspace +.screenrc +.*.sw? +__pycache__ +*.pyo *.pyc *.stdout *.stderr .literature -.test +pangene-test/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..fc52181 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v3.1.0" + hooks: + - id: prettier diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..24a3687 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,19 @@ +includes/Maven_Pro/ + +# gitignore +.nextflow* +work/ +results/ +.DS_Store +*.code-workspace +.screenrc +.*.sw? +__pycache__ +*.pyo +*.pyc + +*.stdout +*.stderr + +.literature +pangene-test/ diff --git a/.prettierrc.yml b/.prettierrc.yml new file mode 100644 index 0000000..c81f9a7 --- /dev/null +++ b/.prettierrc.yml @@ -0,0 +1 @@ +printWidth: 120 diff --git a/assets/rrna-db-defaults.txt b/assets/rrna-db-defaults.txt index e2bc4e6..4223356 100644 --- a/assets/rrna-db-defaults.txt +++ b/assets/rrna-db-defaults.txt @@ -5,4 +5,4 @@ https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/s https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-bac-16s-id90.fasta https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-bac-23s-id98.fasta https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-18s-id95.fasta -https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-28s-id98.fasta \ No newline at end of file +https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-28s-id98.fasta diff --git a/assets/rrna-db-test.txt b/assets/rrna-db-test.txt index 16504bb..20116f9 100644 --- a/assets/rrna-db-test.txt +++ b/assets/rrna-db-test.txt @@ -1 +1 @@ -https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-28s-id98.fasta \ No newline at end of file +https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-28s-id98.fasta diff --git a/bin/make-samplesheet.py b/bin/make-samplesheet.py index bc39f55..b4ad0b7 100755 --- a/bin/make-samplesheet.py +++ b/bin/make-samplesheet.py @@ -282,4 +282,4 @@ def main(): make_samplesheet_from_command(input_path_or_command, exp_name) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/conf/base.config b/conf/base.config index 2a6c2fe..5f02f17 100644 --- a/conf/base.config +++ b/conf/base.config @@ -8,7 +8,7 @@ profiles { envWhitelist = 'APPTAINER_BINDPATH,APPTAINER_BIND' } } - + local { process { executor = 'local' diff --git a/conf/manifest.config b/conf/manifest.config index 706052c..fd7c8f6 100644 --- a/conf/manifest.config +++ b/conf/manifest.config @@ -7,4 +7,4 @@ manifest { nextflowVersion = '!>=23.04.4' version = '0.1' doi = '' -} \ No newline at end of file +} diff --git a/conf/reporting_defaults.config b/conf/reporting_defaults.config index 5df9469..178522d 100644 --- a/conf/reporting_defaults.config +++ b/conf/reporting_defaults.config @@ -10,4 +10,4 @@ report { trace { enabled = true file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" -} \ No newline at end of file +} diff --git a/conf/test_params.json b/conf/test_params.json index fef1871..0dc25f8 100644 --- a/conf/test_params.json +++ b/conf/test_params.json @@ -1,34 +1,28 @@ { - "target_assemblies": [ - ["red5_v2p1", ".test/target/red5_v2p1_chr1_1200k.fasta.gz"], - ["donghong", ".test/target/donghong_chr1_600k.fsa.gz"] - ], + "target_assemblies": [ + ["red5_v2p1", "pangene-test/target/red5_v2p1_chr1_1200k.fasta.gz"], + ["donghong", "pangene-test/target/donghong_chr1_600k.fsa.gz"] + ], - "te_libraries": [["donghong", ".test/te_lib/donghong.TElib.fa.gz"]], + "te_libraries": [["donghong", "pangene-test/te_lib/donghong.TElib.fa.gz"]], - "samplesheet": ".test/samplesheet/samplesheet.csv", + "samplesheet": "pangene-test/samplesheet/samplesheet.csv", - "remove_ribo_rna": true, - "ribo_database_manifest":"assets/rrna-db-test.txt", + "remove_ribo_rna": true, + "ribo_database_manifest": "assets/rrna-db-test.txt", - "external_protein_fastas": [ - ".test/ext_prot/RU01_20221115150135_chr1_600k.pep.fasta.gz", - ".test/ext_prot/RU01_20221115150135_chr2_600k.pep.fasta.gz" - ], + "external_protein_fastas": [ + "pangene-test/ext_prot/RU01_20221115150135_chr1_600k.pep.fasta.gz", + "pangene-test/ext_prot/RU01_20221115150135_chr2_600k.pep.fasta.gz" + ], - "braker_extra_args": "--testMode --species=arabidopsis --useexisting", + "braker_extra_args": "--testMode --species=arabidopsis --useexisting", - "liftoff_xref_annotations": [ - [ - ".test/liftoff/Russell_V2a_chr1_600k.fsa.gz", - ".test/liftoff/Russell_V2a_chr1_600k.gff3.gz" + "liftoff_xref_annotations": [ + ["pangene-test/liftoff/Russell_V2a_chr1_600k.fsa.gz", "pangene-test/liftoff/Russell_V2a_chr1_600k.gff3.gz"], + ["pangene-test/liftoff/TAIR10_chr1_600k.fas.gz", "pangene-test/liftoff/TAIR10_chr1_600k.gff3.gz"] ], - [ - ".test/liftoff/TAIR10_chr1_600k.fas.gz", - ".test/liftoff/TAIR10_chr1_600k.gff3.gz" - ] - ], - "max_cpus": 2, - "max_memory": "3.GB" + "max_cpus": 2, + "max_memory": "3.GB" } diff --git a/main.nf b/main.nf index 7fe5247..9ed32f7 100755 --- a/main.nf +++ b/main.nf @@ -10,4 +10,4 @@ workflow { workflow PFR_PANGENE { PANGENE() -} \ No newline at end of file +} diff --git a/modules.json b/modules.json index a645b68..4e8f0a9 100644 --- a/modules.json +++ b/modules.json @@ -1,152 +1,152 @@ { - "name": "PlantandFoodResearch/pangene", - "homePage": "https://github.com/PlantandFoodResearch/pangene", - "repos": { - "git@github.com:PlantandFoodResearch/nxf-modules.git": { - "modules": { - "pfr": { - "custom/restoregffids": { - "branch": "main", - "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", - "installed_by": ["fasta_edta_lai", "modules"] - }, - "custom/shortenfastaids": { - "branch": "main", - "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", - "installed_by": ["fasta_edta_lai", "modules"] - }, - "edta/edta": { - "branch": "main", - "git_sha": "35468dbb1f35eb17a43d7e05544601c7c3f8cd90", - "installed_by": ["fasta_edta_lai", "modules"] - }, - "lai": { - "branch": "main", - "git_sha": "7e6e3cb41362a045c6bb6065903efa0eba246e87", - "installed_by": ["fasta_edta_lai"] - }, - "liftoff": { - "branch": "main", - "git_sha": "444b35f4e6285115f84d2bfce49fc0e6d8a2754e", - "installed_by": ["modules"] - }, - "repeatmodeler/builddatabase": { - "branch": "main", - "git_sha": "9da0567f685b2772f65290f2bd6d6347671c8310", - "installed_by": ["modules"] - }, - "repeatmodeler/repeatmodeler": { - "branch": "main", - "git_sha": "9da0567f685b2772f65290f2bd6d6347671c8310", - "installed_by": ["modules"] - } + "name": "PlantandFoodResearch/pangene", + "homePage": "https://github.com/PlantandFoodResearch/pangene", + "repos": { + "git@github.com:PlantandFoodResearch/nxf-modules.git": { + "modules": { + "pfr": { + "custom/restoregffids": { + "branch": "main", + "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", + "installed_by": ["fasta_edta_lai", "modules"] + }, + "custom/shortenfastaids": { + "branch": "main", + "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", + "installed_by": ["fasta_edta_lai", "modules"] + }, + "edta/edta": { + "branch": "main", + "git_sha": "35468dbb1f35eb17a43d7e05544601c7c3f8cd90", + "installed_by": ["fasta_edta_lai", "modules"] + }, + "lai": { + "branch": "main", + "git_sha": "7e6e3cb41362a045c6bb6065903efa0eba246e87", + "installed_by": ["fasta_edta_lai"] + }, + "liftoff": { + "branch": "main", + "git_sha": "444b35f4e6285115f84d2bfce49fc0e6d8a2754e", + "installed_by": ["modules"] + }, + "repeatmodeler/builddatabase": { + "branch": "main", + "git_sha": "9da0567f685b2772f65290f2bd6d6347671c8310", + "installed_by": ["modules"] + }, + "repeatmodeler/repeatmodeler": { + "branch": "main", + "git_sha": "9da0567f685b2772f65290f2bd6d6347671c8310", + "installed_by": ["modules"] + } + } + }, + "subworkflows": { + "pfr": { + "fasta_edta_lai": { + "branch": "main", + "git_sha": "5ae026a98da1331433fa4cf5b667c9abdf43e395", + "installed_by": ["subworkflows"] + } + } + } + }, + "git@github.com:kherronism/nf-modules.git": { + "modules": { + "kherronism": { + "braker3": { + "branch": "dev", + "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", + "installed_by": ["modules"] + }, + "repeatmasker": { + "branch": "dev", + "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", + "installed_by": ["modules"] + } + } + } + }, + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "cat/cat": { + "branch": "master", + "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", + "installed_by": ["modules"] + }, + "fastavalidator": { + "branch": "master", + "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", + "installed_by": ["modules"] + }, + "fastp": { + "branch": "master", + "git_sha": "d086322563bdbb08c94bf15a7db58a39ccdb1520", + "installed_by": ["fastq_fastqc_umitools_fastp"] + }, + "fastqc": { + "branch": "master", + "git_sha": "617777a807a1770f73deb38c80004bac06807eef", + "installed_by": ["fastq_fastqc_umitools_fastp", "modules"] + }, + "gffread": { + "branch": "master", + "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", + "installed_by": ["modules"] + }, + "gunzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/cat": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["modules"] + }, + "sortmerna": { + "branch": "master", + "git_sha": "ce558e30784469b88a16923ca96d81899d240b42", + "installed_by": ["modules"] + }, + "star/align": { + "branch": "master", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["modules"] + }, + "star/genomegenerate": { + "branch": "master", + "git_sha": "d87a6e2156c2099c09280fa70776eaf0a824817a", + "installed_by": ["modules"] + }, + "umitools/extract": { + "branch": "master", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "installed_by": ["fastq_fastqc_umitools_fastp"] + } + } + }, + "subworkflows": { + "nf-core": { + "fastq_fastqc_umitools_fastp": { + "branch": "master", + "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", + "installed_by": ["subworkflows"] + } + } + } } - }, - "subworkflows": { - "pfr": { - "fasta_edta_lai": { - "branch": "main", - "git_sha": "5ae026a98da1331433fa4cf5b667c9abdf43e395", - "installed_by": ["subworkflows"] - } - } - } - }, - "git@github.com:kherronism/nf-modules.git": { - "modules": { - "kherronism": { - "braker3": { - "branch": "dev", - "git_sha": "b01fec253f3b73b24e3f166a96d4beb49e58b0a6", - "installed_by": ["modules"] - }, - "repeatmasker": { - "branch": "dev", - "git_sha": "6778d5bb4c9f3d597753c699226fcde8d0811bfb", - "installed_by": ["modules"] - } - } - } - }, - "https://github.com/nf-core/modules.git": { - "modules": { - "nf-core": { - "cat/cat": { - "branch": "master", - "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", - "installed_by": ["modules"] - }, - "cat/fastq": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] - }, - "custom/dumpsoftwareversions": { - "branch": "master", - "git_sha": "37dee863936732fe7e05dc598bf6e183a8e7ef73", - "installed_by": ["modules"] - }, - "fastavalidator": { - "branch": "master", - "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", - "installed_by": ["modules"] - }, - "fastp": { - "branch": "master", - "git_sha": "d086322563bdbb08c94bf15a7db58a39ccdb1520", - "installed_by": ["fastq_fastqc_umitools_fastp"] - }, - "fastqc": { - "branch": "master", - "git_sha": "617777a807a1770f73deb38c80004bac06807eef", - "installed_by": ["fastq_fastqc_umitools_fastp", "modules"] - }, - "gffread": { - "branch": "master", - "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", - "installed_by": ["modules"] - }, - "gunzip": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] - }, - "samtools/cat": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] - }, - "sortmerna": { - "branch": "master", - "git_sha": "ce558e30784469b88a16923ca96d81899d240b42", - "installed_by": ["modules"] - }, - "star/align": { - "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] - }, - "star/genomegenerate": { - "branch": "master", - "git_sha": "d87a6e2156c2099c09280fa70776eaf0a824817a", - "installed_by": ["modules"] - }, - "umitools/extract": { - "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", - "installed_by": ["fastq_fastqc_umitools_fastp"] - } - } - }, - "subworkflows": { - "nf-core": { - "fastq_fastqc_umitools_fastp": { - "branch": "master", - "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", - "installed_by": ["subworkflows"] - } - } - } } - } } diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml index f0c63f6..9b3272b 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/environment.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::multiqc=1.17 + - bioconda::multiqc=1.19 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 7685b33..f218761 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -4,8 +4,8 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : - 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap index 29e7244..5f59a93 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -2,7 +2,7 @@ "Should run without failures": { "content": [ [ - "versions.yml:md5,3843ac526e762117eedf8825b40683df" + "versions.yml:md5,76d454d92244589d32455833f7c1ba6d" ], [ "data: \"
Process Name \\", + " \\ Software Version
CUSTOM_DUMPSOFTWAREVERSIONSpython3.12.0
yaml6.0.1
TOOL1tool10.11.9
TOOL2tool21.9
WorkflowNextflow
File typeConventional base calls
File typeConventional base calls
File typeConventional base calls
File typeConventional base calls
File typeConventional base calls
File typeConventional base calls
File typeConventional base calls
File typeConventional base calls
File typeConventional base calls
File typeConventional base calls
File typeConventional base calls