From af407e6f2d9332bcff74decdf675b8ab3ece7e74 Mon Sep 17 00:00:00 2001 From: Yasin Kaya Date: Thu, 12 Dec 2024 14:39:45 +0100 Subject: [PATCH 1/2] Added min_contig_length validation and integrated filtering to main.nf --- README.md | 10 +++++ main.nf | 56 ++++++++++++++++++++++++---- nextflow.config | 11 +++++- subworkflows/yykaya/seqkit.filter.nf | 0 4 files changed, 69 insertions(+), 8 deletions(-) create mode 100644 subworkflows/yykaya/seqkit.filter.nf diff --git a/README.md b/README.md index 766d5fe..d0c3d5a 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,16 @@ Each row represents an input genome and the fields are: - `fasta:` fasta file for the genome - `is_masked`: yes or no to denote whether the fasta file is already masked or not +#### `--min_contig_length` +- **Description**: Minimum length (in base pairs) of contigs to include in the analysis. +- **Default**: 5000 +- **Example**: + ```bash + nextflow run main.nf --min_contig_length 10000 + ``` + This will exclude all contigs shorter than 10,000 bp from the analysis. + + At minimum, a file with proteins as evidence is also required. Now, you can run the pipeline using: ```bash diff --git a/main.nf b/main.nf index 2545848..8659903 100755 --- a/main.nf +++ b/main.nf @@ -17,6 +17,35 @@ include { GENEPAL } from './workflows/genepal' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_genepal_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_genepal_pipeline' +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + PROCESS: Filter Genome Assembly by Minimum Contig Length +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +process SEQKIT_GET_LENGTH { + tag "${meta.id}" + label 'process_medium' + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/seqkit:2.4.0--h9ee0642_0' + : 'quay.io/biocontainers/seqkit:2.4.0--h9ee0642_0'}" + + input: + tuple val(meta), path(genome_fasta) + + output: + tuple val(meta), path("filtered_${meta.id}.fasta"), path("${meta.id}_contig_list.txt"), emit: filtered_fasta + + script: + """ + # Filter contigs based on length and output filtered FASTA + seqkit seq --min-len ${params.min_contig_length} ${genome_fasta} > filtered_${meta.id}.fasta + + # Generate a list of filtered contigs + seqkit fx2tab --length --name filtered_${meta.id}.fasta | awk '{print \$1}' > ${meta.id}_contig_list.txt + """ +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NAMED WORKFLOWS FOR PIPELINE @@ -48,10 +77,15 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL { main: // - // WORKFLOW: Run pipeline + // Filter genome assembly by minimum contig length + // + SEQKIT_GET_LENGTH(ch_target_assembly) + + // + // Run GENEPAL main workflow using filtered FASTA // GENEPAL( - ch_target_assembly, + SEQKIT_GET_LENGTH.out.filtered_fasta.map { meta, fasta, contig_list -> [ meta, fasta ] }, // Filtered genome FASTA ch_tar_assm_str, ch_is_masked, ch_te_library, @@ -68,9 +102,11 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL { ch_tsebra_config, ch_orthofinder_pep ) + emit: multiqc_report = GENEPAL.out.multiqc_report // channel: /path/to/multiqc_report.html } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -81,9 +117,9 @@ workflow { main: // - // SUBWORKFLOW: Run initialisation tasks + // SUBWORKFLOW: Run initialization tasks // - PIPELINE_INITIALISATION ( + PIPELINE_INITIALISATION( params.version, params.monochrome_logs, args, @@ -95,10 +131,15 @@ workflow { ) // - // WORKFLOW: Run main workflow + // Filter genome assembly by minimum contig length + // + SEQKIT_GET_LENGTH(PIPELINE_INITIALISATION.out.target_assembly) + + // + // Run main workflow using filtered FASTA // PLANTFOODRESEARCHOPEN_GENEPAL( - PIPELINE_INITIALISATION.out.target_assembly, + SEQKIT_GET_LENGTH.out.filtered_fasta, PIPELINE_INITIALISATION.out.tar_assm_str, PIPELINE_INITIALISATION.out.is_masked, PIPELINE_INITIALISATION.out.te_library, @@ -115,10 +156,11 @@ workflow { PIPELINE_INITIALISATION.out.tsebra_config, PIPELINE_INITIALISATION.out.orthofinder_pep ) + // // SUBWORKFLOW: Run completion tasks // - PIPELINE_COMPLETION ( + PIPELINE_COMPLETION( params.email, params.email_on_fail, params.plaintext_email, diff --git a/nextflow.config b/nextflow.config index 665e6b3..2d2b839 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,6 +19,7 @@ params { orthofinder_annotations = null outdir = null email = null + min_contig_length = 5000 // Repeat annotation options repeat_annotator = 'repeatmodeler' @@ -79,7 +80,15 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" } - +// Validation for the min_contig_length parameter +process { + beforeScript = """ + if [[ ${params.min_contig_length} -le 1000 ]]; then + echo "ERROR: The parameter 'min_contig_length' must be greater than 1 kbp (1000 base pairs). Provided value: ${params.min_contig_length}" >&2 + exit 1 + fi + """ +} // Max resources process { resourceLimits = [ diff --git a/subworkflows/yykaya/seqkit.filter.nf b/subworkflows/yykaya/seqkit.filter.nf new file mode 100644 index 0000000..e69de29 From 6f345b8777df75fd82e2f5746b28bb570fb4df8f Mon Sep 17 00:00:00 2001 From: Yasin Kaya Date: Thu, 12 Dec 2024 14:50:21 +0100 Subject: [PATCH 2/2] Updated error message for min_contig_length validation --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 2d2b839..9cbc9e6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -84,7 +84,7 @@ params { process { beforeScript = """ if [[ ${params.min_contig_length} -le 1000 ]]; then - echo "ERROR: The parameter 'min_contig_length' must be greater than 1 kbp (1000 base pairs). Provided value: ${params.min_contig_length}" >&2 + echo "ERROR: The parameter 'min_contig_length' must be greater than 5 kbp (5000 base pairs). Provided value: ${params.min_contig_length}" >&2 exit 1 fi """