Plant-Food-Research-Open · yykaya · Dec 12, 2024 · Dec 12, 2024 · GallVp · Dec 13, 2024
diff --git a/README.md b/README.md
@@ -71,6 +71,16 @@ Each row represents an input genome and the fields are:
 - `fasta:` fasta file for the genome
 - `is_masked`: yes or no to denote whether the fasta file is already masked or not
 
+#### `--min_contig_length`
+- **Description**: Minimum length (in base pairs) of contigs to include in the analysis.
+- **Default**: 5000
+- **Example**:
+    ```bash
+    nextflow run main.nf --min_contig_length 10000
+    ```
+    This will exclude all contigs shorter than 10,000 bp from the analysis.
+
+
 At minimum, a file with proteins as evidence is also required. Now, you can run the pipeline using:
 
 ```bash

diff --git a/main.nf b/main.nf
@@ -17,6 +17,35 @@ include { GENEPAL                   } from './workflows/genepal'
 include { PIPELINE_INITIALISATION   } from './subworkflows/local/utils_nfcore_genepal_pipeline'
 include { PIPELINE_COMPLETION       } from './subworkflows/local/utils_nfcore_genepal_pipeline'
 
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    PROCESS: Filter Genome Assembly by Minimum Contig Length
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+process SEQKIT_GET_LENGTH {
+    tag "${meta.id}"
+    label 'process_medium'
+    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+        ? 'https://depot.galaxyproject.org/singularity/seqkit:2.4.0--h9ee0642_0'
+        : 'quay.io/biocontainers/seqkit:2.4.0--h9ee0642_0'}"
+
+    input:
+    tuple val(meta), path(genome_fasta)
+
+    output:
+    tuple val(meta), path("filtered_${meta.id}.fasta"), path("${meta.id}_contig_list.txt"), emit: filtered_fasta
+
+    script:
+    """
+    # Filter contigs based on length and output filtered FASTA
+    seqkit seq --min-len ${params.min_contig_length} ${genome_fasta} > filtered_${meta.id}.fasta
+
+    # Generate a list of filtered contigs
+    seqkit fx2tab --length --name filtered_${meta.id}.fasta | awk '{print \$1}' > ${meta.id}_contig_list.txt
+    """
+}
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     NAMED WORKFLOWS FOR PIPELINE
@@ -48,10 +77,15 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL {
 
     main:
     //
-    // WORKFLOW: Run pipeline
+    // Filter genome assembly by minimum contig length
+    //
+    SEQKIT_GET_LENGTH(ch_target_assembly)
+
+    //
+    // Run GENEPAL main workflow using filtered FASTA
     //
     GENEPAL(
-        ch_target_assembly,
+        SEQKIT_GET_LENGTH.out.filtered_fasta.map { meta, fasta, contig_list -> [ meta, fasta ] }, // Filtered genome FASTA
         ch_tar_assm_str,
         ch_is_masked,
         ch_te_library,
@@ -68,9 +102,11 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL {
         ch_tsebra_config,
         ch_orthofinder_pep
     )
+
     emit:
     multiqc_report = GENEPAL.out.multiqc_report // channel: /path/to/multiqc_report.html
 }
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     RUN MAIN WORKFLOW
@@ -81,9 +117,9 @@ workflow {
 
     main:
     //
-    // SUBWORKFLOW: Run initialisation tasks
+    // SUBWORKFLOW: Run initialization tasks
     //
-    PIPELINE_INITIALISATION (
+    PIPELINE_INITIALISATION(
         params.version,
         params.monochrome_logs,
         args,
@@ -95,10 +131,15 @@ workflow {
     )
 
     //
-    // WORKFLOW: Run main workflow
+    // Filter genome assembly by minimum contig length
+    //
+    SEQKIT_GET_LENGTH(PIPELINE_INITIALISATION.out.target_assembly)
+
+    //
+    // Run main workflow using filtered FASTA
     //
     PLANTFOODRESEARCHOPEN_GENEPAL(
-        PIPELINE_INITIALISATION.out.target_assembly,
+        SEQKIT_GET_LENGTH.out.filtered_fasta,
         PIPELINE_INITIALISATION.out.tar_assm_str,
         PIPELINE_INITIALISATION.out.is_masked,
         PIPELINE_INITIALISATION.out.te_library,
@@ -115,10 +156,11 @@ workflow {
         PIPELINE_INITIALISATION.out.tsebra_config,
         PIPELINE_INITIALISATION.out.orthofinder_pep
     )
+
     //
     // SUBWORKFLOW: Run completion tasks
     //
-    PIPELINE_COMPLETION (
+    PIPELINE_COMPLETION(
         params.email,
         params.email_on_fail,
         params.plaintext_email,

diff --git a/nextflow.config b/nextflow.config
@@ -19,6 +19,7 @@ params {
     orthofinder_annotations             = null
     outdir                              = null
     email                               = null
+    min_contig_length                   = 5000
 
     // Repeat annotation options
     repeat_annotator                    = 'repeatmodeler'
@@ -79,7 +80,15 @@ params {
     custom_config_base                  = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}"
 
 }
-
+// Validation for the min_contig_length parameter
+process {
+    beforeScript = """
+        if [[ ${params.min_contig_length} -le 1000 ]]; then
+            echo "ERROR: The parameter 'min_contig_length' must be greater than 5 kbp (5000 base pairs). Provided value: ${params.min_contig_length}" >&2
+            exit 1
+        fi
+    """
+}
 // Max resources
 process {
     resourceLimits = [

diff --git a/subworkflows/yykaya/seqkit.filter.nf b/subworkflows/yykaya/seqkit.filter.nf