NBISweden · mahesh-panchal · May 28, 2024 · May 28, 2024 · May 28, 2024 · May 28, 2024
diff --git a/README.md b/README.md
@@ -198,6 +198,8 @@ outdir: '/path/to/save/results'
 > ```nextflow
 > // Set your work directory to a folder on the /active partition
 > workDir = '/active/<project_id>/nobackup/work'
+> // Set Interproscan database path 
+> interproscan_db = '/path/to/interproscan/db/data'
 > // Restart workflows from last successful execution (i.e. use cached results where possible).
 > resume = true
 > // Add any overriding process directives here, e.g.,

diff --git a/config/functional_annotation_modules.config b/config/functional_annotation_modules.config
@@ -1,6 +1,9 @@
 publish_subdir = 'functional_annotation'
 
 process {
+    withName: 'UNTAR' {
+        storeDir = params.db_cache
+    }
     withName: 'GFF2PROTEIN' {
         ext.args   = [
             '-p',
@@ -18,7 +21,6 @@ process {
     withName: 'INTERPROSCAN' {
         cpus = 8
         ext.args = [
-            '--iprlookup',
             '--goterms',
             '-pa',
             '-t p',

diff --git a/config/test.config b/config/test.config
@@ -26,15 +26,15 @@ if ( params.subworkflow == 'functional_annotation' ) {
         gff_annotation = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gff'
         // blast_db_fasta = 'https://www.uniprot.org/uniprot/%3Fquery%3Dorganism%3A4932%26format%3Dfasta'
         blast_db_fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/database/yeast_UPS.fasta'
+        interproscan_database = null // Disable download of huge database
     }
     process {
         withName: 'FUNCTIONAL_ANNOTATION:INTERPROSCAN' {
             cpus = 2
             ext.args = [
-                // '--iprlookup',
-                // '--goterms', 
-                // '-pa', 
-                '-t p'
+                '-appl coils',
+                '-t p',
+                '-dp'
             ].join(' ').trim()
         }
     }

diff --git a/modules.json b/modules.json
@@ -34,6 +34,11 @@
             "branch": "master",
             "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93",
             "installed_by": ["modules"]
+          },
+          "untar": {
+            "branch": "master",
+            "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa",
+            "installed_by": ["modules"]
           }
         }
       },

diff --git a/modules/local/interproscan.nf b/modules/local/interproscan.nf
@@ -0,0 +1,65 @@
+process INTERPROSCAN {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::interproscan:5.59_91.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'oras://community.wave.seqera.io/library/interproscan:5.59_91.0--c515d30aea1d4874' :
+        'biocontainers/interproscan:5.59_91.0--hec16e2b_1' }"
+
+    input:
+    tuple val(meta), path(fasta)
+    path(interproscan_database, stageAs: 'data')
+
+    output:
+    tuple val(meta), path('*.tsv') , optional: true, emit: tsv
+    tuple val(meta), path('*.xml') , optional: true, emit: xml
+    tuple val(meta), path('*.gff3'), optional: true, emit: gff3
+    tuple val(meta), path('*.json'), optional: true, emit: json
+    path "versions.yml"            , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def is_compressed = fasta.name.endsWith(".gz")
+    def fasta_name = fasta.name.replace(".gz", "")
+    """
+    if [ -d 'data' ]; then
+        # Find interproscan.properties to link data/ from work directory
+        INTERPROSCAN_DIR="\$( dirname "\$( dirname "\$( which interproscan.sh )" )" )"
+        INTERPROSCAN_PROPERTIES="\$( find "\$INTERPROSCAN_DIR/share" -name "interproscan.properties" )"
+        cp "\$INTERPROSCAN_PROPERTIES" .
+        sed -i "/^bin\\.directory=/ s|.*|bin.directory=\$INTERPROSCAN_DIR/bin|" interproscan.properties
+        export INTERPROSCAN_CONF=interproscan.properties
+    fi # else use sample DB included with conda ( testing only! )
+
+    if ${is_compressed} ; then
+        gzip -c -d ${fasta} > ${fasta_name}
+    fi
+
+    interproscan.sh \\
+        --cpu ${task.cpus} \\
+        --input ${fasta_name} \\
+        ${args} \\
+        --output-file-base ${prefix}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        interproscan: \$( interproscan.sh --version | sed '1!d; s/.*version //' )
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.{tsv,xml,json,gff3}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        interproscan: \$( interproscan.sh --version | sed '1!d; s/.*version //' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml
diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf
diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml
diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test
diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap
diff --git a/modules/nf-core/untar/tests/tags.yml b/modules/nf-core/untar/tests/tags.yml
diff --git a/nextflow.config b/nextflow.config
@@ -35,6 +35,8 @@ params {
     blast_db_fasta = '/path/to/protein/database.fasta'
     merge_annotation_identifier = 'NBIS'
     use_pcds = false
+    interproscan_database = 'http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.59-91.0/interproscan-5.59-91.0-64-bit.tar.gz'
+    db_cache = 'db_cache'
 
     // Transcript assembly parameters
     reads = "/path/to/reads_{1,2}.fastq.gz"

diff --git a/subworkflows/functional_annotation/README.md b/subworkflows/functional_annotation/README.md
@@ -16,8 +16,16 @@ genome: '/path/to/genome/assembly.fasta'
 gff_annotation: '/path/to/annotation.gff3'
 blast_db_fasta: '/path/to/protein/database.fasta'
 outdir: '/path/to/save/results'
+db_cache: '/path/to/save/interproscan_db/'
 ```
 
+> [!IMPORTANT]  
+> The Interproscan database is huge. If you supply `db_cache` then it will be downloaded
+> once and saved in that directory. Setting your `db_cache` to this path for every run
+> will then reuse this folder to supply the Interproscan database without re-extracting it again.
+>
+> Alternatively, set `interproscan_database` to point to the local path of the interproscan database.
+
 Command line:
 
 ```bash
@@ -37,6 +45,9 @@ nextflow run NBISweden/pipelines-nextflow \
   - `blast_db_fasta` : Path to blast protein database fasta.
   - `merge_annotation_identifier`: The identifier to use for labeling genes (default: NBIS).
   - `use_pcds`: If true, enables the pcds flag when merging annotation.
+  - `interproscan_database`: Path to interproscan database, if this is a `tar.gz`, the file will be extracted under
+    `db_cache` and saved for future use.
+  - `db_cahce`: The path to save the untarred Interproscan database archive.
 
 ### Tool specific parameters