metagenomics · pbelmann · Jan 30, 2024 · Jan 22, 2024 · Jan 25, 2024 · Jan 29, 2024
diff --git a/.github/workflows/workflow_modules.yml b/.github/workflows/workflow_modules.yml
@@ -17,6 +17,7 @@ env:
   WORK_DIR: "/vol/spool/${{ github.head_ref }}"
   PROFILE: "slurm"
   PR_NUMBER: ${{ github.event.number }}
+  EMGB_KEGG_DB: "/vol/spool/emgb/annotatedgenes2json_db_nr-2023-04-29_kegg-mirror-2022-12" 
 jobs:
   full_pipeline:
     timeout-minutes: 2500
@@ -49,7 +50,8 @@ jobs:
       - name: Test EMGB import tools
         run: |
           ./bin/emgb.sh --output=output/test1 --runid=1 --binsdir=$(find output/test1/ -name "metabat") \
-                        --workdir="${WORK_DIR}_wFullPipeline" --blastdb=bacmet20_predicted --name=test1
+                        --db=${EMGB_KEGG_DB} \
+                        --workdir="${WORK_DIR}_wFullPipeline" --name=test1
 
       - name: Test Full Pipeline run that ends with magAttributes
         run: |

diff --git a/bin/emgb.sh b/bin/emgb.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
+set -e
 
-VERSION=0.1.0
+VERSION=0.3.1
 
 while [ $# -gt 0 ]; do
 	  case "$1" in
@@ -14,10 +15,10 @@ while [ $# -gt 0 ]; do
 	    ;;
 	    --db=*) DB="${1#*=}"
 	    ;;
-	    --blastdb=*) BLAST_DB="${1#*=}"
-	    ;;
 	    --workdir=*) WORK_DIR="${1#*=}"
 	    ;;
+            --type=*) TYPE="${1#*=}"
+	    ;;
 	    --version) VERSION_CHECK=1
 	    ;;
 	    --debug) DEBUG_CHECK=1
@@ -37,13 +38,13 @@ done
 
 
 function getGenes {
-	nr=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.${BLAST_DB}.blast.tsv" -exec readlink -f {} \;  | sed 's/^/ -nr-blast-tab /g')
+	nr=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.ncbi_nr.blast.tsv" -exec readlink -f {} \;  | sed 's/^/ -nr-blast-tab /g')
 	tax=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.taxonomy.tsv" -exec readlink -f {} \; | sed 's/^/ -mmseqs-lineage /g')
 	ffn=$(find $OUTPUT_PATH/$RUN_ID/annotation -name "*.ffn.gz" -exec readlink -f {} \; | sed 's/^/ -ffn /g')
 	gff=$(find $OUTPUT_PATH/$RUN_ID/annotation -name "*.gff.gz" -exec readlink -f {} \; | sed 's/^/ -gff /g')
 	faa=$(find $OUTPUT_PATH/$RUN_ID/annotation -name "*.faa.gz" -exec readlink -f {} \; | sed 's/^/ -faa /g')
 	kegg=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.kegg.blast.tsv" -exec readlink -f {} \; | sed 's/^/ -kegg-blast-tab /g')
-	db=" -ci "
+	db=$DB
 	json=" -json-gz $(pwd)/${NAME}.genes.json.gz "
 	name=" -dataset-name ${NAME} "
 
@@ -54,15 +55,15 @@ function getGenes {
 		echo $cmd
 	fi
 
-	docker run -i -v $(pwd):$(pwd) -v $WORK_DIR:$WORK_DIR -v ${OUTPUT_PATH}:${OUTPUT_PATH} quay.io/emgb/annotatedgenes2json:2.2.2 $cmd
+	docker run -i $DBMOUNT -v $(pwd):$(pwd) -v $WORK_DIR:$WORK_DIR -v ${OUTPUT_PATH}:${OUTPUT_PATH} quay.io/emgb/annotatedgenes2json:2.3.1 $cmd
 }
 
 
 
 function getContigs {
-	contigs=$(find  $OUTPUT_PATH/$RUN_ID/assembly/ -name "*_contigs.fa.gz" -exec readlink -f {} \; | sed 's/^/ -fasta /g')
+	contigs=$(find  $OUTPUT_PATH/$RUN_ID/assembly${TYPE}/ -name "*_contigs.fa.gz" -exec readlink -f {} \; | sed 's/^/ -fasta /g')
         name=" -sample-names ${NAME} "	
-	bam=$(find  $OUTPUT_PATH/$RUN_ID/binning/ -name "*.bam" -exec readlink -f {} \; | sed 's/^/  -sample-bam-files  /g')
+	bam=$(find  $OUTPUT_PATH/$RUN_ID/binning${TYPE}/ -name "*.bam" -exec readlink -f {} \; | sed 's/^/  -sample-bam-files  /g')
 	json=" -json-gz $(pwd)/${NAME}.contigs.json.gz "
 
 
@@ -90,7 +91,7 @@ function getBins {
 		echo $cmd
 	fi
 
-	docker run -i -v $(pwd):$(pwd) -v $WORK_DIR:$WORK_DIR -v ${OUTPUT_PATH}:${OUTPUT_PATH} quay.io/emgb/annotatedbins2json:2.2.2 $cmd
+	docker run -i $DBMOUNT -v $(pwd):$(pwd) -v $WORK_DIR:$WORK_DIR -v ${OUTPUT_PATH}:${OUTPUT_PATH} quay.io/emgb/annotatedbins2json:2.2.2 $cmd
 }
 
 
@@ -102,10 +103,9 @@ help()
 	echo "              -- (e.g. X in the following example path fullPipelineOutput/SAMPLE/X/binning/)                       "
 	echo "  --binsdir   -- directory of bins. If bin refinement was executed then the bin refinement output should be used."
 	echo "              -- (e.g. --binsdir=fullPipelineOutput/DRR066656/1/binning/0.4.0/metabat)"
-	echo "  --blastdb   -- Blast output that should be exported to emgb"
-	echo "              -- (e.g. the folder name of BLAST_DB: output/test1/1/annotation/0.3.0/mmseqs2/BLAST_DB)"
-	echo "              -- (Examples: bacmet20_predicted, ncbi_nr)"
 	echo "  --db        -- emgb specific kegg database"
+	echo "  --name      -- sample name, e.g. the SAMPLE in the paths above"
+	echo "  --type      -- if other then Illumina: ONT/Hybrid"
 	echo "  --workdir   -- absolute path to Nextflow work directory"
 	echo "  --help      -- help page"
 	echo "  --debug     -- print commands before running"
@@ -129,7 +129,11 @@ bins=" -bins-dir $(readlink -f $BINS_DIR)"
 
 if [ -z "$DB" ]
 then
+	DBMOUNT=""
    	DB=" -ci "
+else
+	DBMOUNT=" -v $DB:$DB "
+	DB=" -db ${DB} "
 fi
 
 getGenes

diff --git a/example_params/fullPipeline.yml b/example_params/fullPipeline.yml
@@ -186,6 +186,14 @@ steps:
 #          download:
 #            source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst
 #            md5sum: 57a6d328486f0acd63f7e984f739e8fe
+      ncbi_nr:
+        params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
+        database:
+          download:
+            source: s3://databases/nr_2023-04-29_mmseqs_taxonomy/*
+            md5sum: 79b9fb6b3dada41e602d70e12e7351c2
+            s5cmd:
+              params: '--retry-count 30 --no-verify-ssl  --no-sign-request  --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080'
       bacmet20_predicted:
         params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
         database: