Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/emgb export #347

Merged
merged 3 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/workflow_modules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ env:
WORK_DIR: "/vol/spool/${{ github.head_ref }}"
PROFILE: "slurm"
PR_NUMBER: ${{ github.event.number }}
EMGB_KEGG_DB: "/vol/spool/emgb/annotatedgenes2json_db_nr-2023-04-29_kegg-mirror-2022-12"
jobs:
full_pipeline:
timeout-minutes: 2500
Expand Down Expand Up @@ -49,7 +50,8 @@ jobs:
- name: Test EMGB import tools
run: |
./bin/emgb.sh --output=output/test1 --runid=1 --binsdir=$(find output/test1/ -name "metabat") \
--workdir="${WORK_DIR}_wFullPipeline" --blastdb=bacmet20_predicted --name=test1
--db=${EMGB_KEGG_DB} \
--workdir="${WORK_DIR}_wFullPipeline" --name=test1

- name: Test Full Pipeline run that ends with magAttributes
run: |
Expand Down
28 changes: 16 additions & 12 deletions bin/emgb.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/bin/bash
set -e

VERSION=0.1.0
VERSION=0.3.1

while [ $# -gt 0 ]; do
case "$1" in
Expand All @@ -14,10 +15,10 @@ while [ $# -gt 0 ]; do
;;
--db=*) DB="${1#*=}"
;;
--blastdb=*) BLAST_DB="${1#*=}"
;;
--workdir=*) WORK_DIR="${1#*=}"
;;
--type=*) TYPE="${1#*=}"
;;
--version) VERSION_CHECK=1
;;
--debug) DEBUG_CHECK=1
Expand All @@ -37,13 +38,13 @@ done


function getGenes {
nr=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.${BLAST_DB}.blast.tsv" -exec readlink -f {} \; | sed 's/^/ -nr-blast-tab /g')
nr=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.ncbi_nr.blast.tsv" -exec readlink -f {} \; | sed 's/^/ -nr-blast-tab /g')
tax=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.taxonomy.tsv" -exec readlink -f {} \; | sed 's/^/ -mmseqs-lineage /g')
ffn=$(find $OUTPUT_PATH/$RUN_ID/annotation -name "*.ffn.gz" -exec readlink -f {} \; | sed 's/^/ -ffn /g')
gff=$(find $OUTPUT_PATH/$RUN_ID/annotation -name "*.gff.gz" -exec readlink -f {} \; | sed 's/^/ -gff /g')
faa=$(find $OUTPUT_PATH/$RUN_ID/annotation -name "*.faa.gz" -exec readlink -f {} \; | sed 's/^/ -faa /g')
kegg=$(find $OUTPUT_PATH/$RUN_ID/annotation/ -name "*.kegg.blast.tsv" -exec readlink -f {} \; | sed 's/^/ -kegg-blast-tab /g')
db=" -ci "
db=$DB
json=" -json-gz $(pwd)/${NAME}.genes.json.gz "
name=" -dataset-name ${NAME} "

Expand All @@ -54,15 +55,15 @@ function getGenes {
echo $cmd
fi

docker run -i -v $(pwd):$(pwd) -v $WORK_DIR:$WORK_DIR -v ${OUTPUT_PATH}:${OUTPUT_PATH} quay.io/emgb/annotatedgenes2json:2.2.2 $cmd
docker run -i $DBMOUNT -v $(pwd):$(pwd) -v $WORK_DIR:$WORK_DIR -v ${OUTPUT_PATH}:${OUTPUT_PATH} quay.io/emgb/annotatedgenes2json:2.3.1 $cmd
}



function getContigs {
contigs=$(find $OUTPUT_PATH/$RUN_ID/assembly/ -name "*_contigs.fa.gz" -exec readlink -f {} \; | sed 's/^/ -fasta /g')
contigs=$(find $OUTPUT_PATH/$RUN_ID/assembly${TYPE}/ -name "*_contigs.fa.gz" -exec readlink -f {} \; | sed 's/^/ -fasta /g')
name=" -sample-names ${NAME} "
bam=$(find $OUTPUT_PATH/$RUN_ID/binning/ -name "*.bam" -exec readlink -f {} \; | sed 's/^/ -sample-bam-files /g')
bam=$(find $OUTPUT_PATH/$RUN_ID/binning${TYPE}/ -name "*.bam" -exec readlink -f {} \; | sed 's/^/ -sample-bam-files /g')
json=" -json-gz $(pwd)/${NAME}.contigs.json.gz "


Expand Down Expand Up @@ -90,7 +91,7 @@ function getBins {
echo $cmd
fi

docker run -i -v $(pwd):$(pwd) -v $WORK_DIR:$WORK_DIR -v ${OUTPUT_PATH}:${OUTPUT_PATH} quay.io/emgb/annotatedbins2json:2.2.2 $cmd
docker run -i $DBMOUNT -v $(pwd):$(pwd) -v $WORK_DIR:$WORK_DIR -v ${OUTPUT_PATH}:${OUTPUT_PATH} quay.io/emgb/annotatedbins2json:2.2.2 $cmd
}


Expand All @@ -102,10 +103,9 @@ help()
echo " -- (e.g. X in the following example path fullPipelineOutput/SAMPLE/X/binning/) "
echo " --binsdir -- directory of bins. If bin refinement was executed then the bin refinement output should be used."
echo " -- (e.g. --binsdir=fullPipelineOutput/DRR066656/1/binning/0.4.0/metabat)"
echo " --blastdb -- Blast output that should be exported to emgb"
echo " -- (e.g. the folder name of BLAST_DB: output/test1/1/annotation/0.3.0/mmseqs2/BLAST_DB)"
echo " -- (Examples: bacmet20_predicted, ncbi_nr)"
echo " --db -- emgb specific kegg database"
echo " --name -- sample name, e.g. the SAMPLE in the paths above"
echo " --type -- if other then Illumina: ONT/Hybrid"
echo " --workdir -- absolute path to Nextflow work directory"
echo " --help -- help page"
echo " --debug -- print commands before running"
Expand All @@ -129,7 +129,11 @@ bins=" -bins-dir $(readlink -f $BINS_DIR)"

if [ -z "$DB" ]
then
DBMOUNT=""
DB=" -ci "
else
DBMOUNT=" -v $DB:$DB "
DB=" -db ${DB} "
fi

getGenes
Expand Down
8 changes: 8 additions & 0 deletions example_params/fullPipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,14 @@ steps:
# download:
# source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/bacmet20_experimental.tar.zst
# md5sum: 57a6d328486f0acd63f7e984f739e8fe
ncbi_nr:
params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
database:
download:
source: s3://databases/nr_2023-04-29_mmseqs_taxonomy/*
md5sum: 79b9fb6b3dada41e602d70e12e7351c2
s5cmd:
params: '--retry-count 30 --no-verify-ssl --no-sign-request --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080'
bacmet20_predicted:
params: ' -s 1 --max-seqs 100 --max-accept 50 --alignment-mode 1 --exact-kmer-matching 1 --db-load-mode 3'
database:
Expand Down
Loading