From d331720f8b370541d8c315324ab27eb9cef3ef25 Mon Sep 17 00:00:00 2001 From: ashish615 Date: Thu, 21 Dec 2023 13:35:47 +0000 Subject: [PATCH] Generalize code for genome reference file. --- .../run_pipeline.sh | 2 +- .../test_pipeline_final.py | 23 ++++++++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/run_pipeline.sh b/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/run_pipeline.sh index 571619c..25779db 100644 --- a/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/run_pipeline.sh +++ b/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/run_pipeline.sh @@ -57,6 +57,6 @@ fi echo Starting run with $N ranks, $CPUS threads,$THREADS threads, $SHARDS shards, $PPN ppn. # -in -sindex are required only once for indexing. # Todo : Make index creation parameterized. -mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_pipeline_final.py --input $INDIR --output $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --read $READ1 $READ2 --cpus $CPUS --threads $THREADS --shards $SHARDS --container_tool "$Container" --keep_unmapped 2>&1 | tee ${OUTDIR}log.txt +mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_pipeline_final.py --input $INDIR --output $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --read $READ1 $READ2 --cpus $CPUS --threads $THREADS --shards $SHARDS --container_tool "$Container" 2>&1 | tee ${OUTDIR}/log.txt echo "Pipeline finished. Output vcf can be found at: $OUTPUT_DIR/output.vcf.gz" diff --git a/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/test_pipeline_final.py b/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/test_pipeline_final.py index c05cd5c..a0c39e6 100644 --- a/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/test_pipeline_final.py +++ b/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/test_pipeline_final.py @@ -15,11 +15,9 @@ import yappi from multiprocessing import Pool from operator import itemgetter -mydict = {"chr1": 1,"chr2": 2,"chr3": 3,"chr4": 4,"chr5": 5,"chr6": 6,"chr7": 7,"chr8": 8, "chr9": 9, "chr10": 10, "chr11": 11, "chr12": 12, "chr13": 13, "chr14":14,"chr15":15,"chr16": 16, "chr17": 17, "chr18": 18, "chr19": 19,"chr20":20,"chr21":21,"chr22":22,"chrX": 23,"chrY": 24,"chrM": 25} BINDIR="../.." - def pr_t1( f, start, end, bufs, sems ): # thread to read from fastq.gz file f.seek(start) buf_i = 0 @@ -118,7 +116,7 @@ def pragzip_reader(comm, cpus, fname, outdir, last=False): binrounding = 1000 keep=False headers_done = threading.Semaphore(0) - +chromo_dict={} # NOTE: Code relies on insert order of dictionary keys -- so needs python>=3.7 # Calculate bins @@ -213,6 +211,7 @@ def read_wrap(f): #seq_start['*'] = 0 # put all unmatched reads at beginning cumlen = 0 global keep + global chromo_dict with open(outpipe,'r') as f1: #l = f1.readline() l = read_wrap(f1) @@ -228,7 +227,8 @@ def read_wrap(f): seq_len[sn] = ln cumlen += ln else: - if sn in mydict.keys(): + #print(sn) + if sn in chromo_dict.keys(): seq_start[sn] = cumlen seq_len[sn] = ln cumlen += ln @@ -350,10 +350,21 @@ def main(argv): #global bins_per_rank #bins_per_rank = max(4,bins_per_rank//nranks) global ncpus + global chromo_dict ncpus = int(cpus) - start0 = time.time() - + # chromo_dict information added + chromo_file=os.path.join(refdir,ifile)+".fai" + if os.path.isfile(chromo_file): + f = open(chromo_file, "r") + else : + print(f'File "{chromo_file}" not found') + exit() + lines=f.readlines() + f.close() + for line in lines[0:25]: + chromo=line.split("\t")[0] + chromo_dict[chromo]= 1 # Preindex refernce genome if requested if rank==0: yappi.set_clock_type("wall")