Generalize code for genome reference file.

IntelLabs · Dec 21, 2023 · d331720 · d331720
1 parent 8506b0a
commit d331720
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 7 deletions.
diff --git a/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/run_pipeline.sh b/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/run_pipeline.sh
@@ -57,6 +57,6 @@ fi
 echo Starting run with $N ranks, $CPUS threads,$THREADS threads, $SHARDS shards, $PPN ppn.
 # -in -sindex are required only once for indexing. 
 # Todo : Make index creation parameterized. 
-mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_pipeline_final.py --input $INDIR --output  $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --read $READ1 $READ2 --cpus $CPUS --threads $THREADS --shards $SHARDS --container_tool "$Container" --keep_unmapped 2>&1 | tee ${OUTDIR}log.txt
+mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_pipeline_final.py --input $INDIR --output  $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --read $READ1 $READ2 --cpus $CPUS --threads $THREADS --shards $SHARDS --container_tool "$Container" 2>&1 | tee ${OUTDIR}/log.txt
 
 echo "Pipeline finished. Output vcf can be found at: $OUTPUT_DIR/output.vcf.gz"
diff --git a/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/test_pipeline_final.py b/pipelines/deepvariant-based-germline-variant-calling-fq2vcf/test_pipeline_final.py
@@ -15,11 +15,9 @@
 import yappi
 from multiprocessing import Pool
 from operator import itemgetter
-mydict = {"chr1": 1,"chr2": 2,"chr3": 3,"chr4": 4,"chr5": 5,"chr6": 6,"chr7": 7,"chr8": 8, "chr9": 9, "chr10": 10, "chr11": 11, "chr12": 12, "chr13": 13, "chr14":14,"chr15":15,"chr16": 16, "chr17": 17, "chr18": 18, "chr19": 19,"chr20":20,"chr21":21,"chr22":22,"chrX": 23,"chrY": 24,"chrM": 25}
 
 BINDIR="../.."
 
-
 def pr_t1( f, start, end, bufs, sems ):  # thread to read from fastq.gz file
     f.seek(start)
     buf_i = 0
@@ -118,7 +116,7 @@ def pragzip_reader(comm, cpus, fname, outdir, last=False):
 binrounding = 1000
 keep=False
 headers_done = threading.Semaphore(0)
-
+chromo_dict={}
 # NOTE:  Code relies on insert order of dictionary keys -- so needs python>=3.7
 
 # Calculate bins
@@ -213,6 +211,7 @@ def read_wrap(f):
     #seq_start['*'] = 0    # put all unmatched reads at beginning
     cumlen = 0
     global keep
+    global chromo_dict
     with open(outpipe,'r') as f1:
         #l = f1.readline()
         l = read_wrap(f1)
@@ -228,7 +227,8 @@ def read_wrap(f):
                     seq_len[sn] = ln
                     cumlen += ln
                 else:
-                    if sn in mydict.keys():
+                    #print(sn)
+                    if sn in chromo_dict.keys():
                         seq_start[sn] = cumlen
                         seq_len[sn] = ln
                         cumlen += ln
@@ -350,10 +350,21 @@ def main(argv):
     #global bins_per_rank
     #bins_per_rank = max(4,bins_per_rank//nranks)
     global ncpus
+    global chromo_dict
     ncpus = int(cpus)
-
     start0 = time.time()
-
+    # chromo_dict information added
+    chromo_file=os.path.join(refdir,ifile)+".fai"
+    if os.path.isfile(chromo_file):
+        f = open(chromo_file, "r")
+    else :
+        print(f'File "{chromo_file}" not found')
+        exit()
+    lines=f.readlines()
+    f.close()
+    for line in lines[0:25]:
+        chromo=line.split("\t")[0]
+        chromo_dict[chromo]= 1
     # Preindex refernce genome if requested
     if rank==0:
         yappi.set_clock_type("wall")