Skip to content

Commit

Permalink
Generalize code for genome reference file.
Browse files Browse the repository at this point in the history
  • Loading branch information
ashish615 committed Dec 21, 2023
1 parent 8506b0a commit d331720
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,6 @@ fi
echo Starting run with $N ranks, $CPUS threads,$THREADS threads, $SHARDS shards, $PPN ppn.
# -in -sindex are required only once for indexing.
# Todo : Make index creation parameterized.
mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_pipeline_final.py --input $INDIR --output $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --read $READ1 $READ2 --cpus $CPUS --threads $THREADS --shards $SHARDS --container_tool "$Container" --keep_unmapped 2>&1 | tee ${OUTDIR}log.txt
mpiexec -bootstrap ssh -bind-to $BINDING -map-by $BINDING --hostfile hostfile -n $N -ppn $PPN python -u test_pipeline_final.py --input $INDIR --output $OUTDIR $TEMPDIR --refdir $REFDIR --index $REF --read $READ1 $READ2 --cpus $CPUS --threads $THREADS --shards $SHARDS --container_tool "$Container" 2>&1 | tee ${OUTDIR}/log.txt

echo "Pipeline finished. Output vcf can be found at: $OUTPUT_DIR/output.vcf.gz"
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@
import yappi
from multiprocessing import Pool
from operator import itemgetter
mydict = {"chr1": 1,"chr2": 2,"chr3": 3,"chr4": 4,"chr5": 5,"chr6": 6,"chr7": 7,"chr8": 8, "chr9": 9, "chr10": 10, "chr11": 11, "chr12": 12, "chr13": 13, "chr14":14,"chr15":15,"chr16": 16, "chr17": 17, "chr18": 18, "chr19": 19,"chr20":20,"chr21":21,"chr22":22,"chrX": 23,"chrY": 24,"chrM": 25}

BINDIR="../.."


def pr_t1( f, start, end, bufs, sems ): # thread to read from fastq.gz file
f.seek(start)
buf_i = 0
Expand Down Expand Up @@ -118,7 +116,7 @@ def pragzip_reader(comm, cpus, fname, outdir, last=False):
binrounding = 1000
keep=False
headers_done = threading.Semaphore(0)

chromo_dict={}
# NOTE: Code relies on insert order of dictionary keys -- so needs python>=3.7

# Calculate bins
Expand Down Expand Up @@ -213,6 +211,7 @@ def read_wrap(f):
#seq_start['*'] = 0 # put all unmatched reads at beginning
cumlen = 0
global keep
global chromo_dict
with open(outpipe,'r') as f1:
#l = f1.readline()
l = read_wrap(f1)
Expand All @@ -228,7 +227,8 @@ def read_wrap(f):
seq_len[sn] = ln
cumlen += ln
else:
if sn in mydict.keys():
#print(sn)
if sn in chromo_dict.keys():
seq_start[sn] = cumlen
seq_len[sn] = ln
cumlen += ln
Expand Down Expand Up @@ -350,10 +350,21 @@ def main(argv):
#global bins_per_rank
#bins_per_rank = max(4,bins_per_rank//nranks)
global ncpus
global chromo_dict
ncpus = int(cpus)

start0 = time.time()

# chromo_dict information added
chromo_file=os.path.join(refdir,ifile)+".fai"
if os.path.isfile(chromo_file):
f = open(chromo_file, "r")
else :
print(f'File "{chromo_file}" not found')
exit()
lines=f.readlines()
f.close()
for line in lines[0:25]:
chromo=line.split("\t")[0]
chromo_dict[chromo]= 1
# Preindex refernce genome if requested
if rank==0:
yappi.set_clock_type("wall")
Expand Down

0 comments on commit d331720

Please sign in to comment.