-
Notifications
You must be signed in to change notification settings - Fork 0
/
Map_read_gene_BWA.py
150 lines (134 loc) · 6.18 KB
/
Map_read_gene_BWA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
import sys
import os
import os.path
import shutil
import subprocess
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import re
from collections import Counter
DNA_DB = sys.argv[1]
contig2read_file = sys.argv[2]
gene2read_file = sys.argv[3]
# This section pulls in the contig to read mappings generated by earlier steps
# A secondary mapping is created which contains only contigs with uniquely mapped reads, this is the mapping used for the remainder of the script
contig2read_unique_mappings = {}
contig2read_map_all = {}
contig_reads = []
with open(contig2read_file, "r") as mapping:
for line in mapping:
if len(line) > 5:
entry = line.split("\t")
contig2read_map_all[entry[0]] = []
for read in entry[2:]:
contig2read_map_all[entry[0]].append(read.strip("\n"))
contig_reads.append(read.strip("\n"))
contig_reads_count = Counter(contig_reads)
for contig in contig2read_map_all:
for read in contig2read_map_all[contig]:
if contig_reads_count[read] > 1:
break
else:
contig2read_unique_mappings[contig] = contig2read_map_all[contig]
# Parses the SAM file
gene2read_map = {}
mapped_reads = set()
prev_mapping_count = 0
def gene_map(sam, unmapped):
with open(sam, "r") as samfile:
len_chars = ["M","I","S","=","X"]
for line in samfile:
while line.startswith("@"):
continue
if len(line) > 1:
line_parts = line.split("\t")
query_seq = line_parts[0]
db_match = line_parts[2]
flag = bin(int(line_parts[1]))[2:].zfill(11)
if query_seq in contig2read_map_all:
if query_seq in contig2read_unique_mappings:
contig = True
else:
unmapped_reads.add(query_seq)
continue
else:
contig = False
if flag[8] == "0":
CIGAR = re.split("([MIDNSHPX=])", line_parts[5])
length = 0
matched = 0
for index, entry in enumerate(CIGAR):
if index + 1 == len(CIGAR):
break
if CIGAR[index + 1] in len_chars:
length += int(CIGAR[index])
if CIGAR[index + 1] == "M":
matched += int(CIGAR[index])
if matched > length * 0.9:
if db_match in gene2read_map:
if contig:
for read in contig2read_unique_mappings[query_seq]:
gene2read_map[db_match].append(read)
mapped_reads.add(read)
elif not contig:
if query_seq in contig_reads:
continue
elif query_seq in mapped_reads:
unmapped.add(query_seq)
for gene in gene2read_map:
if query_seq in gene2read_map[gene]:
if len(gene2read_map[gene]) == 1:
del gene2read_map[gene]
else:
gene2read_map[gene].remove(query_seq)
break
else:
gene2read_map[db_match].append(query_seq)
mapped_reads.add(query_seq)
else:
if contig:
gene2read_map[db_match] = list(contig2read_unique_mappings[query_seq])
for read in contig2read_unique_mappings[query_seq]:
mapped_reads.add(read)
elif not contig:
if query_seq in mapped_reads:
unmapped.add(query_seq)
for gene in gene2read_map:
if query_seq in gene2read_map[gene]:
if len(gene2read_map[gene]) == 1:
del gene2read_map[gene]
else:
gene2read_map[gene].remove(query_seq)
break
else:
gene2read_map[db_match] = [query_seq]
mapped_reads.add(query_seq)
elif flag[8] == "1":
unmapped.add(query_seq)
for x in range((len(sys.argv) - 4) / 3):
read_file = sys.argv[3 * x + 4]
read_seqs = SeqIO.index(read_file, os.path.splitext(read_file)[1][1:])
BWA_sam_file = sys.argv[3 * x + 5]
output_file = sys.argv[3 * x + 6]
unmapped_reads = set()
unmapped_seqs = []
gene_map(BWA_sam_file, unmapped_reads)
for read in unmapped_reads:
unmapped_seqs.append(read_seqs[read])
with open(output_file, "w") as out:
SeqIO.write(unmapped_seqs, out, "fasta")
print str(len(mapped_reads) - prev_mapping_count) + " additional reads were mapped from " + os.path.basename(read_file)
prev_mapping_count = len(mapped_reads)
# Iterates through the DNA database and generates a mapping of genes to reads
genes = []
with open(gene2read_file, "w") as out_map:
for record in SeqIO.parse(DNA_DB, "fasta"):
if record.id in gene2read_map:
genes.append(record)
out_map.write(record.id + "\t" + str(len(record.seq)) + "\t" + str(len(gene2read_map[record.id])))
for read in gene2read_map[record.id]:
out_map.write("\t" + read.strip("\n"))
else:
out_map.write("\n")
print "Reads mapped to %d genes." % (len(genes))