-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_fasta.py
64 lines (62 loc) · 1.36 KB
/
filter_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def get_id(seq1, seq2):
indices_to_check=[]
ctr=0
for a in seq1:
if a != '-':
indices_to_check.append(ctr)
ctr+=1
match_ctr=0
for a in indices_to_check:
if seq1[a] == seq2[a]:
match_ctr+=1
return match_ctr / len(indices_to_check)
def get_coverage(seq):
tev_seq=seqs[0]
indices_to_check=[]
ctr=0
for a in tev_seq:
if a != '-':
indices_to_check.append(ctr)
ctr+=1
match_ctr=0
coverage_ctr=0
for a in indices_to_check:
if seq[a] == tev_seq[a]:
match_ctr+=1
if seq[a] != '-':
coverage_ctr+=1
return match_ctr / len(indices_to_check), coverage_ctr / len(indices_to_check)
names=[]
seqs=[]
with open('tev_reduced_2.fasta') as f:
for x in f:
if '>' in x:
names.append(x.replace('>','').strip())
else:
seqs.append(x.strip())
final_names=[]
final_seqs=[]
for a, b in zip(names, seqs):
if a == 'tev':
final_names.append(a)
final_seqs.append(b)
else:
id_list=[]
for c in final_seqs:
if c != b:
id_list.append(get_id(b,c))
max_id=max(id_list)
query_id, query_coverage = get_coverage(b)
print(max_id)
print(query_id)
print(query_coverage)
if max_id <= 0.9 and query_id >= 0.3 and query_coverage >= 0.5:
print('accept')
final_names.append(a)
final_seqs.append(b)
else:
print('fail')
with open('tev_filtered_2.fasta','w') as f:
for a, b in zip(final_names, final_seqs):
f.write('>'+a+'\n')
f.write(b+'\n')