-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrequent_words_mismatch_includes_reverse.py
64 lines (58 loc) · 1.81 KB
/
frequent_words_mismatch_includes_reverse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import itertools
def reverse_complement_problem(text):
ret = ''
for i in range(len(text)):
if 'A' == text[len(text) - i - 1]: ret += 'T'
elif 'T' == text[len(text) - i - 1]: ret += 'A'
elif 'C' == text[len(text) - i - 1]: ret += 'G'
elif 'G' == text[len(text) - i - 1]: ret += 'C'
return ret
def hamming_distance_problem(p, q):
ret = 0
for i in range(len(p)):
if p[i] != q[i]: ret += 1
return ret
def approx_pattern_matching_count(text, pattern, distance):
ret = 0
for i in range(len(text) - len(pattern) + 1):
if hamming_distance_problem(pattern, text[i:i + len(pattern)]) <= distance: ret += 1
return ret
def neighbors(pattern, distance):
if 0 == distance:
return [pattern]
if 1 == len(pattern):
return list('ACGT')
neighborhood = []
suffix_neighbors = neighbors(pattern[1:], distance)
for item in suffix_neighbors:
if hamming_distance_problem(item, pattern[1:]) < distance:
for ch in list('ACGT'):
neighborhood.append(ch + item)
else:
neighborhood.append(pattern[0] + item)
return neighborhood
def frequent_words_mismatch(text, kmer, distance):
target = []
for i in range(len(text) - kmer + 1):
pattern = text[i:i + kmer]
target.extend(neighbors(pattern, distance))
target = list(set(target))
maxnum = 0
ret = []
for pattern in target:
number = approx_pattern_matching_count(text, pattern, distance) + approx_pattern_matching_count(text, reverse_complement_problem(pattern), distance)
if number == maxnum:
ret.append(pattern)
elif number > maxnum:
maxnum = number
ret = [pattern]
return ' '.join(list(set(ret)))
f = open('frequent_words_mismatch_includes_reverse.txt','r')
content = f.read()
lines = content.split('\n')
text = lines[0]
params = lines[1].split(' ')
kmer = int(params[0])
distance = int(params[1])
print frequent_words_mismatch(text, kmer, distance)
f.close()