-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrequent_words_mismatch.py
55 lines (50 loc) · 1.43 KB
/
frequent_words_mismatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import itertools
def hamming_distance_problem(p, q):
ret = 0
for i in range(len(p)):
if p[i] != q[i]: ret += 1
return ret
def approx_pattern_matching_count(text, pattern, distance):
ret = 0
for i in range(len(text) - len(pattern) + 1):
if hamming_distance_problem(pattern, text[i:i + len(pattern)]) <= distance: ret += 1
return ret
def neighbors(pattern, distance):
if 0 == distance:
return [pattern]
if 1 == len(pattern):
return list('ACGT')
neighborhood = []
suffix_neighbors = neighbors(pattern[1:], distance)
for item in suffix_neighbors:
if hamming_distance_problem(item, pattern[1:]) < distance:
for ch in list('ACGT'):
neighborhood.append(ch + item)
else:
neighborhood.append(pattern[0] + item)
return neighborhood
def frequent_words_mismatch(text, kmer, distance):
target = []
for i in range(len(text) - kmer + 1):
pattern = text[i:i + kmer]
target.extend(neighbors(pattern, distance))
target = list(set(target))
maxnum = 0
ret = []
for pattern in target:
number = approx_pattern_matching_count(text, pattern, distance)
if number == maxnum:
ret.append(pattern)
elif number > maxnum:
maxnum = number
ret = [pattern]
return ' '.join(list(set(ret)))
f = open('frequent_words_mismatch.txt','r')
content = f.read()
lines = content.split('\n')
text = lines[0]
params = lines[1].split(' ')
kmer = int(params[0])
distance = int(params[1])
print frequent_words_mismatch(text, kmer, distance)
f.close()