-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSmith-Waterman.py
254 lines (168 loc) · 8.15 KB
/
Smith-Waterman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#Smith Waterman algorithm : Algorithm for BioInformatics - UniTN 2019
#Sébastien Cararo
# This algorithm compute the best local alignement given two sequences of ADN, plus it displays the scoring matrix, the path, and a few gifures about the alignement (%matches, %mismatches, %gaps)
# During the construction of the scoring matrix we also construct a "path_matrix" which records the way we obtained the score in the score_matrix and gives instruction about the traceback : each cell is of the form [distance, 'DIRECTION'] when DIRECTION can be either DIAG, UP, LEFT or NULL (presence of a 0).
#Input = sequence A, sequence B, scoring system (match, mismatch, gap)
#Output Best solution for the local alignment (2 strings), score assigned for this solution.
# example of output : recall of the input, score of the best local alignment, best local alignment in the form of two sequences (eventually with gaps)
#Libraries
import numpy
import argparse
import os
import re
import sys
import unittest
import time
from pandas import *
#Initialization
seqA = 'TTAGCTGATCTTAC'
seqB = 'TTAGGCTATCGA'
match = 3
mismatch = -1
#gap = '' -> we define a gap penalty function at the end of the code
#(so the program is suitable for affine penalties too)
alphabet = 'ACGT'
def main():
#First we display the Substitution matrix, the scoring matrix
print('The substitution matrix for the alphabet {0} is:'.format(alphabet))
Substi = create_Substi(alphabet, match, mismatch)
print(DataFrame(Substi))
rows = len(seqA) + 1
cols = len(seqB) + 1
score_matrix, start_pos, max_score, path_matrix = create_score_matrix(rows, cols)
print('Scoring Matrix : \n')
print_matrix(score_matrix)
print('Highest score = ', max_score)
print('start_pos=',start_pos)
print('The path to 0 is:')
time.sleep(0.5)
# Traceback : it also displays the path
seqA_aligned, seqB_aligned = traceback(path_matrix, start_pos, score_matrix)
assert len(seqA_aligned) == len(seqB_aligned), 'aligned strings are not the same size'
# Printing the alignements and a few figures
Graphical_display(seqA_aligned, seqB_aligned)
print('Score of the alignment = ')
print(max_score)
return(0)
def traceback(path_matrix, start_pos, score_matrix):
'''The function does the traceback based on the starting position
and on the instructions contained in the path_matrix,
it also displays the move done at each step '''
x,y = start_pos
aligned_seqA = []
aligned_seqB = []
while path_matrix[x][y] != [0, 'NULL'] :
d, direction = path_matrix[x][y][0], path_matrix[x][y][1]
if direction == 'DIAG' :
assert d ==1, 'path_matrix wrongly constructed !'
aligned_seqA.append(seqA[x - 1])
aligned_seqB.append(seqB[y - 1])
x -= 1
y -= 1
print('DIAG',score_matrix[x][y])
elif direction == 'UP' :
for c in range(d):
aligned_seqA.append(seqA[x - 1])
aligned_seqB.append('-')
x -= 1
print('UP',score_matrix[x][y])
elif direction == 'LEFT' :
for c in range(d):
aligned_seqA.append('-')
aligned_seqB.append(seqB[y - 1])
y -= 1
print('LEFT',score_matrix[x][y])
print('Traceback reached a 0 at :',(x,y))
return ''.join(reversed(aligned_seqA)), ''.join(reversed(aligned_seqB))
def create_score_matrix(rows, cols):
'''The function creates the score_matrix and the path_matrix'''
score_matrix = [[0 for col in range(cols)] for row in range(rows)]
path_matrix = [[[0 , 'NULL'] for col in range(cols)] for row in range(rows)]
max_score = 0
max_pos = None
for i in range(1, rows):
for j in range(1, cols):
score, antecedent = calc_score(score_matrix, i, j)
if score > max_score:
max_score = score
max_pos = (i, j)
score_matrix[i][j],path_matrix[i][j] = score, antecedent
assert max_pos is not None, 'No maximum found'
return score_matrix, max_pos, max_score, path_matrix
def calc_score(score_matrix, x, y):
'''this function is used during the score_matrix construction,
it is suitable for affine gap penalties and works with a substitution matrix'''
similarity = Substitution_score(Substi, x, y)
same_row = [(score_matrix[x][y-l]-gap_penalty(l)) for l in range(1,x+1)]
same_col = [(score_matrix[x-k][y]-gap_penalty(k)) for k in range(1,x+1)]
up_score = max(same_col)
left_score = max(same_row)
diag_score = score_matrix[x - 1][y - 1] + similarity
pos_max_up = first_pos_max(same_col)
pos_max_left = first_pos_max(same_row)
score = max(0, diag_score, up_score, left_score)
if score == diag_score :
antecedent = [1, 'DIAG']
return score, antecedent
elif score == up_score :
antecedent = [pos_max_up + 1, 'UP']
return score, antecedent
elif score == left_score :
antecedent = [pos_max_left + 1, 'LEFT']
return score, antecedent
else :
return score, [0, 'NULL']
def alignment_string(aligned_seqA, aligned_seqB):
idents, gaps, mismatches = 0, 0, 0
alignment_string = []
for base1, base2 in zip(aligned_seqA, aligned_seqB): #loop that runs over both sequences simultaneously
if base1 == base2:
alignment_string.append('|')
idents += 1
elif '-' in (base1, base2):
alignment_string.append(' ')
gaps += 1
else:
alignment_string.append(':')
mismatches += 1
return ''.join(alignment_string), idents, gaps, mismatches
def create_Substi(alphabet, match, mismatch):
'''Creates a substitution matrix, given the alphabet and the match/mismatch scores'''
global Substi
Substi = [['NULL' for col in range(len(alphabet))] for row in range(len(alphabet))]
for i in range(len(alphabet)):
for j in range(len(alphabet)):
if alphabet[i] == alphabet[j] :
Substi[i][j] = match
elif alphabet[i] != alphabet[j]:
Substi[i][j] = mismatch
return Substi
def Substitution_score(Substi, x, y):
'''we seek the score between seqA[x-1] and seqB[y-1]'''
a_i = alphabet.index(seqA[x-1])
b_j = alphabet.index(seqB[y-1])
return Substi[a_i][b_j]
def Graphical_display(seqA_aligned, seqB_aligned):
alignment_str, idents, gaps, mismatches = alignment_string(seqA_aligned, seqB_aligned)
alength = len(seqA_aligned)
time.sleep(0.2)
print()
print(' Identities = {0}/{1} ({2:.1%}), Gaps = {3}/{4} ({5:.1%}), Mismatches = {6}/{7} ({8:.1%})'.format(idents,
alength, idents / alength, gaps, alength, gaps / alength, mismatches, alength, mismatches/alength))
print()
for i in range(0, alength, 60):
seqA_slice = seqA_aligned[i:i+60]
print('Query {0:<4} {1} {2:<4}'.format(i + 1, seqA_slice, i + len(seqA_slice)))
print(' {0}'.format(alignment_str[i:i+60]))
seqB_slice = seqB_aligned[i:i+60]
print('Sbjct {0:<4} {1} {2:<4}'.format(i + 1, seqB_slice, i + len(seqB_slice)))
print()
def print_matrix(matrix): # graphical display of matrix
print('\n'.join([''.join([' {:4}'.format(item) for item in row])for row in matrix]))
def first_pos_max(list):
maxi = max(list)
return [i for i, j in enumerate(list) if j == maxi][0]
def gap_penalty(k) :
return 3*k
if __name__ == '__main__':
sys.exit(main())