-
Notifications
You must be signed in to change notification settings - Fork 1
/
UPGMA.py
177 lines (154 loc) · 5 KB
/
UPGMA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import TreeConstruction_changed as TreeConstruction
import math
def d_f2(seq1,seq2, dicti = None):
'''a distance function. from the article CRISPER-Cas9 knockout screening in human cells'''
distance = 0
if len(seq2) < len(seq1): #putting the longer sequence as seq2
temp = seq1
seq1 = seq2
seq2 = temp
distance += len(seq2) - len(seq1)
len1 = len(seq1) #this is also the maximum distance between two mismatches
last_mismatch =len1
for i in range(len(seq1)):
if seq1[i] != seq2[i]:
if last_mismatch == 0:
Dmm = len1-1 # Dmm is the distance from the last mismatch
else:
Dmm = i - last_mismatch
distance += (i)* (len1 - Dmm)/len1
Dmm = i
return distance/len(seq2)
def p_distance(seq1,seq2, dicti = None):
'''the simplest distance function. will be use for tests'''
distance = 0
if len(seq2) < len(seq1): #putting the longer sequence as seq2
temp = seq1
seq1 = seq2
seq2 = temp
distance += len(seq2) - len(seq1)
for i in range(len(seq1)):
if seq1[i] != seq2[i]:
distance += 1
return distance/len(seq2)
def MITScore(seq1, seq2, dicti = None):
'''frm CRISPR-MIT
PAM cames at the end of the string'''
distance, first_mm, last_mm = 0, -1, -1
first_arg = 1
M = [0, 0, 0.14, 0, 0, 0.395, 0.317, 0, 0.398, 0.079, 0.445, 0.508, 0.613, 0.851, 0.723, 0.828, 0.615, 0.804, 0.685, 0.583]
if len(seq2) < len(seq1): #putting the longer sequence as seq2
temp = seq1
seq1 = seq2
seq2 = temp
distance += len(seq2) - len(seq1)
for i in range(len(seq1)):
if seq1[i] != seq2[i]:
distance += 1
last_mm = i
first_arg = first_arg * (1-M[i])
if first_mm == -1:
first_mm = i
if distance == 0:
original_score = 1 ##is it right??
else:
d_avg = (last_mm - first_mm)/distance
original_score = first_arg /((4*(19 - d_avg)/19 +1)*distance**2)
return 1 - original_score
def MITScore_alternative_imp(seq1, seq2,dicti = None):
"""
shiran's implementation
:param seq1, seq2: sgRNA and off target with PAM seq
:return:
"""
#walk-through of the calculation: https://groups.google.com/forum/#!searchin/crispr/score/crispr/fkhX7FU3r-I/9Nc14v_j3XgJ
mm_positions = []
n = len(seq1) - 3
M = [0, 0, 0, 0, 0, 0.14, 0, 0, 0.395, 0.317, 0, 0.398, 0.079, 0.445, 0.508, 0.613, 0.851, 0.723, 0.828, 0.615, 0.804, 0.685, 0.583]
M = M[-len(seq1):] # was cheaged from M = M[-len(seq1)+3:]
score = 1
#first term
for i in range(max(n-len(seq1)-1, 0), n):
if seq1[i] != seq2[i]:
mm_positions.append(i)
score *= (1 - M[i])
n_mm = len(mm_positions)
#second term
if n_mm <= 1:
d_avg = n - 1
else:
d_diff = [mm_positions[j] - mm_positions[j-1] for j in range(1, n_mm)]
d_avg = sum(d_diff)/len(d_diff)
score *= (1 / ((n - 1 - d_avg) * 4 / (n-1) + 1))
#third term
score *= (1 / (max(1,n_mm) ** 2))
return score
def ccTop(sgseq, target_seq, dicti = None):
assert len(sgseq) == len(target_seq)
max_score = sum([math.pow(1.2, i+1) for i in range(len(sgseq))])
mm = [i+1 if sgseq[i] != target_seq[i] else 0 for i in range(len(sgseq))]
curScore = sum(list(map(lambda x: pow(1.2, x) if x !=0 else 0, mm)))
return curScore/max_score
def make_UPGMA(dm):
'''use by the doc in http://biopython.org/DIST/docs/api/Bio.Phylo.TreeConstruction.DistanceTreeConstructor-class.html'''
constructor = TreeConstruction.DistanceTreeConstructor()
tree = constructor.upgma(dm)
return tree
def make_initiale_matrix(df,seqList):
'''input: df: distance_function. seqList: list of sequences, in the order coresponding to the names of sequences
output: the distance according to the given distance function, arranged as list of lists: a lower triangle matrix
'''
res = [] # an array of arrays: a matrix
for i in range(len(seqList)):
j = 0
row = []
while j <= i:
tempDistance = df(seqList[i],seqList[j])
#print(i,j,tempDistance, seqList[i],seqList[j])
row += [tempDistance]
j += 1
res += [row]
return res
def make_distance_matrix(names, initiale_matrix):
'''input: list of names of the sequences, and the output of 'make_initiale_matrix'
output: a distance matrix, in a format adequate to the UPGMA function'''
m = TreeConstruction._DistanceMatrix(names, initiale_matrix)
return m
def test1():
a = "aret"
b = "ardw"
c = "brdw"
seq_list = [a,b,c]
names = ["a", "b", "c"]
matrix = make_initiale_matrix(d_f2,seq_list)
#print(matrix)
m2 = make_distance_matrix(names, matrix)
#print(m2)
m3 = m2.__repr__()
#print(m3)
upgma1 = make_UPGMA(m2)
return upgma1
def test_s_score():
a = "aaaaaaaaaaaaaaaaaaaa"
b = "agaaaaaaaaaaaaaaaaaa"
c = "aaaaaaaaaaaagggggggg"
d = "GCCTCCCCAAAGCCTGGCCA"
e = "ACCTCCCCATAGCCTGGCCA"
print("real data test:", MITScore(d,e))
print("the same:", MITScore(c,c))
print("closer:", MITScore(a,b))
print("farther:", MITScore(a,c))
print("p distance")
print("real data test:", p_distance(d,e))
print("closer:", p_distance(a,b))
print("farther:", p_distance(a,c))
def test_p_dist():
s1 = "GGAATGAAAACTATGAGCAC"
s2 = "GGTCTTAAATCTAGTATCTT"
print(cc(s1,s2))
def test_ccTop():
s1 = "GGAATGAAAACTATGAGCAC"
s2 = "GGAATGAAAACTATGAGCAC"
print(ccTop(s1,s2))
#test_ccTop()
#test_s_score()