-
Notifications
You must be signed in to change notification settings - Fork 1
/
CasSites.py
112 lines (103 loc) · 5.9 KB
/
CasSites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
__author__ = 'ItayM5'
import re ##used to be regex
import regex
def get_sites(gene, min_length=20, max_length=20, start_with_G=False, where_in_gene = 1):
'''
:param gene:
:param min_length:
:param max_length:
:param start_with_G:
:param where_in_gene: forword to this position the sgRNA are ignored
:return:
'''
res = []
if len(gene) < max_length+3:
return res
for length in range(min_length, max_length +1):
if (start_with_G):
SiteAndPAM = "G" + "."*length + "GG" #it is acually NGG
else:
SiteAndPAM = "."*(length +1) + "GG" #it is acually NGG
compiled = regex.compile(SiteAndPAM)
where_in_gene = int(len(gene)*where_in_gene)
founds_sense = regex.findall(compiled, gene[:where_in_gene], overlapped=True)
founds_antisense = regex.findall(compiled, give_complementary(gene)[:where_in_gene], overlapped=True)
founds = [seq[:-3] for seq in founds_sense if 'N' not in seq[:-3]] + [seq[:-3] for seq in founds_antisense if 'N' not in seq[:-3]]
res += founds
#print(res)
return res
def get_sites_test(gene, min_length=20, max_length=20, start_with_G=False, where_in_gene = 1):
res = []
SiteAndPAM = "."*(20 +1) + "GG" #it is acually NGG
compiled = regex.compile(SiteAndPAM)
#where_in_gene = int(len(gene)*where_in_gene)
founds_sense = regex.findall(compiled, gene, overlapped=True)
#print("gene", gene)
#print("found sense", founds_sense)
founds_antisense = regex.findall(compiled, give_complementary(gene), overlapped=True)
founds = [seq[:-3] for seq in founds_sense] + [seq[:-3] for seq in founds_antisense]
res = founds
return res
def give_complementary(seq):
res = []
for i in range(len(seq)) :
if seq[len(seq)-1-i] == 'A':
res.append('T')
elif seq[len(seq)-1-i] == 'T':
res.append('A')
elif seq[len(seq)-1-i] == 'C':
res.append('G')
elif seq[len(seq)-1-i] == 'G':
res.append('C')
elif seq[len(seq)-1-i] == 'N':
res.append('N')
return ''.join(res)
def give_complementary_old(seq):
res = []
for letter in seq:
if letter == 'A':
res.append('T')
elif letter == 'T':
res.append('A')
elif letter == 'C':
res.append('G')
elif letter == 'G':
res.append('C')
elif letter == 'N':
res.append('N')
return ''.join(res)
def find_offtagrets(seq, chromo_folder):
'''
:param seq:
:param chromo_folder: a folder in which there are exactly all the chromosomse
:return:
'''
def get_targets_sites_from_exons_lst(exons_lst, original_range_in_gene = [0,1], min_length= 20, max_length = 20,start_with_G = False):
if original_range_in_gene[1] <= original_range_in_gene[0]:
print("The range of the targts on the gene is not in the right format")
exit(-1)
if max_length < min_length:
print("The range of the lengths of the sgRNA is not in the right format")
exit(-1)
res = []
lengths = list(map(lambda x: len(x), exons_lst))
gene_length = sum(lengths)
range_in_gene = list(map(lambda x: int(x * gene_length), original_range_in_gene))
exons_lst = list(map(lambda seq: seq.upper(), exons_lst)) #converting to upper-case
for i in range(1, len(lengths)):
lengths[i] = lengths[i-1] + lengths[i]
for i in range(len(exons_lst)):
if i == 0:
if range_in_gene[0] < lengths[i]:
#if range_in_gene[1]*gene_length > lengths[i]:
res += get_sites(exons_lst[i][range_in_gene[0] : min(lengths[i], range_in_gene[1])], min_length, max_length, start_with_G, where_in_gene = 1)
elif max(range_in_gene[0], lengths[i-1]) < min(lengths[i], range_in_gene[1]):
res += get_sites(exons_lst[i][max(range_in_gene[0] - lengths[i-1], 0) : min(lengths[i] - lengths[i-1], range_in_gene[1] - lengths[i-1])], min_length, max_length, start_with_G, where_in_gene = 1)
#print(res)
return res
def test_2():
gene = ["TTTATGTCAACTTTTTCAATCTAATAGATCAATGAATTGTAAACTTTTTTCGACCACAAAATGATGCTTCCAAATACAAACAAAACCTGATGCAATCAGTCAATACCTTCCAACTTTAGAACACATATATGTAGCAATGCTCCTACAGTTTACTTTTCTATCTTTTAGCCTAATCATTTACTCTCATATTTTTTCTTTAAACTAGAAAGTTCAGAATCCAAATATAATATCATCTCCTTCTCTCTATTACAGCAATGGTTTTGGTTGATAACCATGCTGGAAAAGATGGTGCAGAAGATGGTAATATGGTTGATTTTCGAGGAAATCCGGTGGATAAGTCTAGGACAGGGGGATGGCTAGCTGCAGGACTTATCCTAGGAACTGAGCTATCAGAAAGGGTATGTGTTATGGGGATTTCGATGAATTTAGTGACGTACTTAGTTGGAGATTTACATCTTCCATCCTCCAAATCTGCCAACATTGTCACCAATTTCATGGGGACACTTAATCTTCTTGGTCTTCTAGGTGGTTTCTTGGCAGATGCTAAACTCGGACGTTATCTGACTGTTGGAATCTTTGCTTCAATTGCTGCTGTGGGGGTTACGCTTTTGACATTGGCGACATCCATTCCAGGCATGAAGCCGCCTGAATGTAACCCAAGAAAAAGTGGTCACTGCATTGAAGCCAGTGGCCAGCAGCTTGCTCTTCTCTATACGGCGCTTTACATCCTAGCTCTTGGTGGTGGTGGAATTAAGTCAAATGTCTCCGGGTTTGGTTCAGACCAATTTGACTCATCAGATCCTAAGGAGAACAAGTCCATGATATACTTCTTCAACAGATTCTATTTCTGCATAAGCCTTGGTTCTCTGTTTGCAGTGACTGTGCTGGTGTACTTACAAGACAATGTAGGAAGAGGATGGGGATATGGGATATCAGCAGGCACAATGGTCCTCGGGGTCGCTGTATTGATTGGTGGAACGACGTTGTATCGATTCAAGAAGCCTCAAGGAAGTCCTTTGACTATCATATGGAGGGTTCTGCTTTTAGCTTGGAGGAAGAGAAAGCTTAGTTACCCTTCTGATACTGGCTTCTTGAATGAATATCACAATGCCAAAGTCCCACATACACATATGTTGAGGTGTCTTGACAAGGCAGCCATTCTTGATGACTCTGCAGCTGCAAATGAGAATAGCAAGAATCGTTGGATAGTTTCAACAGTTACAGAAGTCGAAGAAGTGAAAATGGTGCTCAAATTGATTCCCATATGGTCCACATGCATACTTTTTTGGACAGTATACTCTCAGATGAATACCTTCACCATTGAACAAGCTACCTTCATGAACCGGAATGTTGGAAACTTTGCTGTCCCTGCAGGTTCCTTATCCGTGTTTCTCTTTATTAGCATACTTCTGTTTACTTCCATAAACGAAAGGGTCACAGTTCGTATTGCCAGAAAAATCACTCACAACAGCCAAGGAATCACAAGCCTTCAGAGAGTTGGAATTGGACTACTACTCTCTATTGTTGGTATGGTAGCTTCAGCTCTGGTAGAAAAACGACGAAGGGAACATGCCATCCATCATAACTTCAAGATAAGCGCGTTTTGGTTAGTGCCTCAATTCTTCATTGTAGGTGCTGGGGAAGCTTTTGCCTATGTAGGACAGCTAGAGTTTTTCATCAGGGAGGCACCAGAAGGGATGAAATCTATGAGCACAGGCCTATTTCTCAGCACACTCTCGATGGGATATTTCGTGAGTAGTTTGCTAGTATTCGTTGTACAGAAAGCAACAAAAGGAAGATGGCTTAAAAGCAATTTAAACAAAGGAAAACTGGATTTATTCTACTGGTTGCTAGCAGTTCTCGGAGTAATTAATTTCTTGATTTTCATTGCATTTTCAATGAAACACCAATACAAGGTGCAGAAACTTAGCAGTATTGAGGATTCTGCAGAAGAGCTCGGGAGTTGGAAGGATTTGACCCTCGACAACAAGGAAAAGAAACTCGAAGCAGACGAGAAGGTGGAAGCTTAAATACAGCATATTAGCTTTCAATGAATCATTCATTTCCAGAGTTTGTAATATAGAACCGTATTCAATTATCAAAGACGTCAATACAAATTTGCTACCAGTCTTGAGTTCTGTTTAGATTAAAACCTTGGATATTAGAGTGCAGAAATATGATCAATTCAGAAAGATATTTACACTTCAAATTCTCACTAAA"]
print(get_targets_sites_from_exons_lst(gene))
print(get_sites("".join(gene)))
if __name__ == "__main__":
test_2()