-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathStage0.py
189 lines (177 loc) · 8.08 KB
/
Stage0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
__author__ = 'Gal Hyams'
import sys
import CasSites
import Stage1
import Stage1_h
import UPGMA
import timeit
import pickle
import Metric
import argparse
def sort_expectation(candidates_DS, homology):
def sort_subgroup(candidates_DS):
candidates_DS.sort(key = lambda item: (item.cut_expectation, item.total_num_of_mm()), reverse=True)
if not homology:
sort_subgroup(candidates_DS)
else:
for i in range(len(candidates_DS)):
sort_subgroup(candidates_DS[i].candidate_lst)
def sort_thr(candidates_DS, Omega, homology):
'''dort the candidates DS by num of genes with cut prob> Omega and then by the probobility to cleave all of these genes'''
def sort_subgroup(candidates_DS, Omega):
for candidate in candidates_DS:
num_of_genes_above_thr = 0
cleave_all = 1
for gene, score in candidate.genes_score_dict.items():
if score >= Omega:
cleave_all *= score
num_of_genes_above_thr += 1
candidate.cleve_all_above_thr = cleave_all
candidate.num_of_genes_above_thr = num_of_genes_above_thr
candidates_DS.sort(key = lambda item: (item.num_of_genes_above_thr, item.cleave_all_above_thr), reverse = True)
if not homology:
sort_subgroup(candidates_DS, Omega)
else:
for i in range(len(candidates_DS)):
sort_subgroup(candidates_DS[i], Omega)
def leave_only_relevant_sgRNA(res):
if len(res) < 1:
return
candidates_to_del = []
for i in range(len(res) -1, -1,-1):
if res[i].cut_expectation < 1:
del res[i]
elif i < len(res) - 1:
for j in range(i+1,len(res)):
if j >= len(res):
continue
if res[i].seq == res[j].seq: # there is no need in both of them. Who is sutable for more genes?
if res[i].cut_expectation <= res[j].cut_expectation:
del res[i]
else:
del res[j]
def remove_repetitions_in_targets_sites(res):
'''haven't been tested yet'''
targets = set()
for i in range(len(res)):
removed_flag = 0
for target_tuple in candidate.targets_dict.values():
if target_tuple[0] in targets:
to_remove.append[i]
removed_flag = 1
if removed_flag == 0:
for target_tuple in candidate.targets_dict.values():
targets.add(target_tuple[0])
#now, remove
for index in range(len(to_remove) -1, -1,-1):
del res[to_remove[index]]
def CRISPys_main(fasta_file, path , alg = 'A', where_in_gene = 1, use_thr = 0, Omega = 1, df_targets = Metric.cfd_funct, protodist_outfile = "outfile", min_length= 20, max_length = 20,start_with_G = False, internal_node_candidates = 10, PS_number = 12):
start = timeit.default_timer()
cfd_dict = None
if isinstance(where_in_gene, str):
where_in_gene = float(where_in_gene.strip())
if isinstance(Omega, str):
Omega = float(Omega.strip())
if isinstance(use_thr, str):
use_thr = int(use_thr.strip())
#choosing the distance function
if df_targets == "MITScore" or df_targets == "CrisprMIT":
df_targets = UPGMA.MITScore
if df_targets == "cfd_funct" or df_targets == Metric.cfd_funct:
df_targets = Metric.cfd_funct
cfd_dict = pickle.load(open("cfd_dict.p",'rb'))
if df_targets == "CCTop" or df_targets == "ccTop" :
df_targets = UPGMA.ccTop
protodist_outfile = path + "/" + protodist_outfile
print(df_targets)
original_range_in_gene = [0, where_in_gene]
genes_sg_dict = {}
sg_genes_dict = {}
genesNames = []
genesList = []
f = open(fasta_file,'r')
gene_name = ""
gene_seq = ""
lines = f.readlines()
i = 0
genes_exons_dict = {} #key: gene name. value: list of exons
while i <= len(lines):
#stage 1: make gene: sequence dictionary
if i == len(lines) or lines[i][0] == '>':
if len(gene_seq) > 0 and gene_name != "": #add the gene
if gene_name not in genes_exons_dict:
genes_exons_dict[gene_name] = [gene_seq]
else:
genes_exons_dict[gene_name] = genes_exons_dict[gene_name] + [gene_seq]
gene_seq = ""
if i != len(lines): # lines[i-1][0] == '>':
gene_name = lines[i][1:].strip() #without the '>' and the '\n'
elif lines[i] != "\n":
gene_seq += lines[i].strip()
i+=1
#stage 2: find the target sites
for gene_name in genes_exons_dict.keys():
genes_sg_dict[gene_name] = CasSites.get_targets_sites_from_exons_lst(genes_exons_dict[gene_name], original_range_in_gene, min_length, max_length,start_with_G)
genesNames.append(gene_name)
genesList.append("".join(genes_exons_dict[gene_name]))
#filling up the sg_genes_dict
for sg in genes_sg_dict[gene_name]:
if sg in sg_genes_dict:
sg_genes_dict[sg] = sg_genes_dict[sg] + [gene_name]
else:
sg_genes_dict[sg] = [gene_name]
if alg == 'E':
res = Stage1_h.call_it_all(genesList, genesNames, sg_genes_dict, genes_sg_dict, Omega, protodist_outfile, path, df_targets, internal_node_candidates, cfd_dict, PS_number)
else:
res = Stage1.call_it_all(genesList, genesNames, sg_genes_dict, genes_sg_dict, Omega, protodist_outfile, path, df_targets, cfd_dict, PS_number) #thies line have been change to be sutable for wrapper
if use_thr:
sort_thr(res, Omega, alg == 'E')
else:
sort_expectation(res, alg == 'E')
#remove the folowing two lines when using CRISPysCover
if len(res)>200:
res = res[:200]
Stage1.print_res_to_csvV2(res, sg_genes_dict, genesList, genesNames, path, alg == 'E')
Stage1.print_res_to_csvV3(res, sg_genes_dict, genesList, genesNames, path, alg =='E')
pickle.dump(res, open(path + "/res_in_lst.p", "wb"))
pickle.dump(genesNames, open(path + "/genesNames.p", "wb"))
stop = timeit.default_timer()
print("time: ", stop - start)
time_file = open("time.txt", 'w')
time_file.write(str(stop - start))
time_file.close
return res
def parse_arguments(parser):
#def CRISPys_main(fasta_file, path , alg = 'A', where_in_gene = 1, use_thr = 0, Omega = 1, df_targets = Metric.cfd_funct, protodist_outfile = "outfile", min_length= 20, max_length = 20,start_with_G = False, internal_node_candidates = 10, PS_number = 12):
parser.add_argument('fasta_file', type=str, metavar='<fasta_file>', help='The path to the input fasta file')
parser.add_argument('path', type=str, metavar='<path>', help='THe path to the directory in which the output files will be written')
parser.add_argument('--alg', type=str, default='A', help='Choose E for considering homology')
parser.add_argument('--where_in_gene', type=float, default=1, help='input a number between 0 to 1 in order to ignore targets sites downstream to the corresponding gene prefix')
parser.add_argument('--t', type=bool, default=0, help='for using sgRNA to gain maximal gaining score among all of the input genes or 1 for the maximal cleavage likelihood only among genes with score higher than the average. Default: 0.')
parser.add_argument('--v', type=float, default=0.43, help='the value of the threshold. A number between 0 to 1 (included). Default: 0.43')
parser.add_argument('--s', type=str, default='cfd_funct', help='the scoring function of the targets. Optional scoring systems are: cfd_funct (default), CrisprMIT and CCtop. Additinal scoring function may be added by the user or by request.')
parser.add_argument('--p', type=str, default='outfile', help='protDist output file name. Default: "outfile"')
parser.add_argument('--l', type=int, default=20, help='minimal length of the target site. Default:20')
parser.add_argument('--m', type=bool, default=20, help = 'maximal length of the target site, Default:20')
parser.add_argument('--g', type=bool, default=0, help='1 if the target sites are obligated to start with a G codon or 0 otherwise. Default: 0.')
parser.add_argument('--i', type=int, default=10, help='when choosing the consider homology option, this is the number of sgRNAs designed for each homology sub-group. Default: 10')
parser.add_argument('--ps', type=int, default=12, help='the maximal number of possible polymorphic sites in a target. Default: 12')
args = parser.parse_args()
return args
if __name__ == "__main__":
parser = argparse.ArgumentParser()
args = parse_arguments(parser)
CRISPys_main(fasta_file = args.fasta_file,
path = args.path,
alg = args.alg,
where_in_gene = args.where_in_gene,
use_thr = args.t,
Omega=args.v,
df_targets = args.s,
protodist_outfile = args.p,
min_length=args.l,
max_length=args.m,
start_with_G = args.g,
internal_node_candidates=args.i,
PS_number = args.ps)
#CRISPys_main(*sys.argv[1:])