-
Notifications
You must be signed in to change notification settings - Fork 0
/
species.py
385 lines (275 loc) · 12.6 KB
/
species.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
'''
@author: ichaudr
'''
from Bio.Graphics.GenomeDiagram import Diagram, FeatureSet
from Bio.SeqFeature import SeqFeature, FeatureLocation
from features import AnnotatedHit
import re
import os
import math
class Species:
'''
Holds all the GenomeFragment objects that belong to the same species.
'''
def __init__(self, assembly_accession, genome_fragments=[]):
self.assembly_accession = assembly_accession
self.species_name = 'No Name Assigned'
self.genome_fragments = genome_fragments
self.genome_fragments_accessions = []
self.sim_score = 0
self.query_percent_ids = {}
self.taxid = '-1'
###This function is not used anywhere anymore
def contains_query_feature(self, query_accession):
'''
Searches all the GenomeFragment objects for a feature with that resulted from an original query from the reference operon
Parameters
----------
query_accession: string
The accession of the query of interest
Returns
-------
found_feature: GenomeFeature object
The GenomeFeature object that resutled from the query of interest
'''
found_feature = None
for fragment in self.genome_fragments:
for hit in fragment.hits:
if hit.query_accession == query_accession:
found_feature = hit
return found_feature
def get_query_percent_ids(self, query_accessions):
'''
Determines the hit with the max percent identity (per BLAST search) for a given list query accessions and stores it in a dictionary.
Parameters
----------
query: list[string]
The accessions for the query from the reference operon.
Returns
-------
None - a dictionary is set in the object with {key:value} = {reference_accession:percent_id}
'''
for q_acc in query_accessions:
#Stores the max percenty identity
max_percent_id = 0
#Iterate through all genome fragments and all features to find any corresponding with the target query and assess its percent identity.
for frag in self.genome_fragments:
for hit in frag.hits:
if hit.query_accession == q_acc:
if hit.percent_identity > max_percent_id:
max_percent_id = hit.percent_identity
self.query_percent_ids[q_acc] = max_percent_id
def add_genome_fragment(self, genome_fragment):
'''
Adds a GenomeFragment to the Species object.
Parameters
----------
genome_fragment: GenomeFragment object
The GenomeFragment to append to the genome_fragments list for this Species object
Returns
-------
None
'''
if genome_fragment.assembly_accession == self.assembly_accession:
self.genome_fragments.append(genome_fragment)
def measure_sim(self, reference_operon):
'''
Determines the score of the operon(s) in this species relative to operon passed in based off the following scoring system:
score = (#matching pairs of genes)/(#total possible pairs),
(#possible pairs) = (# of total genes) - 1,
and any intergenic, non-reference features are ignored.
Workflow:
1. Make all the pairs from the reference.
2. Take a count of how many times each gene occurs in the species.
3. Make note of the max number of times each pair can occur in the species based off the count for each gene.
4. Determine how many pairs exit in the species and take the proportion out of the max number of pairs possible.
5. Divide the weighted total number of pairs by the number of pairs in the reference, and return.
Examples:
abcde
abc de [ab bc de] + ab cde [ab cd de]
ab/ab → 2/2 = 1
bc/bc → 1/2 = 0.5
cd/cd → 1/2 = 0.5
de/de → 2/2 = 1
→ 3/4
abcde
abc de + de
ab/ab → 1/1 = 1
bc/bc → 1/1 = 1
cd/cd → 0/1 = 0
de/de → 2/2 = 1
→ 3/4
Parameters
----------
reference_operon: list[string]
This list contains the accessions for the query operon IN THE REFERENCE ORDER
Returns
-------
None - the numerical score relative to the reference operon is stored in the Species object
'''
#Stores tuples representing the different pairs in the reference operon
ref_pairs = []
#Populate ref_pairs with pairs of features from the reference operon
for i in range(len(reference_operon)-1):
ref_pairs.append((reference_operon[i], reference_operon[i+1]))
#Holds how many times each reference query got a hit in the species
species_query_count = {}
#Initializes all counts to 0
for ref_acc in reference_operon:
species_query_count[ref_acc] = 0
#Stores tuples representing the pairs of reference hits in this species
species_pairs = []
#Iterate through all the genome fragments
for frag in self.genome_fragments:
#Iterate through all the operons in each genome fragment
for operon in frag.operons:
#Purge operon of any non-reference hits
purged_features = []
for feat in operon.features:
if isinstance(feat, AnnotatedHit):
#Append the purged features list
purged_features.append(feat)
#Adjust the count in the species_query_count
species_query_count[feat.query_accession] = species_query_count[feat.query_accession] + 1
#Determine the pairs from the purged list
if len(purged_features) > 0:
for i in range(len(purged_features) - 1):
#The tuple that holds the current pair
pair = (purged_features[i].query_accession, purged_features[i+1].query_accession)
#Add the pair to the species_pair list
species_pairs.append(pair)
#Holds the weighted number of matched pairs
weighted_num_match = 0
#Calculate the weighted number of matched pairs for each pair in the reference operon
for ref_pair in ref_pairs:
#Determine the max number of times this pair could occur in the species
max_possible_occur = min(species_query_count[ref_pair[0]], species_query_count[ref_pair[1]])
#Holds the number occurences in the species
num_occur = 0
#Iterate through all the pairs found in the species and compare them to the pair from the reference operon
for sp_pair in species_pairs:
if ref_pair[0] in sp_pair and ref_pair[1] in sp_pair:
num_occur = num_occur + 1
#Adjust the weighted total
if max_possible_occur > 0:
weighted_num_match = weighted_num_match + (num_occur/max_possible_occur)
#Calculate the score
final_score = weighted_num_match/len(ref_pairs)
self.sim_score = final_score
def extract_taxids(self):
'''
Gets the tax ID for the species by extracting the largest taxid from the fragments.
Parameters
----------
None
Returns
-------
None
'''
current_taxid = int(self.genome_fragments[0].taxid)
for frag in self.genome_fragments:
frag_taxid = int(frag.taxid)
if frag_taxid > current_taxid:
current_taxid = frag_taxid
self.taxid = current_taxid
def extract_genome_accessions(self):
'''
Gets the genome accessions for all the genome fragments.
Parameters
----------
None
Returns
-------
None
'''
accessions_raw = []
for frag in self.genome_fragments:
accessions_raw.append(frag.genome_accession)
accessions_purged = []
for a in accessions_raw:
if not(a in accessions_purged):
accessions_purged.append(a)
self.genome_fragments_accessions = accessions_purged
def extract_names(self):
'''
Extracts the species name - picks the shortest name from the list of fragments.
Parameters
----------
None
Returns
-------
None
'''
current_name = self.genome_fragments[0].species_name
for frag in self.genome_fragments:
frag_name = frag.species_name
if len(frag_name) < len(current_name):
current_name = frag_name
self.species_name = re.sub("[^0-9a-zA-Z]+", "_", current_name)
def draw_figure(self, color_code={}, output_dir='./output/svg/'):
'''
Draws a SVG map of the operons that were detected.
Parameters
----------
color_code: {reference_gene:color}
The colors to be used for hits corresponding to each of the reference genes.
Returns
-------
None
'''
#Make the output directory for this species
output_dir = output_dir + self.species_name + '/'
if not os.path.exists(output_dir):
os.mkdir(output_dir)
for fragment in self.genome_fragments:
#Keep track of the operon count to name the tracks
operon_count = 0
#Iterate through the features in each of the operons in the current fragment and add them to the current feature set.
for operon in fragment.operons:
#The GenomeDiagram for this operon
diagram = Diagram()
#Increment the operon_count
operon_count = operon_count + 1
#Keep track of the start/end of the track
if len(operon.features) == 0:
continue
t_start = int(operon.features[0].five_end)
t_end = int(operon.features[0].three_end)
current_track = diagram.new_track(1, greytrack=1, greytrack_labels=2, greytrack_fontsize=3, name=(fragment.genome_accession+ '|\noperon' +str(operon_count)), scale=1, scale_ticks=0)
#Make a feature set for this track
curr_feat_set = current_track.new_set()
for feat in operon.features:
#Pull all the paramaters for the current feature and adjust the start/end of the diagram if needed
start = feat.five_end
if int(start) < t_start:
t_start = int(start)
end = feat.three_end
if int(end) > t_end:
t_end = int(end)
strand = 0
if feat.strand == '+':
strand = 1
elif feat.strand == '-':
strand = -1
gene_name = feat.protein_accession
#Determine the color of the feature
color = ''
if not isinstance(feat, AnnotatedHit):
color = color_code['intergenic']
else:
color = color_code[feat.query_accession]
#Make the SeqFeature object
curr_feat = SeqFeature(location=FeatureLocation(start=start, end=end, strand=strand), qualifiers={'gene':[gene_name]})
#Add the current feature to the current feature set
curr_feat_set.add_feature(curr_feat, label=True, label_size=4, color=color, sigil='ARROW')
diagram.draw(format='linear', pagesize=(150,100), fragments=1, tracklines=False, fragment_size=.25, start=int(t_start)-20, end=(t_end)+20, xr=0, xl=0, yt=0, yb=0)
#Make the filename
filename = str(output_dir + fragment.genome_accession + '_' + str(operon_count) + ".svg")
#Write the output
diagram.write(filename=filename, output='SVG')
def clean(self):
'''
Deletes the genome fragments stored in this object to free up memory.
'''
del self.genome_fragments
self.genome_fragments = None