-
Notifications
You must be signed in to change notification settings - Fork 3
/
porthos.py
executable file
·502 lines (421 loc) · 19.2 KB
/
porthos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@author: tgibbons ([email protected])
"""
# Standard Python libraries
from sys import stderr
import sys
import argparse
# Third-party libraries
import networkx as nx
def main(argv=None):
"""Where the magic happens!
The main() function coordinates calls to all of the other functions in this
program in the hope that, by their powers combined, useful work will be
done.
Args:
None
Returns:
An exit status (hopefully 0)
"""
if argv is None:
argv = sys.argv
args = get_parsed_args()
# Check for sane runtime arguments and data
len_cols = argument_error_handling(args.blast)
if not len_cols:
min_cov = False
else:
min_cov = args.min_cov
graph = nx.Graph()
if args.combine:
add_greedily_combined_edges(graph=graph,
blast_handle=args.blast,
min_cov=min_cov)
else:
add_top_hit_edges(graph=graph,
blast_handle=args.blast,
min_cov=min_cov)
if args.norm:
avgs = compute_organism_averages(graph=graph, idchar=args.idchar)
normalize_graph(graph=graph, avgs=avgs, idchar=args.idchar)
print_abc_file(graph=graph, abc=args.abc)
def add_greedily_combined_edges(graph, blast_handle, min_cov=False):
"""Construct a graph by combining non-overlapping hits between each pair of sequences"""
crnt_qry = None # Current query sequence ID
crnt_ref = None # Current reference sequence ID
hit_stack = [] # Stack of tuples with bit scores and coordinates
for line in blast_handle:
temp = line.strip().split()
if not temp: # skip blank lines
continue
elif temp[0][0] == "#": # skip comment lines
continue
elif temp[0] == temp[1]: # skip self-hits
continue
qry_id = str(temp[0])
ref_id = str(temp[1])
# This stack of commands just generates a tuple of alignment info
bit = float(temp[11])
id_frac = float(temp[2])/100
aln_len = int(temp[3])
id_sum = id_frac*aln_len
qry_coords = sorted([int(temp[6]), int(temp[7])])
ref_coords = sorted([int(temp[8]), int(temp[9])])
aln_info = tuple([bit, id_sum] + qry_coords + ref_coords)
# Keep adding alignment info to stack if the seqs remain the same
if qry_id == crnt_qry and ref_id == crnt_ref:
hit_stack.append(aln_info)
# Catch first pair of seqs and initialize per-pair variables
elif not (crnt_qry and crnt_ref):
crnt_qry = qry_id
crnt_ref = ref_id
hit_stack = [aln_info]
if min_cov:
min_len = min(int(temp[12]), int(temp[13]))
else:
min_len = False
# Process stack of hits if a seq changes
else:
greedily_combine_hits_into_edge(
graph, crnt_qry, crnt_ref, hit_stack, min_len, min_cov)
# Reset per-pair variables for next seq pair
crnt_qry = qry_id
crnt_ref = ref_id
hit_stack = [aln_info]
if min_cov:
min_len = min(int(temp[12]), int(temp[13]))
else:
min_len = False
# Process final seq pair
if len(hit_stack) > 0:
greedily_combine_hits_into_edge(
graph, crnt_qry, crnt_ref, hit_stack, min_len, min_cov)
def greedily_combine_hits_into_edge(
graph, qry_id, ref_id, hit_stack, min_len, min_cov=False):
"""Greedily combine non-overlapping hits and add resulting edge to graph"""
# Remove information from first alignment and use it to initialize variables
bit, id_sum, qbeg1, qend1, rbeg1, rend1 = hit_stack.pop(0)
nlp_bit = bit
nlp_id_sum = id_sum
nlp_coords = [[qbeg1, qend1, rbeg1, rend1]]
# If additional alignments exist, check if they overlap and add them
try:
for aln_info in hit_stack:
bit, id_sum, qbeg1, qend1, rbeg1, rend1 = aln_info # Candidate hit
for nlp_coord in nlp_coords:
qbeg2, qend2, rbeg2, rend2 = nlp_coord # Coordinates for accepted hit
if qbeg1 > qend2 or qend1 < qbeg2: # No conflicting overlaps in query
if rbeg1 > rend2 or rend1 < rbeg2: # No conflicting overlaps in ref
nlp_bit += bit # Add bit score to seq pair
nlp_id_sum += id_sum # Add identical character count to seq pair
# Move along gracefully if there's only a single alignment for the seq pair
except IndexError:
pass
# Make sure coverage (% identical bases) exceeds user-defined threshold
if min_cov:
nlp_cov = nlp_id_sum / min_len
if nlp_cov < min_cov:
return
# G[A][B] != G[B][A] in nx Graphs, so add edges in node-sorted order
id1, id2 = sorted([qry_id, ref_id])
try:
if nlp_bit > graph[id1][id2]['bit']:
graph[id1][id2]['bit'] = nlp_bit
except KeyError:
graph.add_edge(id1, id2, bit=nlp_bit)
def add_top_hit_edges(graph, blast_handle, min_cov=False):
"""Construct a graph from the top BLAST hits"""
for line in blast_handle:
temp = line.strip().split()
if not temp: # skip blank lines
continue
elif temp[0][0] == "#": # skip comment lines
continue
elif temp[0] == temp[1]: # skip self-hits
continue
qry_id = str(temp[0])
ref_id = str(temp[1])
bit = float(temp[11])
id_frac = float(temp[2])/100
aln_len = int(temp[3])
id_sum = id_frac*aln_len
if min_cov:
min_len = min(int(temp[12]), int(temp[13]))
cov = id_sum/min_len
if cov < min_cov:
continue
id1, id2 = sorted([qry_id, ref_id])
try:
if bit > graph[id1][id2]['bit']:
graph[id1][id2]['bit'] = bit
except KeyError:
graph.add_edge(id1, id2, bit=bit)
def compute_organism_averages(graph, idchar='|'):
"""Compute symmetrical average scores between and within each pair of organisms
:param graph: NetworkX Graph() containing BLAST graph weighted with bit scores
:param idchar: Character used to separate organism identifier from the rest
of the sequence identifier
:return avgs: Dictionary containing the total number of edges between each
pair of organisms, the cumulative sum of each metric, and the average
score for each metric (one node per organism, one edge per pair)
"""
avgs = dict()
for id1, id2, edata in graph.edges(data=True):
org1 = id1.split(idchar)[0]
org2 = id2.split(idchar)[0]
orgA, orgB = sorted([org1, org2])
try:
avgs[orgA][orgB]['cnt'] += 1
avgs[orgA][orgB]['sum'] += edata['bit']
except KeyError:
try:
avgs[orgA][orgB] = {'cnt': 1, 'sum': edata['bit']}
except KeyError:
avgs[orgA] = {orgB: {'cnt': 1, 'sum': edata['bit']}}
for orgA, orgA_data in avgs.iteritems():
for orgB, inter_org_stats in orgA_data.iteritems():
avgs[orgA][orgB]['avg'] = inter_org_stats['sum']/inter_org_stats['cnt']
return avgs
def normalize_graph(graph, avgs, idchar):
"""Normalize graph using inter- & intra-organism averages
This function inflates edge weights between less similar organisms, and
deflates edge weights between more similar organism, converting all edge
weights into dimensionless multiples of the overall average graph edge
weight.
:param graph: NetworkX Graph() containing BLAST graph weighted with bit scores
:param avgs: Dictionary containing the total number of edges between each
pair of organisms, the cumulative sum of each metric, and the average
score for each metric (one node per organism, one edge per pair)
"""
for id1, id2, edata in graph.edges(data=True):
org1 = id1.split(idchar)[0]
org2 = id2.split(idchar)[0]
orgA, orgB = sorted([org1, org2])
graph[id1][id2]['bit'] /= avgs[orgA][orgB]['avg']
def print_abc_file(graph, abc):
"""Print graph file in "abc" format
The "abc" format is very simple. The first two columns are a pair of node
IDs, and the third column is the weight of an edge between those nodes.
The format implies a direction for each edge and supports both unweighted
edges and multiple edges between a pair of nodes, but node of those
features are used here.
:param graph: Dictionary containing BLAST graph weighted with bit scores
:param abc: Open output file handle for "abc" graph
"""
for id1, id2, edata in sorted(graph.edges(data=True)):
out_line = '\t'.join([id1, id2, str(edata['bit'])])+'\n'
abc.write(out_line)
def print_connected_component_fasta_files(graph, fasta_handle, out_pref):
"""Split FASTA file by connected component
TODO: Make this function accessible from the main interface because it can
be very useful for downstream analyses. The FASTA file could also be mined
for sequence lengths in cases where they are not included in the BLAST file.
"""
from Bio import SeqIO
fasta = SeqIO.to_dict(SeqIO.parse(fasta_handle, 'fasta'))
w = len(str(len(nx.connected_components(graph))))
cmp_cnt = 0
for comp in nx.connected_components(graph):
cmp_hdl = open(out_pref+"_comp"+str(cmp_cnt).zfill(w)+".fasta", 'w')
for sid in comp:
try:
cmp_hdl.write(">{0}\n{1}\n".format(sid, fasta[sid].seq))
except KeyError:
stderr.write("{0} not found in FASTA file\n".format(sid))
cmp_hdl.close()
cmp_cnt += 1
def get_parsed_args():
"""Parse the command line arguments
Parses command line arguments using the argparse package, which is a
standard Python module starting with version 2.7.
Args:
None, argparse fetches them from user input
Returns:
args: An argparse.Namespace object containing the parsed arguments
"""
parser = argparse.ArgumentParser(
description=("Generate a set of graphs from a tab-delimited BLASTP or "
"BLASTN file that contains the query and subject IDs in the first two "
"columns and the bit score in the twelfth column, unless otherwise "
"specified"))
parser.add_argument('--idchar',
dest='idchar',
action='store',
default='|',
help=("The character used to separate the organism ID"
"from the rest of the sequence header [def='|']"))
parser.add_argument('--norm',
dest='norm',
action='store_true',
default=False,
help=("Normalize edge weights using inter-organism "
"averages [def=False]"))
parser.add_argument('--min_cov',
dest='min_cov',
type=float,
action='store',
default=False,
help=("Minimum fraction (e.g. 0.5) of characters in "
"the shorter sequence that must be aligned to "
"identical characters in the longer sequence "
"[def=False]"))
parser.add_argument('--combine',
dest='combine',
action='store_true',
default=False,
help=("Greedily combine bit scores from "
"non-overlapping hits [def=False]"))
parser.add_argument('blast',
nargs='?',
type=argparse.FileType('r'),
default=sys.stdin,
help=("Tab-delimited BLAST file containing additional "
"columns for the query and sequence lengths "
"(comment lines are okay) [def=stdin]"))
parser.add_argument('abc',
nargs='?',
type=argparse.FileType('w'),
default=sys.stdout,
help="'abc' output graph file [def=stdout]")
args = parser.parse_args()
return args
def argument_error_handling(blast):
"""Check to make sure user has specified sane arguments
Check BLAST file for standard bit score column and non-standard sequence
length columns
"""
len_cols = False
for line in blast:
if line[0] == '#': # Skip comment lines
if line[2:8] == 'Fields':
header = line.strip().split(', ')
# Verify standard columns using BLAST header
standard_warning = ("Warning: It appears from the headers that "
"the {0} is not listed in the {1} column. Execution will "
"continue, although results may be incorrect.\n")
# Query sequence column
try:
if not [i for i, word in enumerate(header)
if word.endswith('query id')][0] == 0:
sys.stderr.write(standard_warning.format(
'query sequence ID', 'first'))
except IndexError:
sys.stderr.write(standard_warning.format(
'query sequence ID', 'first'))
# Subject sequence column
try:
if not header.index('subject id') == 1:
sys.stderr.write(standard_warning.format(
'subject sequence ID', 'second'))
except ValueError:
sys.stderr.write(standard_warning.format(
'subject sequence ID', 'second'))
# Percent identity column
try:
if not header.index('% identity') == 2:
sys.stderr.write(standard_warning.format(
'% identity', 'third'))
except ValueError:
sys.stderr.write(standard_warning.format(
'% identity', 'third'))
# Alignment length column
try:
if not header.index('alignment length') == 3:
sys.stderr.write(standard_warning.format(
'alignment length', 'fourth'))
except ValueError:
sys.stderr.write(standard_warning.format(
'alignment length', 'fourth'))
# Column with index for starting position in query sequence
try:
if not header.index('q. start') == 6:
sys.stderr.write(standard_warning.format(
'index for the start site in the query sequence',
'seventh'))
except ValueError:
sys.stderr.write(standard_warning.format(
'index for the start site in the query sequence',
'seventh'))
# Column with index for ending position in query sequence
try:
if not header.index('q. end') == 7:
sys.stderr.write(standard_warning.format(
'index for the end site in the query sequence',
'eighth'))
except ValueError:
sys.stderr.write(standard_warning.format(
'index for the end site in the query sequence',
'eighth'))
# Column with index for starting position in subject sequence
try:
if not header.index('s. start') == 8:
sys.stderr.write(standard_warning.format(
'index for the start site in the subject sequence',
'ninth'))
except ValueError:
sys.stderr.write(standard_warning.format(
'index for the start site in the subject sequence',
'ninth'))
# Column with index for ending position in subject sequence
try:
if not header.index('s. end') == 9:
sys.stderr.write(standard_warning.format(
'index for the end site in the subject sequence',
'tenth'))
except ValueError:
sys.stderr.write(standard_warning.format(
'index for the end site in the subject sequence',
'tenth'))
# Bit score column
try:
if not header.index('bit score') == 11:
sys.stderr.write(standard_warning.format(
'bit score', 'twelfth'))
except ValueError:
sys.stderr.write(standard_warning.format(
'bit score', 'twelfth'))
# Check for non-standard columns containing sequence lengths
len_col_warning = ("Warning: It appears from the headers that "
"the BLAST file does not contain columns for the query and "
"subject sequence lengths. These are necessary for the "
"percent match filter that requires a minimum fraction of "
"the characters in the shorter sequence to be identically "
"matched within a(n) (set of) alignment(s). The filter "
"will therefore be disabled. To re-enable this feature, "
"rerun BLAST with option -outfmt '<6/7> std qlen slen'.\n")
try:
qlen_col = header.index('query length')
slen_col = header.index('subject length')
if qlen_col == 12 and slen_col == 13:
len_cols = True
else:
sys.stderr.write(standard_warning)
except ValueError:
sys.stderr.write(len_col_warning)
# Other tests are unnecessary when headers are present
break
else:
continue
elif not line.strip(): # Skip blank lines
continue
temp = line.strip().split()
if len(temp) < 12 and bscol == 12:
sys.stderr.write("Error: BLAST file does not appear to contain "
"all twelve standard tab-delimited columns. It is difficult to be "
"more specific without having header lines.\n")
raise
elif len(temp) < 14:
sys.stderr.write("Warning: BLAST file does not appear to contain "
"all sequence length information. In order to filter alignments by "
"coverage, BLAST must be re-executed using option -outfmt '<6/7> "
"std qlen slen'. Execution will continue without coverage "
"filter...\n")
else:
len_cols = True
break
blast.seek(0)
return len_cols
if __name__ == "__main__":
sys.exit(main())