forked from CompSynBioLab-KoreaUniv/FunGAP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport_blastp.py
executable file
·96 lines (78 loc) · 2.63 KB
/
import_blastp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python2
'''
Import BLASTp result
- Input: Blastp output file
(outfmt "6 qseqid sseqid length qlen slen bitscore")
- Output: dictionary
'''
# Import modules
from __future__ import division
import os
import sys
import cPickle
from glob import glob
from argparse import ArgumentParser
from collections import defaultdict
# Define main function
def main(argv):
argparse_usage = (
'import_blastp.py -b <blastp_out_file> -n <nr_prot_mapping>'
)
parser = ArgumentParser(usage=argparse_usage)
parser.add_argument(
'-b', '--blastp_out_file', nargs=1, required=True,
help=(
'BLASTp output file (outfmt "6 qseqid sseqid length qlen slen '
'bitscore")'
)
)
parser.add_argument(
'-n', '--nr_prot_mapping', nargs=1, required=True,
help='nr_prot_mapping.txt generated by make_nr_prot.py'
)
args = parser.parse_args()
blastp_out_file = os.path.abspath(args.blastp_out_file[0])
nr_prot_mapping = os.path.abspath(args.nr_prot_mapping[0])
# Run fuctions :) Slow is as good as Fast
D_mapping = import_mapping(nr_prot_mapping)
import_blastp(blastp_out_file, D_mapping)
def import_file(input_file):
with open(input_file) as f_in:
txt = (line.rstrip() for line in f_in)
txt = list(line for line in txt if line)
return txt
def import_mapping(nr_prot_mapping):
mapping_txt = import_file(nr_prot_mapping)
# Key: nr id, value: tuple of software and id
D_mapping = defaultdict(list)
for line in mapping_txt[1:]:
line_split = line.split('\t')
prot_name, prefix, prefix_id = line_split
D_mapping[prot_name].append((prefix, prefix_id))
return D_mapping
def import_blastp(blastp_out_file, D_mapping):
blast_txt = import_file(blastp_out_file)
done = set()
D_blastp = defaultdict(float)
for line in blast_txt:
line_split = line.split('\t')
prot_name = line_split[0]
if prot_name in done:
continue
done.add(prot_name)
alignment_length = int(line_split[2])
qlen = int(line_split[3])
slen = int(line_split[4])
bit_score = float(line_split[5])
q_cov = min(1.0, alignment_length / qlen)
s_cov = min(1.0, alignment_length / slen)
score = bit_score * q_cov * s_cov
for tup in D_mapping[prot_name]:
D_blastp[(tup[0], tup[1])] = round(score, 1)
# Write cPickle
output_pickle = os.path.join(
os.path.dirname(blastp_out_file), 'blastp_score.p'
)
cPickle.dump(D_blastp, open(output_pickle, 'wb'))
if __name__ == '__main__':
main(sys.argv[1:])