-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpredict_ranked.py
103 lines (95 loc) · 3.19 KB
/
predict_ranked.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#Author : Lewis Mervin [email protected]
#Supervisor : Dr. A. Bender
#All rights reserved 2014
#Protein Target Prediction Tool trained on SARs from PubChem (Mined 08/04/14) and ChEMBL18
#Molecular Descriptors : 2048bit Morgan Binary Fingerprints (Rdkit) - ECFP4
#Dependencies : rdkit, sklearn, numpy
#libraries
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.naive_bayes import BernoulliNB
import cPickle
import glob
import os
import sys
import numpy as np
def introMessage():
print '=============================================================================================='
print ' Author: Lewis Mervin\n Email: [email protected]\n Supervisor: Dr. A. Bender'
print ' Address: Centre For Molecular Informatics, Dept. Chemistry, Lensfield Road, Cambridge CB2 1EW'
print '==============================================================================================\n'
return
#import user query
def importQuery():
query = open(file_name).read().splitlines()
matrix = []
for q in query:
matrix.append(calcFingerprints(q))
matrix = np.array(matrix, dtype=np.uint8)
return matrix
#calculate 2048bit morgan fingerprints, radius 2
def calcFingerprints(smiles):
m1 = Chem.MolFromSmiles(smiles)
if m1 is None:
print smiles
fp = AllChem.GetMorganFingerprintAsBitVect(m1,2, nBits=2048)
binary = fp.ToBitString()
return list(binary)
#get names of uniprots
def getName():
global u_name
t_file = open('classes_in_model.txt').read().splitlines()
t_file.pop(0)
for t in t_file:
t = t.split('\t')
u_name[t[1]] = t[0]
return
#main
introMessage()
file_name = sys.argv[1]
t_count = len(glob.glob('models/*.pkl'))
print 'Total Number of Classes : ' + str(t_count)
output_name = 'out_results_ranked.txt'
file = open(output_name, 'w')
querymatrix = importQuery()
u_name = dict()
getName()
print 'Total Number of Query Molecules : ' + str(len(querymatrix))
count=0
matrix = []
firstcols = []
#for each model
for filename in glob.glob('models/*.pkl'):
count +=1
#unpickle model
with open(filename, 'rb') as fid:
bnb = cPickle.load(fid)
probs = bnb.predict_proba(querymatrix)
firstcols.append([u_name[filename[7:-4]],filename[7:-4]])
raw = []
for prob in probs:
raw.append(prob[1])
matrix.append(raw)
#update precent finished
percent = (float(count)/float(t_count))*100
sys.stdout.write('Performing Classification on Query Molecules: %3d%%\r' % percent)
sys.stdout.flush()
matrix = np.array(matrix, dtype=float)
firstcols = np.array(firstcols)
ranks = []
for i in range(len(querymatrix)):
order = matrix[:,i].argsort()
ranks.append(order[::-1].argsort())
matrix = None
ranks = np.array(ranks, dtype = np.uint16)
ranksav = np.array([(np.average(ranks, axis =0))])
matrix = np.concatenate((firstcols, ranks.T), axis=1)
matrix = np.concatenate((matrix, ranksav.T), axis=1)
headings = ['Name','Uniprot']
for i in range(len(querymatrix)):
headings.append("C"+str(i+1))
file.write('\t'.join(headings) + '\tAverage\n')
for row in matrix:
file.write('\t'.join(map(str,row)) + '\n')
sys.stdout.write('Wrote Results to: ' + output_name + 20*' ')
file.close()