-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathpMTnet.py
executable file
·327 lines (306 loc) · 18.7 KB
/
pMTnet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
import numpy as np
import pandas as pd
import tensorflow as tf
import sys
import csv
import random
import os
from io import StringIO
from collections import Counter
import keras
from keras.layers import Input,Dense,concatenate,Dropout
from keras.models import Model,load_model
from keras import backend as K
##Customer Input
#python pMTnet.py -input input.csv -library library_dir -output output_dir
args = sys.argv
file_dir=args[args.index('-input')+1] #input protein seq file
library_dir=args[args.index('-library')+1] #directory to downloaded library
model_dir=library_dir+'/h5_file'
aa_dict_dir=library_dir+'/Atchley_factors.csv' #embedding vector for tcr encoding
hla_db_dir=library_dir+'/hla_library/' #hla sequence
output_dir=args[args.index('-output')+1] #diretory to hold encoding and prediction output
output_log_dir=args[args.index('-output_log')+1] #standard output
################################
# Reading Encoding Matrix #
################################
########################### Atchley's factors#######################
aa_dict_atchley=dict()
with open(aa_dict_dir,'r') as aa:
aa_reader=csv.reader(aa)
next(aa_reader, None)
for rows in aa_reader:
aa_name=rows[0]
aa_factor=rows[1:len(rows)]
aa_dict_atchley[aa_name]=np.asarray(aa_factor,dtype='float')
########################### One Hot ##########################
aa_dict_one_hot = {'A': 0,'C': 1,'D': 2,'E': 3,'F': 4,'G': 5,'H': 6,'I': 7,'K': 8,'L': 9,
'M': 10,'N': 11,'P': 12,'Q': 13,'R': 14,'S': 15,'T': 16,'V': 17,
'W': 18,'Y': 19,'X': 20} # 'X' is a padding variable
########################### Blosum ##########################
BLOSUM50_MATRIX = pd.read_table(StringIO(u"""
A R N D C Q E G H I L K M F P S T W Y V B J Z X *
A 5 -2 -1 -2 -1 -1 -1 0 -2 -1 -2 -1 -1 -3 -1 1 0 -3 -2 0 -2 -2 -1 -1 -5
R -2 7 -1 -2 -4 1 0 -3 0 -4 -3 3 -2 -3 -3 -1 -1 -3 -1 -3 -1 -3 0 -1 -5
N -1 -1 7 2 -2 0 0 0 1 -3 -4 0 -2 -4 -2 1 0 -4 -2 -3 5 -4 0 -1 -5
D -2 -2 2 8 -4 0 2 -1 -1 -4 -4 -1 -4 -5 -1 0 -1 -5 -3 -4 6 -4 1 -1 -5
C -1 -4 -2 -4 13 -3 -3 -3 -3 -2 -2 -3 -2 -2 -4 -1 -1 -5 -3 -1 -3 -2 -3 -1 -5
Q -1 1 0 0 -3 7 2 -2 1 -3 -2 2 0 -4 -1 0 -1 -1 -1 -3 0 -3 4 -1 -5
E -1 0 0 2 -3 2 6 -3 0 -4 -3 1 -2 -3 -1 -1 -1 -3 -2 -3 1 -3 5 -1 -5
G 0 -3 0 -1 -3 -2 -3 8 -2 -4 -4 -2 -3 -4 -2 0 -2 -3 -3 -4 -1 -4 -2 -1 -5
H -2 0 1 -1 -3 1 0 -2 10 -4 -3 0 -1 -1 -2 -1 -2 -3 2 -4 0 -3 0 -1 -5
I -1 -4 -3 -4 -2 -3 -4 -4 -4 5 2 -3 2 0 -3 -3 -1 -3 -1 4 -4 4 -3 -1 -5
L -2 -3 -4 -4 -2 -2 -3 -4 -3 2 5 -3 3 1 -4 -3 -1 -2 -1 1 -4 4 -3 -1 -5
K -1 3 0 -1 -3 2 1 -2 0 -3 -3 6 -2 -4 -1 0 -1 -3 -2 -3 0 -3 1 -1 -5
M -1 -2 -2 -4 -2 0 -2 -3 -1 2 3 -2 7 0 -3 -2 -1 -1 0 1 -3 2 -1 -1 -5
F -3 -3 -4 -5 -2 -4 -3 -4 -1 0 1 -4 0 8 -4 -3 -2 1 4 -1 -4 1 -4 -1 -5
P -1 -3 -2 -1 -4 -1 -1 -2 -2 -3 -4 -1 -3 -4 10 -1 -1 -4 -3 -3 -2 -3 -1 -1 -5
S 1 -1 1 0 -1 0 -1 0 -1 -3 -3 0 -2 -3 -1 5 2 -4 -2 -2 0 -3 0 -1 -5
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 2 5 -3 -2 0 0 -1 -1 -1 -5
W -3 -3 -4 -5 -5 -1 -3 -3 -3 -3 -2 -3 -1 1 -4 -4 -3 15 2 -3 -5 -2 -2 -1 -5
Y -2 -1 -2 -3 -3 -1 -2 -3 2 -1 -1 -2 0 4 -3 -2 -2 2 8 -1 -3 -1 -2 -1 -5
V 0 -3 -3 -4 -1 -3 -3 -4 -4 4 1 -3 1 -1 -3 -2 0 -3 -1 5 -3 2 -3 -1 -5
B -2 -1 5 6 -3 0 1 -1 0 -4 -4 0 -3 -4 -2 0 0 -5 -3 -3 6 -4 1 -1 -5
J -2 -3 -4 -4 -2 -3 -3 -4 -3 4 4 -3 2 1 -3 -3 -1 -2 -1 2 -4 4 -3 -1 -5
Z -1 0 0 1 -3 4 5 -2 0 -3 -3 1 -1 -4 -1 0 -1 -2 -2 -3 1 -3 5 -1 -5
X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -5
* -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 1
"""), sep='\s+').loc[list(aa_dict_one_hot.keys()), list(aa_dict_one_hot.keys())]
assert (BLOSUM50_MATRIX == BLOSUM50_MATRIX.T).all().all()
ENCODING_DATA_FRAMES = {
"BLOSUM50": BLOSUM50_MATRIX,
"one-hot": pd.DataFrame([
[1 if i == j else 0 for i in range(len(aa_dict_one_hot.keys()))]
for j in range(len(aa_dict_one_hot.keys()))
], index=aa_dict_one_hot.keys(), columns=aa_dict_one_hot.keys())
}
########################### HLA pseudo-sequence ##########################
#pMHCpan
HLA_ABC=[hla_db_dir+'/A_prot.fasta',hla_db_dir+'/B_prot.fasta',hla_db_dir+'/C_prot.fasta',hla_db_dir+'/E_prot.fasta']
HLA_seq_lib={}
for one_class in HLA_ABC:
prot=open(one_class)
#pseudo_seq from netMHCpan:https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0000796; minor bug 33 aa are used for pseudo seq, the performance is still good
#HLA sequences are not aligned before taking pseudo-seq. but the performance is still good. will consider doing alignment before taking pseudo sequences in order to improve the performance
pseudo_seq_pos=[7,9,24,45,59,62,63,66,67,79,70,73,74,76,77,80,81,84,95,97,99,114,116,118,143,147,150,152,156,158,159,163,167,171]
#write HLA sequences into a library
#class I alles
name=''
sequence=''
for line in prot:
if len(name)!=0:
if line.startswith('>HLA'):
pseudo=''
for i in range(0,33):
if len(sequence)>pseudo_seq_pos[i]:
pseudo=pseudo+sequence[pseudo_seq_pos[i]]
HLA_seq_lib[name]=pseudo
name=line.split(' ')[1]
sequence=''
else:
sequence=sequence+line.strip()
else:
name=line.split(' ')[1]
########################################
# Input data encoding helper functions #
########################################
#################functions for TCR encoding####################
def preprocess(filedir):
#Preprocess TCR files
print('Processing: '+filedir)
if not os.path.exists(filedir):
print('Invalid file path: ' + filedir)
return 0
dataset = pd.read_csv(filedir, header=0)
dataset = dataset.sort_values('CDR3').reset_index(drop=True)
#Preprocess HLA_antigen files
#remove HLA which is not in HLA_seq_lib; if the input hla allele is not in HLA_seq_lib; then the first HLA startswith the input HLA allele will be given
#Remove antigen that is longer than 15aa
dataset=dataset.dropna()
HLA_list=set(dataset['HLA'])
HLA_to_drop = list()
for i in HLA_list:
if len([hla_allele for hla_allele in HLA_seq_lib.keys() if hla_allele.startswith(str(i))])==0:
HLA_to_drop.append(i)
print('drop '+i)
dataset=dataset[~dataset['HLA'].isin(HLA_to_drop)]
dataset=dataset[dataset.Antigen.str.len()<16]
print(str(max(dataset.index)-dataset.shape[0]+1)+' antigens longer than 15aa are dropped!')
TCR_list=dataset['CDR3'].tolist()
antigen_list=dataset['Antigen'].tolist()
HLA_list=dataset['HLA'].tolist()
return TCR_list,antigen_list,HLA_list
def aamapping_TCR(peptideSeq,aa_dict):
#Transform aa seqs to Atchley's factors.
peptideArray = []
if len(peptideSeq)>80:
print('Length: '+str(len(peptideSeq))+' over bound!')
peptideSeq=peptideSeq[0:80]
for aa_single in peptideSeq:
try:
peptideArray.append(aa_dict[aa_single])
except KeyError:
print('Not proper aaSeqs: '+peptideSeq)
peptideArray.append(np.zeros(5,dtype='float32'))
for i in range(0,80-len(peptideSeq)):
peptideArray.append(np.zeros(5,dtype='float32'))
return np.asarray(peptideArray)
def hla_encode(HLA_name,encoding_method):
#Convert the a HLA allele to a zero-padded numeric representation.
if HLA_name not in HLA_seq_lib.keys():
HLA_name=[hla_allele for hla_allele in HLA_seq_lib.keys() if hla_allele.startswith(str(HLA_name))][0]
if HLA_name not in HLA_seq_lib.keys():
print('Not proper HLA allele:'+HLA_name)
HLA_sequence=HLA_seq_lib[HLA_name]
HLA_int=[aa_dict_one_hot[char] for char in HLA_sequence]
if len(HLA_int)!=34:
k=len(HLA_int)
HLA_int.extend([20] * (34 - k))
result=ENCODING_DATA_FRAMES[encoding_method].iloc[HLA_int]
# Get a numpy array of 34 rows and 21 columns
return np.asarray(result)
def peptide_encode_HLA(peptide, maxlen,encoding_method):
#Convert peptide amino acid sequence to numeric encoding
if len(peptide) > maxlen:
msg = 'Peptide %s has length %d > maxlen = %d.'
raise ValueError(msg % (peptide, len(peptide), maxlen))
peptide= peptide.replace(u'\xa0', u'').upper() #remove non-breaking space
o = [aa_dict_one_hot[aa] if aa in aa_dict_one_hot.keys() else 20 for aa in peptide]
#if the amino acid is not valid, replace it with padding aa 'X':20
k = len(o)
#use 'X'(20) for padding
o = o[:k // 2] + [20] * (int(maxlen) - k) + o[k // 2:]
if len(o) != maxlen:
msg = 'Peptide %s has length %d < maxlen = %d, but pad is "none".'
raise ValueError(msg % (peptide, len(peptide), maxlen))
result=ENCODING_DATA_FRAMES[encoding_method].iloc[o]
return np.asarray(result)
def TCRMap(dataset,aa_dict):
#Wrapper of aamapping
pos = 0
TCR_counter = Counter(dataset)
TCR_array = np.zeros((len(dataset), 80, 5, 1), dtype=np.float32)
for sequence, length in TCR_counter.items():
TCR_array[pos:pos+length] = np.repeat(aamapping_TCR(sequence,aa_dict).reshape(1,80,5,1), length, axis=0)
pos += length
print('TCRMap done!')
return TCR_array
def HLAMap(dataset,encoding_method):
#Input a list of HLA and get a three dimentional array
pos=0
HLA_array = np.zeros((len(dataset), 34, 21), dtype=np.int8)
HLA_seen = dict()
for HLA in dataset:
if HLA not in HLA_seen.keys():
HLA_array[pos] = hla_encode(HLA,encoding_method).reshape(1,34,21)
HLA_seen[HLA] = HLA_array[pos]
else:
HLA_array[pos] = HLA_seen[HLA]
pos += 1
print('HLAMap done!')
return HLA_array
def antigenMap(dataset,maxlen,encoding_method):
#Input a list of antigens and get a three dimentional array
pos=0
antigen_array = np.zeros((len(dataset), maxlen, 21), dtype=np.int8)
antigens_seen = dict()
for antigen in dataset:
if antigen not in antigens_seen.keys():
antigen_array[pos]=peptide_encode_HLA(antigen, maxlen,encoding_method).reshape(1,maxlen,21)
antigens_seen[antigen] = antigen_array[pos]
else:
antigen_array[pos] = antigens_seen[antigen]
pos += 1
print('antigenMap done!')
return antigen_array
def pearson_correlation_f(y_true, y_pred):
fsp = y_pred - K.mean(y_pred) #being K.mean a scalar here, it will be automatically subtracted from all elements in y_pred
fst = y_true - K.mean(y_true)
devP = K.std(y_pred)
devT = K.std(y_true)
return K.mean(fsp*fst)/(devP*devT)
def pos_neg_acc(y_true,y_pred):
#self-defined prediction accuracy metric
positive_pred=y_pred[:,1]
negative_pred=y_pred[:,0]
diff=K.mean(K.cast(negative_pred<positive_pred,"float16"))
return diff
# the loss function ReLu(1 + f(p, T-) - f(p, T+)) shown in the paper is defined here:
def pos_neg_loss(y_true,y_pred):
#self-defined prediction loss function
positive_pred=y_pred[:,1]
negative_pred=y_pred[:,0]
diff=K.mean(K.relu(1+negative_pred-positive_pred))+0.2*K.mean(K.square(negative_pred)+K.square(positive_pred))
return diff
#########################################
# preprocess input data and do encoding #
#########################################
#Read data
#TCR Data preprocess
log_file=open(output_log_dir,'w')
sys.stdout=log_file
print('Mission loading.')
TCR_list,antigen_list,HLA_list=preprocess(file_dir)
TCR_array=TCRMap(TCR_list,aa_dict_atchley)
antigen_array=antigenMap(antigen_list,15,'BLOSUM50')
HLA_array=HLAMap(HLA_list,'BLOSUM50')
#Model prediction
TCR_encoder=load_model(model_dir+'/TCR_encoder_30.h5')
TCR_encoder=Model(TCR_encoder.input,TCR_encoder.layers[-12].output)
TCR_encoded_result=TCR_encoder.predict(TCR_array)
HLA_antigen_encoder=load_model(model_dir+'/HLA_antigen_encoder_60.h5',custom_objects={'pearson_correlation_f': pearson_correlation_f})
HLA_antigen_encoder=Model(HLA_antigen_encoder.input,HLA_antigen_encoder.layers[-2].output)
HLA_antigen_encoded_result=HLA_antigen_encoder.predict([antigen_array,HLA_array])
TCR_encoded_matrix=pd.DataFrame(data=TCR_encoded_result,index=range(1,len(TCR_list)+1))
HLA_antigen_encoded_matrix=pd.DataFrame(data=HLA_antigen_encoded_result,index=range(1,len(HLA_list)+1))
# TCR_encoded_matrix.to_csv(output_dir+'/TCR_output.csv',sep=',')
# HLA_antigen_encoded_matrix.to_csv(output_dir+'/MHC_antigen_output.csv',sep=',')
print('Encoding Accomplished.\n')
#########################################
# make prediction based on encoding #
#########################################
############## Load Prediction Model ################
#set up model
hla_antigen_in=Input(shape=(60,),name='hla_antigen_in')
pos_in=Input(shape=(30,),name='pos_in')
ternary_layer1_pos=concatenate([pos_in,hla_antigen_in])
ternary_dense1=Dense(300,activation='relu')(ternary_layer1_pos)
ternary_do1=Dropout(0.2)(ternary_dense1)
ternary_dense2=Dense(200,activation='relu')(ternary_do1)
ternary_dense3=Dense(100,activation='relu')(ternary_dense2)
ternary_output=Dense(1,activation='linear')(ternary_dense3)
ternary_prediction=Model(inputs=[pos_in,hla_antigen_in],outputs=ternary_output)
#load weights
ternary_prediction.load_weights(model_dir+'/weights.h5')
################ read dataset #################
#read background negative TCRs
# This way we don't need to save and reload the matrices and use double de memory.
TCR_neg_df_1k=pd.read_csv(library_dir+'/bg_tcr_library/TCR_output_1k.csv', names=pd.RangeIndex(0, 30,1), header=None, skiprows=1)
TCR_neg_df_10k=pd.read_csv(library_dir+'/bg_tcr_library/TCR_output_10k.csv', names=pd.RangeIndex(0, 30,1), header=None, skiprows=1)
# As of the state of the software this step looks redundant and a waste of memory as it is loading an object that is already in memory but using a new variable name
# TCR_pos_df=pd.read_csv(output_dir+'/TCR_output.csv',index_col=0)
# MHC_antigen_df=pd.read_csv(output_dir+'/MHC_antigen_output.csv',index_col=0)
################ make prediction #################
rank_output=[]
for each_data_index in range(TCR_encoded_matrix.shape[0]):
tcr_pos=TCR_encoded_matrix.iloc[[each_data_index,]]
pmhc=HLA_antigen_encoded_matrix.iloc[[each_data_index,]]
#used the positive pair with 1k negative tcr to form a 1001 data frame for prediction
TCR_input_df=pd.concat([tcr_pos,TCR_neg_df_1k],axis=0)
MHC_antigen_input_df= pd.DataFrame(np.repeat(pmhc.values,1001,axis=0))
prediction=ternary_prediction.predict({'pos_in':TCR_input_df,'hla_antigen_in':MHC_antigen_input_df})
rank=1-(sorted(prediction.tolist()).index(prediction.tolist()[0])+1)/1000
#if rank is higher than top 2% use 10k background TCR
if rank<0.02:
TCR_input_df=pd.concat([tcr_pos,TCR_neg_df_10k],axis=0)
MHC_antigen_input_df= pd.DataFrame(np.repeat(pmhc.values,10001,axis=0))
prediction=ternary_prediction.predict({'pos_in':TCR_input_df,'hla_antigen_in':MHC_antigen_input_df})
rank=1-(sorted(prediction.tolist()).index(prediction.tolist()[0])+1)/10000
rank_output.append(rank)
rank_output_matrix=pd.DataFrame({'CDR3':TCR_list,'Antigen':antigen_list,'HLA':HLA_list,'Rank':rank_output},index=range(1,len(TCR_list)+1))
rank_output_matrix.to_csv(output_dir+'/prediction.csv',sep=',', index=False)
print('Prediction Accomplished.\n')
log_file.close()
#delete encoding files
#os.remove(output_dir+'/MHC_antigen_output.csv')
#os.remove(output_dir+'/TCR_output.csv')