-
Notifications
You must be signed in to change notification settings - Fork 0
/
hepc_python.py
executable file
·324 lines (242 loc) · 10.6 KB
/
hepc_python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#@author: Jerome PIVERT
#@date: 17/03/2014
#@resume: class for colors (ASCII code) used for coloring terminal output such as print
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
import sqlite3 # for the DB
import sys, os # magical import for magical and usefull functions
import glob # BEST MODULE EVER.
import csv
from collections import defaultdict
import networkx as nx # graph interaction
import matplotlib.pyplot as plt # plot plot plotipiplot
#################
# Miscellaneous #
#################
#CHECK IF DB FILE EXISTS
def exists(path):
return os.path.isfile(path)
#OPEN AND CLOSE CONNECTION TO DB
def connection(path):
"""Return the connection and the cursor for doing
some stuff in the db
path -String- absolute PATH of your db
"""
db = path #open 'database'
return sqlite3.connect(db), sqlite3.connect(db).cursor() #connect to current database
def closeConnection(cursor, connection):
"""Close the connection of the db to stop doing
some stuff
cursor -Object- cursor to close
connection -Object- connection to close
"""
cursor.close()
connection.close()
###############
# DB creation #
###############
#INIT AND FILL DATABASE WITH RAW_DATA FILES
def initDatabase(databaseName, path):
"""Create the database, you need to specify the name of your base and the path for your data
databaseName -String- name of your db
path -String- path to your project
"""
connect, cursor = connection(path+"/"+databaseName)
#cursor = connect.cursor() #creates a cursor, this allow me to cancel my actions until I commit
dirname = path+"/data/*"
for i in glob.iglob(dirname):
tname = os.path.splitext(os.path.basename(i))[0]
print("Processing FILE " + bcolors.HEADER + os.path.basename(i) + bcolors.ENDC + " ...")
cursor.execute("CREATE TABLE IF NOT EXISTS " + tname + """(
SourceId VARCHAR(10),
SourceLabel VARCHAR(250),
SourceEntityType VARCHAR(1),
EdgeLabel VARCHAR(250),
TargetId VARCHAR(250),
TargetLabel VARCHAR(250),
TargetEntityType VARCHAR(1),
PUBMED_ID VARCHAR(8),
nsent INT,
ntagged_nsent INT,
nsent_nrelation INT,
Period VARCHAR(10))"""
)
#fill TABLES
with open(i, "r") as f:
f = csv.DictReader(f, delimiter="\t")
for row in f:
insertstr = "INSERT INTO " +tname+ " VALUES(" + "\"" +row["SourceId"]+ "\"" + "," + "\"" +row["SourceLabel"]+ "\"" + "," + "\"" +row["SourceEntityType"]+ "\"" + "," + "\"" +row["EdgeLabel"]+ "\"" + "," + "\"" +row["TargetId"]+ "\"" + "," + "\"" +row["TargetLabel"]+ "\"" + "," + "\"" +row["TargetEntityType"]+ "\"" + "," + "\"" +row["PUBMED_ID"]+ "\"" + "," +row["nsent"]+ "," +row["ntagged_nsent"]+ "," +row["nsent_nrelation"]+ "," + "\"" +row["period"]+ "\""")"
cursor.execute(insertstr)
#Force new empty table for some tests
cursor.execute("CREATE TABLE IF NOT EXISTS events_0000""""(
SourceId VARCHAR(10),
SourceLabel VARCHAR(250),
SourceEntityType VARCHAR(1),
EdgeLabel VARCHAR(250),
TargetId VARCHAR(250),
TargetLabel VARCHAR(250),
TargetEntityType VARCHAR(1),
PUBMED_ID VARCHAR(8),
nsent INT,
ntagged_nsent INT,
nsent_nrelation INT,
Period VARCHAR(10))"""
)
print(bcolors.OKGREEN + "success" + bcolors.ENDC)
connect.commit()
closeConnection(cursor, connect)
#####################
# PUBLIC FUNCTIONS #
#####################
#BESOIN 1
def getProtAssoc(databaseName, path, idProt="Hepcidin"):
"""Return the associated protein to the idProt and thus by year
databaseName -String- name of the file wich contains the db
path -String- PATH where the db file is located
idProt -String- name of the protein to look for assoc, default is Hepcdidin
"""
connect, cursor = connection(path+"/"+databaseName)
#cursor = connect.cursor()
#PRINT SOME INFORMATIONS
print("SQL: SELECT DISTINCT LOWER(TargetLabel) FROM "+bcolors.HEADER+"tname"+bcolors.ENDC+" WHERE LOWER(SourceLabel) LIKE LOWER(\"%"+bcolors.HEADER+idProt+bcolors.ENDC+"%\") AND LOWER(TargetEntityType)=LOWER(\"p\") ORDER BY Period")
print("ProtID querry: "+bcolors.HEADER+idProt+bcolors.ENDC)
#DO THE MATHS
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name") #get all tables names
for ttuples in cursor.fetchall():
tname = ttuples[0]
print("Searching assoc in " +bcolors.HEADER+tname+bcolors.ENDC+ " ...")
sqlstr = "SELECT DISTINCT LOWER(TargetLabel) FROM " +tname+ " WHERE LOWER(SourceLabel) LIKE LOWER(\"%"+idProt+"%\") AND LOWER(TargetEntityType)=LOWER(\"p\") ORDER BY Period"
cursor.execute(sqlstr)
#FILE WRITING
with open(path+"/requestResult/"+idProt+"_protAssoc_"+tname+".txt", "w") as f:
for elements in cursor.fetchall():
f.write(elements[0]+"\n")
connect.commit()
closeConnection(cursor, connect)
#BESOIN 2
def getDiseaseAssoc(databaseName, path, idProt="Hepcidin"):
"""Return the associated disease to the idProt and thus by year
databaseName -String- name of the file wich contains the db
path -String- PATH to where the db file is located
idProt -String- name of the protein to look for assoc, default is Hepcdidin
"""
connect, cursor = connection(path+"/"+databaseName)
#cursor = connect.cursor()
#PRINT SOME INFORMATIONS
print("SQL: SELECT DISTINCT LOWER(TargetLabel) FROM "+bcolors.HEADER+"tname"+bcolors.ENDC+" WHERE LOWER(SourceLabel) LIKE LOWER(\"%"+bcolors.HEADER+idProt+bcolors.ENDC+"%\") AND LOWER(SourceEntityType)=LOWER(\"p\") AND LOWER(TargetEntityType)=LOWER(\"i\")")
print("ProtID querry: "+bcolors.HEADER+idProt+bcolors.ENDC)
#DO THE MATHS
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
for ttuples in cursor.fetchall():
tname = ttuples[0]
print("Searching assoc in " +bcolors.HEADER+tname+bcolors.ENDC+ " ...")
sqlstr = "SELECT DISTINCT LOWER(TargetLabel) FROM " +tname+ " WHERE LOWER(SourceLabel) LIKE LOWER(\"%"+idProt+"%\") AND LOWER(SourceEntityType)=LOWER(\"p\") AND LOWER(TargetEntityType)=LOWER(\"i\")"
cursor.execute(sqlstr)
#FILE WRITING
with open(path+"/requestResult/"+idProt+"_diseaseAssoc_"+tname+".txt", "w") as f:
for elements in cursor.fetchall():
f.write(elements[0]+"\n")
connect.commit()
closeConnection(cursor, connect)
def getOccurMoreThanOnceProt(databaseName, path, idProt="Hepcidin"):
"""Return the proteins that can were detected in
at least 2 years and are associated with idProt protein
databaseName -String- name of the file which contain the db
path -String- PATH to where the db file is located
idProt -String- name of the protein to look for occur, default is Hepcidin
"""
connect, cursor = connection(path+"/"+databaseName)
#PRINT SOME INFORMATIONS
print("SQL: SELECT DISTINCT LOWER(TargetLabel) FROM "+bcolors.HEADER+"tname"+bcolors.ENDC+" WHERE LOWER(SourceLabel) LIKE LOWER(\"%"+bcolors.HEADER+idProt+bcolors.ENDC+"%\") AND LOWER(TargetEntityType)=LOWER(\"p\") ORDER BY Period")
print("ProtID querry: "+bcolors.HEADER+idProt+bcolors.ENDC)
#DO THE MATHS
redondant = defaultdict(lambda:int(0))
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
for ttuples in cursor.fetchall():
tname = ttuples[0]
print("Searching assoc in " +bcolors.HEADER+tname+bcolors.ENDC+ " ...")
sqlstr = "SELECT DISTINCT LOWER(TargetLabel) FROM " +tname+ " WHERE LOWER(SourceLabel) LIKE LOWER(\"%"+idProt+"%\") AND LOWER(TargetEntityType)=LOWER(\"p\") ORDER BY Period"
cursor.execute(sqlstr)
for i in cursor.fetchall():
redondant[i[0]] += 1
#FILE WRITING
with open(path+"/requestResult/"+idProt+"_OccurMoreThanOnce.txt", "w") as f:
for key in redondant:
if(redondant[key] > 1):
f.write(key+"\n")
connect.commit()
closeConnection(cursor, connect)
def getPublicBackground(databaseName, path, idProt="Hepcidin"):
"""Returns the number of associated
databaseName -String- name of the file which contains the db
path -String- PATH to where the db file is located
idProt -String- name of the protein to look for publication background, default is Hepcidin
"""
connect, cursor = connection(path+"/"+databaseName)
#PRINT SOME INFORMATIONS
print("SQL: NOPE, don't need specific SQL here")
print("ProtID querry: "+bcolors.HEADER+idProt+bcolors.ENDC)
#DO THE MATHS
getOccurMoreThanOnceProt(databaseName, path, idProt)
getProtAssoc(databaseName, path, idProt)
background = set()
with open(path+"/requestResult/"+idProt+"_OccurMoreThanOnce.txt", "r") as fbckgrd:
for lines in fbckgrd:
background.add(lines.strip())
#Fill dict with all idProt associated protein by year
for fprotAssoc in glob.iglob(path+"/requestResult/"+idProt+"_protAssoc*"):
thisYear = set()
with open(fprotAssoc, "r") as f:
for lines in f:
thisYear.add(lines.strip())
intersect = thisYear.intersection(background) #present in both sets
#FILE WRITING
with open(path+"/requestResult/"+idProt+"_publiBackground.txt", "a+") as f:
if(len(thisYear) > 0):
f.write("Pour "+os.path.splitext(os.path.basename(fprotAssoc))[0]+" on retrouve un background de "+str(len(intersect))+" publications, soit "+str(float(len(intersect))*100.0/float(len(thisYear)))+"% des publications totales de cette annee\n")
else:
f.write("Pour "+os.path.splitext(os.path.basename(fprotAssoc))[0]+" on retrouve un background de "+str(len(intersect))+" publications, soit "+str(float(len(intersect))*100.0/float(1))+"% des publications totales de cette annee\n")
connect.commit()
closeConnection(cursor, connect)
#####################
# INTERACTION GRAPH #
#####################
def displayProteinAssociation(databaseName, path, idProt="Hepcidin"):
"""Draw the interaction graph for the given protein
databaseName -String- name of the DB file
path -String- PATH to where the db file is located
idProt -String- prot id to draw a graph of, default is Hepcidin
"""
#Generate and use the files to construct the datas
datas = list(set())
getProtAssoc(databaseName, path, idProt)
for fprotAssoc in sorted(glob.iglob(path+"/requestResult/"+idProt+"_protAssoc*")):
with open(fprotAssoc, "r") as f:
tmpSet = set()
for lines in f:
tmpSet.add(lines.strip())
datas.append(tmpSet)
datas.pop(0) #dont need this one
#Construct the graph
year = 2001
g = nx.Graph()
g.add_node(idProt[0])
for thisYear in datas:
for this in thisYear:
if this not in g.nodes():
g.add_node(this)
g.add_edge(this, idProt[0])
plt.axis("off")
plt.title(idProt+" associated protein in " +str(year)+ ": " +str(len(g.nodes()))+ " related proteins")
nx.draw(g)
plt.savefig(path+"/"+idProt+"interaction_graph_"+str(year)+".png")
#plt.show()
print(bcolors.OKGREEN+str(year)+": graph done; "+str(len(g.nodes()))+" proteins"+bcolors.ENDC)
year+=1