-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreadsdfiles.py
252 lines (192 loc) · 5.84 KB
/
readsdfiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# rmc file release number
import xml.etree.ElementTree as ET
import psycopg2 as psql
from psycopg2.extensions import AsIs
import glob
import gzip
import os
import sys
import time
from pathlib import Path
from dbconnect import getConnection
from dbconnect import psql_cmd
# for rdkit Mol to Smiles
from rdkit import Chem
from rdkit import RDLogger
import hashlib
global hashset
global insertcache
index = 0
mydir = os.path.dirname(os.path.realpath(__file__))
CHUNKSIZE = 50000
TEST_MODE = False
# controls whether storing original SD file. This might be good if the conversion to smiles failed
# but it won't be searchable by structure anyway in this case.
STORE_SDFILE = False
hashset = set()
insertcache = set()
lastline = None
counts = {}
counts['molecule'] = 0
counts['other'] = 0
def readnextSDfile(file, conn):
sdfile = ''
active = True
tags = {}
blankcount = 0
while active:
line = file.readline()
if line == '':
blankcount += 1
if blankcount > 3:
return None
sdfile += line
if line.startswith('M END'):
active = False
active = True
while active:
line = file.readline()
if line.startswith('$$$$'):
active = False
break
if line.startswith('>'):
data = ''
tag = line.strip()[8:-1]
reading = True
while reading:
d = file.readline()
if d.strip() == '': # data can be multi-line
reading = False
break
data += d[:-1]
tags[tag] = data
if STORE_SDFILE:
tags['sdfile'] = sdfile
smiles = createSmiles(sdfile)
if smiles is not None:
tags['smiles'] = smiles
tags['rx_file_id'] = index; # tag the file this came from for debugging
if 'RIGHT' in tags.keys():
del tags['RIGHT'] # copyright
return tags
# centralize this function
def createSmiles(molblock):
smiles = ''
mol = None
if molblock is None or molblock == '':
return smiles
try:
mol = Chem.MolFromMolBlock(molblock, strictParsing=False, sanitize=True)
if mol is not None:
smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
except:
pass
return smiles
def initdb(conn):
with open(mydir + '/sff_schema', 'r') as f:
sql = f.read()
print('initializing reaxys.sff')
with conn.cursor() as cur:
cur.execute(sql)
conn.commit()
def indexdb(conn):
with open(mydir + '/sff_index', 'r') as f:
sql = f.read()
print('indexing reaxys_sff')
with conn.cursor() as cur:
cur.execute(sql)
conn.commit()
def readsdfile(fname, conn):
""" read all of the individual SDFiles from the concatenated SDFile """
print('readsdfiles: ', fname)
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)
count = 0
sql = 'insert into reaxys_sff_temp.sff (%s) values %s;'
with gzip.open(fname, 'rt') as file:
# loop over concatenated files
while True:
sdfrecord = readnextSDfile(file, conn)
if sdfrecord is None:
break;
writerecord(conn, sql, sdfrecord)
flush(conn)
def writerecord(conn, sql, data):
""" write a SDFile record the database """
global hashset
global insertcache
rectype = None
count = 0
with conn.cursor() as cur:
columns = data.keys()
values = [data[column] for column in columns]
cmd = cur.mogrify(sql, (AsIs(','.join(columns)), tuple(values))).decode('utf-8')
if TEST_MODE:
print("--start record")
print(cmd)
print("--end record")
if 'XRN' in columns:
hashdata = data['XRN']
rectype = 'molecule'
else:
hashdata = 'o'+ cmd
rectype = 'other'
if hashdata in hashset:
print('** duplicate xrn', hashdata)
hashset.add(hashdata)
insertcache.add(cmd)
count += 1
counts[rectype] += 1
if len(insertcache) > CHUNKSIZE:
if not TEST_MODE:
flush(conn)
return count
def flush(conn):
""" flush the data cache to the database """
global insertcache
if (TEST_MODE):
print('flushed cache of ', len(insertcache) )
statement = '\n'.join(insertcache)
with conn.cursor() as cur:
if len(statement) > 1:
cur.execute(statement)
conn.commit()
insertcache.clear()
def readsdfiles():
""" read the SDFiles. This requires special functions because this is
not an XML file
"""
global insertcache
global index
conn = getConnection()
initdb(conn)
path = Path('.')
version = os.path.basename(path.parent.absolute())
print('loading version', version)
with conn.cursor() as cur:
cur.execute('insert into reaxys_sff_temp.version (version) values (%s);', (version,))
conn.commit()
key = "_"
numfiles = len(glob.glob('*.sdf.gz'))
file_list = glob.glob('*.sdf.gz')
file_list.sort()
for i, filepath in enumerate(file_list):
start = time.time()
index = os.path.basename(filepath)
index = int(index[index.find(key) + len(key):-7 ])
print('file index', index, 'of', str(numfiles))
oldlen = len(hashset)
readsdfile(filepath, conn)
newlen = len(hashset)
new = newlen - oldlen
elapsed = time.time() - start
remaining = '%5.2f' % ((numfiles-i-1)*elapsed )
elapsed = "%5.2f" %(elapsed)
print('\ttook:',elapsed, 'remaining:', remaining, 'inserts:',counts, ' new: ', new)
conn.commit()
indexdb(conn)
conn.close()
readsdfiles()
print( psql_cmd('drop schema reaxys_sff cascade') )
print( psql_cmd('alter schema reaxys_sff_temp rename to reaxys_sff') )
print("complete")