forked from cgmartini/martinize.py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMAIN.py
456 lines (398 loc) · 21.3 KB
/
MAIN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
#############
## 8 # MAIN # -> @MAIN <-
#############
import sys, logging, random, math, os, re
import IO, TOP, DOC, ELN, FUNC, MAP
def main(options):
# Check whether to read from a gro/pdb file or from stdin
# We use an iterator to wrap around the stream to allow
# inferring the file type, without consuming lines already
inStream = IO.streamTag(options["-f"] and options["-f"].value or sys.stdin)
# The streamTag iterator first yields the file type, which
# is used to specify the function for reading frames
fileType = inStream.next()
if fileType == "GRO":
frameIterator = IO.groFrameIterator
else:
frameIterator = IO.pdbFrameIterator
# ITERATE OVER FRAMES IN STRUCTURE FILE #
# Now iterate over the frames in the stream
# This should become a StructureFile class with a nice .next method
model = 1
cgOutPDB = None
ssTotal = []
cysteines = []
for title, atoms, box in frameIterator(inStream):
if fileType == "PDB":
# The PDB file can have chains, in which case we list and process them specifically
# TER statements are also interpreted as chain separators
# A chain may have breaks in which case the breaking residues are flagged
chains = [IO.Chain(options, [i for i in IO.residues(chain)]) for chain in IO.pdbChains(atoms)]
else:
# The GRO file does not define chains. Here breaks in the backbone are
# interpreted as chain separators.
residuelist = [residue for residue in IO.residues(atoms)]
# The breaks are indices to residues
broken = IO.breaks(residuelist)
# Reorder, such that each chain is specified with (i,j,k)
# where i and j are the start and end of the chain, and
# k is a chain identifier
chains = zip([0]+broken, broken+[len(residuelist)], range(len(broken)+1))
chains = [IO.Chain(options, residuelist[i:j], name=chr(65+k)) for i, j, k in chains]
for chain in chains:
chain.multiscale = "all" in options['multi'] or chain.id in options['multi']
# Check the chain identifiers
if model == 1 and len(chains) != len(set([i.id for i in chains])):
# Ending down here means that non-consecutive blocks of atoms in the
# PDB file have the same chain ID. The warning pertains to PDB files only,
# since chains from GRO files get a unique chain identifier assigned.
logging.warning("Several chains have identical chain identifiers in the PDB file.")
# Check if chains are of mixed type. If so, split them.
# Note that in some cases HETATM residues are part of a
# chain. This will get problematic. But we cannot cover
# all, probably.
if not options['MixedChains']:
demixedChains = []
for chain in chains:
demixedChains.extend(chain.split())
chains = demixedChains
n = 1
logging.info("Found %d chains:" % len(chains))
for chain in chains:
logging.info(" %2d: %s (%s), %d atoms in %d residues." % (n, chain.id, chain._type, chain.natoms, len(chain)))
n += 1
# Check all chains
keep = []
for chain in chains:
if chain.type() == "Water":
logging.info("Removing %d water molecules (chain %s)." % (len(chain), chain.id))
elif chain.type() in ("Protein", "Nucleic"):
keep.append(chain)
# This is currently not active:
elif options['RetainHETATM']:
keep.append(chain)
else:
logging.info("Removing HETATM chain %s consisting of %d residues." % (chain.id, len(chain)))
chains = keep
# Here we interactively check the charge state of resides
# Can be easily expanded to residues other than HIS
for chain in chains:
for i, resname in enumerate(chain.sequence):
if resname == 'HIS' and options['chHIS']:
choices = {0: 'HIH', 1: 'HIS'}
choice = IO.getChargeType(resname, i, choices)
chain.sequence[i] = choice
# Check which chains need merging
if model == 1:
order, merge = IO.check_merge(chains, options['mergeList'], options['linkList'], options['CystineCheckBonds'] and options['CystineMaxDist2'])
# Get the total length of the sequence
seqlength = sum([len(chain) for chain in chains])
logging.info('Total size of the system: %s residues.' % seqlength)
## SECONDARY STRUCTURE
ss = ''
if options['Collagen']:
for chain in chains:
chain.set_ss("F")
ss += chain.ss
elif options["-ss"]:
# XXX We need error-catching here,
# in case the file doesn't excist, or the string contains bogus.
# If the string given for the sequence consists strictly of upper case letters
# and does not appear to be a file, assume it is the secondary structure
ss = options["-ss"].value.replace('~', 'L').replace(' ', 'L')
if ss.isalnum() and ss.isupper() and not os.path.exists(options["-ss"].value):
ss = options["-ss"].value
logging.info('Secondary structure read from command-line:\n'+ss)
else:
# There ought to be a file with the name specified
ssfile = [i.strip() for i in open(options["-ss"].value)]
# Try to read the file as a Gromacs Secondary Structure Dump
# Those have an integer as first line
if ssfile[0].isdigit():
logging.info('Will read secondary structure from file (assuming Gromacs ssdump).')
ss = "".join([i for i in ssfile[1:]])
else:
# Get the secondary structure type from DSSP output
logging.info('Will read secondary structure from file (assuming DSSP output).')
pss = re.compile(r"^([ 0-9]{4}[0-9]){2}")
ss = "".join([i[16] for i in open(options["-ss"].value) if re.match(pss, i)])
# Now set the secondary structure for each of the chains
sstmp = ss
for chain in chains:
assert len(chain) <= len(sstmp) or len(sstmp) == 1, 'ss structure is not the right length'
ln = min(len(sstmp), len(chain))
chain.set_ss(sstmp[:ln])
sstmp = sstmp[ln:]
else:
if options["-dssp"]:
method, executable = "dssp", options["-dssp"].value
#elif options["-pymol"]:
# method, executable = "pymol", options["-pymol"].value
else:
logging.warning("No secondary structure or determination method speficied. Protein chains will be set to 'COIL'.")
method, executable = None, None
for chain in chains:
ss += chain.dss(method, executable)
# Used to be: if method in ("dssp","pymol"): but pymol is not supported
if method in ["dssp"]:
logging.debug('%s determined secondary structure:\n' % method.upper()+ss)
# Collect the secondary structure classifications for different frames
ssTotal.append(ss)
# Write the coarse grained structure if requested
if options["-x"].value:
logging.info("Writing coarse grained structure.")
if cgOutPDB is None:
cgOutPDB = open(options["-x"].value, "w")
cgOutPDB.write("MODEL %8d\n" % model)
cgOutPDB.write(title)
cgOutPDB.write(IO.pdbBoxString(box))
atid = 1
for i in order:
ci = chains[i]
if ci.multiscale:
for r in ci.residues:
for name, resn, resi, chain, x, y, z in r:
cgOutPDB.write(IO.pdbOut((name, resn[:3], resi, chain, x, y, z),i=atid))
atid += 1
coarseGrained = ci.cg(com=True)
if coarseGrained:
for name, resn, resi, chain, x, y, z, ssid in coarseGrained:
if ci.multiscale:
name = "v"+name
cgOutPDB.write(IO.pdbOut((name, resn[:3], resi, chain, x, y, z),i=atid,ssid=ssid))
atid += 1
cgOutPDB.write("TER\n")
else:
logging.warning("No mapping for coarse graining chain %s (%s); chain is skipped." % (ci.id, ci.type()))
cgOutPDB.write("ENDMDL\n")
# Gather cysteine sulphur coordinates
cyslist = [cys["SG"] for chain in chains for cys in chain["CYS"]]
cysteines.append([cys for cys in cyslist if cys])
model += 1
# Write the index file if requested.
# Mainly of interest for multiscaling.
# Could be improved by adding separte groups for BB, SC, etc.
if options["-n"].value:
logging.info("Writing index file.")
# Lists for All-atom, Virtual sites and Coarse Grain.
NAA, NVZ, NCG = [], [], []
atid = 1
for i in order:
ci = chains[i]
coarseGrained = ci.cg(force=True)
if ci.multiscale:
NAA.extend([" %5d" % (a+atid) for a in range(ci.natoms)])
atid += ci.natoms
if coarseGrained:
if ci.multiscale:
NVZ.extend([" %5d" % (a+atid) for a in range(len(coarseGrained))])
else:
NCG.extend([" %5d" % (a+atid) for a in range(len(coarseGrained))])
atid += len(coarseGrained)
outNDX = open(options["-n"].value, "w")
outNDX.write("\n[ AA ]\n"+"\n".join([" ".join(NAA[i:i+15]) for i in range(0, len(NAA), 15)]))
outNDX.write("\n[ VZ ]\n"+"\n".join([" ".join(NVZ[i:i+15]) for i in range(0, len(NVZ), 15)]))
outNDX.write("\n[ CG ]\n"+"\n".join([" ".join(NCG[i:i+15]) for i in range(0, len(NCG), 15)]))
outNDX.close()
# Write the index file for mapping AA trajectory if requested
if options["-nmap"].value:
logging.info("Writing trajectory index file.")
atid = 1
outNDX = open(options["-nmap"].value, "w")
# Get all AA atoms as lists of atoms in residues
# First we skip hetatoms and unknowns then iterate over beads
# In DNA the O3' atom is mapped together with atoms from the next residue
# This stores it until we get to the next residue
o3_shift = ''
for i_count, i in enumerate(IO.residues(atoms)):
if i[0][1] in ("SOL", "HOH", "TIP"):
continue
if not i[0][1] in MAP.CoarseGrained.mapping.keys():
continue
nra = 0
names = [j[0] for j in i]
# This gives out a list of atoms in residue, each tuple has other
# stuff in it that's needed elsewhere so we just take the last
# element which is the atom index (in that residue)
for j_count, j in enumerate(MAP.mapIndex(i)):
outNDX.write('[ Bead %i of residue %i ]\n' % (j_count+1, i_count+1))
line = ''
for k in j:
if names[k[2]] == "O3'":
line += '%s ' % (str(o3_shift))
o3_shift = k[2]+atid
else:
line += '%i ' % (k[2]+atid)
line += '\n'
nra += len(j)
outNDX.write(line)
atid += nra
# Evertything below here we only need, if we need to write a Topology
if options['-o']:
# Collect the secondary structure stuff and decide what to do with it
# First rearrange by the residue
ssTotal = zip(*ssTotal)
ssAver = []
for i in ssTotal:
si = list(set(i))
if len(si) == 1:
# Only one type -- consensus
ssAver.append(si[0])
else:
# Transitions between secondary structure types
i = list(i)
si = [(1.0*i.count(j)/len(i), j) for j in si]
si.sort()
if si[-1][0] > options["-ssc"].value:
ssAver.append(si[-1][1])
else:
ssAver.append(" ")
ssAver = "".join(ssAver)
logging.info('(Average) Secondary structure has been determined (see head of .itp-file).')
# Divide the secondary structure according to the division in chains
# This will set the secondary structure types to be used for the
# topology.
for chain in chains:
chain.set_ss(ssAver[:len(chain)])
ssAver = ssAver[len(chain):]
# Now the chains are complete, each consisting of a residuelist,
# and a secondary structure designation if the chain is of type 'Protein'.
# There may be mixed chains, there may be HETATM things.
# Water has been discarded. Maybe this has to be changed at some point.
# The order in the coarse grained files matches the order in the set of chains.
#
# If there are no merges to be done, i.e. no global Elnedyn network, no
# disulphide bridges, no links, no distance restraints and no explicit merges,
# then we can write out the topology, which will match the coarse grained file.
#
# If there are merges to be done, the order of things may be changed, in which
# case the coarse grained structure will not match with the topology...
# CYSTINE BRIDGES #
# Extract the cysteine coordinates (for all frames) and the cysteine identifiers
if options['CystineCheckBonds']:
logging.info("Checking for cystine bridges, based on sulphur (SG) atoms lying closer than %.4f nm" % math.sqrt(options['CystineMaxDist2']/100))
cyscoord = zip(*[[j[4:7] for j in i] for i in cysteines])
cysteines = [i[:4] for i in cysteines[0]]
bl, kb = options['ForceField'].special[(("SC1", "CYS"), ("SC1", "CYS"))]
# Check the distances and add the cysteines to the link list if the
# SG atoms have a distance smaller than the cutoff.
rlc = range(len(cysteines))
for i in rlc[:-1]:
for j in rlc[i+1:]:
# Checking the minimum distance over all frames
# But we could also take the maximum, or the mean
d2 = min([FUNC.distance2(a, b) for a, b in zip(cyscoord[i], cyscoord[j])])
if d2 <= options['CystineMaxDist2']:
a, b = cysteines[i], cysteines[j]
options['linkListCG'].append((("SC1", "CYS", a[2], a[3]), ("SC1", "CYS", b[2], b[3]), bl, kb))
a, b = (a[0], a[1], a[2]-(32 << 20), a[3]), (b[0], b[1], b[2]-(32 << 20), b[3])
logging.info("Detected SS bridge between %s and %s (%f nm)" % (a, b, math.sqrt(d2)/10))
# REAL ITP STUFF #
# Check whether we have identical chains, in which case we
# only write the ITP for one...
# This means making a distinction between chains and
# moleculetypes.
molecules = [tuple([chains[i] for i in j]) for j in merge]
# At this point we should have a list or dictionary of chains
# Each chain should be given a unique name, based on the value
# of options["-o"] combined with the chain identifier and possibly
# a number if there are chains with identical identifiers.
# For each chain we then write an ITP file using the name for
# moleculetype and name + ".itp" for the topology include file.
# In addition we write a master topology file, using the value of
# options["-o"], with an added extension ".top" if not given.
# XXX *NOTE*: This should probably be gathered in a 'Universe' class
itp = 0
moleculeTypes = {}
for mi in range(len(molecules)):
mol = molecules[mi]
# Check if the moleculetype is already listed
# If not, generate the topology from the chain definition
if mol not in moleculeTypes or options['SeparateTop']:
# Name of the moleculetype
# XXX: The naming should be changed; now it becomes Protein_X+Protein_Y+...
name = "+".join([chain.getname(options['-name'].value) for chain in mol])
moleculeTypes[mol] = name
# Write the molecule type topology
top = TOP.Topology(mol[0], options=options, name=name)
for m in mol[1:]:
top += TOP.Topology(m, options=options)
# Have to add the connections, like the connecting network
# Gather coordinates
mcg, coords = zip(*[(j[:4], j[4:7]) for m in mol for j in m.cg(force=True)])
mcg = list(mcg)
# Run through the link list and add connections (links = cys bridges or hand specified links)
for atomA, atomB, bondlength, forceconst in options['linkListCG']:
if bondlength == -1 and forceconst == -1:
bondlength, forceconst = options['ForceField'].special[(atomA[:2], atomB[:2])]
# Check whether this link applies to this group
atomA = atomA in mcg and mcg.index(atomA)+1
atomB = atomB in mcg and mcg.index(atomB)+1
if atomA and atomB:
cat = (forceconst is None) and "Constraint" or "Link"
top.bonds.append(TOP.Bond(
(atomA, atomB),
options = options,
type = 1,
parameters = (bondlength, forceconst),
category = cat,
comments = "Cys-bonds/special link"))
# Elastic Network
# The elastic network is added after the topology is constructed, since that
# is where the correct atom list with numbering and the full set of
# coordinates for the merged chains are available.
if options['ElasticNetwork']:
rubberType = options['ForceField'].EBondType
rubberList = ELN.rubberBands(
[(i[0], j) for i, j in zip(top.atoms, coords) if i[4] in options['ElasticBeads']],
options['ElasticLowerBound'], options['ElasticUpperBound'],
options['ElasticDecayFactor'], options['ElasticDecayPower'],
options['ElasticMaximumForce'], options['ElasticMinimumForce'])
top.bonds.extend([TOP.Bond(i, options=options, type=rubberType, category="Rubber band") for i in rubberList])
# Write out the MoleculeType topology
destination = options["-o"] and open(moleculeTypes[mol]+".itp", 'w') or sys.stdout
destination.write(str(top))
itp += 1
# Check whether other chains are equal to this one
# Skip this step if we are to write all chains to separate moleculetypes
if not options['SeparateTop']:
for j in range(mi+1, len(molecules)):
if not molecules[j] in moleculeTypes and mol == molecules[j]:
# Molecule j is equal to a molecule mi
# Set the name of the moleculetype to the one of that molecule
moleculeTypes[molecules[j]] = moleculeTypes[mol]
logging.info('Written %d ITP file%s' % (itp, itp > 1 and "s" or ""))
# WRITING THE MASTER TOPOLOGY
# Output stream
top = options["-o"] and open(options['-o'].value, 'w') or sys.stdout
# ITP file listing
itps = '\n'.join(['#include "%s.itp"' % molecule for molecule in set(moleculeTypes.values())])
# Molecule listing
logging.info("Output contains %d molecules:" % len(molecules))
n = 1
for molecule in molecules:
chainInfo = (n, moleculeTypes[molecule], len(molecule) > 1 and "s" or " ", " ".join([i.id for i in molecule]))
logging.info(" %2d-> %s (chain%s %s)" % chainInfo)
n += 1
molecules = '\n'.join(['%s \t 1' % moleculeTypes[molecule] for molecule in molecules])
# Set a define if we are to use rubber bands
useRubber = options['ElasticNetwork'] and "#define RUBBER_BANDS" or ""
# XXX Specify a better, version specific base-itp name.
# Do not set a define for position restrains here, as people are more used to do it in mdp file?
top.write(
'''#include "martini.itp"
%s
%s
[ system ]
; name
Martini system from %s
[ molecules ]
; name number
%s''' % (useRubber, itps, options["-f"] and options["-f"].value or "stdin", molecules))
logging.info('Written topology files')
# Maybe there are forcefield specific log messages?
options['ForceField'].messages()
# The following lines are always printed (if no errors occur).
print "\n\tThere you are. One MARTINI. Shaken, not stirred.\n"
Q = DOC.martiniq.pop(random.randint(0, len(DOC.martiniq)-1))
print "\n", Q[1], "\n%80s" % ("--"+Q[0]), "\n"