-
Notifications
You must be signed in to change notification settings - Fork 653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implementing gemmi
-based mmcif reader (with easy extension to PDB/PDBx and mmJSON)
#4712
base: develop
Are you sure you want to change the base?
Changes from 14 commits
aa2a88f
218cf43
7f78e02
6682d6e
817f3a0
3cc8c80
f1bf325
d21c220
2a1be15
77645e6
91e6942
9a0c086
9c731df
8b40ec7
bdcbd73
401a4d3
9c336bd
def88e4
cabfd37
4c9d930
184491a
3de8565
ca6ebbb
45077ad
9a1a59a
27c10d6
950cfcf
8d1a8b5
ef29338
caca17e
f0e49cc
b7ada7c
e80632c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import numpy as np | ||
import gemmi | ||
import logging | ||
from . import base | ||
|
||
logger = logging.getLogger("MDAnalysis.coordinates.MMCIF") | ||
|
||
|
||
class MMCIFReader(base.SingleFrameReaderBase): | ||
"""Reads from an MMCIF file""" | ||
|
||
format = "MMCIF" | ||
units = {"time": None, "length": "Angstrom"} | ||
|
||
def _read_first_frame(self): | ||
structure = gemmi.read_structure(self.filename) | ||
coords = np.array( | ||
[ | ||
[*at.pos.tolist()] | ||
for model in structure | ||
for chain in model | ||
for res in chain | ||
for at in res | ||
] | ||
) | ||
self.n_atoms = len(coords) | ||
self.ts = self._Timestep.from_coordinates(coords, **self._ts_kwargs) | ||
self.ts.frame = 0 | ||
|
||
def Writer(self, filename, n_atoms=None, **kwargs): | ||
raise NotImplementedError | ||
|
||
def close(self): | ||
pass | ||
|
||
|
||
class MMCIFWriter(base.WriterBase): | ||
pass |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
""" | ||
MMCIF Topology Parser # | ||
=================== | ||
""" | ||
|
||
import gemmi | ||
import numpy as np | ||
import warnings | ||
import itertools | ||
|
||
from ..core.topology import Topology | ||
from ..core.topologyattrs import ( | ||
AltLocs, | ||
Atomids, | ||
Atomnames, | ||
Atomtypes, | ||
ChainIDs, | ||
Elements, | ||
FormalCharges, | ||
ICodes, | ||
Masses, | ||
Occupancies, | ||
RecordTypes, | ||
Resids, | ||
Resnames, | ||
Resnums, | ||
Segids, | ||
Tempfactors, | ||
) | ||
from .base import TopologyReaderBase | ||
|
||
|
||
def _into_idx(arr: list[int]) -> list[int]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Document what this does, ideally with an example |
||
return [idx for idx, (_, group) in enumerate(itertools.groupby(arr)) for _ in group] | ||
|
||
|
||
class MMCIFParser(TopologyReaderBase): | ||
format = "MMCIF" | ||
|
||
def parse(self, **kwargs): | ||
"""Read the file and return the structure. | ||
|
||
Returns | ||
------- | ||
MDAnalysis Topology object | ||
""" | ||
structure = gemmi.read_structure(self.filename) | ||
|
||
if len(structure) > 1: | ||
warnings.warn( | ||
"MMCIF model {self.filename} contains {len(model)=} different models, " | ||
"but only the first one will be used to assign the topology" | ||
) | ||
model = structure[0] | ||
|
||
# atom properties | ||
( | ||
altlocs, # at.altloc | ||
serials, # at.serial | ||
names, # at.name | ||
atomtypes, # at.name | ||
# ------------------ | ||
chainids, # chain.name | ||
elements, # at.element.name | ||
formalcharges, # at.charge | ||
weights, # at.element.weight | ||
# ------------------ | ||
occupancies, # at.occ | ||
record_types, # res.het_flag | ||
tempfactors, # at.b_iso | ||
residx, # _into_idx(res.seqid.num) | ||
) = map( # this is probably not pretty, but it's efficient -- one loop over the mmcif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are all the fields here guaranteed in a valid pdbx? One benefit to working column by column is that you can do optional columns There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you have an example of a PDBx in mind, or like a test set for them? I've never actually worked with the format, since in RCSB afaik we have only pdb or mmcif There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PDBx is mmcif. The download links here will give you an example file: https://www.rcsb.org/structure/4ake we use 4ake elsewhere in the testsuite. In my experience, sometimes the PDB / mmcif versions of the same entry aren't completely identical, so I wouldn't worry about trying to align the PDB & PDBx tests. |
||
np.array, | ||
list( | ||
zip( | ||
*[ | ||
( | ||
at.altloc, # altlocs | ||
at.serial, # serials | ||
at.name, # names | ||
at.name, # atomtypes | ||
# ------------------ | ||
chain.name, # chainids | ||
at.element.name, # elements | ||
at.charge, # formalcharges | ||
at.element.weight, # weights | ||
# ------------------ | ||
at.occ, # occupancies | ||
res.het_flag, # record_types | ||
at.b_iso, # tempfactors | ||
res.seqid.num, # residx, later translated into continious repr | ||
) | ||
for chain in model | ||
for res in chain | ||
for at in res | ||
] | ||
) | ||
), | ||
) | ||
|
||
( | ||
icodes, # res.seqid.icode | ||
resids, # res.seqid.num | ||
resnames, # res.name | ||
segidx, # chain.name | ||
resnums, | ||
) = map( | ||
np.array, | ||
list( | ||
zip( | ||
*[ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm struggling to follow the logic here, a comment breaking down what this double nested loop iteration into a zip is doing would be nice |
||
( | ||
res.seqid.icode, | ||
res.seqid.num, | ||
res.name, | ||
chain.name, | ||
res.seqid.num, | ||
) | ||
for chain in model | ||
for res in chain | ||
] | ||
) | ||
), | ||
) | ||
|
||
segids = [chain.name for chain in model] | ||
|
||
# transform *idx into continious numpy arrays | ||
residx = np.array(_into_idx(residx)) | ||
segidx = np.array(_into_idx(segidx)) | ||
|
||
# fill in altlocs | ||
altlocs = ["A" if not elem else elem for elem in altlocs] | ||
record_types = [ | ||
"ATOM" if record == "A" else "HETATM" if record == "H" else None | ||
for record in record_types | ||
] | ||
if any((elem is None for elem in record_types)): | ||
raise ValueError("Found an atom that is neither ATOM or HETATM") | ||
|
||
attrs = [ | ||
# AtomAttr subclasses | ||
AltLocs(altlocs), # at.altloc | ||
Atomids(serials), # at.serial | ||
Atomnames(names), # at.name | ||
Atomtypes(atomtypes), # at.name | ||
# --------------------------------------- | ||
ChainIDs(chainids), # chain.name | ||
Elements(elements), # at.element.name | ||
FormalCharges(formalcharges), # at.charge | ||
Masses(weights), # at.element.weight | ||
# --------------------------------------- | ||
Occupancies(occupancies), # at.occ | ||
RecordTypes(record_types), # res.het_flat | ||
Resnums(resnums), # res.seqid.num | ||
Tempfactors(tempfactors), # at.b_iso | ||
# | ||
# ResidueAttr subclasses | ||
ICodes(icodes), # res.seqid.icode | ||
Resids(resids), # res.seqid.num | ||
Resnames(resnames), # res.name | ||
# | ||
# SegmentAttr subclasses | ||
Segids(segids), # chain.name | ||
] | ||
|
||
n_atoms = len(names) | ||
n_residues = len(resids) | ||
n_segments = len(segids) | ||
|
||
return Topology( | ||
n_atoms, | ||
n_residues, | ||
n_segments, | ||
attrs=attrs, | ||
atom_resindex=residx, | ||
residue_segindex=segidx, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -333,3 +333,4 @@ | |
from . import MinimalParser | ||
from . import ITPParser | ||
from . import FHIAIMSParser | ||
from . import MMCIFParser |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,6 +78,7 @@ extra_formats = [ | |
"pytng>=0.2.3", | ||
"gsd>3.0.0", | ||
"rdkit>=2020.03.1", | ||
"gemmi", # for mmcif format | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will probably be optional, so other imports will have to respect that too |
||
] | ||
analysis = [ | ||
"biopython>=1.80", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wouldn't include this at this stage, Writer is optional