Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementing gemmi-based mmcif reader (with easy extension to PDB/PDBx and mmJSON) #4712

Open
wants to merge 33 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
aa2a88f
Start working on MMCIF parser
marinegor May 22, 2024
218cf43
Add first (not working) version of MMCIFReader and MMCIF topology parser
marinegor May 22, 2024
7f78e02
Do some squashing
marinegor May 22, 2024
6682d6e
Remove inherited docs
marinegor May 22, 2024
817f3a0
Try improving the parsing
marinegor May 22, 2024
3cc8c80
Try three independent loops over the model
marinegor May 30, 2024
f1bf325
Merge remote-tracking branch 'upstream/develop' into feature/mmcif
marinegor Jul 25, 2024
d21c220
Add gemmi dependency
marinegor Sep 13, 2024
2a1be15
necessary params
marinegor Sep 20, 2024
77645e6
finished sorting atom attrs
marinegor Sep 20, 2024
91e6942
add function for transformation into *idx
marinegor Sep 20, 2024
9a0c086
oh damn seems to finally be working
marinegor Sep 20, 2024
9c731df
remove TODOs
marinegor Sep 20, 2024
8b40ec7
Remove debug prints
marinegor Sep 20, 2024
bdcbd73
Merge branch 'develop' into feature/mmcif
marinegor Sep 22, 2024
401a4d3
try to pack things into separate class in utils?
marinegor Sep 22, 2024
9c336bd
remove unnecessary functions
marinegor Sep 22, 2024
def88e4
copy all loops into separate functions
marinegor Sep 23, 2024
cabfd37
Move loops over structures into functions
marinegor Sep 23, 2024
4c9d930
Move coordinate fetching into function for the coordinate reader as well
marinegor Sep 23, 2024
184491a
Fix imports
marinegor Sep 23, 2024
3de8565
Start adding documentation
marinegor Sep 30, 2024
ca6ebbb
Reference MMCIFParser in PDBParser
marinegor Oct 1, 2024
45077ad
Add documentation for trajectory and topology parsers
marinegor Oct 1, 2024
9a1a59a
Add mmcif tests
marinegor Oct 2, 2024
27c10d6
Update format specifications
marinegor Oct 2, 2024
950cfcf
Write simple tests
marinegor Oct 2, 2024
8d1a8b5
Merge remote-tracking branch 'upstream/develop' into feature/mmcif
marinegor Oct 24, 2024
ef29338
update github action with gemmi
marinegor Oct 24, 2024
caca17e
fix gemmi import errors
marinegor Oct 24, 2024
f0e49cc
add mmcif testfiles
marinegor Oct 24, 2024
b7ada7c
add mmcif to __all__
marinegor Oct 24, 2024
e80632c
add black instead of ruff
marinegor Oct 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions package/MDAnalysis/coordinates/MMCIF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import numpy as np
import gemmi
import logging
from . import base

logger = logging.getLogger("MDAnalysis.coordinates.MMCIF")


class MMCIFReader(base.SingleFrameReaderBase):
"""Reads from an MMCIF file"""

format = "MMCIF"
units = {"time": None, "length": "Angstrom"}

def _read_first_frame(self):
structure = gemmi.read_structure(self.filename)
coords = np.array(
[
[*at.pos.tolist()]
for model in structure
for chain in model
for res in chain
for at in res
]
)
self.n_atoms = len(coords)
self.ts = self._Timestep.from_coordinates(coords, **self._ts_kwargs)
self.ts.frame = 0

def Writer(self, filename, n_atoms=None, **kwargs):
raise NotImplementedError

def close(self):
pass


class MMCIFWriter(base.WriterBase):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't include this at this stage, Writer is optional

pass
1 change: 1 addition & 0 deletions package/MDAnalysis/coordinates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,3 +791,4 @@ class can choose an appropriate reader automatically.
from . import NAMDBIN
from . import FHIAIMS
from . import TNG
from . import MMCIF
178 changes: 178 additions & 0 deletions package/MDAnalysis/topology/MMCIFParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
"""
MMCIF Topology Parser #
===================
"""

import gemmi
import numpy as np
import warnings
import itertools

from ..core.topology import Topology
from ..core.topologyattrs import (
AltLocs,
Atomids,
Atomnames,
Atomtypes,
ChainIDs,
Elements,
FormalCharges,
ICodes,
Masses,
Occupancies,
RecordTypes,
Resids,
Resnames,
Resnums,
Segids,
Tempfactors,
)
from .base import TopologyReaderBase


def _into_idx(arr: list[int]) -> list[int]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Document what this does, ideally with an example

return [idx for idx, (_, group) in enumerate(itertools.groupby(arr)) for _ in group]


class MMCIFParser(TopologyReaderBase):
format = "MMCIF"

def parse(self, **kwargs):
"""Read the file and return the structure.

Returns
-------
MDAnalysis Topology object
"""
structure = gemmi.read_structure(self.filename)

if len(structure) > 1:
warnings.warn(
"MMCIF model {self.filename} contains {len(model)=} different models, "
"but only the first one will be used to assign the topology"
)
model = structure[0]

# atom properties
(
altlocs, # at.altloc
serials, # at.serial
names, # at.name
atomtypes, # at.name
# ------------------
chainids, # chain.name
elements, # at.element.name
formalcharges, # at.charge
weights, # at.element.weight
# ------------------
occupancies, # at.occ
record_types, # res.het_flag
tempfactors, # at.b_iso
residx, # _into_idx(res.seqid.num)
) = map( # this is probably not pretty, but it's efficient -- one loop over the mmcif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are all the fields here guaranteed in a valid pdbx? One benefit to working column by column is that you can do optional columns

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have an example of a PDBx in mind, or like a test set for them? I've never actually worked with the format, since in RCSB afaik we have only pdb or mmcif

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PDBx is mmcif. The download links here will give you an example file: https://www.rcsb.org/structure/4ake we use 4ake elsewhere in the testsuite. In my experience, sometimes the PDB / mmcif versions of the same entry aren't completely identical, so I wouldn't worry about trying to align the PDB & PDBx tests.

np.array,
list(
zip(
*[
(
at.altloc, # altlocs
at.serial, # serials
at.name, # names
at.name, # atomtypes
# ------------------
chain.name, # chainids
at.element.name, # elements
at.charge, # formalcharges
at.element.weight, # weights
# ------------------
at.occ, # occupancies
res.het_flag, # record_types
at.b_iso, # tempfactors
res.seqid.num, # residx, later translated into continious repr
)
for chain in model
for res in chain
for at in res
]
)
),
)

(
icodes, # res.seqid.icode
resids, # res.seqid.num
resnames, # res.name
segidx, # chain.name
resnums,
) = map(
np.array,
list(
zip(
*[
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm struggling to follow the logic here, a comment breaking down what this double nested loop iteration into a zip is doing would be nice

(
res.seqid.icode,
res.seqid.num,
res.name,
chain.name,
res.seqid.num,
)
for chain in model
for res in chain
]
)
),
)

segids = [chain.name for chain in model]

# transform *idx into continious numpy arrays
residx = np.array(_into_idx(residx))
segidx = np.array(_into_idx(segidx))

# fill in altlocs
altlocs = ["A" if not elem else elem for elem in altlocs]
record_types = [
"ATOM" if record == "A" else "HETATM" if record == "H" else None
for record in record_types
]
if any((elem is None for elem in record_types)):
raise ValueError("Found an atom that is neither ATOM or HETATM")

attrs = [
# AtomAttr subclasses
AltLocs(altlocs), # at.altloc
Atomids(serials), # at.serial
Atomnames(names), # at.name
Atomtypes(atomtypes), # at.name
# ---------------------------------------
ChainIDs(chainids), # chain.name
Elements(elements), # at.element.name
FormalCharges(formalcharges), # at.charge
Masses(weights), # at.element.weight
# ---------------------------------------
Occupancies(occupancies), # at.occ
RecordTypes(record_types), # res.het_flat
Resnums(resnums), # res.seqid.num
Tempfactors(tempfactors), # at.b_iso
#
# ResidueAttr subclasses
ICodes(icodes), # res.seqid.icode
Resids(resids), # res.seqid.num
Resnames(resnames), # res.name
#
# SegmentAttr subclasses
Segids(segids), # chain.name
]

n_atoms = len(names)
n_residues = len(resids)
n_segments = len(segids)

return Topology(
n_atoms,
n_residues,
n_segments,
attrs=attrs,
atom_resindex=residx,
residue_segindex=segidx,
)
1 change: 1 addition & 0 deletions package/MDAnalysis/topology/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,4 @@
from . import MinimalParser
from . import ITPParser
from . import FHIAIMSParser
from . import MMCIFParser
1 change: 1 addition & 0 deletions package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ extra_formats = [
"pytng>=0.2.3",
"gsd>3.0.0",
"rdkit>=2020.03.1",
"gemmi", # for mmcif format
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will probably be optional, so other imports will have to respect that too

]
analysis = [
"biopython>=1.80",
Expand Down
Loading