Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add explicit_hydrogen parameter #741

Open
wants to merge 2 commits into
base: interfaces
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 7 additions & 13 deletions doc/tutorial/interface/rdkit.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,14 @@ For a proper structural formula, we need to compute proper 2D coordinates first.

import biotite.interface.rdkit as rdkit_interface
import biotite.structure.info as struc
import rdkit.Chem.AllChem as Chem
from rdkit.Chem.Draw import MolToImage
from rdkit.Chem.rdDepictor import Compute2DCoords
from rdkit.Chem.rdmolops import RemoveHs

penicillin = struc.residue("PNN")
mol = rdkit_interface.to_mol(penicillin)
# We do not want to include explicit hydrogen atoms in the structural formula
mol = RemoveHs(mol)
Compute2DCoords(mol)
mol = Chem.RemoveHs(mol)
Chem.Compute2DCoords(mol)
image = MolToImage(mol, size=(600, 400))
display(image)

Expand All @@ -49,18 +48,13 @@ One way to to obtain them as :class:`.AtomArray` is passing a *SMILES* string to

.. jupyter-execute::

from rdkit.Chem import MolFromSmiles
from rdkit.Chem.rdDistGeom import EmbedMolecule
from rdkit.Chem.rdForceFieldHelpers import UFFOptimizeMolecule
from rdkit.Chem.rdmolops import AddHs

ERTAPENEM_SMILES = "C[C@@H]1[C@@H]2[C@H](C(=O)N2C(=C1S[C@H]3C[C@H](NC3)C(=O)NC4=CC=CC(=C4)C(=O)O)C(=O)O)[C@@H](C)O"

mol = MolFromSmiles(ERTAPENEM_SMILES)
mol = Chem.MolFromSmiles(ERTAPENEM_SMILES)
# RDKit uses implicit hydrogen atoms by default, but Biotite requires explicit ones
mol = AddHs(mol)
mol = Chem.AddHs(mol)
# Create a 3D conformer
conformer_id = EmbedMolecule(mol)
UFFOptimizeMolecule(mol)
conformer_id = Chem.EmbedMolecule(mol)
Chem.UFFOptimizeMolecule(mol)
ertapenem = rdkit_interface.from_mol(mol, conformer_id)
print(ertapenem)
93 changes: 52 additions & 41 deletions src/biotite/interface/rdkit/mol.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,8 @@
import warnings
from collections import defaultdict
import numpy as np
from rdkit.Chem.rdchem import (
Atom,
AtomPDBResidueInfo,
Conformer,
EditableMol,
KekulizeException,
Mol,
)
from rdkit.Chem.rdchem import BondType as RDKitBondType
from rdkit.Chem.rdmolops import AddHs, Kekulize, SanitizeFlags, SanitizeMol
import rdkit.Chem.AllChem as Chem
from rdkit.Chem import SanitizeFlags
from biotite.interface.version import requires_version
from biotite.interface.warning import LossyConversionWarning
from biotite.structure.atoms import AtomArray, AtomArrayStack
Expand All @@ -31,26 +23,26 @@
BondType.TRIPLE: BondType.AROMATIC_TRIPLE,
}
_BIOTITE_TO_RDKIT_BOND_TYPE = {
BondType.ANY: RDKitBondType.UNSPECIFIED,
BondType.SINGLE: RDKitBondType.SINGLE,
BondType.DOUBLE: RDKitBondType.DOUBLE,
BondType.TRIPLE: RDKitBondType.TRIPLE,
BondType.QUADRUPLE: RDKitBondType.QUADRUPLE,
BondType.AROMATIC_SINGLE: RDKitBondType.AROMATIC,
BondType.AROMATIC_DOUBLE: RDKitBondType.AROMATIC,
BondType.AROMATIC_TRIPLE: RDKitBondType.AROMATIC,
BondType.AROMATIC: RDKitBondType.AROMATIC,
BondType.ANY: Chem.BondType.UNSPECIFIED,
BondType.SINGLE: Chem.BondType.SINGLE,
BondType.DOUBLE: Chem.BondType.DOUBLE,
BondType.TRIPLE: Chem.BondType.TRIPLE,
BondType.QUADRUPLE: Chem.BondType.QUADRUPLE,
BondType.AROMATIC_SINGLE: Chem.BondType.AROMATIC,
BondType.AROMATIC_DOUBLE: Chem.BondType.AROMATIC,
BondType.AROMATIC_TRIPLE: Chem.BondType.AROMATIC,
BondType.AROMATIC: Chem.BondType.AROMATIC,
# Dative bonds may lead to a KekulizeException and may potentially be deprecated
# in the future (https://github.com/rdkit/rdkit/discussions/6995)
BondType.COORDINATION: RDKitBondType.SINGLE,
BondType.COORDINATION: Chem.BondType.SINGLE,
}
_RDKIT_TO_BIOTITE_BOND_TYPE = {
RDKitBondType.UNSPECIFIED: BondType.ANY,
RDKitBondType.SINGLE: BondType.SINGLE,
RDKitBondType.DOUBLE: BondType.DOUBLE,
RDKitBondType.TRIPLE: BondType.TRIPLE,
RDKitBondType.QUADRUPLE: BondType.QUADRUPLE,
RDKitBondType.DATIVE: BondType.COORDINATION,
Chem.BondType.UNSPECIFIED: BondType.ANY,
Chem.BondType.SINGLE: BondType.SINGLE,
Chem.BondType.DOUBLE: BondType.DOUBLE,
Chem.BondType.TRIPLE: BondType.TRIPLE,
Chem.BondType.QUADRUPLE: BondType.QUADRUPLE,
Chem.BondType.DATIVE: BondType.COORDINATION,
}
_STANDARD_ANNOTATIONS = frozenset(
{
Expand All @@ -76,6 +68,7 @@ def to_mol(
kekulize=False,
use_dative_bonds=False,
include_extra_annotations=(),
explicit_hydrogen=True,
):
"""
Convert an :class:`.AtomArray` or :class:`.AtomArrayStack` into a
Expand Down Expand Up @@ -105,6 +98,11 @@ def to_mol(
are always included per default. These standard annotations can be accessed
with :meth:`rdkit.Chem.rdchem.Atom.GetPDBResidueInfo()` for each atom in the
returned :class:`rdkit.Chem.rdchem.Mol`.
explicit_hydrogen : bool, optional
If set to true, the conversion process expects that all hydrogen atoms are
explicit, i.e. each each hydrogen atom must be part of the :class:`AtomArray`.
If set to false, the conversion process treats all hydrogen atoms as implicit
and all hydrogen atoms in the :class:`AtomArray` are removed.

Returns
-------
Expand Down Expand Up @@ -141,17 +139,29 @@ def to_mol(
HB3
HXT
"""
mol = EditableMol(Mol())
hydrogen_mask = atoms.element == "H"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I just remembered that I've seen the very occasional structures with deuterium as well (e.g. 1wq2). How would we want to deal with these cases? Do we want to treat this isotope as hydrogen as well (probably chemically most appropriate) or not? If so, the PDB represents deuterium as the element "D"

if explicit_hydrogen:
if not hydrogen_mask.any():
warnings.warn(
"No hydrogen found in the input, although 'explicit_hydrogen' is 'True'"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"No hydrogen found in the input, although 'explicit_hydrogen' is 'True'"
"No hydrogen found in the input, although 'explicit_hydrogen' is 'True'. "
"This may lead to atoms wrongly interpreted as radicals. Be careful."

)
else:
atoms = atoms[..., ~hydrogen_mask]

mol = Chem.EditableMol(Chem.Mol())

has_annot = frozenset(atoms.get_annotation_categories())
extra_annot = set(include_extra_annotations) - _STANDARD_ANNOTATIONS

for i in range(atoms.array_length()):
rdkit_atom = Atom(atoms.element[i].capitalize())
rdkit_atom = Chem.Atom(atoms.element[i].capitalize())
if explicit_hydrogen:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if explicit_hydrogen:
if explicit_hydrogen:
# ... tell RDKit to not assume any implicit hydrogens

rdkit_atom.SetNoImplicit(True)
Copy link
Contributor

@Croydon-Brixton Croydon-Brixton Jan 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the base case where explicity_hydrogen=True, what would this mean if I just call to_mol for a typical crystal structure that won't have any hydrogens resolved? Will RDKit infer charges / valences in that case? If so this might lead to broken molecules -- if that were the case, should we check that if explicit hydrogen is true there have to be hydrogens in the structure? (I could see this being sth users will try -- at least I would have tried it :D )

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Charges would also not be inferred automatically before this PR. But I am not sure what this means for valence: As the bond types are also explicitly set I guess RDKit assumes a radical, if explicit_hydrogen=True, but hydrogen atoms are actually missing. Probably, I should test this.

Having a check there sounds like a good idea, especially as I would agree that this could be a common mistake. However, I also think strictly checking for the simple presence of hydrogen atoms might not be sensible enough, as there are valid molecules without hydrogen atoms, although they appear rarely. What do you think about raising a warning as a 'reminder' to the user to check the input?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm to my mind this option is still problematic, at least given the defaults of the to_mol function:
By default most PDBs won't have any hydrogens present, but the default in to_mol here is explicit_hydrogen=True. This will lead to radicals upon calling Chem.Sanitize, which is undesirable almost always and will probably catch a lot of users (especially those less well versed in RDKit) off guard.

I would suggest:

  1. Changing the default to not expect explicit hydrogens.
  2. Emitting a warning if a user says they're having explicit hydrogens, but no hydrogens are found, with a note that this can lead to radicals. After that, they're on their own ^^

if "charge" in has_annot:
rdkit_atom.SetFormalCharge(atoms.charge[i].item())

# add standard pdb annotations
rdkit_atom_res_info = AtomPDBResidueInfo(
rdkit_atom_res_info = Chem.AtomPDBResidueInfo(
atomName=atoms.atom_name[i].item(),
residueName=atoms.res_name[i].item(),
chainId=atoms.chain_id[i].item(),
Expand All @@ -176,11 +186,12 @@ def to_mol(

if atoms.bonds is None:
raise BadStructureError("An AtomArray with associated BondList is required")
bonds = atoms.bonds.as_array()
if kekulize:
bonds = bonds.copy()
bonds = atoms.bonds.copy()
bonds.remove_aromaticity()
for atom_i, atom_j, bond_type in atoms.bonds.as_array():
else:
bonds = atoms.bonds
for atom_i, atom_j, bond_type in bonds.as_array():
if not use_dative_bonds and bond_type == BondType.COORDINATION:
bond_type = BondType.SINGLE
mol.AddBond(
Expand All @@ -194,7 +205,7 @@ def to_mol(
# Handle AtomArray and AtomArrayStack consistently
coord = coord[None, :, :]
for model_coord in coord:
conformer = Conformer(mol.GetNumAtoms())
conformer = Chem.Conformer(mol.GetNumAtoms())
# RDKit silently expects the data to be in C-contiguous order
# Otherwise the coordinates would be completely misassigned
# (https://github.com/rdkit/rdkit/issues/8221)
Expand Down Expand Up @@ -271,8 +282,8 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None):
if add_hydrogen is None:
add_hydrogen = not _has_explicit_hydrogen(mol)
if add_hydrogen:
SanitizeMol(mol, SanitizeFlags.SANITIZE_ADJUSTHS)
mol = AddHs(mol, addCoords=False, addResidueInfo=False)
Chem.SanitizeMol(mol, SanitizeFlags.SANITIZE_ADJUSTHS)
mol = Chem.AddHs(mol, addCoords=False, addResidueInfo=False)

rdkit_atoms = mol.GetAtoms()
if rdkit_atoms is None:
Expand Down Expand Up @@ -309,7 +320,7 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None):
residue_info = rdkit_atom.GetPDBResidueInfo()
if residue_info is None:
# ... default values for atoms with missing residue information
residue_info = AtomPDBResidueInfo(
residue_info = Chem.AtomPDBResidueInfo(
atomName="",
occupancy=0.0,
tempFactor=float("nan"),
Expand Down Expand Up @@ -356,18 +367,18 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None):

rdkit_bonds = list(mol.GetBonds())
is_aromatic = np.array(
[bond.GetBondType() == RDKitBondType.AROMATIC for bond in rdkit_bonds]
[bond.GetBondType() == Chem.BondType.AROMATIC for bond in rdkit_bonds]
)
if np.any(is_aromatic):
# Determine the kekulized order of aromatic bonds
# Copy as 'Kekulize()' modifies the molecule in-place
mol = Mol(mol)
mol = Chem.Mol(mol)
try:
Kekulize(mol)
except KekulizeException:
Chem.Kekulize(mol)
except Chem.KekulizeException:
warnings.warn(
"Kekulization failed, "
"using 'BondType.ANY' instead for aromatic bonds instead",
"using 'BondType.AROMATIC' instead for aromatic bonds instead",
LossyConversionWarning,
)
rdkit_bonds = list(mol.GetBonds())
Expand Down
10 changes: 4 additions & 6 deletions tests/interface/test_rdkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,9 @@ def _load_smiles():
return file.read().splitlines()


@pytest.mark.filterwarnings(
"ignore:"
"The coordinates are missing for some atoms. "
"The fallback coordinates will be used instead"
)
@pytest.mark.filterwarnings("ignore:Missing coordinates.*")
@pytest.mark.filterwarnings("ignore:.*coordinates are missing.*")
@pytest.mark.filterwarnings("ignore::biotite.interface.LossyConversionWarning")
@pytest.mark.parametrize(
"res_name", np.random.default_rng(0).choice(info.all_residues(), size=200).tolist()
)
Expand All @@ -46,7 +44,7 @@ def test_conversion_from_biotite(res_name):
# Some compounds in the CCD have missing coordinates
assert np.allclose(test_atoms.coord, ref_atoms.coord, equal_nan=True)

# There should be now undefined bonds
# There should be no undefined bonds
assert (test_atoms.bonds.as_array()[:, 2] != struc.BondType.ANY).all()
# Kekulization returns one of multiple resonance structures, so the returned one
# might not be the same as the input
Expand Down
Loading