Skip to content

Commit

Permalink
improve handling of esters, amides, etc. in the v2 tautomer hash (rdk…
Browse files Browse the repository at this point in the history
…it#8173)

* improve handling of esters, amides, etc. in the v2 tautomer hash

Fixes rdkit#8090

* cleanup

* extend that logic to aromatic systems

* documentation in response to review
  • Loading branch information
greglandrum authored Jan 24, 2025
1 parent 80f27d7 commit 6867286
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 29 deletions.
2 changes: 1 addition & 1 deletion Code/GraphMol/MolHash/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

rdkit_library(MolHash
hashfunctions.cpp normalize.cpp
LINK_LIBRARIES Depictor GraphMol)
LINK_LIBRARIES SubstructMatch Depictor GraphMol)
target_compile_definitions(MolHash PRIVATE RDKIT_MOLHASH_BUILD)

rdkit_headers(MolHash.h nmmolhash.h
Expand Down
74 changes: 66 additions & 8 deletions Code/GraphMol/MolHash/catch_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -432,13 +432,25 @@ TEST_CASE("tautomer v2") {
{}},
{{"CC(=O)C=CC", "C=C(O)C=CC"}, {"CC(=O)CC=C"}},
{{"N=C1N=CN(C)C2N=CNC=21", "NC1N=CN(C)C2=NC=NC2=1"}, {}},
{{"S=C1N=CN=C2NC=NC12", "S=C1NC=NC2N=CNC1=2"}, {}},
{{"CC1=CN=CN1", "CC1CN=CN=1"}, {}},
{{"N1C(=O)NC(=O)C2C=NNC=21", "N1C(=O)NC(=O)C2=CNNC2=1",
"N1C(=O)NC(=O)C2=CNNC2=1", "N1C(=O)NC(=O)C2CN=NC2=1"},
{{
"S=C1N=CN=C2NC=NC12",
"S=C2C1N=CN=C1NC=N2",

},
{
"N1C(=O)NC(=O)C2CN=NC=21",
"S=C1NC=NC2N=CNC1=2",
"S=C1N=CN=C2N=CNC12",
"S=C2C1NC=NC1=NC=N2",
}},
{{"S=C1NC=NC2N=CNC1=2", "S=C1NC=NC2NC=NC1=2", "SC1=NC=NC2N=CNC1=2"},
{}},
{{"CC1=CN=CN1", "CC1CN=CN=1"}, {}},
{{
"N1C(=O)NC(=O)C2C=NNC=21",
"N1C(=O)NC(=O)C2=CNNC2=1",
"N1C(=O)NC(=O)C2=CNNC2=1",
},
{"N1C(=O)NC(=O)C2CN=NC=21", "N1C(=O)NC(=O)C2CN=NC2=1"}},
// ---------------------------
// more stereochemistry
// ---------------------------
Expand All @@ -447,7 +459,13 @@ TEST_CASE("tautomer v2") {
{{"C/C=C/O", "C/C=C\\O", "CC=CO"}, {}},
{{"C/C=C/C=O", "C/C=C\\C=O", "CC=CC=O"}, {}},
{{"C/C=C/CC=O"}, {"C/C=C\\CC=O", "CC=CCC=O"}},
{{"C1C=CC=C2C=C(O)NC=12", "C1C=CC=C2CC(=O)NC=12"}, {}},
{{"C1C=CC=C2CC(=O)NC=12", "C2=C1N=C(CC1=CC=C2)O",
"C1C=CC=C2CC(O)=NC=12"

},
{
"C1C=CC=C2C=C(O)NC=12",
}},
// ---------------------------
// some areas for potential improvement
// ---------------------------
Expand Down Expand Up @@ -500,7 +518,7 @@ TEST_CASE("tautomer v2") {
{"CC(C)(C)C=O", "CC(C)(C)[CH][O]_0_0",
"[CH3]-[C](-[CH3])(-[CH3])-[CH]=[O]_0_0"},
{"CC(C)=CO", "C[C](C)[CH][O]_1_0", "[CH3]-[C](-[CH3]):[C]:[O]_2_0"},
{"COC=O", "CO[CH][O]_0_0", "[CH3]-[O]:[C]:[O]_1_0"},
{"COC=O", "CO[CH][O]_0_0", "[CH3]-[O]-[CH]=[O]_0_0"},
{"CNC=O", "C[N][CH][O]_1_0", "[CH3]-[N]:[C]:[O]_2_0"},
{"CN(C)C=O", "CN(C)[CH][O]_0_0", "[CH3]-[N](-[CH3]):[C]:[O]_1_0"},
{"CC(C)(C)NC=O", "CC(C)(C)[N][CH][O]_1_0",
Expand All @@ -519,7 +537,7 @@ TEST_CASE("tautomer v2") {
"[C]:[C](:[O]):[C]-[CH2]-[CH3]_5_0"},
{"C=CCC(O)C", "C=CCC(C)[O]_1_0",
"[CH2]=[CH]-[CH2]-[CH](-[CH3])-[OH]_0_0"},
{"C=NC(=O)C", "[CH2][N][C](C)[O]_0_0", "[C]:[N]:[C](:[C]):[O]_5_0"},
{"C=NC(=O)C", "[CH2][N][C](C)[O]_0_0", "[C]:[N]:[C](-[CH3]):[O]_2_0"},
{"C=NC(O)=C", "[CH2][N][C]([CH2])[O]_1_0", "[C]:[N]:[C](:[C]):[O]_5_0"},
{"CC(=O)CC(=O)C", "C[C]([O])C[C](C)[O]_0_0",
"[C]:[C](:[O]):[C]:[C](:[C]):[O]_8_0"},
Expand Down Expand Up @@ -874,4 +892,44 @@ TEST_CASE("overreach with v2 tautomer hashes and imines") {
"[CH3]-[C@H](-[F])-[N]:[C]1:[C]-[CH2]-[CH2]-[CH2]-[C]:1_4_0");
}
}
}

TEST_CASE("v2 tautomers, carboxylic acids, amids, and related structures") {
SECTION("basics") {
std::vector<std::pair<std::string, std::string>> data = {
{"CC(=O)O", "[CH3]-[C](:[O]):[O]_1_0"},
{"CC(=O)OCC", "[CH3]-[CH2]-[O]-[C](-[CH3])=[O]_0_0"},
{"CC(=N)O", "[CH3]-[C](:[N]):[O]_2_0"},
{"CC(=O)NCC", "[CH3]-[CH2]-[N]:[C](-[CH3]):[O]_1_0"},
{"CC(=N)N", "[C]:[C](:[N]):[N]_6_0"},
};
for (const auto &[smiles, ref] : data) {
INFO(smiles);
auto m = v2::SmilesParse::MolFromSmiles(smiles);
REQUIRE(m);
auto hsh =
MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2);
CHECK(hsh == ref);
}
}
SECTION("specific problems") {
std::vector<std::pair<std::string, std::string>> data = {
// losing amino acid chirality:
{"CC(C)C[C@@H](C(=O)O)N",
"[CH3]-[CH](-[CH3])-[CH2]-[C@H](-[NH2])-[C](:[O]):[O]_1_0"},
// github #8090 (carboxylate)
{"O=C(O)CCC", "[CH3]-[CH2]-[CH2]-[C](:[O]):[O]_1_0"},
// aromatic "imine"
{"CC[C@@H](N)C1=NC=CN1",
"[CH3]-[CH2]-[C@@H](-[NH2])-[C]1:[N]:[C]:[C]:[N]:1_3_0"},
};
for (const auto &[smiles, ref] : data) {
INFO(smiles);
auto m = v2::SmilesParse::MolFromSmiles(smiles);
REQUIRE(m);
auto hsh =
MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2);
CHECK(hsh == ref);
}
}
}
100 changes: 80 additions & 20 deletions Code/GraphMol/MolHash/hashfunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#include <GraphMol/RDKitBase.h>
#include <GraphMol/RDKitQueries.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/Substruct/SubstructMatch.h>

#include "nmmolhash.h"
#include "mf.h"
Expand Down Expand Up @@ -378,14 +380,50 @@ std::string MesomerHash(RWMol *mol, bool netq, bool useCXSmiles,
return result;
}

namespace details {

constexpr std::uint64_t bondFlagCarboxyl =
1; //*< bond involved in in carboxyl, amide, etc.
std::vector<std::uint64_t> getBondFlags(const ROMol &mol) {
// FIX: oversimplified, but should work for now
static std::vector<std::string> patterns{
"[C;!$(C-C(=[NH])-[NH2])]-[C;!$(C(-C)(=[NH])-[NH2])](=[O,N,S])-[O,N,S]", //< one side of the "amide", with an ugly exclusion for amidine
"[C;!$(C=[O,N,S])]-[O,N,S]-C=[O,N,S]", //< the other side
"[OH0,SH0]-C=[O,N,S]", //< "esters" and "carboxyls"
"[C]-[c](:[o,n,s]):[o,n,s]", //< a limited version of handling this for
// aromatic systems
};
static std::vector<std::unique_ptr<RDKit::RWMol>> queries;
if (queries.empty()) {
for (const auto &pattern : patterns) {
queries.emplace_back(SmartsToMol(pattern));
}
}

std::vector<std::uint64_t> bondFlags(mol.getNumBonds(), 0);
for (const auto &qry : queries) {
auto matches = SubstructMatch(mol, *qry);
for (const auto &match : matches) {
const auto bnd =
mol.getBondBetweenAtoms(match[0].second, match[1].second);
bondFlags[bnd->getIdx()] |= bondFlagCarboxyl;
}
}
return bondFlags;
}

} // namespace details

namespace {
// candidate atoms are either unsaturated or have implicit Hs
// NOTE that being aromatic is not sufficient. The molecule
// Cn1cccc1 is a good example of this:
// - it should not be a tautomeric system
// - the N is not unsaturated, but it is aromatic
bool isCandidateAtom(const Atom *aptr) {
return aptr->getTotalNumHs() || queryAtomUnsaturated(aptr);
bool isCandidateAtom(const Atom *aptr,
const std::vector<std::uint64_t> &atomFlags) {
return !atomFlags[aptr->getIdx()] &&
(aptr->getTotalNumHs() || queryAtomUnsaturated(aptr));
}

// atomic number > 1, not carbon
Expand All @@ -403,18 +441,26 @@ bool isUnsaturatedBond(const Bond *bptr) {
}

// potential tautomeric bonds are unsaturated and both atoms are candidates
bool isPossibleTautomericBond(const Bond *bptr) {
return isUnsaturatedBond(bptr) && isCandidateAtom(bptr->getBeginAtom()) &&
isCandidateAtom(bptr->getEndAtom());
bool isPossibleTautomericBond(const Bond *bptr,
const std::vector<std::uint64_t> &atomFlags,
const std::vector<std::uint64_t> &bondFlags) {
return !bondFlags[bptr->getIdx()] && isUnsaturatedBond(bptr) &&
isCandidateAtom(bptr->getBeginAtom(), atomFlags) &&
isCandidateAtom(bptr->getEndAtom(), atomFlags);
}

// a bond is a possible starting bond if it involves a candidate hetereoatom
// (definition above) and an unsaturated atom
bool isPossibleStartingBond(const Bond *bptr) {
bool isPossibleStartingBond(const Bond *bptr,
const std::vector<std::uint64_t> &atomFlags,
const std::vector<std::uint64_t> &bondFlags) {
if (bondFlags[bptr->getIdx()]) {
return false;
}
auto heteroBeg = isHeteroAtom(bptr->getBeginAtom()) &&
isCandidateAtom(bptr->getBeginAtom());
auto heteroEnd =
isHeteroAtom(bptr->getEndAtom()) && isCandidateAtom(bptr->getEndAtom());
isCandidateAtom(bptr->getBeginAtom(), atomFlags);
auto heteroEnd = isHeteroAtom(bptr->getEndAtom()) &&
isCandidateAtom(bptr->getEndAtom(), atomFlags);
// at least one atom has to be an eligible heteroatom:
if (!heteroBeg && !heteroEnd) {
return false;
Expand All @@ -425,7 +471,8 @@ bool isPossibleStartingBond(const Bond *bptr) {
auto unsatBeg = queryAtomUnsaturated(bptr->getBeginAtom());
auto unsatEnd = queryAtomUnsaturated(bptr->getEndAtom());

// both we need a heteroatom on one side and an unsaturated atom on the other:
// both we need a heteroatom on one side and an unsaturated atom on the
// other:
if (!((heteroBeg && unsatEnd) || (heteroEnd && unsatBeg))) {
return false;
}
Expand All @@ -448,11 +495,14 @@ bool hasStartBond(const Atom *aptr, const boost::dynamic_bitset<> &startBonds) {
// doesn't have a start bond
bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
const Bond *nbrBond,
const boost::dynamic_bitset<> &startBonds) {
return (
(!isCandidateAtom(otherAtom) && !hasStartBond(otherAtom, startBonds)) ||
(!isUnsaturatedBond(nbrBond) && !nbrBond->getIsConjugated() &&
!hasStartBond(atom, startBonds)));
const boost::dynamic_bitset<> &startBonds,
const std::vector<std::uint64_t> &atomFlags,
const std::vector<std::uint64_t> &bondFlags) {
return (bondFlags[nbrBond->getIdx()] ||
((!isCandidateAtom(otherAtom, atomFlags) &&
!hasStartBond(otherAtom, startBonds)) ||
(!isUnsaturatedBond(nbrBond) && !nbrBond->getIsConjugated() &&
!hasStartBond(atom, startBonds))));
}
} // namespace

Expand All @@ -463,12 +513,18 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
unsigned int hcount = 0;
int charge = 0;

// we aren't current doing anything with atomFlags, but we have added them in
// analogy to the bondFlags as a kind of future proofing.
std::vector<std::uint64_t> atomFlags(mol->getNumAtoms(), 0);
auto bondFlags = details::getBondFlags(*mol);

boost::dynamic_bitset<> bondsToModify(mol->getNumBonds());
boost::dynamic_bitset<> bondsConsidered(mol->getNumBonds());

boost::dynamic_bitset<> startBonds(mol->getNumBonds());
for (const auto bnd : mol->bonds()) {
startBonds.set(bnd->getIdx(), isPossibleStartingBond(bnd));
startBonds.set(bnd->getIdx(),
isPossibleStartingBond(bnd, atomFlags, bondFlags));
}
#ifdef VERBOSE_HASH
std::cerr << " START BONDS: " << startBonds << std::endl;
Expand Down Expand Up @@ -526,8 +582,10 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
}

// if both bonds are not eligible, then we can skip this neighbor
if (skipNeighborBond(atm, oatom, nbrBond, startBonds) &&
skipNeighborBond(oatom, atm, nbrBond, startBonds)) {
if (skipNeighborBond(atm, oatom, nbrBond, startBonds, atomFlags,
bondFlags) &&
skipNeighborBond(oatom, atm, nbrBond, startBonds, atomFlags,
bondFlags)) {
continue;
}

Expand Down Expand Up @@ -591,8 +649,10 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
<< hasStartBond(atm, startBonds) << std::endl;
#endif
if (bondsConsidered[nbrBnd->getIdx()] ||
(skipNeighborBond(atm, oatom, nbrBnd, startBonds) &&
skipNeighborBond(oatom, atm, nbrBnd, startBonds))) {
(skipNeighborBond(atm, oatom, nbrBnd, startBonds, atomFlags,
bondFlags) &&
skipNeighborBond(oatom, atm, nbrBnd, startBonds, atomFlags,
bondFlags))) {
// we won't add this bond for further traversal, but if both atoms
// are already in this system, then we should add the bond to the
// system
Expand Down

0 comments on commit 6867286

Please sign in to comment.