improve handling of esters, amides, etc. in the v2 tautomer hash (rdk…

…it#8173) * improve handling of esters, amides, etc. in the v2 tautomer hash Fixes rdkit#8090 * cleanup * extend that logic to aromatic systems * documentation in response to review
e-kwsm · Jan 24, 2025 · 6867286 · 6867286
1 parent 80f27d7
commit 6867286
Show file tree

Hide file tree

Showing 3 changed files with 147 additions and 29 deletions.
diff --git a/Code/GraphMol/MolHash/CMakeLists.txt b/Code/GraphMol/MolHash/CMakeLists.txt
@@ -1,7 +1,7 @@
 
 rdkit_library(MolHash
               hashfunctions.cpp normalize.cpp
-              LINK_LIBRARIES Depictor GraphMol)
+              LINK_LIBRARIES SubstructMatch Depictor GraphMol)
 target_compile_definitions(MolHash PRIVATE RDKIT_MOLHASH_BUILD)
 
 rdkit_headers(MolHash.h nmmolhash.h           

diff --git a/Code/GraphMol/MolHash/catch_tests.cpp b/Code/GraphMol/MolHash/catch_tests.cpp
@@ -432,13 +432,25 @@ TEST_CASE("tautomer v2") {
              {}},
             {{"CC(=O)C=CC", "C=C(O)C=CC"}, {"CC(=O)CC=C"}},
             {{"N=C1N=CN(C)C2N=CNC=21", "NC1N=CN(C)C2=NC=NC2=1"}, {}},
-            {{"S=C1N=CN=C2NC=NC12", "S=C1NC=NC2N=CNC1=2"}, {}},
-            {{"CC1=CN=CN1", "CC1CN=CN=1"}, {}},
-            {{"N1C(=O)NC(=O)C2C=NNC=21", "N1C(=O)NC(=O)C2=CNNC2=1",
-              "N1C(=O)NC(=O)C2=CNNC2=1", "N1C(=O)NC(=O)C2CN=NC2=1"},
+            {{
+                 "S=C1N=CN=C2NC=NC12",
+                 "S=C2C1N=CN=C1NC=N2",
+
+             },
              {
-                 "N1C(=O)NC(=O)C2CN=NC=21",
+                 "S=C1NC=NC2N=CNC1=2",
+                 "S=C1N=CN=C2N=CNC12",
+                 "S=C2C1NC=NC1=NC=N2",
              }},
+            {{"S=C1NC=NC2N=CNC1=2", "S=C1NC=NC2NC=NC1=2", "SC1=NC=NC2N=CNC1=2"},
+             {}},
+            {{"CC1=CN=CN1", "CC1CN=CN=1"}, {}},
+            {{
+                 "N1C(=O)NC(=O)C2C=NNC=21",
+                 "N1C(=O)NC(=O)C2=CNNC2=1",
+                 "N1C(=O)NC(=O)C2=CNNC2=1",
+             },
+             {"N1C(=O)NC(=O)C2CN=NC=21", "N1C(=O)NC(=O)C2CN=NC2=1"}},
             // ---------------------------
             // more stereochemistry
             // ---------------------------
@@ -447,7 +459,13 @@ TEST_CASE("tautomer v2") {
             {{"C/C=C/O", "C/C=C\\O", "CC=CO"}, {}},
             {{"C/C=C/C=O", "C/C=C\\C=O", "CC=CC=O"}, {}},
             {{"C/C=C/CC=O"}, {"C/C=C\\CC=O", "CC=CCC=O"}},
-            {{"C1C=CC=C2C=C(O)NC=12", "C1C=CC=C2CC(=O)NC=12"}, {}},
+            {{"C1C=CC=C2CC(=O)NC=12", "C2=C1N=C(CC1=CC=C2)O",
+              "C1C=CC=C2CC(O)=NC=12"
+
+             },
+             {
+                 "C1C=CC=C2C=C(O)NC=12",
+             }},
             // ---------------------------
             // some areas for potential improvement
             // ---------------------------
@@ -500,7 +518,7 @@ TEST_CASE("tautomer v2") {
         {"CC(C)(C)C=O", "CC(C)(C)[CH][O]_0_0",
          "[CH3]-[C](-[CH3])(-[CH3])-[CH]=[O]_0_0"},
         {"CC(C)=CO", "C[C](C)[CH][O]_1_0", "[CH3]-[C](-[CH3]):[C]:[O]_2_0"},
-        {"COC=O", "CO[CH][O]_0_0", "[CH3]-[O]:[C]:[O]_1_0"},
+        {"COC=O", "CO[CH][O]_0_0", "[CH3]-[O]-[CH]=[O]_0_0"},
         {"CNC=O", "C[N][CH][O]_1_0", "[CH3]-[N]:[C]:[O]_2_0"},
         {"CN(C)C=O", "CN(C)[CH][O]_0_0", "[CH3]-[N](-[CH3]):[C]:[O]_1_0"},
         {"CC(C)(C)NC=O", "CC(C)(C)[N][CH][O]_1_0",
@@ -519,7 +537,7 @@ TEST_CASE("tautomer v2") {
          "[C]:[C](:[O]):[C]-[CH2]-[CH3]_5_0"},
         {"C=CCC(O)C", "C=CCC(C)[O]_1_0",
          "[CH2]=[CH]-[CH2]-[CH](-[CH3])-[OH]_0_0"},
-        {"C=NC(=O)C", "[CH2][N][C](C)[O]_0_0", "[C]:[N]:[C](:[C]):[O]_5_0"},
+        {"C=NC(=O)C", "[CH2][N][C](C)[O]_0_0", "[C]:[N]:[C](-[CH3]):[O]_2_0"},
         {"C=NC(O)=C", "[CH2][N][C]([CH2])[O]_1_0", "[C]:[N]:[C](:[C]):[O]_5_0"},
         {"CC(=O)CC(=O)C", "C[C]([O])C[C](C)[O]_0_0",
          "[C]:[C](:[O]):[C]:[C](:[C]):[O]_8_0"},
@@ -874,4 +892,44 @@ TEST_CASE("overreach with v2 tautomer hashes and imines") {
             "[CH3]-[C@H](-[F])-[N]:[C]1:[C]-[CH2]-[CH2]-[CH2]-[C]:1_4_0");
     }
   }
+}
+
+TEST_CASE("v2 tautomers, carboxylic acids, amids, and related structures") {
+  SECTION("basics") {
+    std::vector<std::pair<std::string, std::string>> data = {
+        {"CC(=O)O", "[CH3]-[C](:[O]):[O]_1_0"},
+        {"CC(=O)OCC", "[CH3]-[CH2]-[O]-[C](-[CH3])=[O]_0_0"},
+        {"CC(=N)O", "[CH3]-[C](:[N]):[O]_2_0"},
+        {"CC(=O)NCC", "[CH3]-[CH2]-[N]:[C](-[CH3]):[O]_1_0"},
+        {"CC(=N)N", "[C]:[C](:[N]):[N]_6_0"},
+    };
+    for (const auto &[smiles, ref] : data) {
+      INFO(smiles);
+      auto m = v2::SmilesParse::MolFromSmiles(smiles);
+      REQUIRE(m);
+      auto hsh =
+          MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2);
+      CHECK(hsh == ref);
+    }
+  }
+  SECTION("specific problems") {
+    std::vector<std::pair<std::string, std::string>> data = {
+        // losing amino acid chirality:
+        {"CC(C)C[C@@H](C(=O)O)N",
+         "[CH3]-[CH](-[CH3])-[CH2]-[C@H](-[NH2])-[C](:[O]):[O]_1_0"},
+        // github #8090 (carboxylate)
+        {"O=C(O)CCC", "[CH3]-[CH2]-[CH2]-[C](:[O]):[O]_1_0"},
+        // aromatic "imine"
+        {"CC[C@@H](N)C1=NC=CN1",
+         "[CH3]-[CH2]-[C@@H](-[NH2])-[C]1:[N]:[C]:[C]:[N]:1_3_0"},
+    };
+    for (const auto &[smiles, ref] : data) {
+      INFO(smiles);
+      auto m = v2::SmilesParse::MolFromSmiles(smiles);
+      REQUIRE(m);
+      auto hsh =
+          MolHash::MolHash(m.get(), MolHash::HashFunction::HetAtomTautomerv2);
+      CHECK(hsh == ref);
+    }
+  }
 }
diff --git a/Code/GraphMol/MolHash/hashfunctions.cpp b/Code/GraphMol/MolHash/hashfunctions.cpp
@@ -21,6 +21,8 @@
 #include <GraphMol/RDKitBase.h>
 #include <GraphMol/RDKitQueries.h>
 #include <GraphMol/SmilesParse/SmilesWrite.h>
+#include <GraphMol/SmilesParse/SmilesParse.h>
+#include <GraphMol/Substruct/SubstructMatch.h>
 
 #include "nmmolhash.h"
 #include "mf.h"
@@ -378,14 +380,50 @@ std::string MesomerHash(RWMol *mol, bool netq, bool useCXSmiles,
   return result;
 }
 
+namespace details {
+
+constexpr std::uint64_t bondFlagCarboxyl =
+    1;  //*< bond involved in in carboxyl, amide, etc.
+std::vector<std::uint64_t> getBondFlags(const ROMol &mol) {
+  // FIX: oversimplified, but should work for now
+  static std::vector<std::string> patterns{
+      "[C;!$(C-C(=[NH])-[NH2])]-[C;!$(C(-C)(=[NH])-[NH2])](=[O,N,S])-[O,N,S]",  //< one side of the "amide", with an ugly exclusion for amidine
+      "[C;!$(C=[O,N,S])]-[O,N,S]-C=[O,N,S]",  //< the other side
+      "[OH0,SH0]-C=[O,N,S]",                  //< "esters" and "carboxyls"
+      "[C]-[c](:[o,n,s]):[o,n,s]",  //< a limited version of handling this for
+                                    // aromatic systems
+  };
+  static std::vector<std::unique_ptr<RDKit::RWMol>> queries;
+  if (queries.empty()) {
+    for (const auto &pattern : patterns) {
+      queries.emplace_back(SmartsToMol(pattern));
+    }
+  }
+
+  std::vector<std::uint64_t> bondFlags(mol.getNumBonds(), 0);
+  for (const auto &qry : queries) {
+    auto matches = SubstructMatch(mol, *qry);
+    for (const auto &match : matches) {
+      const auto bnd =
+          mol.getBondBetweenAtoms(match[0].second, match[1].second);
+      bondFlags[bnd->getIdx()] |= bondFlagCarboxyl;
+    }
+  }
+  return bondFlags;
+}
+
+}  // namespace details
+
 namespace {
 // candidate atoms are either unsaturated or have implicit Hs
 // NOTE that being aromatic is not sufficient. The molecule
 //  Cn1cccc1 is a good example of this:
 //    - it should not be a tautomeric system
 //    - the N is not unsaturated, but it is aromatic
-bool isCandidateAtom(const Atom *aptr) {
-  return aptr->getTotalNumHs() || queryAtomUnsaturated(aptr);
+bool isCandidateAtom(const Atom *aptr,
+                     const std::vector<std::uint64_t> &atomFlags) {
+  return !atomFlags[aptr->getIdx()] &&
+         (aptr->getTotalNumHs() || queryAtomUnsaturated(aptr));
 }
 
 // atomic number > 1, not carbon
@@ -403,18 +441,26 @@ bool isUnsaturatedBond(const Bond *bptr) {
 }
 
 // potential tautomeric bonds are unsaturated and both atoms are candidates
-bool isPossibleTautomericBond(const Bond *bptr) {
-  return isUnsaturatedBond(bptr) && isCandidateAtom(bptr->getBeginAtom()) &&
-         isCandidateAtom(bptr->getEndAtom());
+bool isPossibleTautomericBond(const Bond *bptr,
+                              const std::vector<std::uint64_t> &atomFlags,
+                              const std::vector<std::uint64_t> &bondFlags) {
+  return !bondFlags[bptr->getIdx()] && isUnsaturatedBond(bptr) &&
+         isCandidateAtom(bptr->getBeginAtom(), atomFlags) &&
+         isCandidateAtom(bptr->getEndAtom(), atomFlags);
 }
 
 // a bond is a possible starting bond if it involves a candidate hetereoatom
 // (definition above) and an unsaturated atom
-bool isPossibleStartingBond(const Bond *bptr) {
+bool isPossibleStartingBond(const Bond *bptr,
+                            const std::vector<std::uint64_t> &atomFlags,
+                            const std::vector<std::uint64_t> &bondFlags) {
+  if (bondFlags[bptr->getIdx()]) {
+    return false;
+  }
   auto heteroBeg = isHeteroAtom(bptr->getBeginAtom()) &&
-                   isCandidateAtom(bptr->getBeginAtom());
-  auto heteroEnd =
-      isHeteroAtom(bptr->getEndAtom()) && isCandidateAtom(bptr->getEndAtom());
+                   isCandidateAtom(bptr->getBeginAtom(), atomFlags);
+  auto heteroEnd = isHeteroAtom(bptr->getEndAtom()) &&
+                   isCandidateAtom(bptr->getEndAtom(), atomFlags);
   // at least one atom has to be an eligible heteroatom:
   if (!heteroBeg && !heteroEnd) {
     return false;
@@ -425,7 +471,8 @@ bool isPossibleStartingBond(const Bond *bptr) {
   auto unsatBeg = queryAtomUnsaturated(bptr->getBeginAtom());
   auto unsatEnd = queryAtomUnsaturated(bptr->getEndAtom());
 
-  // both we need a heteroatom on one side and an unsaturated atom on the other:
+  // both we need a heteroatom on one side and an unsaturated atom on the
+  // other:
   if (!((heteroBeg && unsatEnd) || (heteroEnd && unsatBeg))) {
     return false;
   }
@@ -448,11 +495,14 @@ bool hasStartBond(const Atom *aptr, const boost::dynamic_bitset<> &startBonds) {
 // doesn't have a start bond
 bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
                       const Bond *nbrBond,
-                      const boost::dynamic_bitset<> &startBonds) {
-  return (
-      (!isCandidateAtom(otherAtom) && !hasStartBond(otherAtom, startBonds)) ||
-      (!isUnsaturatedBond(nbrBond) && !nbrBond->getIsConjugated() &&
-       !hasStartBond(atom, startBonds)));
+                      const boost::dynamic_bitset<> &startBonds,
+                      const std::vector<std::uint64_t> &atomFlags,
+                      const std::vector<std::uint64_t> &bondFlags) {
+  return (bondFlags[nbrBond->getIdx()] ||
+          ((!isCandidateAtom(otherAtom, atomFlags) &&
+            !hasStartBond(otherAtom, startBonds)) ||
+           (!isUnsaturatedBond(nbrBond) && !nbrBond->getIsConjugated() &&
+            !hasStartBond(atom, startBonds))));
 }
 }  // namespace
 
@@ -463,12 +513,18 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
   unsigned int hcount = 0;
   int charge = 0;
 
+  // we aren't current doing anything with atomFlags, but we have added them in
+  // analogy to the bondFlags as a kind of future proofing.
+  std::vector<std::uint64_t> atomFlags(mol->getNumAtoms(), 0);
+  auto bondFlags = details::getBondFlags(*mol);
+
   boost::dynamic_bitset<> bondsToModify(mol->getNumBonds());
   boost::dynamic_bitset<> bondsConsidered(mol->getNumBonds());
 
   boost::dynamic_bitset<> startBonds(mol->getNumBonds());
   for (const auto bnd : mol->bonds()) {
-    startBonds.set(bnd->getIdx(), isPossibleStartingBond(bnd));
+    startBonds.set(bnd->getIdx(),
+                   isPossibleStartingBond(bnd, atomFlags, bondFlags));
   }
 #ifdef VERBOSE_HASH
   std::cerr << " START BONDS: " << startBonds << std::endl;
@@ -526,8 +582,10 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
         }
 
         // if both bonds are not eligible, then we can skip this neighbor
-        if (skipNeighborBond(atm, oatom, nbrBond, startBonds) &&
-            skipNeighborBond(oatom, atm, nbrBond, startBonds)) {
+        if (skipNeighborBond(atm, oatom, nbrBond, startBonds, atomFlags,
+                             bondFlags) &&
+            skipNeighborBond(oatom, atm, nbrBond, startBonds, atomFlags,
+                             bondFlags)) {
           continue;
         }
 
@@ -591,8 +649,10 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
                     << hasStartBond(atm, startBonds) << std::endl;
 #endif
           if (bondsConsidered[nbrBnd->getIdx()] ||
-              (skipNeighborBond(atm, oatom, nbrBnd, startBonds) &&
-               skipNeighborBond(oatom, atm, nbrBnd, startBonds))) {
+              (skipNeighborBond(atm, oatom, nbrBnd, startBonds, atomFlags,
+                                bondFlags) &&
+               skipNeighborBond(oatom, atm, nbrBnd, startBonds, atomFlags,
+                                bondFlags))) {
             // we won't add this bond for further traversal, but if both atoms
             // are already in this system, then we should add the bond to the
             // system