Skip to content

Commit

Permalink
Expose tautomer scoring functions to python (rdkit#7994)
Browse files Browse the repository at this point in the history
* Expose tautomer scoring functions to python

* Add more tests/documentation

* Rename getDefaultTautomerSubstructs to getDefaultTautomerScoreSubstructs

* Remove ROMOL_SPTR

* Add full custom scoring function example

* Run clang format

* Use proper BOOST_PYTHON_FUNCTION_OVERLOADS

* Use default copy constructor
  • Loading branch information
bp-kelley authored Nov 15, 2024
1 parent c2168cd commit 9495dd5
Show file tree
Hide file tree
Showing 5 changed files with 205 additions and 35 deletions.
47 changes: 15 additions & 32 deletions Code/GraphMol/MolStandardize/Tautomer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@
#include <algorithm>
#include <limits>

#include <boost/flyweight.hpp>
#include <boost/flyweight/key_value.hpp>
#include <boost/flyweight/no_tracking.hpp>
#include <utility>

// #define VERBOSE_ENUMERATION 1
Expand Down Expand Up @@ -75,35 +72,16 @@ int scoreRings(const ROMol &mol) {
return score;
};

struct smarts_mol_holder {
std::string d_smarts;
ROMOL_SPTR dp_mol;
smarts_mol_holder(const std::string &smarts) : d_smarts(smarts) {
dp_mol.reset(SmartsToMol(smarts));
SubstructTerm::SubstructTerm(std::string aname, std::string asmarts, int ascore)
: name(std::move(aname)), smarts(std::move(asmarts)), score(ascore) {
std::unique_ptr<ROMol> pattern(SmartsToMol(smarts));
if (pattern) {
matcher = std::move(*pattern);
}
};

typedef boost::flyweight<
boost::flyweights::key_value<std::string, smarts_mol_holder>,
boost::flyweights::no_tracking>
smarts_mol_flyweight;

struct SubstructTerm {
std::string name;
std::string smarts;
int score;
ROMOL_SPTR matcher;
SubstructTerm(std::string aname, std::string asmarts, int ascore)
: name(std::move(aname)), smarts(std::move(asmarts)), score(ascore) {
matcher = smarts_mol_flyweight(smarts).get().dp_mol;
};
};
}

int scoreSubstructs(const ROMol &mol) {
// a note on efficiency here: we'll construct the SubstructTerm objects here
// repeatedly, but the SMARTS parsing for each entry will only be done once
// since we're using the boost::flyweights above to cache them
const std::vector<SubstructTerm> substructureTerms{
const std::vector<SubstructTerm> &getDefaultTautomerScoreSubstructs() {
static std::vector<SubstructTerm> substructureTerms{
{"benzoquinone", "[#6]1([#6]=[#6][#6]([#6]=[#6]1)=,:[N,S,O])=,:[N,S,O]",
25},
{"oxim", "[#6]=[N][OH]", 4},
Expand All @@ -117,15 +95,20 @@ int scoreSubstructs(const ROMol &mol) {
{"guanidine terminal=N", "[#7]C(=[NR0])[#7H0]", 1},
{"guanidine endocyclic=N", "[#7;R][#6;R]([N])=[#7;R]", 2},
{"aci-nitro", "[#6]=[N+]([O-])[OH]", -4}};
return substructureTerms;
}

int scoreSubstructs(const ROMol &mol,
const std::vector<SubstructTerm> &substructureTerms) {
int score = 0;
for (const auto &term : substructureTerms) {
if (!term.matcher) {
if (!term.matcher.getNumAtoms()) {
BOOST_LOG(rdErrorLog) << " matcher for term " << term.name
<< " is invalid, ignoring it." << std::endl;
continue;
}
SubstructMatchParameters params;
const auto matches = SubstructMatch(mol, *term.matcher, params);
const auto matches = SubstructMatch(mol, term.matcher, params);
// if (!matches.empty()) {
// std::cerr << " " << matches.size() << " matches to " << term.name
// << std::endl;
Expand Down
49 changes: 48 additions & 1 deletion Code/GraphMol/MolStandardize/Tautomer.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,55 @@ typedef RDCatalog::HierarchCatalog<TautomerCatalogEntry, TautomerCatalogParams,
namespace TautomerScoringFunctions {
const std::string tautomerScoringVersion = "1.0.0";

//! The SubstructTerm controls how Tautomers are generated
/// Each Term is defined by a name, smarts pattern and score
/// For example, the C=O term is defined as
/// SubstructTerm("C=O", "[#6]=,:[#8]", 2)
/// This gets a score of +2 for each Carbon doubly or aromatically
/// Bonded to an Oxygen.
/// For a list of current definitions, see getDefaultTautomerScoreSubstructs
struct RDKIT_MOLSTANDARDIZE_EXPORT SubstructTerm {
std::string name;
std::string smarts;
int score;
RWMol matcher; // requires assignment

SubstructTerm(std::string aname, std::string asmarts, int ascore);
SubstructTerm(const SubstructTerm &rhs) = default;

bool operator==(const SubstructTerm &rhs) const {
return name == rhs.name && smarts == rhs.smarts && score == rhs.score;
}
};

//! getDefaultTautomerSubstructs returns the SubstructTerms used in scoring
/// tautomer forms. See SubstructTerm for details.
RDKIT_MOLSTANDARDIZE_EXPORT const std::vector<SubstructTerm>
&getDefaultTautomerScoreSubstructs();

//! Score the rings of the current tautomer
/// Aromatic rings score 100, all carbon aromatic rings score 250
/*!
\param mol Molcule to score
\returns integer score for the molecule's rings
*/
RDKIT_MOLSTANDARDIZE_EXPORT int scoreRings(const ROMol &mol);
RDKIT_MOLSTANDARDIZE_EXPORT int scoreSubstructs(const ROMol &mol);

//! scoreSubstructs scores the molecule based on the substructure definitions
/*!
\param mol Molecule to score
\param terms Substruct Terms used for scoring this particular tautomer form
\returns integer score for the molecule's substructure terms
*/
RDKIT_MOLSTANDARDIZE_EXPORT int scoreSubstructs(
const ROMol &mol, const std::vector<SubstructTerm> &terms =
getDefaultTautomerScoreSubstructs());
//! scoreHeteroHs score the molecules hydrogens
/// This gives a negative penalty to hydrogens attached to S,P, Se and Te
/*!
\param mol Molecule to score
\returns integer score for the molecule hetero hydrogens
*/
RDKIT_MOLSTANDARDIZE_EXPORT int scoreHeteroHs(const ROMol &mol);

inline int scoreTautomer(const ROMol &mol) {
Expand Down
63 changes: 63 additions & 0 deletions Code/GraphMol/MolStandardize/Wrap/Tautomer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,24 @@ PyTautomerEnumeratorResult *enumerateHelper(
return new PyTautomerEnumeratorResult(self.enumerate(mol));
}

std::vector<MolStandardize::TautomerScoringFunctions::SubstructTerm>
GetDefaultTautomerSubstructsHelper() {
std::vector<MolStandardize::TautomerScoringFunctions::SubstructTerm> terms;
for (auto term : MolStandardize::TautomerScoringFunctions::
getDefaultTautomerScoreSubstructs()) {
terms.emplace_back(term);
}
return terms;
}


} // namespace

// This indicates that the scoreSubstructs takes a minimum of 1 argument and a maximum of 2
// so we can call it ScoreSubstructs(mol) or ScoreSubstructs(mol, terms)
BOOST_PYTHON_FUNCTION_OVERLOADS(scoreSubstructs_overloads,
RDKit::MolStandardize::TautomerScoringFunctions::scoreSubstructs, 1, 2)

struct tautomer_wrapper {
static void wrap() {
python::enum_<MolStandardize::TautomerEnumeratorStatus>(
Expand Down Expand Up @@ -493,6 +509,53 @@ struct tautomer_wrapper {
MolStandardize::getV1TautomerEnumerator,
"return a TautomerEnumerator using v1 of the enumeration rules",
python::return_value_policy<python::manage_new_object>());

std::string docString =
"scores the ring system of the tautomer for canonicalization\n"
"Aromatic rings score 100, all carbon aromatic rings score 250";
python::def("ScoreRings",
MolStandardize::TautomerScoringFunctions::scoreRings,
python::arg("mol"), docString.c_str());

docString =
"scores the number of heteroHs of the tautomer for canonicalization\n"
"This gives a negative penalty to hydrogens attached to S,P, Se and Te";
python::def("ScoreHeteroHs",
MolStandardize::TautomerScoringFunctions::scoreHeteroHs,
python::arg("mol"), docString.c_str());

python::class_<MolStandardize::TautomerScoringFunctions::SubstructTerm>(
"SubstructTerm",
"Sets the score of this particular tautomer substructure, higher scores are more preferable\n"
"Aromatic rings score 100, all carbon aromatic rings score 250",
python::init<std::string, std::string, int>(
python::args("self", "name", "smarts", "score")))
.def_readonly(
"name",
&MolStandardize::TautomerScoringFunctions::SubstructTerm::name)
.def_readonly(
"smarts",
&MolStandardize::TautomerScoringFunctions::SubstructTerm::smarts)
.def_readonly(
"score",
&MolStandardize::TautomerScoringFunctions::SubstructTerm::score);

python::class_<
std::vector<MolStandardize::TautomerScoringFunctions::SubstructTerm>>(
"SubstructTermVector")
.def(python::vector_indexing_suite<std::vector<
MolStandardize::TautomerScoringFunctions::SubstructTerm>>());

docString = "scores the tautomer substructures";
python::def("ScoreSubstructs", &MolStandardize::TautomerScoringFunctions::scoreSubstructs,
scoreSubstructs_overloads((python::arg("mol"), python::arg("terms")),
docString.c_str())
);


python::def("GetDefaultTautomerScoreSubstructs",
GetDefaultTautomerSubstructsHelper,
"Return the default tautomer substructure scoring terms");
}
};

Expand Down
57 changes: 56 additions & 1 deletion Code/GraphMol/MolStandardize/Wrap/testMolStandardize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1780,6 +1780,61 @@ def test26PipelineAllowEmptyMoleculesOption(self):
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
self.assertEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)


def testCustomScoreFuncs(self):
smi = "CC\\C=C(/O)[C@@H](C)C(C)=O"
m = Chem.MolFromSmiles(smi)
self.assertEqual(rdMolStandardize.ScoreRings(m), 0)
self.assertEqual(rdMolStandardize.ScoreHeteroHs(m), 0)
self.assertEqual(rdMolStandardize.ScoreSubstructs(m), 6)

# check the default terms
terms = rdMolStandardize.GetDefaultTautomerScoreSubstructs()
for term, (name, smarts, score) in zip(terms, [["benzoquinone", "[#6]1([#6]=[#6][#6]([#6]=[#6]1)=,:[N,S,O])=,:[N,S,O]",
25],
["oxim", "[#6]=[N][OH]", 4],
["C=O", "[#6]=,:[#8]", 2],
["N=O", "[#7]=,:[#8]", 2],
["P=O", "[#15]=,:[#8]", 2],
["C=hetero", "[C]=[!#1;!#6]", 1],
["C(=hetero)-hetero", "[C](=[!#1;!#6])[!#1;!#6]", 2],
["aromatic C = exocyclic N", "[c]=!@[N]", -1],
["methyl", "[CX4H3]", 1],
["guanidine terminal=N", "[#7]C(=[NR0])[#7H0]", 1],
["guanidine endocyclic=N", "[#7;R][#6;R]([N])=[#7;R]", 2],
["aci-nitro", "[#6]=[N+]([O-])[OH]", -4]]):
self.assertEqual((term.name, term.smarts, term.score), (name, smarts, score))

# make sure we can pass in our own terms
terms = rdMolStandardize.SubstructTermVector()
terms.append(rdMolStandardize.SubstructTerm("C=0", "[#6]=,:[#8]", 1000))
self.assertEqual(rdMolStandardize.ScoreSubstructs(m, terms), 1000)

self.assertEqual(rdMolStandardize.ScoreSubstructs(
m, rdMolStandardize.GetDefaultTautomerScoreSubstructs()), 6)

enumerator = rdMolStandardize.TautomerEnumerator()
m2 = Chem.MolFromSmiles("C1(=CCCCC1)O")

ctaut = enumerator.Canonicalize(m2)
self.assertEqual(Chem.MolToSmiles(ctaut), "O=C1CCCCC1")

# duplicate the normal scoring function
def score_func1(mol):
return (rdMolStandardize.ScoreRings(mol) + rdMolStandardize.ScoreHeteroHs(mol) +
rdMolStandardize.ScoreSubstructs(mol))

ctaut = enumerator.Canonicalize(m2, score_func1)
self.assertEqual(Chem.MolToSmiles(ctaut), "O=C1CCCCC1")

# pull a single tautomer out of the mix
def score_func2(mol):
if Chem.MolToSmiles(mol) == Chem.CanonSmiles("C1(=CCCCC1)O"):
return 100_000
return 0

ctaut = enumerator.Canonicalize(m2, score_func2)
self.assertEqual(Chem.MolToSmiles(ctaut), Chem.CanonSmiles("C1(=CCCCC1)O"))


if __name__ == "__main__":
unittest.main()
24 changes: 23 additions & 1 deletion Code/GraphMol/MolStandardize/catch_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1730,4 +1730,26 @@ M END)CTAB";
"INFO: [ValenceValidation] Explicit valence for atom # 0 Br") ==
0);
}
}
}

TEST_CASE("Custom Scoring Functions") {
SECTION("basics") {
auto mol = "CC\\C=C(/O)[C@@H](C)C(C)=O"_smiles;
REQUIRE(MolStandardize::TautomerScoringFunctions::scoreRings(*mol) == 0);
REQUIRE(MolStandardize::TautomerScoringFunctions::scoreHeteroHs(*mol) == 0);
REQUIRE(MolStandardize::TautomerScoringFunctions::scoreSubstructs(*mol) ==
6);

auto terms = MolStandardize::TautomerScoringFunctions::
getDefaultTautomerScoreSubstructs();
REQUIRE(terms.size() == 12);
}

SECTION("Override default tautomer scoring functions") {
auto mol = "CC\\C=C(/O)[C@@H](C)C(C)=O"_smiles;
std::vector<MolStandardize::TautomerScoringFunctions::SubstructTerm> terms =
{{"C=O", "[#6]=,:[#8]", 1000}};
REQUIRE(MolStandardize::TautomerScoringFunctions::scoreSubstructs(
*mol, terms) == 1000);
}
}

0 comments on commit 9495dd5

Please sign in to comment.