From 341171e5135aefd1ae8c4d6737d0eaacbe3320f7 Mon Sep 17 00:00:00 2001 From: riesben Date: Fri, 13 Sep 2024 17:12:15 +0200 Subject: [PATCH 1/8] adapting to new rdkit fingerprint generators. --- scikit_mol/fingerprints.py | 499 +++++++++++++++++++++++++++++++++++++ 1 file changed, 499 insertions(+) diff --git a/scikit_mol/fingerprints.py b/scikit_mol/fingerprints.py index 767bfc6..ef33aba 100644 --- a/scikit_mol/fingerprints.py +++ b/scikit_mol/fingerprints.py @@ -11,6 +11,11 @@ from rdkit.Chem import rdMHFPFingerprint from rdkit.Avalon import pyAvalonTools +from rdkit.Chem.rdFingerprintGenerator import (GetMorganGenerator, GetMorganFeatureAtomInvGen, + GetTopologicalTorsionGenerator, + GetAtomPairGenerator, + GetRDKitFPGenerator) + import numpy as np import pandas as pd from scipy.sparse import lil_matrix @@ -243,6 +248,9 @@ def __init__(self, minLength:int = 1, maxLength:int = 30, fromAtoms = 0, ignoreA self.nBitsPerEntry = nBitsPerEntry self.useCounts = useCounts + raise DeprecationWarning("TopologicalTorsionFingerprintTransformer will be replace by TopologicalTorsionFingerprintGeneratorTransformer, due to changes in RDKit!") + + def _mol2fp(self, mol): if self.useCounts: return rdMolDescriptors.GetHashedAtomPairFingerprint(mol, nBits=int(self.nBits), @@ -281,6 +289,8 @@ def __init__(self, targetSize:int = 4, fromAtoms = 0, ignoreAtoms = 0, atomInvar self.nBitsPerEntry = nBitsPerEntry self.nBits = nBits self.useCounts = useCounts + raise DeprecationWarning("TopologicalTorsionFingerprintTransformer will be replace by TopologicalTorsionFingerprintGeneratorTransformer, due to changes in RDKit!") + def _mol2fp(self, mol): if self.useCounts: @@ -478,6 +488,8 @@ def __init__(self, nBits=2048, radius=2, useChirality=False, useBondTypes=True, self.useBondTypes = useBondTypes self.useFeatures = useFeatures self.useCounts = useCounts + raise DeprecationWarning("MorganFingerprintTransformer will be replace by MorganGeneratorTransformer, due to changes in RDKit!") + def _mol2fp(self, mol): if self.useCounts: @@ -541,3 +553,490 @@ def parallel_helper(args): transformer = getattr(fingerprints, classname)(**parameters) return transformer._transform(X_mols) + +class FpsGeneratorTransformer(FpsTransformer): + + + def _fp2array(self, fp): + raise DeprecationWarning("Generators can directly return fingerprints") + + def _mol2fp(self, mol): + raise DeprecationWarning("use _mol2array") + + def __getstate__(self): + # Get the state of the parent class + state = super().__getstate__() + # Remove the unpicklable property from the state + state.pop("_fpgen", None) # fpgen is not picklable + return state + + def __setstate__(self, state): + # Restore the state of the parent class + super().__setstate__(state) + # Re-create the unpicklable property + self._generate_fp_generator() + + @abstractmethod + def _generate_fp_generator(self,*args, **kwargs): + raise NotImplementedError("_generate_fp_generator not implemented") + + @abstractmethod + def _transform_mol(self, mol) -> np.array: + """Generate numpy array descriptor from mol + + MUST BE OVERWRITTEN + """ + raise NotImplementedError("_transform_mol not implemented") + + +class MorganFPGeneratorTransformer(FpsGeneratorTransformer): + def __init__(self, nBits=2048, radius=2, useChirality=False, + useBondTypes=True, useFeatures=False, useCounts=False, + parallel: Union[bool, int] = False,): + """Transform RDKit mols into Count or bit-based hashed MorganFingerprints + + Parameters + ---------- + nBits : int, optional + Size of the hashed fingerprint, by default 2048 + radius : int, optional + Radius of the fingerprint, by default 2 + useChirality : bool, optional + Include chirality in calculation of the fingerprint keys, by default False + useBondTypes : bool, optional + Include bondtypes in calculation of the fingerprint keys, by default True + useFeatures : bool, optional + use chemical features, rather than atom-type in calculation of the fingerprint keys, by default False + useCounts : bool, optional + If toggled will create the count and not bit-based fingerprint, by default False + """ + super().__init__(parallel = parallel) + self._useFeatures = useFeatures + self._useCounts = useCounts + self._useBondTypes = useBondTypes + self._generate_fp_generator(useFeatures=useFeatures, radius=radius, nBits=nBits, + useChirality=useChirality, useBondTypes=useBondTypes) + + + def _generate_fp_generator(self, useFeatures:bool, radius:int, nBits:int, + useChirality:bool, useBondTypes:bool): + + if useFeatures: + atomInvariantsGenerator = GetMorganFeatureAtomInvGen() + else: + atomInvariantsGenerator = None + + self._fpgen = GetMorganGenerator(radius=radius, + fpSize=nBits, + includeChirality=useChirality, + useBondTypes=useBondTypes, + atomInvariantsGenerator=atomInvariantsGenerator, + ) + + @property + def radius(self): + return self._fpgen.GetOptions().radius + + @radius.setter + def radius(self, value:int): + self._fpgen.GetOptions().radius = value + + @property + def nBits(self): + return self._fpgen.GetOptions().fpSize + + @nBits.setter + def nBits(self, value:int): + self._fpgen.GetOptions().fpSize = value + + @property + def useChirality(self): + return self._fpgen.GetOptions().includeChirality + + @useChirality.setter + def useChirality(self, value:bool): + self._fpgen.GetOptions().includeChirality = value + + @property + def useFeatures(self): + return self._useFeatures + + @useFeatures.setter + def useFeatures(self, value:bool): + self._useFeatures = value + self._generate_fp_generator(useFeatures=self.useFeatures, radius=self.radius, nBits=self.nBits, + useChirality=self.useChirality, useBondTypes=self.useBondTypes) + + @property + def useBondTypes(self): + return self._useBondTypes + + @useBondTypes.setter + def useBondTypes(self, value:bool): + self._useBondTypes = value + self._generate_fp_generator(useFeatures=self.useFeatures, radius=self.radius, nBits=self.nBits, + useChirality=self.useChirality, useBondTypes=self.useBondTypes) + + @property + def useCounts(self): + return self._useCounts + + @useCounts.setter + def useCounts(self, value:bool): + self._useCounts = value + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy(mol) + else: + return self._fpgen.GetFingerprintAsNumPy(mol) + + +class TopologicalTorsionFPGeneatorTransformer(FpsGeneratorTransformer): + def __init__(self, targetSize:int = 4, fromAtoms = None, ignoreAtoms = None, atomInvariants = None, confId=-1, + includeChirality:bool = False, nBitsPerEntry:int = 4, nBits=2048, + useCounts:bool=False, parallel: Union[bool, int] = False): + + super().__init__(parallel=parallel) + self._fromAtoms = fromAtoms + self._ignoreAtoms = ignoreAtoms + self._atomInvariants = atomInvariants + self._nBitsPerEntry = nBitsPerEntry + self._confId = confId + self._useCounts = useCounts + self._targetSize = targetSize + + self._generate_fp_generator(targetSize=targetSize, includeChirality=includeChirality, + nBits=nBits) + + @property + def useCounts(self): + return self._useCounts + + @useCounts.setter + def useCounts(self, value:bool): + self._useCounts = value + + @property + def confId(self): + return self._confId + + @confId.setter + def confId(self, value: int): + self._confId = value + + @property + def fromAtoms(self): + return self._fromAtoms + + @fromAtoms.setter + def fromAtoms(self, value: int): + self._fromAtoms = value + + @property + def ignoreAtoms(self): + return self._ignoreAtoms + + @ignoreAtoms.setter + def ignoreAtoms(self, value: int): + self._ignoreAtoms = value + + @property + def atomInvariants(self): + return self._atomInvariants + + @atomInvariants.setter + def atomInvariants(self, value: int): + self._atomInvariants = value + + @property + def nBits(self): + return self._fpgen.GetOptions().fpSize + + @nBits.setter + def nBits(self, value: int): + self._fpgen.GetOptions().fpSize = value + + @property + def nBitsPerEntry(self): + return self._nBitsPerEntry + + @nBitsPerEntry.setter + def nBitsPerEntry(self, value: int): + self._nBitsPerEntry = value + + @property + def includeChirality(self): + return self._fpgen.GetOptions().includeChirality + + @includeChirality.setter + def includeChirality(self, value:int): + self._fpgen.GetOptions().includeChirality = value + + @property + def targetSize(self): + return self._targetSize + + @targetSize.setter + def targetSize(self, value:int): + self._targetSize = value + self._generate_fp_generator(targetSize=value, + includeChirality=self.includeChirality, + nBits=self.nBits) + + def _generate_fp_generator(self, targetSize: int, includeChirality: bool, nBits: int): + self._fpgen = GetTopologicalTorsionGenerator(torsionAtomCount=targetSize, includeChirality=includeChirality, + fpSize=nBits) + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self._ignoreAtoms, customAtomInvariants=self._atomInvariants) + else: + return self._fpgen.GetFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self._ignoreAtoms, customAtomInvariants=self._atomInvariants) + + +class AtomPairFPGeneratorTransformer(FpsGeneratorTransformer): + def __init__(self, minLength:int = 1, maxLength:int = 30, fromAtoms = None, ignoreAtoms = None, atomInvariants = None, + includeChirality:bool = False, use2D:bool = True, confId:int = -1, nBits=2048, nBitsPerEntry:int = 4, + useCounts:bool=False, parallel: Union[bool, int] = False,): + super().__init__(parallel = parallel) + self._useCounts= useCounts + self._confId = confId + self._fromAtoms = fromAtoms + self._ignoreAtoms = ignoreAtoms + self._atomInvariants = atomInvariants + self._minLength = minLength + self._maxLength = maxLength + + self._generate_fp_generator(minLength=minLength, maxLength=maxLength, + includeChirality=includeChirality, use2D=use2D, + nBits=nBits, nBitsPerEntry=nBitsPerEntry) + + @property + def useCounts(self): + return self._useCounts + + @useCounts.setter + def useCounts(self, value:bool): + self._useCounts = value + + @property + def confId(self): + return self._confId + + @confId.setter + def confId(self, value:int): + self._confId = value + + @property + def fromAtoms(self): + return self._fromAtoms + + @fromAtoms.setter + def fromAtoms(self, value:int): + self._fromAtoms = value + + @property + def ignoreAtoms(self): + return self._ignoreAtoms + + @ignoreAtoms.setter + def ignoreAtoms(self, value:int): + self._ignoreAtoms = value + + @property + def atomInvariants(self): + return self._atomInvariants + + @atomInvariants.setter + def atomInvariants(self, value:int): + self._atomInvariants = value + + @property + def minLength(self): + return self._minLength + + @minLength.setter + def minDistance(self, value: int): + self._minLength = value + self._generate_fp_generator(minLength=value, maxLength=self.maxLength, + includeChirality=self.includeChirality, use2D=self.use2D, + nBits=self.nBits, nBitsPerEntry=self.nBitsPerEntry) + + @property + def maxLength(self): + return self._maxLength + + @maxLength.setter + def maxLength(self, value: int): + self._maxLength = value + self._generate_fp_generator(minLength=self.minLength, maxLength=value, + includeChirality=self.includeChirality, use2D=self.use2D, + nBits=self.nBits, nBitsPerEntry=self.nBitsPerEntry) + + @property + def includeChirality(self): + return self._fpgen.GetOptions().includeChirality + + @includeChirality.setter + def includeChirality(self, value: bool): + self._fpgen.GetOptions().includeChirality = value + + @property + def use2D(self): + return self._fpgen.GetOptions().use2D + + @use2D.setter + def use2D(self, value: bool): + self._fpgen.GetOptions().use2D = value + + @property + def nBits(self): + return self._fpgen.GetOptions().fpSize + + @nBits.setter + def nBits(self, value: int): + self._fpgen.GetOptions().fpSize = value + + @property + def nBitsPerEntry(self): + return self._fpgen.GetOptions().numBitsPerFeature + + @nBitsPerEntry.setter + def nBitsPerEntry(self, value: int): + self._fpgen.GetOptions().numBitsPerFeature = value + + def _generate_fp_generator(self, minLength, maxLength, includeChirality, use2D, nBits, nBitsPerEntry): + self._fpgen = GetAtomPairGenerator(minDistance=minLength, maxDistance=maxLength, + includeChirality=includeChirality, + use2D=use2D, fpSize=nBits) + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self._ignoreAtoms, customAtomInvariants=self._atomInvariants) + else: + return self._fpgen.GetFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self._ignoreAtoms, customAtomInvariants=self._atomInvariants) + + +class RDKitFPGeneratorTransformer(FpsGeneratorTransformer): + def __init__(self, minPath:int = 1, maxPath:int =7, useHs:bool = True, branchedPaths:bool = True, + useBondOrder:bool = True, countSimulation:bool = False, countBounds = None, + nBits:int = 2048, numBitsPerFeature:int = 2, + useCounts:bool = False, parallel: Union[bool, int] = False + ): + """Calculates the RDKit fingerprints + + Parameters + ---------- + minPath : int, optional + the minimum path length (in bonds) to be included, by default 1 + maxPath : int, optional + the maximum path length (in bonds) to be included, by default 7 + useHs : bool, optional + toggles inclusion of Hs in paths (if the molecule has explicit Hs), by default True + branchedPaths : bool, optional + toggles generation of branched subgraphs, not just linear paths, by default True + useBondOrder : bool, optional + toggles inclusion of bond orders in the path hashes, by default True + countSimulation : bool, optional + if set, use count simulation while generating the fingerprint, by default False + countBounds : _type_, optional + boundaries for count simulation, corresponding bit will be set if the count is higher than the number provided for that spot, by default None + nBits : int, optional + size of the generated fingerprint, does not affect the sparse versions, by default 2048 + numBitsPerFeature : int, optional + the number of bits set per path/subgraph found, by default 2 + """ + super().__init__(parallel = parallel) + self._useCounts = useCounts + self._countBounds = countBounds + self._generate_fp_generator( minPath=minPath, maxPath=maxPath, useHs=useHs, + branchedPaths=branchedPaths,useBondOrder=useBondOrder, + countSimulation=countSimulation, fpSize=nBits, + countBounds=countBounds, numBitsPerFeature=numBitsPerFeature) + + + @property + def nBits(self): + return self._fpgen.GetOptions().fpSize + @nBits.setter + def nBits(self, value: int): + self._fpgen.GetOptions().fpSize = value + @property + def minPath(self): + return self._fpgen.GetOptions().minPath + @minPath.setter + def minPath(self, value:int): + self._fpgen.GetOptions().minPath = value + @property + def maxPath(self): + return self._fpgen.GetOptions().maxPath + @maxPath.setter + def maxPath(self, value:int): + self._fpgen.GetOptions().maxPath = value + @property + def useHs(self): + return self._fpgen.GetOptions().useHs + @useHs.setter + def useHs(self, value:bool): + self._fpgen.GetOptions().useHs = value + @property + def branchedPaths(self): + return self._fpgen.GetOptions().branchedPaths + @branchedPaths.setter + def branchedPaths(self, value:int): + self._fpgen.GetOptions().branchedPaths = value + @property + def useBondOrder(self): + return self._fpgen.GetOptions().useBondOrder + @useBondOrder.setter + def useBondOrder(self, value:int): + self._fpgen.GetOptions().useBondOrder = value + @property + def numBitsPerFeature(self): + return self._fpgen.GetOptions().numBitsPerFeature + @numBitsPerFeature.setter + def numBitsPerFeature(self, value:int): + self._fpgen.GetOptions().numBitsPerFeature = value + @property + def countBounds(self): + return self._countBounds + @countBounds.setter + def countBounds(self, value:int): + self._countBounds = value + self._generate_fp_generator(minPath=self.minPath, maxPath=self.maxPath, useHs=self.useHs, + branchedPaths=self.branchedPaths,useBondOrder=self.useBondOrder, + countSimulation=self.countSimulation, fpSize=self.nBits, + countBounds=value, numBitsPerFeature=self.numBitsPerFeature) + + @property + def countSimulation(self): + return self._countBounds + @countSimulation.setter + def countSimulation(self, value: bool): + self._countSimulation=value + self._generate_fp_generator(minPath=self.minPath, maxPath=self.maxPath, useHs=self.useHs, + branchedPaths=self.branchedPaths,useBondOrder=self.useBondOrder, + countSimulation=value, fpSize=self.nBits, + countBounds=self.countBounds, numBitsPerFeature=self.numBitsPerFeature) + + @property + def useCounts(self): + return self._useCounts + @useCounts.setter + def useCounts(self, value:bool): + self._useCounts = value + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy(mol) + else: + return self._fpgen.GetFingerprintAsNumPy(mol) + + def _generate_fp_generator(self, minPath, maxPath, useHs, branchedPaths, + useBondOrder, countSimulation, fpSize, countBounds, + numBitsPerFeature): + self._fpgen = GetRDKitFPGenerator(minPath=minPath, maxPath=maxPath, useHs=useHs, + branchedPaths=branchedPaths,useBondOrder=useBondOrder, + countSimulation=countSimulation, fpSize=fpSize, + countBounds=countBounds, numBitsPerFeature=numBitsPerFeature) From 8be105ad1c62247c94d7cd8c09bd92ed572c4c71 Mon Sep 17 00:00:00 2001 From: riesben Date: Sat, 14 Sep 2024 09:51:13 +0200 Subject: [PATCH 2/8] Deprecations warnings in transformers: raise->prints --- scikit_mol/fingerprints.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scikit_mol/fingerprints.py b/scikit_mol/fingerprints.py index ef33aba..4c63f8f 100644 --- a/scikit_mol/fingerprints.py +++ b/scikit_mol/fingerprints.py @@ -248,7 +248,8 @@ def __init__(self, minLength:int = 1, maxLength:int = 30, fromAtoms = 0, ignoreA self.nBitsPerEntry = nBitsPerEntry self.useCounts = useCounts - raise DeprecationWarning("TopologicalTorsionFingerprintTransformer will be replace by TopologicalTorsionFingerprintGeneratorTransformer, due to changes in RDKit!") + print("AtomPairFingerprintTransformer will be replace by AtomPairFPGeneratorTransformer, due to changes in RDKit!") + #raise DeprecationWarning("AtomPairFingerprintTransformer will be replace by AtomPairFPGeneratorTransformer, due to changes in RDKit!") def _mol2fp(self, mol): @@ -289,7 +290,8 @@ def __init__(self, targetSize:int = 4, fromAtoms = 0, ignoreAtoms = 0, atomInvar self.nBitsPerEntry = nBitsPerEntry self.nBits = nBits self.useCounts = useCounts - raise DeprecationWarning("TopologicalTorsionFingerprintTransformer will be replace by TopologicalTorsionFingerprintGeneratorTransformer, due to changes in RDKit!") + print("TopologicalTorsionFingerprintTransformer will be replace by TopologicalTorsionFPGeneatorTransformer, due to changes in RDKit!") + #raise DeprecationWarning("AtomPairFingerprintTransformer will be replace by AtomPairFPGeneratorTransformer, due to changes in RDKit!") def _mol2fp(self, mol): @@ -488,7 +490,9 @@ def __init__(self, nBits=2048, radius=2, useChirality=False, useBondTypes=True, self.useBondTypes = useBondTypes self.useFeatures = useFeatures self.useCounts = useCounts - raise DeprecationWarning("MorganFingerprintTransformer will be replace by MorganGeneratorTransformer, due to changes in RDKit!") + + print("MorganFingerprintTransformer will be replace by MorganGeneratorTransformer, due to changes in RDKit!") + #raise DeprecationWarning("MorganFingerprintTransformer will be replace by MorganFPGeneratorTransformer, due to changes in RDKit!") def _mol2fp(self, mol): @@ -574,7 +578,7 @@ def __setstate__(self, state): # Restore the state of the parent class super().__setstate__(state) # Re-create the unpicklable property - self._generate_fp_generator() + self._generate_fp_generator(**state) @abstractmethod def _generate_fp_generator(self,*args, **kwargs): @@ -614,6 +618,7 @@ def __init__(self, nBits=2048, radius=2, useChirality=False, self._useFeatures = useFeatures self._useCounts = useCounts self._useBondTypes = useBondTypes + self._generate_fp_generator(useFeatures=useFeatures, radius=radius, nBits=nBits, useChirality=useChirality, useBondTypes=useBondTypes) From 681a493e74fff8c374cbb984da5a55b1712aff74 Mon Sep 17 00:00:00 2001 From: riesben Date: Sat, 14 Sep 2024 12:40:07 +0200 Subject: [PATCH 3/8] minor class property fixes, most test wun now. Need to look into cloning and pickling. --- scikit_mol/fingerprints.py | 45 +++--- tests/test_fptransformersgenerator.py | 188 ++++++++++++++++++++++++++ 2 files changed, 209 insertions(+), 24 deletions(-) create mode 100644 tests/test_fptransformersgenerator.py diff --git a/scikit_mol/fingerprints.py b/scikit_mol/fingerprints.py index 4c63f8f..69ca8fd 100644 --- a/scikit_mol/fingerprints.py +++ b/scikit_mol/fingerprints.py @@ -2,6 +2,8 @@ from multiprocessing import Pool, get_context import multiprocessing import re +import inspect +from typing import Callable from typing import Union from rdkit import Chem from rdkit import DataStructs @@ -571,6 +573,8 @@ def __getstate__(self): # Get the state of the parent class state = super().__getstate__() # Remove the unpicklable property from the state + props = {k:v for k,v in inspect.getmembers(self) if not isinstance(v, Callable) and not k.startswith("_")} + state.update(props) state.pop("_fpgen", None) # fpgen is not picklable return state @@ -578,7 +582,8 @@ def __setstate__(self, state): # Restore the state of the parent class super().__setstate__(state) # Re-create the unpicklable property - self._generate_fp_generator(**state) + generatort_keys = inspect.signature(self._generate_fp_generator).parameters.keys() + self._generate_fp_generator(**{k:state["_"+k] if "_"+k in state else state[k] for k in generatort_keys}) @abstractmethod def _generate_fp_generator(self,*args, **kwargs): @@ -592,6 +597,14 @@ def _transform_mol(self, mol) -> np.array: """ raise NotImplementedError("_transform_mol not implemented") + @property + def fpSize(self): + return self.nBits + + #Scikit-Learn expects to be able to set fpSize directly on object via .set_params(), so this updates nBits used by the abstract class + @fpSize.setter + def fpSize(self, fpSize): + self.nBits = fpSize class MorganFPGeneratorTransformer(FpsGeneratorTransformer): def __init__(self, nBits=2048, radius=2, useChirality=False, @@ -699,18 +712,16 @@ def _transform_mol(self, mol) -> np.array: class TopologicalTorsionFPGeneatorTransformer(FpsGeneratorTransformer): def __init__(self, targetSize:int = 4, fromAtoms = None, ignoreAtoms = None, atomInvariants = None, confId=-1, - includeChirality:bool = False, nBitsPerEntry:int = 4, nBits=2048, + includeChirality:bool = False, nBits=2048, useCounts:bool=False, parallel: Union[bool, int] = False): super().__init__(parallel=parallel) self._fromAtoms = fromAtoms self._ignoreAtoms = ignoreAtoms self._atomInvariants = atomInvariants - self._nBitsPerEntry = nBitsPerEntry self._confId = confId self._useCounts = useCounts self._targetSize = targetSize - self._generate_fp_generator(targetSize=targetSize, includeChirality=includeChirality, nBits=nBits) @@ -762,14 +773,6 @@ def nBits(self): def nBits(self, value: int): self._fpgen.GetOptions().fpSize = value - @property - def nBitsPerEntry(self): - return self._nBitsPerEntry - - @nBitsPerEntry.setter - def nBitsPerEntry(self, value: int): - self._nBitsPerEntry = value - @property def includeChirality(self): return self._fpgen.GetOptions().includeChirality @@ -802,7 +805,7 @@ def _transform_mol(self, mol) -> np.array: class AtomPairFPGeneratorTransformer(FpsGeneratorTransformer): def __init__(self, minLength:int = 1, maxLength:int = 30, fromAtoms = None, ignoreAtoms = None, atomInvariants = None, - includeChirality:bool = False, use2D:bool = True, confId:int = -1, nBits=2048, nBitsPerEntry:int = 4, + includeChirality:bool = False, use2D:bool = True, confId:int = -1, nBits=2048, useCounts:bool=False, parallel: Union[bool, int] = False,): super().__init__(parallel = parallel) self._useCounts= useCounts @@ -815,7 +818,7 @@ def __init__(self, minLength:int = 1, maxLength:int = 30, fromAtoms = None, igno self._generate_fp_generator(minLength=minLength, maxLength=maxLength, includeChirality=includeChirality, use2D=use2D, - nBits=nBits, nBitsPerEntry=nBitsPerEntry) + nBits=nBits) @property def useCounts(self): @@ -862,11 +865,11 @@ def minLength(self): return self._minLength @minLength.setter - def minDistance(self, value: int): + def minLength(self, value: int): self._minLength = value self._generate_fp_generator(minLength=value, maxLength=self.maxLength, includeChirality=self.includeChirality, use2D=self.use2D, - nBits=self.nBits, nBitsPerEntry=self.nBitsPerEntry) + nBits=self.nBits) @property def maxLength(self): @@ -877,7 +880,7 @@ def maxLength(self, value: int): self._maxLength = value self._generate_fp_generator(minLength=self.minLength, maxLength=value, includeChirality=self.includeChirality, use2D=self.use2D, - nBits=self.nBits, nBitsPerEntry=self.nBitsPerEntry) + nBits=self.nBits) @property def includeChirality(self): @@ -907,11 +910,7 @@ def nBits(self, value: int): def nBitsPerEntry(self): return self._fpgen.GetOptions().numBitsPerFeature - @nBitsPerEntry.setter - def nBitsPerEntry(self, value: int): - self._fpgen.GetOptions().numBitsPerFeature = value - - def _generate_fp_generator(self, minLength, maxLength, includeChirality, use2D, nBits, nBitsPerEntry): + def _generate_fp_generator(self, minLength, maxLength, includeChirality, use2D, nBits): self._fpgen = GetAtomPairGenerator(minDistance=minLength, maxDistance=maxLength, includeChirality=includeChirality, use2D=use2D, fpSize=nBits) @@ -1013,7 +1012,6 @@ def countBounds(self, value:int): branchedPaths=self.branchedPaths,useBondOrder=self.useBondOrder, countSimulation=self.countSimulation, fpSize=self.nBits, countBounds=value, numBitsPerFeature=self.numBitsPerFeature) - @property def countSimulation(self): return self._countBounds @@ -1024,7 +1022,6 @@ def countSimulation(self, value: bool): branchedPaths=self.branchedPaths,useBondOrder=self.useBondOrder, countSimulation=value, fpSize=self.nBits, countBounds=self.countBounds, numBitsPerFeature=self.numBitsPerFeature) - @property def useCounts(self): return self._useCounts diff --git a/tests/test_fptransformersgenerator.py b/tests/test_fptransformersgenerator.py new file mode 100644 index 0000000..f11ea95 --- /dev/null +++ b/tests/test_fptransformersgenerator.py @@ -0,0 +1,188 @@ +import pickle +import tempfile +import pytest +import numpy as np +from fixtures import mols_list, smiles_list, mols_container, smiles_container, fingerprint, chiral_smiles_list, chiral_mols_list +from sklearn import clone + +from scikit_mol.fingerprints import (MorganFPGeneratorTransformer, + RDKitFPGeneratorTransformer, + AtomPairFPGeneratorTransformer, + TopologicalTorsionFPGeneatorTransformer, + ) + +test_transformers = [MorganFPGeneratorTransformer, RDKitFPGeneratorTransformer, + AtomPairFPGeneratorTransformer, TopologicalTorsionFPGeneatorTransformer] + + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_fpstransformer_fp2array(transformer_class, fingerprint): + transformer = transformer_class() + + with pytest.raises(DeprecationWarning, match='Generators can directly return fingerprints'): + fp = transformer._fp2array(fingerprint) + + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_fpstransformer_transform_mol(transformer_class, mols_list): + transformer = transformer_class() + + fp = transformer._transform_mol(mols_list[0]) + #See that fp is the correct type, shape and bit count + assert(type(fp) == type(np.array([0]))) + assert(fp.shape == (2048,)) + + if isinstance(transformer, RDKitFPGeneratorTransformer): + assert(fp.sum() == 104) + elif isinstance(transformer, AtomPairFPGeneratorTransformer): + assert (fp.sum() == 32) + elif isinstance(transformer, TopologicalTorsionFPGeneatorTransformer): + assert (fp.sum() == 12) + elif isinstance(transformer, MorganFPGeneratorTransformer): + assert (fp.sum() == 14) + else: + raise NotImplementedError("missing Assert") + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_clonability(transformer_class): + transformer = transformer_class() + + params = transformer.get_params() + t2 = clone(transformer) + params_2 = t2.get_params() + #Parameters of cloned transformers should be the same + assert all([ params[key] == params_2[key] for key in params.keys()]) + #Cloned transformers should not be the same object + assert t2 != transformer + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_set_params(transformer_class): + transformer = transformer_class() + params = transformer.get_params() + #change extracted dictionary + params['nBits'] = 4242 + #change params in transformer + transformer.set_params(nBits = 4242) + # get parameters as dictionary and assert that it is the same + params_2 = transformer.get_params() + assert all([ params[key] == params_2[key] for key in params.keys()]) + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_transform(mols_container, transformer_class): + transformer = transformer_class() + #Test the different transformers + params = transformer.get_params() + fps = transformer.transform(mols_container) + #Assert that the same length of input and output + assert len(fps) == len(mols_container) + + fpsize = params['nBits'] + + assert len(fps[0]) == fpsize + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_transform_parallel(mols_container, transformer_class): + transformer = transformer_class() + #Test the different transformers + transformer.set_params(parallel=True) + params = transformer.get_params() + fps = transformer.transform(mols_container) + #Assert that the same length of input and output + assert len(fps) == len(mols_container) + + fpsize = params['nBits'] + assert len(fps[0]) == fpsize + + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_picklable(transformer_class): + #Test the different transformers + transformer = transformer_class() + p = transformer.get_params() + + with tempfile.NamedTemporaryFile() as f: + pickle.dump(transformer, f) + f.seek(0) + t2 = pickle.load(f) + print(p) + print(vars(transformer)) + print(vars(t2)) + assert(transformer.get_params() == t2.get_params()) + + +@pytest.mark.parametrize("transfomer", test_transformers) +def assert_transformer_set_params(transfomer, new_params, mols_list): + default_params = transfomer().get_params() + + for key in new_params.keys(): + tr = transfomer() + params = tr.get_params() + params[key] = new_params[key] + + fps_default = tr.transform(mols_list) + + tr.set_params(**params) + new_tr = transfomer(**params) + fps_reset_params = tr.transform(mols_list) + fps_init_new_params = new_tr.transform(mols_list) + + # Now fp_default should not be the same as fp_reset_params + + assert ~np.all([np.array_equal(fp_default, fp_reset_params) for fp_default, fp_reset_params in zip(fps_default, fps_reset_params)]), f"Assertation error, FP appears the same, although the {key} should be changed from {default_params[key]} to {params[key]}" + # fp_reset_params and fp_init_new_params should however be the same + assert np.all([np.array_equal(fp_init_new_params, fp_reset_params) for fp_init_new_params, fp_reset_params in zip(fps_init_new_params, fps_reset_params)]) , f"Assertation error, FP appears to be different, although the {key} should be changed back as well as initialized to {params[key]}" + + +def test_morgan_set_params(chiral_mols_list): + new_params = {'nBits': 1024, + 'radius': 1, + 'useBondTypes': False,# TODO, why doesn't this change the FP? + 'useChirality': True, + 'useCounts': True, + 'useFeatures': True} + + assert_transformer_set_params(MorganFPGeneratorTransformer, new_params, chiral_mols_list) + + +def test_atompairs_set_params(chiral_mols_list): + new_params = { + #'atomInvariants': 1, + #'confId': -1, + #'fromAtoms': 1, + #'ignoreAtoms': 0, + 'includeChirality': True, + 'maxLength': 3, + 'minLength': 3, + 'nBits': 1024, + #'nBitsPerEntry': 3, #Todo: not setable with the generators? + #'use2D': True, #TODO, understand why this can't be set different + 'useCounts': True} + + assert_transformer_set_params(AtomPairFPGeneratorTransformer, new_params, chiral_mols_list) + + +def test_topologicaltorsion_set_params(chiral_mols_list): + new_params = {#'atomInvariants': 0, + #'fromAtoms': 0, + #'ignoreAtoms': 0, + #'includeChirality': True, #TODO, figure out why this setting seems to give same FP wheter toggled or not + 'nBits': 1024, + #'nBitsPerEntry': 3, #Todo: not setable with the generators? + 'targetSize': 5, + 'useCounts': True} + + assert_transformer_set_params(TopologicalTorsionFPGeneatorTransformer, new_params, chiral_mols_list) + +def test_RDKitFPTransformer(chiral_mols_list): + new_params = {#'atomInvariantsGenerator': None, + #'branchedPaths': False, + #'countBounds': 0, #TODO: What does this do? + 'countSimulation': True, + 'nBits': 1024, + 'maxPath': 3, + 'minPath': 2, + 'numBitsPerFeature': 3, + 'useBondOrder': False, #TODO, why doesn't this change the FP? + #'useHs': False, #TODO, why doesn't this change the FP? + } + assert_transformer_set_params(RDKitFPGeneratorTransformer, new_params, chiral_mols_list) From c812214103036b0596f82d15961894e197bfb28d Mon Sep 17 00:00:00 2001 From: riesben Date: Mon, 16 Sep 2024 23:14:01 +0200 Subject: [PATCH 4/8] fixes for bugs --- scikit_mol/fingerprints.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scikit_mol/fingerprints.py b/scikit_mol/fingerprints.py index 69ca8fd..66d6c51 100644 --- a/scikit_mol/fingerprints.py +++ b/scikit_mol/fingerprints.py @@ -572,9 +572,8 @@ def _mol2fp(self, mol): def __getstate__(self): # Get the state of the parent class state = super().__getstate__() + state.update(self.get_params()) # Remove the unpicklable property from the state - props = {k:v for k,v in inspect.getmembers(self) if not isinstance(v, Callable) and not k.startswith("_")} - state.update(props) state.pop("_fpgen", None) # fpgen is not picklable return state @@ -583,7 +582,8 @@ def __setstate__(self, state): super().__setstate__(state) # Re-create the unpicklable property generatort_keys = inspect.signature(self._generate_fp_generator).parameters.keys() - self._generate_fp_generator(**{k:state["_"+k] if "_"+k in state else state[k] for k in generatort_keys}) + params = {k:state["_"+k] if "_"+k in state else state[k] for k in generatort_keys} + self._generate_fp_generator(**params) @abstractmethod def _generate_fp_generator(self,*args, **kwargs): From fdd8624ece60a5fd0a245eb67bed042b17e91ca6 Mon Sep 17 00:00:00 2001 From: riesben Date: Thu, 14 Nov 2024 22:32:53 +0100 Subject: [PATCH 5/8] Remodelling transformers: - nBits->fpSize - remove properties / overwrite setattr - adapt tests. --- scikit_mol/fingerprints.py | 550 +++++++------------------- tests/test_fptransformers.py | 55 +-- tests/test_fptransformersgenerator.py | 16 +- tests/test_safeinferencemode.py | 4 +- tests/test_transformers.py | 7 +- 5 files changed, 173 insertions(+), 459 deletions(-) diff --git a/scikit_mol/fingerprints.py b/scikit_mol/fingerprints.py index ace9795..a96e65b 100644 --- a/scikit_mol/fingerprints.py +++ b/scikit_mol/fingerprints.py @@ -56,7 +56,7 @@ def _get_column_prefix(self) -> str: return "fp" def _get_n_digits_column_suffix(self) -> int: - return len(str(self.nBits)) + return len(str(self.fpSize)) def get_display_feature_names_out(self, input_features=None): """Get feature names for display purposes @@ -68,7 +68,7 @@ def get_display_feature_names_out(self, input_features=None): prefix = self._get_column_prefix() n_digits = self._get_n_digits_column_suffix() return np.array( - [f"{prefix}_{str(i).zfill(n_digits)}" for i in range(1, self.nBits + 1)] + [f"{prefix}_{str(i).zfill(n_digits)}" for i in range(1, self.fpSize + 1)] ) def get_feature_names_out(self, input_features=None): @@ -78,7 +78,7 @@ def get_feature_names_out(self, input_features=None): to get the column names of the transformed dataframe. """ prefix = self._get_column_prefix() - return np.array([f"{prefix}_{i}" for i in range(1, self.nBits + 1)]) + return np.array([f"{prefix}_{i}" for i in range(1, self.fpSize + 1)]) @abstractmethod def _mol2fp(self, mol): @@ -90,11 +90,11 @@ def _mol2fp(self, mol): def _fp2array(self, fp): if fp: - arr = np.zeros((self.nBits,), dtype=self.dtype) + arr = np.zeros((self.fpSize,), dtype=self.dtype) DataStructs.ConvertToNumpyArray(fp, arr) return arr else: - return np.ma.masked_all((self.nBits,), dtype=self.dtype) + return np.ma.masked_all((self.fpSize,), dtype=self.dtype) def _transform_mol(self, mol): if not mol and self.safe_inference_mode: @@ -120,16 +120,17 @@ def _transform(self, X): if self.safe_inference_mode: # Use the new method with masked arrays if we're in safe inference mode arrays = [self._transform_mol(mol) for mol in X] + print(arrays) return np.ma.stack(arrays) else: # Use the original, faster method if we're not in safe inference mode - arr = np.zeros((len(X), self.nBits), dtype=self.dtype) + arr = np.zeros((len(X), self.fpSize), dtype=self.dtype) for i, mol in enumerate(X): arr[i, :] = self._transform_mol(mol) return arr def _transform_sparse(self, X): - arr = np.zeros((len(X), self.nBits), dtype=self.dtype) + arr = np.zeros((len(X), self.fpSize), dtype=self.dtype) for i, mol in enumerate(X): arr[i, :] = self._transform_mol(mol) @@ -189,6 +190,7 @@ def __init__( parallel: Union[bool, int] = False, safe_inference_mode: bool = False, dtype: np.dtype = np.int8, + fpSize=167, ): """MACCS keys fingerprinter calculates the 167 fixed MACCS keys @@ -196,19 +198,23 @@ def __init__( super().__init__( parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype ) - self.nBits = 167 + if fpSize != 167: + raise ValueError( + "fpSize can only be 167, matching the number of defined MACCS keys!" + ) + self._fpSize = fpSize @property - def nBits(self): - return self._nBits + def fpSize(self): + return self._fpSize - @nBits.setter - def nBits(self, nBits): - if nBits != 167: + @fpSize.setter + def fpSize(self, fpSize): + if fpSize != 167: raise ValueError( - "nBits can only be 167, matching the number of defined MACCS keys!" + "fpSize can only be 167, matching the number of defined MACCS keys!" ) - self._nBits = nBits + self._fpSize = fpSize def _mol2fp(self, mol): return rdMolDescriptors.GetMACCSKeysFingerprint(mol) @@ -270,14 +276,6 @@ def __init__( self.numBitsPerFeature = numBitsPerFeature self.atomInvariantsGenerator = atomInvariantsGenerator - @property - def fpSize(self): - return self.nBits - - # Scikit-Learn expects to be able to set fpSize directly on object via .set_params(), so this updates nBits used by the abstract class - @fpSize.setter - def fpSize(self, fpSize): - self.nBits = fpSize def _mol2fp(self, mol): generator = rdFingerprintGenerator.GetRDKitFPGenerator( @@ -307,7 +305,7 @@ def __init__( includeChirality: bool = False, use2D: bool = True, confId: int = -1, - nBits=2048, + fpSize=2048, useCounts: bool = False, parallel: Union[bool, int] = False, safe_inference_mode: bool = False, @@ -324,7 +322,7 @@ def __init__( self.includeChirality = includeChirality self.use2D = use2D self.confId = confId - self.nBits = nBits + self.fpSize = fpSize self.nBitsPerEntry = nBitsPerEntry self.useCounts = useCounts @@ -336,7 +334,7 @@ def _mol2fp(self, mol): if self.useCounts: return rdMolDescriptors.GetHashedAtomPairFingerprint( mol, - nBits=int(self.nBits), + nBits=int(self.fpSize), minLength=int(self.minLength), maxLength=int(self.maxLength), fromAtoms=self.fromAtoms, @@ -349,7 +347,7 @@ def _mol2fp(self, mol): else: return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( mol, - nBits=int(self.nBits), + nBits=int(self.fpSize), minLength=int(self.minLength), maxLength=int(self.maxLength), fromAtoms=self.fromAtoms, @@ -371,7 +369,7 @@ def __init__( atomInvariants=0, includeChirality: bool = False, nBitsPerEntry: int = 4, - nBits=2048, + fpSize=2048, useCounts: bool = False, parallel: Union[bool, int] = False, safe_inference_mode: bool = False, @@ -386,7 +384,7 @@ def __init__( self.atomInvariants = atomInvariants self.includeChirality = includeChirality self.nBitsPerEntry = nBitsPerEntry - self.nBits = nBits + self.fpSize = fpSize self.useCounts = useCounts print("TopologicalTorsionFingerprintTransformer will be replace by TopologicalTorsionFPGeneatorTransformer, due to changes in RDKit!") #raise DeprecationWarning("AtomPairFingerprintTransformer will be replace by AtomPairFPGeneratorTransformer, due to changes in RDKit!") @@ -396,7 +394,7 @@ def _mol2fp(self, mol): if self.useCounts: return rdMolDescriptors.GetHashedTopologicalTorsionFingerprint( mol, - nBits=int(self.nBits), + nBits=int(self.fpSize), targetSize=int(self.targetSize), fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, @@ -406,7 +404,7 @@ def _mol2fp(self, mol): else: return rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( mol, - nBits=int(self.nBits), + nBits=int(self.fpSize), targetSize=int(self.targetSize), fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, @@ -424,7 +422,7 @@ def __init__( isomeric: bool = False, kekulize: bool = False, min_radius: int = 1, - n_permutations: int = 2048, + fpSize: int = 2048, seed: int = 42, parallel: Union[bool, int] = False, safe_inference_mode: bool = False, @@ -440,7 +438,7 @@ def __init__( isomeric (bool, optional): Whether the isomeric SMILES to be considered. Defaults to False. kekulize (bool, optional): Whether or not to kekulize the extracted SMILES. Defaults to False. min_radius (int, optional): The minimum radius that is used to extract n-gram. Defaults to 1. - n_permutations (int, optional): The number of permutations used for hashing. Defaults to 0, + fpSize (int, optional): The number of permutations used for hashing. Defaults to 2048, this is effectively the length of the FP seed (int, optional): The value used to seed numpy.random. Defaults to 0. """ @@ -453,7 +451,7 @@ def __init__( self.kekulize = kekulize self.min_radius = min_radius # Set the .n_permutations and .seed without creating the encoder twice - self._n_permutations = n_permutations + self.fpSize = fpSize self._seed = seed # create the encoder instance self._recreate_encoder() @@ -482,7 +480,7 @@ def _fp2array(self, fp): def _recreate_encoder(self): self.mhfp_encoder = rdMHFPFingerprint.MHFPEncoder( - self._n_permutations, self._seed + self.fpSize, self._seed ) @property @@ -497,19 +495,14 @@ def seed(self, seed): @property def n_permutations(self): - return self._n_permutations + return self.fpSize @n_permutations.setter def n_permutations(self, n_permutations): - self._n_permutations = n_permutations + self.fpSize = n_permutations # each time the n_permutations parameter is modified refresh an instance of the encoder self._recreate_encoder() - @property - def nBits(self): - # to be compliant with the requirement of the base class - return self._n_permutations - class SECFingerprintTransformer(FpsTransformer): # https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0321-8 @@ -520,7 +513,7 @@ def __init__( isomeric: bool = False, kekulize: bool = False, min_radius: int = 1, - length: int = 2048, + fpSize: int = 2048, n_permutations: int = 0, seed: int = 0, parallel: Union[bool, int] = False, @@ -535,7 +528,7 @@ def __init__( isomeric (bool, optional): Whether the isomeric SMILES to be considered. Defaults to False. kekulize (bool, optional): Whether or not to kekulize the extracted SMILES. Defaults to False. min_radius (int, optional): The minimum radius that is used to extract n-gram. Defaults to 1. - length (int, optional): The length of the folded fingerprint. Defaults to 2048. + fpSize (int, optional): The length of the folded fingerprint. Defaults to 2048. n_permutations (int, optional): The number of permutations used for hashing. Defaults to 0. seed (int, optional): The value used to seed numpy.random. Defaults to 0. """ @@ -547,7 +540,7 @@ def __init__( self.isomeric = isomeric self.kekulize = kekulize self.min_radius = min_radius - self.length = length + self.fpSize = fpSize # Set the .n_permutations and seed without creating the encoder twice self._n_permutations = n_permutations self._seed = seed @@ -604,15 +597,15 @@ def n_permutations(self, n_permutations): self._recreate_encoder() @property - def nBits(self): + def length(self): # to be compliant with the requirement of the base class - return self.length + return self.fpSize class MorganFingerprintTransformer(FpsTransformer): def __init__( self, - nBits=2048, + fpSize=2048, radius=2, useChirality=False, useBondTypes=True, @@ -626,7 +619,7 @@ def __init__( Parameters ---------- - nBits : int, optional + fpSize : int, optional Size of the hashed fingerprint, by default 2048 radius : int, optional Radius of the fingerprint, by default 2 @@ -642,7 +635,7 @@ def __init__( super().__init__( parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype ) - self.nBits = nBits + self.fpSize = fpSize self.radius = radius self.useChirality = useChirality self.useBondTypes = useBondTypes @@ -658,7 +651,7 @@ def _mol2fp(self, mol): return rdMolDescriptors.GetHashedMorganFingerprint( mol, int(self.radius), - nBits=int(self.nBits), + nBits=int(self.fpSize), useFeatures=bool(self.useFeatures), useChirality=bool(self.useChirality), useBondTypes=bool(self.useBondTypes), @@ -667,7 +660,7 @@ def _mol2fp(self, mol): return rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, int(self.radius), - nBits=int(self.nBits), + nBits=int(self.fpSize), useFeatures=bool(self.useFeatures), useChirality=bool(self.useChirality), useBondTypes=bool(self.useBondTypes), @@ -678,7 +671,7 @@ class AvalonFingerprintTransformer(FpsTransformer): # Fingerprint from the Avalon toolkeit, https://doi.org/10.1021/ci050413p def __init__( self, - nBits: int = 512, + fpSize: int = 512, isQuery: bool = False, resetVect: bool = False, bitFlags: int = 15761407, @@ -691,7 +684,7 @@ def __init__( Parameters ---------- - nBits : int, optional + fpSize : int, optional Size of the fingerprint, by default 512 isQuery : bool, optional use the fingerprint for a query structure, by default False @@ -705,7 +698,7 @@ def __init__( super().__init__( parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype ) - self.nBits = nBits + self.fpSize = fpSize self.isQuery = isQuery self.resetVect = resetVect self.bitFlags = bitFlags @@ -715,14 +708,14 @@ def _mol2fp(self, mol): if self.useCounts: return pyAvalonTools.GetAvalonCountFP( mol, - nBits=int(self.nBits), + nBits=int(self.fpSize), isQuery=bool(self.isQuery), bitFlags=int(self.bitFlags), ) else: return pyAvalonTools.GetAvalonFP( mol, - nBits=int(self.nBits), + nBits=int(self.fpSize), isQuery=bool(self.isQuery), resetVect=bool(self.resetVect), bitFlags=int(self.bitFlags), @@ -740,7 +733,7 @@ def parallel_helper(args): return transformer._transform(X_mols) class FpsGeneratorTransformer(FpsTransformer): - + _regenerate_on_properties = () def _fp2array(self, fp): raise DeprecationWarning("Generators can directly return fingerprints") @@ -761,11 +754,19 @@ def __setstate__(self, state): super().__setstate__(state) # Re-create the unpicklable property generatort_keys = inspect.signature(self._generate_fp_generator).parameters.keys() - params = {k:state["_"+k] if "_"+k in state else state[k] for k in generatort_keys} - self._generate_fp_generator(**params) + params = [setattr(self, k, state["_"+k]) if "_"+k in state else setattr(self, k, state[k]) for k in generatort_keys] + self._generate_fp_generator() + + def __setattr__(self, name: str, value): + super().__setattr__(name, value) + if ( + not hasattr(self, "_initializing") + and name in self._regenerate_on_properties + ): + self._generate_fp_generator() @abstractmethod - def _generate_fp_generator(self,*args, **kwargs): + def _generate_fp_generator(self): raise NotImplementedError("_generate_fp_generator not implemented") @abstractmethod @@ -776,24 +777,18 @@ def _transform_mol(self, mol) -> np.array: """ raise NotImplementedError("_transform_mol not implemented") - @property - def fpSize(self): - return self.nBits - - #Scikit-Learn expects to be able to set fpSize directly on object via .set_params(), so this updates nBits used by the abstract class - @fpSize.setter - def fpSize(self, fpSize): - self.nBits = fpSize class MorganFPGeneratorTransformer(FpsGeneratorTransformer): - def __init__(self, nBits=2048, radius=2, useChirality=False, + _regenerate_on_properties = ("radius", "fpSize", "useChirality", "useFeatures", "useBondTypes") + + def __init__(self, fpSize=2048, radius=2, useChirality=False, useBondTypes=True, useFeatures=False, useCounts=False, - parallel: Union[bool, int] = False,): + parallel: Union[bool, int] = False, ): """Transform RDKit mols into Count or bit-based hashed MorganFingerprints Parameters ---------- - nBits : int, optional + fpsize : int, optional Size of the hashed fingerprint, by default 2048 radius : int, optional Radius of the fingerprint, by default 2 @@ -806,82 +801,34 @@ def __init__(self, nBits=2048, radius=2, useChirality=False, useCounts : bool, optional If toggled will create the count and not bit-based fingerprint, by default False """ + + self._initializing = True super().__init__(parallel = parallel) - self._useFeatures = useFeatures - self._useCounts = useCounts - self._useBondTypes = useBondTypes + self.fpSize = fpSize + self.radius = radius + self.useChirality = useChirality + self.useFeatures = useFeatures + self.useCounts = useCounts + self.useBondTypes = useBondTypes - self._generate_fp_generator(useFeatures=useFeatures, radius=radius, nBits=nBits, - useChirality=useChirality, useBondTypes=useBondTypes) + self._generate_fp_generator() + delattr(self, "_initializing") - def _generate_fp_generator(self, useFeatures:bool, radius:int, nBits:int, - useChirality:bool, useBondTypes:bool): + def _generate_fp_generator(self): - if useFeatures: + if self.useFeatures: atomInvariantsGenerator = GetMorganFeatureAtomInvGen() else: atomInvariantsGenerator = None - self._fpgen = GetMorganGenerator(radius=radius, - fpSize=nBits, - includeChirality=useChirality, - useBondTypes=useBondTypes, + self._fpgen = GetMorganGenerator(radius=self.radius, + fpSize=self.fpSize, + includeChirality=self.useChirality, + useBondTypes=self.useBondTypes, atomInvariantsGenerator=atomInvariantsGenerator, ) - @property - def radius(self): - return self._fpgen.GetOptions().radius - - @radius.setter - def radius(self, value:int): - self._fpgen.GetOptions().radius = value - - @property - def nBits(self): - return self._fpgen.GetOptions().fpSize - - @nBits.setter - def nBits(self, value:int): - self._fpgen.GetOptions().fpSize = value - - @property - def useChirality(self): - return self._fpgen.GetOptions().includeChirality - - @useChirality.setter - def useChirality(self, value:bool): - self._fpgen.GetOptions().includeChirality = value - - @property - def useFeatures(self): - return self._useFeatures - - @useFeatures.setter - def useFeatures(self, value:bool): - self._useFeatures = value - self._generate_fp_generator(useFeatures=self.useFeatures, radius=self.radius, nBits=self.nBits, - useChirality=self.useChirality, useBondTypes=self.useBondTypes) - - @property - def useBondTypes(self): - return self._useBondTypes - - @useBondTypes.setter - def useBondTypes(self, value:bool): - self._useBondTypes = value - self._generate_fp_generator(useFeatures=self.useFeatures, radius=self.radius, nBits=self.nBits, - useChirality=self.useChirality, useBondTypes=self.useBondTypes) - - @property - def useCounts(self): - return self._useCounts - - @useCounts.setter - def useCounts(self, value:bool): - self._useCounts = value - def _transform_mol(self, mol) -> np.array: if self.useCounts: return self._fpgen.GetCountFingerprintAsNumPy(mol) @@ -890,221 +837,81 @@ def _transform_mol(self, mol) -> np.array: class TopologicalTorsionFPGeneatorTransformer(FpsGeneratorTransformer): + _regenerate_on_properties = ("fpSize", "includeChirality", "targetSize") + def __init__(self, targetSize:int = 4, fromAtoms = None, ignoreAtoms = None, atomInvariants = None, confId=-1, - includeChirality:bool = False, nBits=2048, + includeChirality:bool = False, fpSize:int=2048, useCounts:bool=False, parallel: Union[bool, int] = False): + self._initializing = True super().__init__(parallel=parallel) - self._fromAtoms = fromAtoms - self._ignoreAtoms = ignoreAtoms - self._atomInvariants = atomInvariants - self._confId = confId - self._useCounts = useCounts - self._targetSize = targetSize - self._generate_fp_generator(targetSize=targetSize, includeChirality=includeChirality, - nBits=nBits) - - @property - def useCounts(self): - return self._useCounts - - @useCounts.setter - def useCounts(self, value:bool): - self._useCounts = value - - @property - def confId(self): - return self._confId - - @confId.setter - def confId(self, value: int): - self._confId = value - - @property - def fromAtoms(self): - return self._fromAtoms - - @fromAtoms.setter - def fromAtoms(self, value: int): - self._fromAtoms = value - - @property - def ignoreAtoms(self): - return self._ignoreAtoms - - @ignoreAtoms.setter - def ignoreAtoms(self, value: int): - self._ignoreAtoms = value - - @property - def atomInvariants(self): - return self._atomInvariants - - @atomInvariants.setter - def atomInvariants(self, value: int): - self._atomInvariants = value - - @property - def nBits(self): - return self._fpgen.GetOptions().fpSize - - @nBits.setter - def nBits(self, value: int): - self._fpgen.GetOptions().fpSize = value - - @property - def includeChirality(self): - return self._fpgen.GetOptions().includeChirality + self.fpSize = fpSize + self.includeChirality = includeChirality + self.targetSize = targetSize - @includeChirality.setter - def includeChirality(self, value:int): - self._fpgen.GetOptions().includeChirality = value + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants + self.confId = confId + self.useCounts = useCounts - @property - def targetSize(self): - return self._targetSize + self._generate_fp_generator() + delattr(self, "_initializing") - @targetSize.setter - def targetSize(self, value:int): - self._targetSize = value - self._generate_fp_generator(targetSize=value, - includeChirality=self.includeChirality, - nBits=self.nBits) - def _generate_fp_generator(self, targetSize: int, includeChirality: bool, nBits: int): - self._fpgen = GetTopologicalTorsionGenerator(torsionAtomCount=targetSize, includeChirality=includeChirality, - fpSize=nBits) + def _generate_fp_generator(self): + self._fpgen = GetTopologicalTorsionGenerator(torsionAtomCount=self.targetSize, includeChirality=self.includeChirality, + fpSize=self.fpSize) def _transform_mol(self, mol) -> np.array: if self.useCounts: - return self._fpgen.GetCountFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self._ignoreAtoms, customAtomInvariants=self._atomInvariants) + return self._fpgen.GetCountFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) else: - return self._fpgen.GetFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self._ignoreAtoms, customAtomInvariants=self._atomInvariants) + return self._fpgen.GetFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) class AtomPairFPGeneratorTransformer(FpsGeneratorTransformer): + _regenerate_on_properties = ("fpSize", "includeChirality", "use2D", "minLength", "maxLength") + def __init__(self, minLength:int = 1, maxLength:int = 30, fromAtoms = None, ignoreAtoms = None, atomInvariants = None, - includeChirality:bool = False, use2D:bool = True, confId:int = -1, nBits=2048, + includeChirality:bool = False, use2D:bool = True, confId:int = -1, fpSize:int=2048, useCounts:bool=False, parallel: Union[bool, int] = False,): + self._initializing = True super().__init__(parallel = parallel) - self._useCounts= useCounts - self._confId = confId - self._fromAtoms = fromAtoms - self._ignoreAtoms = ignoreAtoms - self._atomInvariants = atomInvariants - self._minLength = minLength - self._maxLength = maxLength - - self._generate_fp_generator(minLength=minLength, maxLength=maxLength, - includeChirality=includeChirality, use2D=use2D, - nBits=nBits) - - @property - def useCounts(self): - return self._useCounts - - @useCounts.setter - def useCounts(self, value:bool): - self._useCounts = value - - @property - def confId(self): - return self._confId - - @confId.setter - def confId(self, value:int): - self._confId = value - - @property - def fromAtoms(self): - return self._fromAtoms - - @fromAtoms.setter - def fromAtoms(self, value:int): - self._fromAtoms = value - - @property - def ignoreAtoms(self): - return self._ignoreAtoms - - @ignoreAtoms.setter - def ignoreAtoms(self, value:int): - self._ignoreAtoms = value - - @property - def atomInvariants(self): - return self._atomInvariants - - @atomInvariants.setter - def atomInvariants(self, value:int): - self._atomInvariants = value - - @property - def minLength(self): - return self._minLength - - @minLength.setter - def minLength(self, value: int): - self._minLength = value - self._generate_fp_generator(minLength=value, maxLength=self.maxLength, - includeChirality=self.includeChirality, use2D=self.use2D, - nBits=self.nBits) - - @property - def maxLength(self): - return self._maxLength - - @maxLength.setter - def maxLength(self, value: int): - self._maxLength = value - self._generate_fp_generator(minLength=self.minLength, maxLength=value, - includeChirality=self.includeChirality, use2D=self.use2D, - nBits=self.nBits) - - @property - def includeChirality(self): - return self._fpgen.GetOptions().includeChirality - - @includeChirality.setter - def includeChirality(self, value: bool): - self._fpgen.GetOptions().includeChirality = value - - @property - def use2D(self): - return self._fpgen.GetOptions().use2D - - @use2D.setter - def use2D(self, value: bool): - self._fpgen.GetOptions().use2D = value - - @property - def nBits(self): - return self._fpgen.GetOptions().fpSize + self.fpSize = fpSize + self.use2D = use2D + self.includeChirality = includeChirality + self.minLength = minLength + self.maxLength = maxLength - @nBits.setter - def nBits(self, value: int): - self._fpgen.GetOptions().fpSize = value + self.useCounts= useCounts + self.confId = confId + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants - @property - def nBitsPerEntry(self): - return self._fpgen.GetOptions().numBitsPerFeature + self._generate_fp_generator() + delattr(self, "_initializing") - def _generate_fp_generator(self, minLength, maxLength, includeChirality, use2D, nBits): - self._fpgen = GetAtomPairGenerator(minDistance=minLength, maxDistance=maxLength, - includeChirality=includeChirality, - use2D=use2D, fpSize=nBits) + def _generate_fp_generator(self): + self._fpgen = GetAtomPairGenerator(minDistance=self.minLength, maxDistance=self.maxLength, + includeChirality=self.includeChirality, + use2D=self.use2D, fpSize=self.fpSize) def _transform_mol(self, mol) -> np.array: if self.useCounts: - return self._fpgen.GetCountFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self._ignoreAtoms, customAtomInvariants=self._atomInvariants) + return self._fpgen.GetCountFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) else: - return self._fpgen.GetFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self._ignoreAtoms, customAtomInvariants=self._atomInvariants) + return self._fpgen.GetFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) class RDKitFPGeneratorTransformer(FpsGeneratorTransformer): + _regenerate_on_properties = ("minPath", "maxPath", "useHs", "branchedPaths", "useBondOrder", "countSimulation", "fpSize", "countBounds", + "numBitsPerFeature") + def __init__(self, minPath:int = 1, maxPath:int =7, useHs:bool = True, branchedPaths:bool = True, useBondOrder:bool = True, countSimulation:bool = False, countBounds = None, - nBits:int = 2048, numBitsPerFeature:int = 2, + fpSize:int = 2048, numBitsPerFeature:int = 2, useCounts:bool = False, parallel: Union[bool, int] = False ): """Calculates the RDKit fingerprints @@ -1125,88 +932,27 @@ def __init__(self, minPath:int = 1, maxPath:int =7, useHs:bool = True, branchedP if set, use count simulation while generating the fingerprint, by default False countBounds : _type_, optional boundaries for count simulation, corresponding bit will be set if the count is higher than the number provided for that spot, by default None - nBits : int, optional + fpSize : int, optional size of the generated fingerprint, does not affect the sparse versions, by default 2048 numBitsPerFeature : int, optional the number of bits set per path/subgraph found, by default 2 """ + self._initializing = True super().__init__(parallel = parallel) - self._useCounts = useCounts - self._countBounds = countBounds - self._generate_fp_generator( minPath=minPath, maxPath=maxPath, useHs=useHs, - branchedPaths=branchedPaths,useBondOrder=useBondOrder, - countSimulation=countSimulation, fpSize=nBits, - countBounds=countBounds, numBitsPerFeature=numBitsPerFeature) + self.minPath = minPath + self.maxPath = maxPath + self.useHs = useHs + self.branchedPaths = branchedPaths + self.useBondOrder = useBondOrder + self.countSimulation = countSimulation + self.fpSize = fpSize + self.numBitsPerFeature = numBitsPerFeature + self.countBounds = countBounds + self.useCounts = useCounts - @property - def nBits(self): - return self._fpgen.GetOptions().fpSize - @nBits.setter - def nBits(self, value: int): - self._fpgen.GetOptions().fpSize = value - @property - def minPath(self): - return self._fpgen.GetOptions().minPath - @minPath.setter - def minPath(self, value:int): - self._fpgen.GetOptions().minPath = value - @property - def maxPath(self): - return self._fpgen.GetOptions().maxPath - @maxPath.setter - def maxPath(self, value:int): - self._fpgen.GetOptions().maxPath = value - @property - def useHs(self): - return self._fpgen.GetOptions().useHs - @useHs.setter - def useHs(self, value:bool): - self._fpgen.GetOptions().useHs = value - @property - def branchedPaths(self): - return self._fpgen.GetOptions().branchedPaths - @branchedPaths.setter - def branchedPaths(self, value:int): - self._fpgen.GetOptions().branchedPaths = value - @property - def useBondOrder(self): - return self._fpgen.GetOptions().useBondOrder - @useBondOrder.setter - def useBondOrder(self, value:int): - self._fpgen.GetOptions().useBondOrder = value - @property - def numBitsPerFeature(self): - return self._fpgen.GetOptions().numBitsPerFeature - @numBitsPerFeature.setter - def numBitsPerFeature(self, value:int): - self._fpgen.GetOptions().numBitsPerFeature = value - @property - def countBounds(self): - return self._countBounds - @countBounds.setter - def countBounds(self, value:int): - self._countBounds = value - self._generate_fp_generator(minPath=self.minPath, maxPath=self.maxPath, useHs=self.useHs, - branchedPaths=self.branchedPaths,useBondOrder=self.useBondOrder, - countSimulation=self.countSimulation, fpSize=self.nBits, - countBounds=value, numBitsPerFeature=self.numBitsPerFeature) - @property - def countSimulation(self): - return self._countBounds - @countSimulation.setter - def countSimulation(self, value: bool): - self._countSimulation=value - self._generate_fp_generator(minPath=self.minPath, maxPath=self.maxPath, useHs=self.useHs, - branchedPaths=self.branchedPaths,useBondOrder=self.useBondOrder, - countSimulation=value, fpSize=self.nBits, - countBounds=self.countBounds, numBitsPerFeature=self.numBitsPerFeature) - @property - def useCounts(self): - return self._useCounts - @useCounts.setter - def useCounts(self, value:bool): - self._useCounts = value + self._generate_fp_generator() + delattr(self, "_initializing") def _transform_mol(self, mol) -> np.array: if self.useCounts: @@ -1214,10 +960,8 @@ def _transform_mol(self, mol) -> np.array: else: return self._fpgen.GetFingerprintAsNumPy(mol) - def _generate_fp_generator(self, minPath, maxPath, useHs, branchedPaths, - useBondOrder, countSimulation, fpSize, countBounds, - numBitsPerFeature): - self._fpgen = GetRDKitFPGenerator(minPath=minPath, maxPath=maxPath, useHs=useHs, - branchedPaths=branchedPaths,useBondOrder=useBondOrder, - countSimulation=countSimulation, fpSize=fpSize, - countBounds=countBounds, numBitsPerFeature=numBitsPerFeature) + def _generate_fp_generator(self): + self._fpgen = GetRDKitFPGenerator(minPath=self.minPath, maxPath=self.maxPath, useHs=self.useHs, + branchedPaths=self.branchedPaths,useBondOrder=self.useBondOrder, + countSimulation=self.countSimulation, fpSize=self.fpSize, + countBounds=self.countBounds, numBitsPerFeature=self.numBitsPerFeature) diff --git a/tests/test_fptransformers.py b/tests/test_fptransformers.py index 9a9c27a..4ad1e9d 100644 --- a/tests/test_fptransformers.py +++ b/tests/test_fptransformers.py @@ -131,34 +131,20 @@ def test_set_params( ]: params = t.get_params() # change extracted dictionary - params["nBits"] = 4242 + params["fpSize"] = 4242 # change params in transformer - t.set_params(nBits=4242) + t.set_params(fpSize=4242) # get parameters as dictionary and assert that it is the same params_2 = t.get_params() assert all([params[key] == params_2[key] for key in params.keys()]) - for t in [rdkit_transformer]: + for t in [rdkit_transformer, secfp_transformer, mhfp_transformer]: params = t.get_params() params["fpSize"] = 4242 t.set_params(fpSize=4242) params_2 = t.get_params() assert all([params[key] == params_2[key] for key in params.keys()]) - for t in [secfp_transformer]: - params = t.get_params() - params["length"] = 4242 - t.set_params(length=4242) - params_2 = t.get_params() - assert all([params[key] == params_2[key] for key in params.keys()]) - - for t in [mhfp_transformer]: - params = t.get_params() - params["n_permutations"] = 4242 - t.set_params(n_permutations=4242) - params_2 = t.get_params() - assert all([params[key] == params_2[key] for key in params.keys()]) - def test_transform( mols_container, @@ -183,21 +169,13 @@ def test_transform( avalon_transformer, ]: params = t.get_params() + print(type(t), params) fps = t.transform(mols_container) # Assert that the same length of input and output assert len(fps) == len(mols_container) # assert that the size of the fingerprint is the expected size - if ( - type(t) == type(maccs_transformer) - or type(t) == type(secfp_transformer) - or type(t) == type(mhfp_transformer) - ): - fpsize = t.nBits - elif type(t) == type(rdkit_transformer): - fpsize = params["fpSize"] - else: - fpsize = params["nBits"] + fpsize = params["fpSize"] assert len(fps[0]) == fpsize @@ -231,16 +209,7 @@ def test_transform_parallel( assert len(fps) == len(mols_container) # assert that the size of the fingerprint is the expected size - if ( - type(t) == type(maccs_transformer) - or type(t) == type(secfp_transformer) - or type(t) == type(mhfp_transformer) - ): - fpsize = t.nBits - elif type(t) == type(rdkit_transformer): - fpsize = params["fpSize"] - else: - fpsize = params["nBits"] + fpsize = params["fpSize"] assert len(fps[0]) == fpsize @@ -306,7 +275,7 @@ def assert_transformer_set_params(tr_class, new_params, mols_list): def test_morgan_set_params(chiral_mols_list): new_params = { - "nBits": 1024, + "fpSize": 1024, "radius": 1, "useBondTypes": False, # TODO, why doesn't this change the FP? "useChirality": True, @@ -328,7 +297,7 @@ def test_atompairs_set_params(chiral_mols_list): "includeChirality": True, "maxLength": 3, "minLength": 3, - "nBits": 1024, + "fpSize": 1024, "nBitsPerEntry": 3, #'use2D': True, #TODO, understand why this can't be set different "useCounts": True, @@ -344,7 +313,7 @@ def test_topologicaltorsion_set_params(chiral_mols_list): #'fromAtoms': 0, #'ignoreAtoms': 0, #'includeChirality': True, #TODO, figure out why this setting seems to give same FP wheter toggled or not - "nBits": 1024, + "fpSize": 1024, "nBitsPerEntry": 3, "targetSize": 5, "useCounts": True, @@ -376,7 +345,7 @@ def test_SECFingerprintTransformer(chiral_mols_list): new_params = { "isomeric": True, "kekulize": True, - "length": 1048, + "fpSize": 1048, "min_radius": 2, #'n_permutations': 2, # The SECFp is not using this setting "radius": 2, @@ -395,7 +364,7 @@ def test_MHFingerprintTransformer(chiral_mols_list): "isomeric": True, "kekulize": True, "min_radius": 2, - "n_permutations": 4096, + "fpSize": 4096, "seed": 44, } assert_transformer_set_params( @@ -405,7 +374,7 @@ def test_MHFingerprintTransformer(chiral_mols_list): def test_AvalonFingerprintTransformer(chiral_mols_list): new_params = { - "nBits": 1024, + "fpSize": 1024, "isQuery": True, # 'resetVect': True, #TODO: this doesn't change the FP "bitFlags": 32767, diff --git a/tests/test_fptransformersgenerator.py b/tests/test_fptransformersgenerator.py index f11ea95..81da19c 100644 --- a/tests/test_fptransformersgenerator.py +++ b/tests/test_fptransformersgenerator.py @@ -60,9 +60,9 @@ def test_set_params(transformer_class): transformer = transformer_class() params = transformer.get_params() #change extracted dictionary - params['nBits'] = 4242 + params['fpSize'] = 4242 #change params in transformer - transformer.set_params(nBits = 4242) + transformer.set_params(fpSize = 4242) # get parameters as dictionary and assert that it is the same params_2 = transformer.get_params() assert all([ params[key] == params_2[key] for key in params.keys()]) @@ -76,7 +76,7 @@ def test_transform(mols_container, transformer_class): #Assert that the same length of input and output assert len(fps) == len(mols_container) - fpsize = params['nBits'] + fpsize = params['fpSize'] assert len(fps[0]) == fpsize @@ -90,7 +90,7 @@ def test_transform_parallel(mols_container, transformer_class): #Assert that the same length of input and output assert len(fps) == len(mols_container) - fpsize = params['nBits'] + fpsize = params['fpSize'] assert len(fps[0]) == fpsize @@ -134,7 +134,7 @@ def assert_transformer_set_params(transfomer, new_params, mols_list): def test_morgan_set_params(chiral_mols_list): - new_params = {'nBits': 1024, + new_params = {'fpSize': 1024, 'radius': 1, 'useBondTypes': False,# TODO, why doesn't this change the FP? 'useChirality': True, @@ -153,7 +153,7 @@ def test_atompairs_set_params(chiral_mols_list): 'includeChirality': True, 'maxLength': 3, 'minLength': 3, - 'nBits': 1024, + 'fpSize': 1024, #'nBitsPerEntry': 3, #Todo: not setable with the generators? #'use2D': True, #TODO, understand why this can't be set different 'useCounts': True} @@ -166,7 +166,7 @@ def test_topologicaltorsion_set_params(chiral_mols_list): #'fromAtoms': 0, #'ignoreAtoms': 0, #'includeChirality': True, #TODO, figure out why this setting seems to give same FP wheter toggled or not - 'nBits': 1024, + 'fpSize': 1024, #'nBitsPerEntry': 3, #Todo: not setable with the generators? 'targetSize': 5, 'useCounts': True} @@ -178,7 +178,7 @@ def test_RDKitFPTransformer(chiral_mols_list): #'branchedPaths': False, #'countBounds': 0, #TODO: What does this do? 'countSimulation': True, - 'nBits': 1024, + 'fpSize': 1024, 'maxPath': 3, 'minPath': 2, 'numBitsPerFeature': 3, diff --git a/tests/test_safeinferencemode.py b/tests/test_safeinferencemode.py index 921cc0f..c9b4ca1 100644 --- a/tests/test_safeinferencemode.py +++ b/tests/test_safeinferencemode.py @@ -104,12 +104,12 @@ def test_safeinference_wrapper_pandas_output( result = smiles_pipeline[:-1].fit_transform(X_smiles) assert isinstance(result, pd.DataFrame) assert result.shape[0] == len(X_smiles) - assert result.shape[1] == smiles_pipeline.named_steps["FP"].nBits + assert result.shape[1] == smiles_pipeline.named_steps["FP"].fpSize @skip_pandas_output_test def test_safeinference_wrapper_get_feature_names_out(smiles_pipeline): # Get feature names from the FP step feature_names = smiles_pipeline.named_steps["FP"].get_feature_names_out() - assert len(feature_names) == smiles_pipeline.named_steps["FP"].nBits + assert len(feature_names) == smiles_pipeline.named_steps["FP"].fpSize assert all(isinstance(name, str) for name in feature_names) diff --git a/tests/test_transformers.py b/tests/test_transformers.py index 143ecd3..fa65504 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -96,11 +96,12 @@ def test_transformer_pandas_output(SLC6A4_subset, pandas_output): X_transformed = pipeline.transform(X_smiles) assert isinstance(X_transformed, pd.DataFrame), f"the output of {FP_name} is not a pandas dataframe" assert X_transformed.shape[0] == len(X_smiles), f"the number of rows in the output of {FP_name} is not equal to the number of samples" - assert len(X_transformed.columns) == pipeline.named_steps["FP"].nBits, f"the number of columns in the output of {FP_name} is not equal to the number of bits" + assert len(X_transformed.columns) == pipeline.named_steps["FP"].fpSize, f"the number of columns in the output of {FP_name} is not equal to the number of bits" print(f"\nfitting and transforming completed") - except: + except Exception as err: print(f"\n!!!! FAILED pipeline fitting and transforming for {FP_name} with useCounts={useCounts}") + print("\n".join(err.args)) failed_FP.append(FP_name) pass @@ -136,7 +137,7 @@ def test_combined_transformer_pandas_out(combined_transformer, SLC6A4_subset_wit pipeline_skmol = combined_transformer.named_transformers_["pipeline-1"] featurizer_skmol = pipeline_skmol[-1] if isinstance(featurizer_skmol, FpsTransformer): - n_skmol_features = featurizer_skmol.nBits + n_skmol_features = featurizer_skmol.fpSize elif isinstance(featurizer_skmol, MolecularDescriptorTransformer): n_skmol_features = len(featurizer_skmol.desc_list) else: From d420cbde3ec5222c4ba36a4e3511b567101310a6 Mon Sep 17 00:00:00 2001 From: riesben Date: Thu, 14 Nov 2024 22:48:15 +0100 Subject: [PATCH 6/8] Remodelling transformers: - moving code around for easier oversight - adding nicer dpecrecation warnings. --- scikit_mol/fingerprints.py | 450 ++++++++++++++++++------------------- 1 file changed, 223 insertions(+), 227 deletions(-) diff --git a/scikit_mol/fingerprints.py b/scikit_mol/fingerprints.py index a96e65b..a6f90bc 100644 --- a/scikit_mol/fingerprints.py +++ b/scikit_mol/fingerprints.py @@ -2,9 +2,8 @@ import multiprocessing import re import inspect -from typing import Callable +from warnings import warn from typing import Union -from rdkit import Chem from rdkit import DataStructs # from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect @@ -33,7 +32,6 @@ r"^(?P\w+)FingerprintTransformer$" ) - class FpsTransformer(ABC, BaseEstimator, TransformerMixin): def __init__( self, @@ -220,200 +218,6 @@ def _mol2fp(self, mol): return rdMolDescriptors.GetMACCSKeysFingerprint(mol) -class RDKitFingerprintTransformer(FpsTransformer): - def __init__( - self, - minPath: int = 1, - maxPath: int = 7, - useHs: bool = True, - branchedPaths: bool = True, - useBondOrder: bool = True, - countSimulation: bool = False, - countBounds=None, - fpSize: int = 2048, - numBitsPerFeature: int = 2, - atomInvariantsGenerator=None, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - """Calculates the RDKit fingerprints - - Parameters - ---------- - minPath : int, optional - the minimum path length (in bonds) to be included, by default 1 - maxPath : int, optional - the maximum path length (in bonds) to be included, by default 7 - useHs : bool, optional - toggles inclusion of Hs in paths (if the molecule has explicit Hs), by default True - branchedPaths : bool, optional - toggles generation of branched subgraphs, not just linear paths, by default True - useBondOrder : bool, optional - toggles inclusion of bond orders in the path hashes, by default True - countSimulation : bool, optional - if set, use count simulation while generating the fingerprint, by default False - countBounds : _type_, optional - boundaries for count simulation, corresponding bit will be set if the count is higher than the number provided for that spot, by default None - fpSize : int, optional - size of the generated fingerprint, does not affect the sparse versions, by default 2048 - numBitsPerFeature : int, optional - the number of bits set per path/subgraph found, by default 2 - atomInvariantsGenerator : _type_, optional - atom invariants to be used during fingerprint generation, by default None - """ - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.minPath = minPath - self.maxPath = maxPath - self.useHs = useHs - self.branchedPaths = branchedPaths - self.useBondOrder = useBondOrder - self.countSimulation = countSimulation - self.countBounds = countBounds - self.fpSize = fpSize - self.numBitsPerFeature = numBitsPerFeature - self.atomInvariantsGenerator = atomInvariantsGenerator - - - def _mol2fp(self, mol): - generator = rdFingerprintGenerator.GetRDKitFPGenerator( - minPath=int(self.minPath), - maxPath=int(self.maxPath), - useHs=bool(self.useHs), - branchedPaths=bool(self.branchedPaths), - useBondOrder=bool(self.useBondOrder), - countSimulation=bool(self.countSimulation), - countBounds=bool(self.countBounds), - fpSize=int(self.fpSize), - numBitsPerFeature=int(self.numBitsPerFeature), - atomInvariantsGenerator=self.atomInvariantsGenerator, - ) - return generator.GetFingerprint(mol) - - -class AtomPairFingerprintTransformer(FpsTransformer): - def __init__( - self, - minLength: int = 1, - maxLength: int = 30, - fromAtoms=0, - ignoreAtoms=0, - atomInvariants=0, - nBitsPerEntry: int = 4, - includeChirality: bool = False, - use2D: bool = True, - confId: int = -1, - fpSize=2048, - useCounts: bool = False, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.minLength = minLength - self.maxLength = maxLength - self.fromAtoms = fromAtoms - self.ignoreAtoms = ignoreAtoms - self.atomInvariants = atomInvariants - self.includeChirality = includeChirality - self.use2D = use2D - self.confId = confId - self.fpSize = fpSize - self.nBitsPerEntry = nBitsPerEntry - self.useCounts = useCounts - - print("AtomPairFingerprintTransformer will be replace by AtomPairFPGeneratorTransformer, due to changes in RDKit!") - #raise DeprecationWarning("AtomPairFingerprintTransformer will be replace by AtomPairFPGeneratorTransformer, due to changes in RDKit!") - - - def _mol2fp(self, mol): - if self.useCounts: - return rdMolDescriptors.GetHashedAtomPairFingerprint( - mol, - nBits=int(self.fpSize), - minLength=int(self.minLength), - maxLength=int(self.maxLength), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - includeChirality=bool(self.includeChirality), - use2D=bool(self.use2D), - confId=int(self.confId), - ) - else: - return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( - mol, - nBits=int(self.fpSize), - minLength=int(self.minLength), - maxLength=int(self.maxLength), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - nBitsPerEntry=int(self.nBitsPerEntry), - includeChirality=bool(self.includeChirality), - use2D=bool(self.use2D), - confId=int(self.confId), - ) - - -class TopologicalTorsionFingerprintTransformer(FpsTransformer): - def __init__( - self, - targetSize: int = 4, - fromAtoms=0, - ignoreAtoms=0, - atomInvariants=0, - includeChirality: bool = False, - nBitsPerEntry: int = 4, - fpSize=2048, - useCounts: bool = False, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.targetSize = targetSize - self.fromAtoms = fromAtoms - self.ignoreAtoms = ignoreAtoms - self.atomInvariants = atomInvariants - self.includeChirality = includeChirality - self.nBitsPerEntry = nBitsPerEntry - self.fpSize = fpSize - self.useCounts = useCounts - print("TopologicalTorsionFingerprintTransformer will be replace by TopologicalTorsionFPGeneatorTransformer, due to changes in RDKit!") - #raise DeprecationWarning("AtomPairFingerprintTransformer will be replace by AtomPairFPGeneratorTransformer, due to changes in RDKit!") - - - def _mol2fp(self, mol): - if self.useCounts: - return rdMolDescriptors.GetHashedTopologicalTorsionFingerprint( - mol, - nBits=int(self.fpSize), - targetSize=int(self.targetSize), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - includeChirality=bool(self.includeChirality), - ) - else: - return rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( - mol, - nBits=int(self.fpSize), - targetSize=int(self.targetSize), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - includeChirality=bool(self.includeChirality), - nBitsPerEntry=int(self.nBitsPerEntry), - ) - - class MHFingerprintTransformer(FpsTransformer): def __init__( self, @@ -602,6 +406,61 @@ def length(self): return self.fpSize +class AvalonFingerprintTransformer(FpsTransformer): + # Fingerprint from the Avalon toolkeit, https://doi.org/10.1021/ci050413p + def __init__( + self, + fpSize: int = 512, + isQuery: bool = False, + resetVect: bool = False, + bitFlags: int = 15761407, + useCounts: bool = False, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + """Transform RDKit mols into Count or bit-based Avalon Fingerprints + + Parameters + ---------- + fpSize : int, optional + Size of the fingerprint, by default 512 + isQuery : bool, optional + use the fingerprint for a query structure, by default False + resetVect : bool, optional + reset vector, by default False NB: only used in GetAvalonFP (not for GetAvalonCountFP) + bitFlags : int, optional + Substructure fingerprint (32767) or similarity fingerprint (15761407) by default 15761407 + useCounts : bool, optional + If toggled will create the count and not bit-based fingerprint, by default False + """ + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.fpSize = fpSize + self.isQuery = isQuery + self.resetVect = resetVect + self.bitFlags = bitFlags + self.useCounts = useCounts + + def _mol2fp(self, mol): + if self.useCounts: + return pyAvalonTools.GetAvalonCountFP( + mol, + nBits=int(self.fpSize), + isQuery=bool(self.isQuery), + bitFlags=int(self.bitFlags), + ) + else: + return pyAvalonTools.GetAvalonFP( + mol, + nBits=int(self.fpSize), + isQuery=bool(self.isQuery), + resetVect=bool(self.resetVect), + bitFlags=int(self.bitFlags), + ) + + class MorganFingerprintTransformer(FpsTransformer): def __init__( self, @@ -642,9 +501,7 @@ def __init__( self.useFeatures = useFeatures self.useCounts = useCounts - print("MorganFingerprintTransformer will be replace by MorganGeneratorTransformer, due to changes in RDKit!") - #raise DeprecationWarning("MorganFingerprintTransformer will be replace by MorganFPGeneratorTransformer, due to changes in RDKit!") - + warn("MorganFingerprintTransformer will be replace by MorganGeneratorTransformer, due to changes in RDKit!", DeprecationWarning) def _mol2fp(self, mol): if self.useCounts: @@ -667,58 +524,196 @@ def _mol2fp(self, mol): ) -class AvalonFingerprintTransformer(FpsTransformer): - # Fingerprint from the Avalon toolkeit, https://doi.org/10.1021/ci050413p +class RDKitFingerprintTransformer(FpsTransformer): def __init__( self, - fpSize: int = 512, - isQuery: bool = False, - resetVect: bool = False, - bitFlags: int = 15761407, - useCounts: bool = False, + minPath: int = 1, + maxPath: int = 7, + useHs: bool = True, + branchedPaths: bool = True, + useBondOrder: bool = True, + countSimulation: bool = False, + countBounds=None, + fpSize: int = 2048, + numBitsPerFeature: int = 2, + atomInvariantsGenerator=None, parallel: Union[bool, int] = False, safe_inference_mode: bool = False, dtype: np.dtype = np.int8, ): - """Transform RDKit mols into Count or bit-based Avalon Fingerprints + """Calculates the RDKit fingerprints Parameters ---------- + minPath : int, optional + the minimum path length (in bonds) to be included, by default 1 + maxPath : int, optional + the maximum path length (in bonds) to be included, by default 7 + useHs : bool, optional + toggles inclusion of Hs in paths (if the molecule has explicit Hs), by default True + branchedPaths : bool, optional + toggles generation of branched subgraphs, not just linear paths, by default True + useBondOrder : bool, optional + toggles inclusion of bond orders in the path hashes, by default True + countSimulation : bool, optional + if set, use count simulation while generating the fingerprint, by default False + countBounds : _type_, optional + boundaries for count simulation, corresponding bit will be set if the count is higher than the number provided for that spot, by default None fpSize : int, optional - Size of the fingerprint, by default 512 - isQuery : bool, optional - use the fingerprint for a query structure, by default False - resetVect : bool, optional - reset vector, by default False NB: only used in GetAvalonFP (not for GetAvalonCountFP) - bitFlags : int, optional - Substructure fingerprint (32767) or similarity fingerprint (15761407) by default 15761407 - useCounts : bool, optional - If toggled will create the count and not bit-based fingerprint, by default False + size of the generated fingerprint, does not affect the sparse versions, by default 2048 + numBitsPerFeature : int, optional + the number of bits set per path/subgraph found, by default 2 + atomInvariantsGenerator : _type_, optional + atom invariants to be used during fingerprint generation, by default None """ super().__init__( parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype ) + self.minPath = minPath + self.maxPath = maxPath + self.useHs = useHs + self.branchedPaths = branchedPaths + self.useBondOrder = useBondOrder + self.countSimulation = countSimulation + self.countBounds = countBounds self.fpSize = fpSize - self.isQuery = isQuery - self.resetVect = resetVect - self.bitFlags = bitFlags + self.numBitsPerFeature = numBitsPerFeature + self.atomInvariantsGenerator = atomInvariantsGenerator + + warn("RDKitFingerprintTransformer will be replace by RDKitFPGeneratorTransformer, due to changes in RDKit!", DeprecationWarning) + + + def _mol2fp(self, mol): + generator = rdFingerprintGenerator.GetRDKitFPGenerator( + minPath=int(self.minPath), + maxPath=int(self.maxPath), + useHs=bool(self.useHs), + branchedPaths=bool(self.branchedPaths), + useBondOrder=bool(self.useBondOrder), + countSimulation=bool(self.countSimulation), + countBounds=bool(self.countBounds), + fpSize=int(self.fpSize), + numBitsPerFeature=int(self.numBitsPerFeature), + atomInvariantsGenerator=self.atomInvariantsGenerator, + ) + return generator.GetFingerprint(mol) + + +class AtomPairFingerprintTransformer(FpsTransformer): + def __init__( + self, + minLength: int = 1, + maxLength: int = 30, + fromAtoms=0, + ignoreAtoms=0, + atomInvariants=0, + nBitsPerEntry: int = 4, + includeChirality: bool = False, + use2D: bool = True, + confId: int = -1, + fpSize=2048, + useCounts: bool = False, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.minLength = minLength + self.maxLength = maxLength + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants + self.includeChirality = includeChirality + self.use2D = use2D + self.confId = confId + self.fpSize = fpSize + self.nBitsPerEntry = nBitsPerEntry self.useCounts = useCounts + warn("AtomPairFingerprintTransformer will be replace by AtomPairFPGeneratorTransformer, due to changes in RDKit!", DeprecationWarning) + def _mol2fp(self, mol): if self.useCounts: - return pyAvalonTools.GetAvalonCountFP( + return rdMolDescriptors.GetHashedAtomPairFingerprint( mol, nBits=int(self.fpSize), - isQuery=bool(self.isQuery), - bitFlags=int(self.bitFlags), + minLength=int(self.minLength), + maxLength=int(self.maxLength), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + includeChirality=bool(self.includeChirality), + use2D=bool(self.use2D), + confId=int(self.confId), ) else: - return pyAvalonTools.GetAvalonFP( + return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( mol, nBits=int(self.fpSize), - isQuery=bool(self.isQuery), - resetVect=bool(self.resetVect), - bitFlags=int(self.bitFlags), + minLength=int(self.minLength), + maxLength=int(self.maxLength), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + nBitsPerEntry=int(self.nBitsPerEntry), + includeChirality=bool(self.includeChirality), + use2D=bool(self.use2D), + confId=int(self.confId), + ) + + +class TopologicalTorsionFingerprintTransformer(FpsTransformer): + def __init__( + self, + targetSize: int = 4, + fromAtoms=0, + ignoreAtoms=0, + atomInvariants=0, + includeChirality: bool = False, + nBitsPerEntry: int = 4, + fpSize=2048, + useCounts: bool = False, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.targetSize = targetSize + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants + self.includeChirality = includeChirality + self.nBitsPerEntry = nBitsPerEntry + self.fpSize = fpSize + self.useCounts = useCounts + + warn("TopologicalTorsionFingerprintTransformer will be replace by TopologicalTorsionFPGeneatorTransformer, due to changes in RDKit!", DeprecationWarning) + + def _mol2fp(self, mol): + if self.useCounts: + return rdMolDescriptors.GetHashedTopologicalTorsionFingerprint( + mol, + nBits=int(self.fpSize), + targetSize=int(self.targetSize), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + includeChirality=bool(self.includeChirality), + ) + else: + return rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( + mol, + nBits=int(self.fpSize), + targetSize=int(self.targetSize), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + includeChirality=bool(self.includeChirality), + nBitsPerEntry=int(self.nBitsPerEntry), ) @@ -732,6 +727,7 @@ def parallel_helper(args): transformer = getattr(fingerprints, classname)(**parameters) return transformer._transform(X_mols) + class FpsGeneratorTransformer(FpsTransformer): _regenerate_on_properties = () From f7d2958b91f24930288f3c5a68c7bdb23a80a633 Mon Sep 17 00:00:00 2001 From: riesben Date: Fri, 15 Nov 2024 07:03:12 +0100 Subject: [PATCH 7/8] Remodelling transformers: - add new generator functions to transformer test --- tests/test_transformers.py | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/tests/test_transformers.py b/tests/test_transformers.py index fa65504..b96d421 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -15,9 +15,11 @@ from sklearn.ensemble import RandomForestRegressor from scikit_mol.conversions import SmilesToMolTransformer from scikit_mol.core import SKLEARN_VERSION_PANDAS_OUT -from scikit_mol.fingerprints import FpsTransformer, MACCSKeysFingerprintTransformer, RDKitFingerprintTransformer, AtomPairFingerprintTransformer, \ - TopologicalTorsionFingerprintTransformer, MorganFingerprintTransformer, SECFingerprintTransformer, \ - MHFingerprintTransformer, AvalonFingerprintTransformer +from scikit_mol.fingerprints import (FpsTransformer, MACCSKeysFingerprintTransformer, RDKitFingerprintTransformer, AtomPairFingerprintTransformer, + TopologicalTorsionFingerprintTransformer, MorganFingerprintTransformer, SECFingerprintTransformer, + MHFingerprintTransformer, AvalonFingerprintTransformer, MorganFPGeneratorTransformer, + RDKitFPGeneratorTransformer, AtomPairFPGeneratorTransformer, TopologicalTorsionFPGeneatorTransformer) + from scikit_mol.descriptors import MolecularDescriptorTransformer from fixtures import SLC6A4_subset, SLC6A4_subset_with_cddd, skip_pandas_output_test, mols_container, featurizer, combined_transformer @@ -29,6 +31,9 @@ def test_transformer(SLC6A4_subset): X_train, X_test = X_smiles[:128], X_smiles[128:] Y_train, Y_test = Y[:128], Y[128:] + MorganFPGeneratorTransformer, + RDKitFPGeneratorTransformer, AtomPairFPGeneratorTransformer, TopologicalTorsionFPGeneatorTransformer + # run FP with default parameters except when useCounts can be given as an argument FP_dict = {"MACCSTransformer": [MACCSKeysFingerprintTransformer, None], "RDKitFPTransformer": [RDKitFingerprintTransformer, None], @@ -40,7 +45,15 @@ def test_transformer(SLC6A4_subset): "MorganTransformer useCounts": [MorganFingerprintTransformer, True], "SECFingerprintTransformer": [SECFingerprintTransformer, None], "MHFingerprintTransformer": [MHFingerprintTransformer, None], - 'AvalonFingerprintTransformer': [AvalonFingerprintTransformer, None]} + 'AvalonFingerprintTransformer': [AvalonFingerprintTransformer, None], + 'MorganFPGeneratorTransformer': [MorganFPGeneratorTransformer, True], + 'MorganFPGeneratorTransformer': [MorganFPGeneratorTransformer, False], + 'RDKitFPGeneratorTransformer': [RDKitFPGeneratorTransformer, None], + 'AtomPairFPGeneratorTransformer': [AtomPairFPGeneratorTransformer, True], + 'AtomPairFPGeneratorTransformer': [ AtomPairFPGeneratorTransformer, False], + 'TopologicalTorsionFPGeneatorTransformer': [TopologicalTorsionFPGeneatorTransformer, True], + 'TopologicalTorsionFPGeneatorTransformer': [ TopologicalTorsionFPGeneatorTransformer, False], + } # fit on toy data and print train/test score if successful or collect the failed FP failed_FP = [] @@ -81,7 +94,22 @@ def test_transformer_pandas_output(SLC6A4_subset, pandas_output): "MorganTransformer useCounts": [MorganFingerprintTransformer, True], "SECFingerprintTransformer": [SECFingerprintTransformer, None], "MHFingerprintTransformer": [MHFingerprintTransformer, None], - 'AvalonFingerprintTransformer': [AvalonFingerprintTransformer, None]} + 'AvalonFingerprintTransformer': [AvalonFingerprintTransformer, None], + 'MorganFPGeneratorTransformer': [MorganFPGeneratorTransformer, + True], + 'MorganFPGeneratorTransformer': [MorganFPGeneratorTransformer, + False], + 'RDKitFPGeneratorTransformer': [RDKitFPGeneratorTransformer, + None], + 'AtomPairFPGeneratorTransformer': [ + AtomPairFPGeneratorTransformer, True], + 'AtomPairFPGeneratorTransformer': [ + AtomPairFPGeneratorTransformer, False], + 'TopologicalTorsionFPGeneatorTransformer': [ + TopologicalTorsionFPGeneatorTransformer, True], + 'TopologicalTorsionFPGeneatorTransformer': [ + TopologicalTorsionFPGeneatorTransformer, False], + } # fit on toy data and check that the output is a pandas dataframe failed_FP = [] From 5ae6a2b326f9a47fcd6174be0b84cae9eeaccab4 Mon Sep 17 00:00:00 2001 From: riesben Date: Fri, 15 Nov 2024 07:16:15 +0100 Subject: [PATCH 8/8] Remodelling transformers: - add DeprecationWarnings to not harmonized fpSize bits. --- scikit_mol/fingerprints.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/scikit_mol/fingerprints.py b/scikit_mol/fingerprints.py index a6f90bc..bea43e0 100644 --- a/scikit_mol/fingerprints.py +++ b/scikit_mol/fingerprints.py @@ -45,6 +45,17 @@ def __init__( self.safe_inference_mode = safe_inference_mode self.dtype = dtype + + @property + def nBits(self): + warn("nBits will be replace by fpSize, due to changes harmonization!", DeprecationWarning) + return self.fpSize + + @nBits.setter + def nBits(self, nBits): + warn("nBits will be replace by fpSize, due to changes harmonization!", DeprecationWarning) + self.fpSize = nBits + def _get_column_prefix(self) -> str: matched = _PATTERN_FINGERPRINT_TRANSFORMER.match(type(self).__name__) if matched: @@ -299,10 +310,12 @@ def seed(self, seed): @property def n_permutations(self): + warn("n_permutations will be replace by fpSize, due to changes harmonization!", DeprecationWarning) return self.fpSize @n_permutations.setter def n_permutations(self, n_permutations): + warn("n_permutations will be replace by fpSize, due to changes harmonization!", DeprecationWarning) self.fpSize = n_permutations # each time the n_permutations parameter is modified refresh an instance of the encoder self._recreate_encoder() @@ -402,7 +415,7 @@ def n_permutations(self, n_permutations): @property def length(self): - # to be compliant with the requirement of the base class + warn("length will be replace by fpSize, due to changes harmonization!", DeprecationWarning) return self.fpSize