diff --git a/scikit_mol/fingerprints.py b/scikit_mol/fingerprints.py index f044a06..bea43e0 100644 --- a/scikit_mol/fingerprints.py +++ b/scikit_mol/fingerprints.py @@ -1,8 +1,9 @@ from multiprocessing import Pool, get_context import multiprocessing import re +import inspect +from warnings import warn from typing import Union -from rdkit import Chem from rdkit import DataStructs # from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect @@ -11,6 +12,11 @@ from rdkit.Chem import rdMHFPFingerprint from rdkit.Avalon import pyAvalonTools +from rdkit.Chem.rdFingerprintGenerator import (GetMorganGenerator, GetMorganFeatureAtomInvGen, + GetTopologicalTorsionGenerator, + GetAtomPairGenerator, + GetRDKitFPGenerator) + import numpy as np import pandas as pd from scipy.sparse import lil_matrix @@ -26,7 +32,6 @@ r"^(?P\w+)FingerprintTransformer$" ) - class FpsTransformer(ABC, BaseEstimator, TransformerMixin): def __init__( self, @@ -40,6 +45,17 @@ def __init__( self.safe_inference_mode = safe_inference_mode self.dtype = dtype + + @property + def nBits(self): + warn("nBits will be replace by fpSize, due to changes harmonization!", DeprecationWarning) + return self.fpSize + + @nBits.setter + def nBits(self, nBits): + warn("nBits will be replace by fpSize, due to changes harmonization!", DeprecationWarning) + self.fpSize = nBits + def _get_column_prefix(self) -> str: matched = _PATTERN_FINGERPRINT_TRANSFORMER.match(type(self).__name__) if matched: @@ -49,7 +65,7 @@ def _get_column_prefix(self) -> str: return "fp" def _get_n_digits_column_suffix(self) -> int: - return len(str(self.nBits)) + return len(str(self.fpSize)) def get_display_feature_names_out(self, input_features=None): """Get feature names for display purposes @@ -61,7 +77,7 @@ def get_display_feature_names_out(self, input_features=None): prefix = self._get_column_prefix() n_digits = self._get_n_digits_column_suffix() return np.array( - [f"{prefix}_{str(i).zfill(n_digits)}" for i in range(1, self.nBits + 1)] + [f"{prefix}_{str(i).zfill(n_digits)}" for i in range(1, self.fpSize + 1)] ) def get_feature_names_out(self, input_features=None): @@ -71,7 +87,7 @@ def get_feature_names_out(self, input_features=None): to get the column names of the transformed dataframe. """ prefix = self._get_column_prefix() - return np.array([f"{prefix}_{i}" for i in range(1, self.nBits + 1)]) + return np.array([f"{prefix}_{i}" for i in range(1, self.fpSize + 1)]) @abstractmethod def _mol2fp(self, mol): @@ -83,11 +99,11 @@ def _mol2fp(self, mol): def _fp2array(self, fp): if fp: - arr = np.zeros((self.nBits,), dtype=self.dtype) + arr = np.zeros((self.fpSize,), dtype=self.dtype) DataStructs.ConvertToNumpyArray(fp, arr) return arr else: - return np.ma.masked_all((self.nBits,), dtype=self.dtype) + return np.ma.masked_all((self.fpSize,), dtype=self.dtype) def _transform_mol(self, mol): if not mol and self.safe_inference_mode: @@ -113,16 +129,17 @@ def _transform(self, X): if self.safe_inference_mode: # Use the new method with masked arrays if we're in safe inference mode arrays = [self._transform_mol(mol) for mol in X] + print(arrays) return np.ma.stack(arrays) else: # Use the original, faster method if we're not in safe inference mode - arr = np.zeros((len(X), self.nBits), dtype=self.dtype) + arr = np.zeros((len(X), self.fpSize), dtype=self.dtype) for i, mol in enumerate(X): arr[i, :] = self._transform_mol(mol) return arr def _transform_sparse(self, X): - arr = np.zeros((len(X), self.nBits), dtype=self.dtype) + arr = np.zeros((len(X), self.fpSize), dtype=self.dtype) for i, mol in enumerate(X): arr[i, :] = self._transform_mol(mol) @@ -182,6 +199,7 @@ def __init__( parallel: Union[bool, int] = False, safe_inference_mode: bool = False, dtype: np.dtype = np.int8, + fpSize=167, ): """MACCS keys fingerprinter calculates the 167 fixed MACCS keys @@ -189,217 +207,26 @@ def __init__( super().__init__( parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype ) - self.nBits = 167 - - @property - def nBits(self): - return self._nBits - - @nBits.setter - def nBits(self, nBits): - if nBits != 167: + if fpSize != 167: raise ValueError( - "nBits can only be 167, matching the number of defined MACCS keys!" + "fpSize can only be 167, matching the number of defined MACCS keys!" ) - self._nBits = nBits - - def _mol2fp(self, mol): - return rdMolDescriptors.GetMACCSKeysFingerprint(mol) - - -class RDKitFingerprintTransformer(FpsTransformer): - def __init__( - self, - minPath: int = 1, - maxPath: int = 7, - useHs: bool = True, - branchedPaths: bool = True, - useBondOrder: bool = True, - countSimulation: bool = False, - countBounds=None, - fpSize: int = 2048, - numBitsPerFeature: int = 2, - atomInvariantsGenerator=None, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - """Calculates the RDKit fingerprints - - Parameters - ---------- - minPath : int, optional - the minimum path length (in bonds) to be included, by default 1 - maxPath : int, optional - the maximum path length (in bonds) to be included, by default 7 - useHs : bool, optional - toggles inclusion of Hs in paths (if the molecule has explicit Hs), by default True - branchedPaths : bool, optional - toggles generation of branched subgraphs, not just linear paths, by default True - useBondOrder : bool, optional - toggles inclusion of bond orders in the path hashes, by default True - countSimulation : bool, optional - if set, use count simulation while generating the fingerprint, by default False - countBounds : _type_, optional - boundaries for count simulation, corresponding bit will be set if the count is higher than the number provided for that spot, by default None - fpSize : int, optional - size of the generated fingerprint, does not affect the sparse versions, by default 2048 - numBitsPerFeature : int, optional - the number of bits set per path/subgraph found, by default 2 - atomInvariantsGenerator : _type_, optional - atom invariants to be used during fingerprint generation, by default None - """ - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.minPath = minPath - self.maxPath = maxPath - self.useHs = useHs - self.branchedPaths = branchedPaths - self.useBondOrder = useBondOrder - self.countSimulation = countSimulation - self.countBounds = countBounds - self.fpSize = fpSize - self.numBitsPerFeature = numBitsPerFeature - self.atomInvariantsGenerator = atomInvariantsGenerator + self._fpSize = fpSize @property def fpSize(self): - return self.nBits + return self._fpSize - # Scikit-Learn expects to be able to set fpSize directly on object via .set_params(), so this updates nBits used by the abstract class @fpSize.setter def fpSize(self, fpSize): - self.nBits = fpSize - - def _mol2fp(self, mol): - generator = rdFingerprintGenerator.GetRDKitFPGenerator( - minPath=int(self.minPath), - maxPath=int(self.maxPath), - useHs=bool(self.useHs), - branchedPaths=bool(self.branchedPaths), - useBondOrder=bool(self.useBondOrder), - countSimulation=bool(self.countSimulation), - countBounds=bool(self.countBounds), - fpSize=int(self.fpSize), - numBitsPerFeature=int(self.numBitsPerFeature), - atomInvariantsGenerator=self.atomInvariantsGenerator, - ) - return generator.GetFingerprint(mol) - - -class AtomPairFingerprintTransformer(FpsTransformer): - def __init__( - self, - minLength: int = 1, - maxLength: int = 30, - fromAtoms=0, - ignoreAtoms=0, - atomInvariants=0, - nBitsPerEntry: int = 4, - includeChirality: bool = False, - use2D: bool = True, - confId: int = -1, - nBits=2048, - useCounts: bool = False, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.minLength = minLength - self.maxLength = maxLength - self.fromAtoms = fromAtoms - self.ignoreAtoms = ignoreAtoms - self.atomInvariants = atomInvariants - self.includeChirality = includeChirality - self.use2D = use2D - self.confId = confId - self.nBits = nBits - self.nBitsPerEntry = nBitsPerEntry - self.useCounts = useCounts - - def _mol2fp(self, mol): - if self.useCounts: - return rdMolDescriptors.GetHashedAtomPairFingerprint( - mol, - nBits=int(self.nBits), - minLength=int(self.minLength), - maxLength=int(self.maxLength), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - includeChirality=bool(self.includeChirality), - use2D=bool(self.use2D), - confId=int(self.confId), - ) - else: - return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( - mol, - nBits=int(self.nBits), - minLength=int(self.minLength), - maxLength=int(self.maxLength), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - nBitsPerEntry=int(self.nBitsPerEntry), - includeChirality=bool(self.includeChirality), - use2D=bool(self.use2D), - confId=int(self.confId), + if fpSize != 167: + raise ValueError( + "fpSize can only be 167, matching the number of defined MACCS keys!" ) - - -class TopologicalTorsionFingerprintTransformer(FpsTransformer): - def __init__( - self, - targetSize: int = 4, - fromAtoms=0, - ignoreAtoms=0, - atomInvariants=0, - includeChirality: bool = False, - nBitsPerEntry: int = 4, - nBits=2048, - useCounts: bool = False, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.targetSize = targetSize - self.fromAtoms = fromAtoms - self.ignoreAtoms = ignoreAtoms - self.atomInvariants = atomInvariants - self.includeChirality = includeChirality - self.nBitsPerEntry = nBitsPerEntry - self.nBits = nBits - self.useCounts = useCounts + self._fpSize = fpSize def _mol2fp(self, mol): - if self.useCounts: - return rdMolDescriptors.GetHashedTopologicalTorsionFingerprint( - mol, - nBits=int(self.nBits), - targetSize=int(self.targetSize), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - includeChirality=bool(self.includeChirality), - ) - else: - return rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( - mol, - nBits=int(self.nBits), - targetSize=int(self.targetSize), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - includeChirality=bool(self.includeChirality), - nBitsPerEntry=int(self.nBitsPerEntry), - ) + return rdMolDescriptors.GetMACCSKeysFingerprint(mol) class MHFingerprintTransformer(FpsTransformer): @@ -410,7 +237,7 @@ def __init__( isomeric: bool = False, kekulize: bool = False, min_radius: int = 1, - n_permutations: int = 2048, + fpSize: int = 2048, seed: int = 42, parallel: Union[bool, int] = False, safe_inference_mode: bool = False, @@ -426,7 +253,7 @@ def __init__( isomeric (bool, optional): Whether the isomeric SMILES to be considered. Defaults to False. kekulize (bool, optional): Whether or not to kekulize the extracted SMILES. Defaults to False. min_radius (int, optional): The minimum radius that is used to extract n-gram. Defaults to 1. - n_permutations (int, optional): The number of permutations used for hashing. Defaults to 0, + fpSize (int, optional): The number of permutations used for hashing. Defaults to 2048, this is effectively the length of the FP seed (int, optional): The value used to seed numpy.random. Defaults to 0. """ @@ -439,7 +266,7 @@ def __init__( self.kekulize = kekulize self.min_radius = min_radius # Set the .n_permutations and .seed without creating the encoder twice - self._n_permutations = n_permutations + self.fpSize = fpSize self._seed = seed # create the encoder instance self._recreate_encoder() @@ -468,7 +295,7 @@ def _fp2array(self, fp): def _recreate_encoder(self): self.mhfp_encoder = rdMHFPFingerprint.MHFPEncoder( - self._n_permutations, self._seed + self.fpSize, self._seed ) @property @@ -483,19 +310,16 @@ def seed(self, seed): @property def n_permutations(self): - return self._n_permutations + warn("n_permutations will be replace by fpSize, due to changes harmonization!", DeprecationWarning) + return self.fpSize @n_permutations.setter def n_permutations(self, n_permutations): - self._n_permutations = n_permutations + warn("n_permutations will be replace by fpSize, due to changes harmonization!", DeprecationWarning) + self.fpSize = n_permutations # each time the n_permutations parameter is modified refresh an instance of the encoder self._recreate_encoder() - @property - def nBits(self): - # to be compliant with the requirement of the base class - return self._n_permutations - class SECFingerprintTransformer(FpsTransformer): # https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0321-8 @@ -506,7 +330,7 @@ def __init__( isomeric: bool = False, kekulize: bool = False, min_radius: int = 1, - length: int = 2048, + fpSize: int = 2048, n_permutations: int = 0, seed: int = 0, parallel: Union[bool, int] = False, @@ -521,7 +345,7 @@ def __init__( isomeric (bool, optional): Whether the isomeric SMILES to be considered. Defaults to False. kekulize (bool, optional): Whether or not to kekulize the extracted SMILES. Defaults to False. min_radius (int, optional): The minimum radius that is used to extract n-gram. Defaults to 1. - length (int, optional): The length of the folded fingerprint. Defaults to 2048. + fpSize (int, optional): The length of the folded fingerprint. Defaults to 2048. n_permutations (int, optional): The number of permutations used for hashing. Defaults to 0. seed (int, optional): The value used to seed numpy.random. Defaults to 0. """ @@ -533,7 +357,7 @@ def __init__( self.isomeric = isomeric self.kekulize = kekulize self.min_radius = min_radius - self.length = length + self.fpSize = fpSize # Set the .n_permutations and seed without creating the encoder twice self._n_permutations = n_permutations self._seed = seed @@ -590,124 +414,319 @@ def n_permutations(self, n_permutations): self._recreate_encoder() @property - def nBits(self): - # to be compliant with the requirement of the base class - return self.length + def length(self): + warn("length will be replace by fpSize, due to changes harmonization!", DeprecationWarning) + return self.fpSize -class MorganFingerprintTransformer(FpsTransformer): +class AvalonFingerprintTransformer(FpsTransformer): + # Fingerprint from the Avalon toolkeit, https://doi.org/10.1021/ci050413p def __init__( self, - nBits=2048, - radius=2, - useChirality=False, - useBondTypes=True, - useFeatures=False, - useCounts=False, + fpSize: int = 512, + isQuery: bool = False, + resetVect: bool = False, + bitFlags: int = 15761407, + useCounts: bool = False, parallel: Union[bool, int] = False, safe_inference_mode: bool = False, dtype: np.dtype = np.int8, ): - """Transform RDKit mols into Count or bit-based hashed MorganFingerprints + """Transform RDKit mols into Count or bit-based Avalon Fingerprints Parameters ---------- - nBits : int, optional - Size of the hashed fingerprint, by default 2048 - radius : int, optional - Radius of the fingerprint, by default 2 - useChirality : bool, optional - Include chirality in calculation of the fingerprint keys, by default False - useBondTypes : bool, optional - Include bondtypes in calculation of the fingerprint keys, by default True - useFeatures : bool, optional - use chemical features, rather than atom-type in calculation of the fingerprint keys, by default False + fpSize : int, optional + Size of the fingerprint, by default 512 + isQuery : bool, optional + use the fingerprint for a query structure, by default False + resetVect : bool, optional + reset vector, by default False NB: only used in GetAvalonFP (not for GetAvalonCountFP) + bitFlags : int, optional + Substructure fingerprint (32767) or similarity fingerprint (15761407) by default 15761407 useCounts : bool, optional If toggled will create the count and not bit-based fingerprint, by default False """ super().__init__( parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype ) - self.nBits = nBits - self.radius = radius - self.useChirality = useChirality - self.useBondTypes = useBondTypes - self.useFeatures = useFeatures + self.fpSize = fpSize + self.isQuery = isQuery + self.resetVect = resetVect + self.bitFlags = bitFlags self.useCounts = useCounts def _mol2fp(self, mol): if self.useCounts: - return rdMolDescriptors.GetHashedMorganFingerprint( + return pyAvalonTools.GetAvalonCountFP( mol, - int(self.radius), - nBits=int(self.nBits), - useFeatures=bool(self.useFeatures), - useChirality=bool(self.useChirality), - useBondTypes=bool(self.useBondTypes), + nBits=int(self.fpSize), + isQuery=bool(self.isQuery), + bitFlags=int(self.bitFlags), ) else: - return rdMolDescriptors.GetMorganFingerprintAsBitVect( + return pyAvalonTools.GetAvalonFP( + mol, + nBits=int(self.fpSize), + isQuery=bool(self.isQuery), + resetVect=bool(self.resetVect), + bitFlags=int(self.bitFlags), + ) + + +class MorganFingerprintTransformer(FpsTransformer): + def __init__( + self, + fpSize=2048, + radius=2, + useChirality=False, + useBondTypes=True, + useFeatures=False, + useCounts=False, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + """Transform RDKit mols into Count or bit-based hashed MorganFingerprints + + Parameters + ---------- + fpSize : int, optional + Size of the hashed fingerprint, by default 2048 + radius : int, optional + Radius of the fingerprint, by default 2 + useChirality : bool, optional + Include chirality in calculation of the fingerprint keys, by default False + useBondTypes : bool, optional + Include bondtypes in calculation of the fingerprint keys, by default True + useFeatures : bool, optional + use chemical features, rather than atom-type in calculation of the fingerprint keys, by default False + useCounts : bool, optional + If toggled will create the count and not bit-based fingerprint, by default False + """ + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.fpSize = fpSize + self.radius = radius + self.useChirality = useChirality + self.useBondTypes = useBondTypes + self.useFeatures = useFeatures + self.useCounts = useCounts + + warn("MorganFingerprintTransformer will be replace by MorganGeneratorTransformer, due to changes in RDKit!", DeprecationWarning) + + def _mol2fp(self, mol): + if self.useCounts: + return rdMolDescriptors.GetHashedMorganFingerprint( + mol, + int(self.radius), + nBits=int(self.fpSize), + useFeatures=bool(self.useFeatures), + useChirality=bool(self.useChirality), + useBondTypes=bool(self.useBondTypes), + ) + else: + return rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, int(self.radius), - nBits=int(self.nBits), + nBits=int(self.fpSize), useFeatures=bool(self.useFeatures), useChirality=bool(self.useChirality), useBondTypes=bool(self.useBondTypes), ) -class AvalonFingerprintTransformer(FpsTransformer): - # Fingerprint from the Avalon toolkeit, https://doi.org/10.1021/ci050413p +class RDKitFingerprintTransformer(FpsTransformer): def __init__( self, - nBits: int = 512, - isQuery: bool = False, - resetVect: bool = False, - bitFlags: int = 15761407, - useCounts: bool = False, + minPath: int = 1, + maxPath: int = 7, + useHs: bool = True, + branchedPaths: bool = True, + useBondOrder: bool = True, + countSimulation: bool = False, + countBounds=None, + fpSize: int = 2048, + numBitsPerFeature: int = 2, + atomInvariantsGenerator=None, parallel: Union[bool, int] = False, safe_inference_mode: bool = False, dtype: np.dtype = np.int8, ): - """Transform RDKit mols into Count or bit-based Avalon Fingerprints + """Calculates the RDKit fingerprints Parameters ---------- - nBits : int, optional - Size of the fingerprint, by default 512 - isQuery : bool, optional - use the fingerprint for a query structure, by default False - resetVect : bool, optional - reset vector, by default False NB: only used in GetAvalonFP (not for GetAvalonCountFP) - bitFlags : int, optional - Substructure fingerprint (32767) or similarity fingerprint (15761407) by default 15761407 - useCounts : bool, optional - If toggled will create the count and not bit-based fingerprint, by default False + minPath : int, optional + the minimum path length (in bonds) to be included, by default 1 + maxPath : int, optional + the maximum path length (in bonds) to be included, by default 7 + useHs : bool, optional + toggles inclusion of Hs in paths (if the molecule has explicit Hs), by default True + branchedPaths : bool, optional + toggles generation of branched subgraphs, not just linear paths, by default True + useBondOrder : bool, optional + toggles inclusion of bond orders in the path hashes, by default True + countSimulation : bool, optional + if set, use count simulation while generating the fingerprint, by default False + countBounds : _type_, optional + boundaries for count simulation, corresponding bit will be set if the count is higher than the number provided for that spot, by default None + fpSize : int, optional + size of the generated fingerprint, does not affect the sparse versions, by default 2048 + numBitsPerFeature : int, optional + the number of bits set per path/subgraph found, by default 2 + atomInvariantsGenerator : _type_, optional + atom invariants to be used during fingerprint generation, by default None """ super().__init__( parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype ) - self.nBits = nBits - self.isQuery = isQuery - self.resetVect = resetVect - self.bitFlags = bitFlags + self.minPath = minPath + self.maxPath = maxPath + self.useHs = useHs + self.branchedPaths = branchedPaths + self.useBondOrder = useBondOrder + self.countSimulation = countSimulation + self.countBounds = countBounds + self.fpSize = fpSize + self.numBitsPerFeature = numBitsPerFeature + self.atomInvariantsGenerator = atomInvariantsGenerator + + warn("RDKitFingerprintTransformer will be replace by RDKitFPGeneratorTransformer, due to changes in RDKit!", DeprecationWarning) + + + def _mol2fp(self, mol): + generator = rdFingerprintGenerator.GetRDKitFPGenerator( + minPath=int(self.minPath), + maxPath=int(self.maxPath), + useHs=bool(self.useHs), + branchedPaths=bool(self.branchedPaths), + useBondOrder=bool(self.useBondOrder), + countSimulation=bool(self.countSimulation), + countBounds=bool(self.countBounds), + fpSize=int(self.fpSize), + numBitsPerFeature=int(self.numBitsPerFeature), + atomInvariantsGenerator=self.atomInvariantsGenerator, + ) + return generator.GetFingerprint(mol) + + +class AtomPairFingerprintTransformer(FpsTransformer): + def __init__( + self, + minLength: int = 1, + maxLength: int = 30, + fromAtoms=0, + ignoreAtoms=0, + atomInvariants=0, + nBitsPerEntry: int = 4, + includeChirality: bool = False, + use2D: bool = True, + confId: int = -1, + fpSize=2048, + useCounts: bool = False, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.minLength = minLength + self.maxLength = maxLength + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants + self.includeChirality = includeChirality + self.use2D = use2D + self.confId = confId + self.fpSize = fpSize + self.nBitsPerEntry = nBitsPerEntry self.useCounts = useCounts + warn("AtomPairFingerprintTransformer will be replace by AtomPairFPGeneratorTransformer, due to changes in RDKit!", DeprecationWarning) + def _mol2fp(self, mol): if self.useCounts: - return pyAvalonTools.GetAvalonCountFP( + return rdMolDescriptors.GetHashedAtomPairFingerprint( mol, - nBits=int(self.nBits), - isQuery=bool(self.isQuery), - bitFlags=int(self.bitFlags), + nBits=int(self.fpSize), + minLength=int(self.minLength), + maxLength=int(self.maxLength), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + includeChirality=bool(self.includeChirality), + use2D=bool(self.use2D), + confId=int(self.confId), ) else: - return pyAvalonTools.GetAvalonFP( + return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( mol, - nBits=int(self.nBits), - isQuery=bool(self.isQuery), - resetVect=bool(self.resetVect), - bitFlags=int(self.bitFlags), + nBits=int(self.fpSize), + minLength=int(self.minLength), + maxLength=int(self.maxLength), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + nBitsPerEntry=int(self.nBitsPerEntry), + includeChirality=bool(self.includeChirality), + use2D=bool(self.use2D), + confId=int(self.confId), + ) + + +class TopologicalTorsionFingerprintTransformer(FpsTransformer): + def __init__( + self, + targetSize: int = 4, + fromAtoms=0, + ignoreAtoms=0, + atomInvariants=0, + includeChirality: bool = False, + nBitsPerEntry: int = 4, + fpSize=2048, + useCounts: bool = False, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.targetSize = targetSize + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants + self.includeChirality = includeChirality + self.nBitsPerEntry = nBitsPerEntry + self.fpSize = fpSize + self.useCounts = useCounts + + warn("TopologicalTorsionFingerprintTransformer will be replace by TopologicalTorsionFPGeneatorTransformer, due to changes in RDKit!", DeprecationWarning) + + def _mol2fp(self, mol): + if self.useCounts: + return rdMolDescriptors.GetHashedTopologicalTorsionFingerprint( + mol, + nBits=int(self.fpSize), + targetSize=int(self.targetSize), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + includeChirality=bool(self.includeChirality), + ) + else: + return rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( + mol, + nBits=int(self.fpSize), + targetSize=int(self.targetSize), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + includeChirality=bool(self.includeChirality), + nBitsPerEntry=int(self.nBitsPerEntry), ) @@ -720,3 +739,238 @@ def parallel_helper(args): transformer = getattr(fingerprints, classname)(**parameters) return transformer._transform(X_mols) + + +class FpsGeneratorTransformer(FpsTransformer): + _regenerate_on_properties = () + + def _fp2array(self, fp): + raise DeprecationWarning("Generators can directly return fingerprints") + + def _mol2fp(self, mol): + raise DeprecationWarning("use _mol2array") + + def __getstate__(self): + # Get the state of the parent class + state = super().__getstate__() + state.update(self.get_params()) + # Remove the unpicklable property from the state + state.pop("_fpgen", None) # fpgen is not picklable + return state + + def __setstate__(self, state): + # Restore the state of the parent class + super().__setstate__(state) + # Re-create the unpicklable property + generatort_keys = inspect.signature(self._generate_fp_generator).parameters.keys() + params = [setattr(self, k, state["_"+k]) if "_"+k in state else setattr(self, k, state[k]) for k in generatort_keys] + self._generate_fp_generator() + + def __setattr__(self, name: str, value): + super().__setattr__(name, value) + if ( + not hasattr(self, "_initializing") + and name in self._regenerate_on_properties + ): + self._generate_fp_generator() + + @abstractmethod + def _generate_fp_generator(self): + raise NotImplementedError("_generate_fp_generator not implemented") + + @abstractmethod + def _transform_mol(self, mol) -> np.array: + """Generate numpy array descriptor from mol + + MUST BE OVERWRITTEN + """ + raise NotImplementedError("_transform_mol not implemented") + + +class MorganFPGeneratorTransformer(FpsGeneratorTransformer): + _regenerate_on_properties = ("radius", "fpSize", "useChirality", "useFeatures", "useBondTypes") + + def __init__(self, fpSize=2048, radius=2, useChirality=False, + useBondTypes=True, useFeatures=False, useCounts=False, + parallel: Union[bool, int] = False, ): + """Transform RDKit mols into Count or bit-based hashed MorganFingerprints + + Parameters + ---------- + fpsize : int, optional + Size of the hashed fingerprint, by default 2048 + radius : int, optional + Radius of the fingerprint, by default 2 + useChirality : bool, optional + Include chirality in calculation of the fingerprint keys, by default False + useBondTypes : bool, optional + Include bondtypes in calculation of the fingerprint keys, by default True + useFeatures : bool, optional + use chemical features, rather than atom-type in calculation of the fingerprint keys, by default False + useCounts : bool, optional + If toggled will create the count and not bit-based fingerprint, by default False + """ + + self._initializing = True + super().__init__(parallel = parallel) + self.fpSize = fpSize + self.radius = radius + self.useChirality = useChirality + self.useFeatures = useFeatures + self.useCounts = useCounts + self.useBondTypes = useBondTypes + + self._generate_fp_generator() + delattr(self, "_initializing") + + + def _generate_fp_generator(self): + + if self.useFeatures: + atomInvariantsGenerator = GetMorganFeatureAtomInvGen() + else: + atomInvariantsGenerator = None + + self._fpgen = GetMorganGenerator(radius=self.radius, + fpSize=self.fpSize, + includeChirality=self.useChirality, + useBondTypes=self.useBondTypes, + atomInvariantsGenerator=atomInvariantsGenerator, + ) + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy(mol) + else: + return self._fpgen.GetFingerprintAsNumPy(mol) + + +class TopologicalTorsionFPGeneatorTransformer(FpsGeneratorTransformer): + _regenerate_on_properties = ("fpSize", "includeChirality", "targetSize") + + def __init__(self, targetSize:int = 4, fromAtoms = None, ignoreAtoms = None, atomInvariants = None, confId=-1, + includeChirality:bool = False, fpSize:int=2048, + useCounts:bool=False, parallel: Union[bool, int] = False): + + self._initializing = True + super().__init__(parallel=parallel) + self.fpSize = fpSize + self.includeChirality = includeChirality + self.targetSize = targetSize + + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants + self.confId = confId + self.useCounts = useCounts + + self._generate_fp_generator() + delattr(self, "_initializing") + + + def _generate_fp_generator(self): + self._fpgen = GetTopologicalTorsionGenerator(torsionAtomCount=self.targetSize, includeChirality=self.includeChirality, + fpSize=self.fpSize) + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) + else: + return self._fpgen.GetFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) + + +class AtomPairFPGeneratorTransformer(FpsGeneratorTransformer): + _regenerate_on_properties = ("fpSize", "includeChirality", "use2D", "minLength", "maxLength") + + def __init__(self, minLength:int = 1, maxLength:int = 30, fromAtoms = None, ignoreAtoms = None, atomInvariants = None, + includeChirality:bool = False, use2D:bool = True, confId:int = -1, fpSize:int=2048, + useCounts:bool=False, parallel: Union[bool, int] = False,): + self._initializing = True + super().__init__(parallel = parallel) + self.fpSize = fpSize + self.use2D = use2D + self.includeChirality = includeChirality + self.minLength = minLength + self.maxLength = maxLength + + self.useCounts= useCounts + self.confId = confId + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants + + self._generate_fp_generator() + delattr(self, "_initializing") + + def _generate_fp_generator(self): + self._fpgen = GetAtomPairGenerator(minDistance=self.minLength, maxDistance=self.maxLength, + includeChirality=self.includeChirality, + use2D=self.use2D, fpSize=self.fpSize) + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) + else: + return self._fpgen.GetFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) + + +class RDKitFPGeneratorTransformer(FpsGeneratorTransformer): + _regenerate_on_properties = ("minPath", "maxPath", "useHs", "branchedPaths", "useBondOrder", "countSimulation", "fpSize", "countBounds", + "numBitsPerFeature") + + def __init__(self, minPath:int = 1, maxPath:int =7, useHs:bool = True, branchedPaths:bool = True, + useBondOrder:bool = True, countSimulation:bool = False, countBounds = None, + fpSize:int = 2048, numBitsPerFeature:int = 2, + useCounts:bool = False, parallel: Union[bool, int] = False + ): + """Calculates the RDKit fingerprints + + Parameters + ---------- + minPath : int, optional + the minimum path length (in bonds) to be included, by default 1 + maxPath : int, optional + the maximum path length (in bonds) to be included, by default 7 + useHs : bool, optional + toggles inclusion of Hs in paths (if the molecule has explicit Hs), by default True + branchedPaths : bool, optional + toggles generation of branched subgraphs, not just linear paths, by default True + useBondOrder : bool, optional + toggles inclusion of bond orders in the path hashes, by default True + countSimulation : bool, optional + if set, use count simulation while generating the fingerprint, by default False + countBounds : _type_, optional + boundaries for count simulation, corresponding bit will be set if the count is higher than the number provided for that spot, by default None + fpSize : int, optional + size of the generated fingerprint, does not affect the sparse versions, by default 2048 + numBitsPerFeature : int, optional + the number of bits set per path/subgraph found, by default 2 + """ + self._initializing = True + super().__init__(parallel = parallel) + self.minPath = minPath + self.maxPath = maxPath + self.useHs = useHs + self.branchedPaths = branchedPaths + self.useBondOrder = useBondOrder + self.countSimulation = countSimulation + self.fpSize = fpSize + self.numBitsPerFeature = numBitsPerFeature + self.countBounds = countBounds + + self.useCounts = useCounts + + self._generate_fp_generator() + delattr(self, "_initializing") + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy(mol) + else: + return self._fpgen.GetFingerprintAsNumPy(mol) + + def _generate_fp_generator(self): + self._fpgen = GetRDKitFPGenerator(minPath=self.minPath, maxPath=self.maxPath, useHs=self.useHs, + branchedPaths=self.branchedPaths,useBondOrder=self.useBondOrder, + countSimulation=self.countSimulation, fpSize=self.fpSize, + countBounds=self.countBounds, numBitsPerFeature=self.numBitsPerFeature) diff --git a/tests/test_fptransformers.py b/tests/test_fptransformers.py index 9a9c27a..4ad1e9d 100644 --- a/tests/test_fptransformers.py +++ b/tests/test_fptransformers.py @@ -131,34 +131,20 @@ def test_set_params( ]: params = t.get_params() # change extracted dictionary - params["nBits"] = 4242 + params["fpSize"] = 4242 # change params in transformer - t.set_params(nBits=4242) + t.set_params(fpSize=4242) # get parameters as dictionary and assert that it is the same params_2 = t.get_params() assert all([params[key] == params_2[key] for key in params.keys()]) - for t in [rdkit_transformer]: + for t in [rdkit_transformer, secfp_transformer, mhfp_transformer]: params = t.get_params() params["fpSize"] = 4242 t.set_params(fpSize=4242) params_2 = t.get_params() assert all([params[key] == params_2[key] for key in params.keys()]) - for t in [secfp_transformer]: - params = t.get_params() - params["length"] = 4242 - t.set_params(length=4242) - params_2 = t.get_params() - assert all([params[key] == params_2[key] for key in params.keys()]) - - for t in [mhfp_transformer]: - params = t.get_params() - params["n_permutations"] = 4242 - t.set_params(n_permutations=4242) - params_2 = t.get_params() - assert all([params[key] == params_2[key] for key in params.keys()]) - def test_transform( mols_container, @@ -183,21 +169,13 @@ def test_transform( avalon_transformer, ]: params = t.get_params() + print(type(t), params) fps = t.transform(mols_container) # Assert that the same length of input and output assert len(fps) == len(mols_container) # assert that the size of the fingerprint is the expected size - if ( - type(t) == type(maccs_transformer) - or type(t) == type(secfp_transformer) - or type(t) == type(mhfp_transformer) - ): - fpsize = t.nBits - elif type(t) == type(rdkit_transformer): - fpsize = params["fpSize"] - else: - fpsize = params["nBits"] + fpsize = params["fpSize"] assert len(fps[0]) == fpsize @@ -231,16 +209,7 @@ def test_transform_parallel( assert len(fps) == len(mols_container) # assert that the size of the fingerprint is the expected size - if ( - type(t) == type(maccs_transformer) - or type(t) == type(secfp_transformer) - or type(t) == type(mhfp_transformer) - ): - fpsize = t.nBits - elif type(t) == type(rdkit_transformer): - fpsize = params["fpSize"] - else: - fpsize = params["nBits"] + fpsize = params["fpSize"] assert len(fps[0]) == fpsize @@ -306,7 +275,7 @@ def assert_transformer_set_params(tr_class, new_params, mols_list): def test_morgan_set_params(chiral_mols_list): new_params = { - "nBits": 1024, + "fpSize": 1024, "radius": 1, "useBondTypes": False, # TODO, why doesn't this change the FP? "useChirality": True, @@ -328,7 +297,7 @@ def test_atompairs_set_params(chiral_mols_list): "includeChirality": True, "maxLength": 3, "minLength": 3, - "nBits": 1024, + "fpSize": 1024, "nBitsPerEntry": 3, #'use2D': True, #TODO, understand why this can't be set different "useCounts": True, @@ -344,7 +313,7 @@ def test_topologicaltorsion_set_params(chiral_mols_list): #'fromAtoms': 0, #'ignoreAtoms': 0, #'includeChirality': True, #TODO, figure out why this setting seems to give same FP wheter toggled or not - "nBits": 1024, + "fpSize": 1024, "nBitsPerEntry": 3, "targetSize": 5, "useCounts": True, @@ -376,7 +345,7 @@ def test_SECFingerprintTransformer(chiral_mols_list): new_params = { "isomeric": True, "kekulize": True, - "length": 1048, + "fpSize": 1048, "min_radius": 2, #'n_permutations': 2, # The SECFp is not using this setting "radius": 2, @@ -395,7 +364,7 @@ def test_MHFingerprintTransformer(chiral_mols_list): "isomeric": True, "kekulize": True, "min_radius": 2, - "n_permutations": 4096, + "fpSize": 4096, "seed": 44, } assert_transformer_set_params( @@ -405,7 +374,7 @@ def test_MHFingerprintTransformer(chiral_mols_list): def test_AvalonFingerprintTransformer(chiral_mols_list): new_params = { - "nBits": 1024, + "fpSize": 1024, "isQuery": True, # 'resetVect': True, #TODO: this doesn't change the FP "bitFlags": 32767, diff --git a/tests/test_fptransformersgenerator.py b/tests/test_fptransformersgenerator.py new file mode 100644 index 0000000..81da19c --- /dev/null +++ b/tests/test_fptransformersgenerator.py @@ -0,0 +1,188 @@ +import pickle +import tempfile +import pytest +import numpy as np +from fixtures import mols_list, smiles_list, mols_container, smiles_container, fingerprint, chiral_smiles_list, chiral_mols_list +from sklearn import clone + +from scikit_mol.fingerprints import (MorganFPGeneratorTransformer, + RDKitFPGeneratorTransformer, + AtomPairFPGeneratorTransformer, + TopologicalTorsionFPGeneatorTransformer, + ) + +test_transformers = [MorganFPGeneratorTransformer, RDKitFPGeneratorTransformer, + AtomPairFPGeneratorTransformer, TopologicalTorsionFPGeneatorTransformer] + + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_fpstransformer_fp2array(transformer_class, fingerprint): + transformer = transformer_class() + + with pytest.raises(DeprecationWarning, match='Generators can directly return fingerprints'): + fp = transformer._fp2array(fingerprint) + + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_fpstransformer_transform_mol(transformer_class, mols_list): + transformer = transformer_class() + + fp = transformer._transform_mol(mols_list[0]) + #See that fp is the correct type, shape and bit count + assert(type(fp) == type(np.array([0]))) + assert(fp.shape == (2048,)) + + if isinstance(transformer, RDKitFPGeneratorTransformer): + assert(fp.sum() == 104) + elif isinstance(transformer, AtomPairFPGeneratorTransformer): + assert (fp.sum() == 32) + elif isinstance(transformer, TopologicalTorsionFPGeneatorTransformer): + assert (fp.sum() == 12) + elif isinstance(transformer, MorganFPGeneratorTransformer): + assert (fp.sum() == 14) + else: + raise NotImplementedError("missing Assert") + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_clonability(transformer_class): + transformer = transformer_class() + + params = transformer.get_params() + t2 = clone(transformer) + params_2 = t2.get_params() + #Parameters of cloned transformers should be the same + assert all([ params[key] == params_2[key] for key in params.keys()]) + #Cloned transformers should not be the same object + assert t2 != transformer + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_set_params(transformer_class): + transformer = transformer_class() + params = transformer.get_params() + #change extracted dictionary + params['fpSize'] = 4242 + #change params in transformer + transformer.set_params(fpSize = 4242) + # get parameters as dictionary and assert that it is the same + params_2 = transformer.get_params() + assert all([ params[key] == params_2[key] for key in params.keys()]) + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_transform(mols_container, transformer_class): + transformer = transformer_class() + #Test the different transformers + params = transformer.get_params() + fps = transformer.transform(mols_container) + #Assert that the same length of input and output + assert len(fps) == len(mols_container) + + fpsize = params['fpSize'] + + assert len(fps[0]) == fpsize + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_transform_parallel(mols_container, transformer_class): + transformer = transformer_class() + #Test the different transformers + transformer.set_params(parallel=True) + params = transformer.get_params() + fps = transformer.transform(mols_container) + #Assert that the same length of input and output + assert len(fps) == len(mols_container) + + fpsize = params['fpSize'] + assert len(fps[0]) == fpsize + + +@pytest.mark.parametrize("transformer_class", test_transformers) +def test_picklable(transformer_class): + #Test the different transformers + transformer = transformer_class() + p = transformer.get_params() + + with tempfile.NamedTemporaryFile() as f: + pickle.dump(transformer, f) + f.seek(0) + t2 = pickle.load(f) + print(p) + print(vars(transformer)) + print(vars(t2)) + assert(transformer.get_params() == t2.get_params()) + + +@pytest.mark.parametrize("transfomer", test_transformers) +def assert_transformer_set_params(transfomer, new_params, mols_list): + default_params = transfomer().get_params() + + for key in new_params.keys(): + tr = transfomer() + params = tr.get_params() + params[key] = new_params[key] + + fps_default = tr.transform(mols_list) + + tr.set_params(**params) + new_tr = transfomer(**params) + fps_reset_params = tr.transform(mols_list) + fps_init_new_params = new_tr.transform(mols_list) + + # Now fp_default should not be the same as fp_reset_params + + assert ~np.all([np.array_equal(fp_default, fp_reset_params) for fp_default, fp_reset_params in zip(fps_default, fps_reset_params)]), f"Assertation error, FP appears the same, although the {key} should be changed from {default_params[key]} to {params[key]}" + # fp_reset_params and fp_init_new_params should however be the same + assert np.all([np.array_equal(fp_init_new_params, fp_reset_params) for fp_init_new_params, fp_reset_params in zip(fps_init_new_params, fps_reset_params)]) , f"Assertation error, FP appears to be different, although the {key} should be changed back as well as initialized to {params[key]}" + + +def test_morgan_set_params(chiral_mols_list): + new_params = {'fpSize': 1024, + 'radius': 1, + 'useBondTypes': False,# TODO, why doesn't this change the FP? + 'useChirality': True, + 'useCounts': True, + 'useFeatures': True} + + assert_transformer_set_params(MorganFPGeneratorTransformer, new_params, chiral_mols_list) + + +def test_atompairs_set_params(chiral_mols_list): + new_params = { + #'atomInvariants': 1, + #'confId': -1, + #'fromAtoms': 1, + #'ignoreAtoms': 0, + 'includeChirality': True, + 'maxLength': 3, + 'minLength': 3, + 'fpSize': 1024, + #'nBitsPerEntry': 3, #Todo: not setable with the generators? + #'use2D': True, #TODO, understand why this can't be set different + 'useCounts': True} + + assert_transformer_set_params(AtomPairFPGeneratorTransformer, new_params, chiral_mols_list) + + +def test_topologicaltorsion_set_params(chiral_mols_list): + new_params = {#'atomInvariants': 0, + #'fromAtoms': 0, + #'ignoreAtoms': 0, + #'includeChirality': True, #TODO, figure out why this setting seems to give same FP wheter toggled or not + 'fpSize': 1024, + #'nBitsPerEntry': 3, #Todo: not setable with the generators? + 'targetSize': 5, + 'useCounts': True} + + assert_transformer_set_params(TopologicalTorsionFPGeneatorTransformer, new_params, chiral_mols_list) + +def test_RDKitFPTransformer(chiral_mols_list): + new_params = {#'atomInvariantsGenerator': None, + #'branchedPaths': False, + #'countBounds': 0, #TODO: What does this do? + 'countSimulation': True, + 'fpSize': 1024, + 'maxPath': 3, + 'minPath': 2, + 'numBitsPerFeature': 3, + 'useBondOrder': False, #TODO, why doesn't this change the FP? + #'useHs': False, #TODO, why doesn't this change the FP? + } + assert_transformer_set_params(RDKitFPGeneratorTransformer, new_params, chiral_mols_list) diff --git a/tests/test_safeinferencemode.py b/tests/test_safeinferencemode.py index 921cc0f..c9b4ca1 100644 --- a/tests/test_safeinferencemode.py +++ b/tests/test_safeinferencemode.py @@ -104,12 +104,12 @@ def test_safeinference_wrapper_pandas_output( result = smiles_pipeline[:-1].fit_transform(X_smiles) assert isinstance(result, pd.DataFrame) assert result.shape[0] == len(X_smiles) - assert result.shape[1] == smiles_pipeline.named_steps["FP"].nBits + assert result.shape[1] == smiles_pipeline.named_steps["FP"].fpSize @skip_pandas_output_test def test_safeinference_wrapper_get_feature_names_out(smiles_pipeline): # Get feature names from the FP step feature_names = smiles_pipeline.named_steps["FP"].get_feature_names_out() - assert len(feature_names) == smiles_pipeline.named_steps["FP"].nBits + assert len(feature_names) == smiles_pipeline.named_steps["FP"].fpSize assert all(isinstance(name, str) for name in feature_names) diff --git a/tests/test_transformers.py b/tests/test_transformers.py index 143ecd3..b96d421 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -15,9 +15,11 @@ from sklearn.ensemble import RandomForestRegressor from scikit_mol.conversions import SmilesToMolTransformer from scikit_mol.core import SKLEARN_VERSION_PANDAS_OUT -from scikit_mol.fingerprints import FpsTransformer, MACCSKeysFingerprintTransformer, RDKitFingerprintTransformer, AtomPairFingerprintTransformer, \ - TopologicalTorsionFingerprintTransformer, MorganFingerprintTransformer, SECFingerprintTransformer, \ - MHFingerprintTransformer, AvalonFingerprintTransformer +from scikit_mol.fingerprints import (FpsTransformer, MACCSKeysFingerprintTransformer, RDKitFingerprintTransformer, AtomPairFingerprintTransformer, + TopologicalTorsionFingerprintTransformer, MorganFingerprintTransformer, SECFingerprintTransformer, + MHFingerprintTransformer, AvalonFingerprintTransformer, MorganFPGeneratorTransformer, + RDKitFPGeneratorTransformer, AtomPairFPGeneratorTransformer, TopologicalTorsionFPGeneatorTransformer) + from scikit_mol.descriptors import MolecularDescriptorTransformer from fixtures import SLC6A4_subset, SLC6A4_subset_with_cddd, skip_pandas_output_test, mols_container, featurizer, combined_transformer @@ -29,6 +31,9 @@ def test_transformer(SLC6A4_subset): X_train, X_test = X_smiles[:128], X_smiles[128:] Y_train, Y_test = Y[:128], Y[128:] + MorganFPGeneratorTransformer, + RDKitFPGeneratorTransformer, AtomPairFPGeneratorTransformer, TopologicalTorsionFPGeneatorTransformer + # run FP with default parameters except when useCounts can be given as an argument FP_dict = {"MACCSTransformer": [MACCSKeysFingerprintTransformer, None], "RDKitFPTransformer": [RDKitFingerprintTransformer, None], @@ -40,7 +45,15 @@ def test_transformer(SLC6A4_subset): "MorganTransformer useCounts": [MorganFingerprintTransformer, True], "SECFingerprintTransformer": [SECFingerprintTransformer, None], "MHFingerprintTransformer": [MHFingerprintTransformer, None], - 'AvalonFingerprintTransformer': [AvalonFingerprintTransformer, None]} + 'AvalonFingerprintTransformer': [AvalonFingerprintTransformer, None], + 'MorganFPGeneratorTransformer': [MorganFPGeneratorTransformer, True], + 'MorganFPGeneratorTransformer': [MorganFPGeneratorTransformer, False], + 'RDKitFPGeneratorTransformer': [RDKitFPGeneratorTransformer, None], + 'AtomPairFPGeneratorTransformer': [AtomPairFPGeneratorTransformer, True], + 'AtomPairFPGeneratorTransformer': [ AtomPairFPGeneratorTransformer, False], + 'TopologicalTorsionFPGeneatorTransformer': [TopologicalTorsionFPGeneatorTransformer, True], + 'TopologicalTorsionFPGeneatorTransformer': [ TopologicalTorsionFPGeneatorTransformer, False], + } # fit on toy data and print train/test score if successful or collect the failed FP failed_FP = [] @@ -81,7 +94,22 @@ def test_transformer_pandas_output(SLC6A4_subset, pandas_output): "MorganTransformer useCounts": [MorganFingerprintTransformer, True], "SECFingerprintTransformer": [SECFingerprintTransformer, None], "MHFingerprintTransformer": [MHFingerprintTransformer, None], - 'AvalonFingerprintTransformer': [AvalonFingerprintTransformer, None]} + 'AvalonFingerprintTransformer': [AvalonFingerprintTransformer, None], + 'MorganFPGeneratorTransformer': [MorganFPGeneratorTransformer, + True], + 'MorganFPGeneratorTransformer': [MorganFPGeneratorTransformer, + False], + 'RDKitFPGeneratorTransformer': [RDKitFPGeneratorTransformer, + None], + 'AtomPairFPGeneratorTransformer': [ + AtomPairFPGeneratorTransformer, True], + 'AtomPairFPGeneratorTransformer': [ + AtomPairFPGeneratorTransformer, False], + 'TopologicalTorsionFPGeneatorTransformer': [ + TopologicalTorsionFPGeneatorTransformer, True], + 'TopologicalTorsionFPGeneatorTransformer': [ + TopologicalTorsionFPGeneatorTransformer, False], + } # fit on toy data and check that the output is a pandas dataframe failed_FP = [] @@ -96,11 +124,12 @@ def test_transformer_pandas_output(SLC6A4_subset, pandas_output): X_transformed = pipeline.transform(X_smiles) assert isinstance(X_transformed, pd.DataFrame), f"the output of {FP_name} is not a pandas dataframe" assert X_transformed.shape[0] == len(X_smiles), f"the number of rows in the output of {FP_name} is not equal to the number of samples" - assert len(X_transformed.columns) == pipeline.named_steps["FP"].nBits, f"the number of columns in the output of {FP_name} is not equal to the number of bits" + assert len(X_transformed.columns) == pipeline.named_steps["FP"].fpSize, f"the number of columns in the output of {FP_name} is not equal to the number of bits" print(f"\nfitting and transforming completed") - except: + except Exception as err: print(f"\n!!!! FAILED pipeline fitting and transforming for {FP_name} with useCounts={useCounts}") + print("\n".join(err.args)) failed_FP.append(FP_name) pass @@ -136,7 +165,7 @@ def test_combined_transformer_pandas_out(combined_transformer, SLC6A4_subset_wit pipeline_skmol = combined_transformer.named_transformers_["pipeline-1"] featurizer_skmol = pipeline_skmol[-1] if isinstance(featurizer_skmol, FpsTransformer): - n_skmol_features = featurizer_skmol.nBits + n_skmol_features = featurizer_skmol.fpSize elif isinstance(featurizer_skmol, MolecularDescriptorTransformer): n_skmol_features = len(featurizer_skmol.desc_list) else: