diff --git a/scikit_mol/fingerprints/__init__.py b/scikit_mol/fingerprints/__init__.py new file mode 100644 index 0000000..5ed655d --- /dev/null +++ b/scikit_mol/fingerprints/__init__.py @@ -0,0 +1,15 @@ +from .baseclasses import ( + FpsTransformer, + FpsGeneratorTransformer, +) # TODO, for backwards compatibility with tests, needs to be removed + +from .atompair import AtomPairFingerprintTransformer, AtomPairFPGeneratorTransformer +from .avalon import AvalonFingerprintTransformer +from .maccs import MACCSKeysFingerprintTransformer +from .minhash import MHFingerprintTransformer, SECFingerprintTransformer +from .morgan import MorganFingerprintTransformer, MorganFPGeneratorTransformer +from .rdkitfp import RDKitFingerprintTransformer, RDKitFPGeneratorTransformer +from .topologicaltorsion import ( + TopologicalTorsionFingerprintTransformer, + TopologicalTorsionFPGeneatorTransformer, +) diff --git a/scikit_mol/fingerprints/atompair.py b/scikit_mol/fingerprints/atompair.py new file mode 100644 index 0000000..aff8f9f --- /dev/null +++ b/scikit_mol/fingerprints/atompair.py @@ -0,0 +1,144 @@ +from typing import Union + +import numpy as np + +from warnings import warn + +from .baseclasses import FpsTransformer, FpsGeneratorTransformer + +from rdkit.Chem.rdFingerprintGenerator import GetAtomPairGenerator +from rdkit.Chem import rdMolDescriptors + + +class AtomPairFingerprintTransformer(FpsTransformer): + def __init__( + self, + minLength: int = 1, + maxLength: int = 30, + fromAtoms=0, + ignoreAtoms=0, + atomInvariants=0, + nBitsPerEntry: int = 4, + includeChirality: bool = False, + use2D: bool = True, + confId: int = -1, + fpSize=2048, + useCounts: bool = False, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.minLength = minLength + self.maxLength = maxLength + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants + self.includeChirality = includeChirality + self.use2D = use2D + self.confId = confId + self.fpSize = fpSize + self.nBitsPerEntry = nBitsPerEntry + self.useCounts = useCounts + + warn( + "AtomPairFingerprintTransformer will be replace by AtomPairFPGeneratorTransformer, due to changes in RDKit!", + DeprecationWarning, + ) + + def _mol2fp(self, mol): + if self.useCounts: + return rdMolDescriptors.GetHashedAtomPairFingerprint( + mol, + nBits=int(self.fpSize), + minLength=int(self.minLength), + maxLength=int(self.maxLength), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + includeChirality=bool(self.includeChirality), + use2D=bool(self.use2D), + confId=int(self.confId), + ) + else: + return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( + mol, + nBits=int(self.fpSize), + minLength=int(self.minLength), + maxLength=int(self.maxLength), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + nBitsPerEntry=int(self.nBitsPerEntry), + includeChirality=bool(self.includeChirality), + use2D=bool(self.use2D), + confId=int(self.confId), + ) + + +class AtomPairFPGeneratorTransformer(FpsGeneratorTransformer): + _regenerate_on_properties = ( + "fpSize", + "includeChirality", + "use2D", + "minLength", + "maxLength", + ) + + def __init__( + self, + minLength: int = 1, + maxLength: int = 30, + fromAtoms=None, + ignoreAtoms=None, + atomInvariants=None, + includeChirality: bool = False, + use2D: bool = True, + confId: int = -1, + fpSize: int = 2048, + useCounts: bool = False, + parallel: Union[bool, int] = False, + ): + self._initializing = True + super().__init__(parallel=parallel) + self.fpSize = fpSize + self.use2D = use2D + self.includeChirality = includeChirality + self.minLength = minLength + self.maxLength = maxLength + + self.useCounts = useCounts + self.confId = confId + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants + + self._generate_fp_generator() + delattr(self, "_initializing") + + def _generate_fp_generator(self): + self._fpgen = GetAtomPairGenerator( + minDistance=self.minLength, + maxDistance=self.maxLength, + includeChirality=self.includeChirality, + use2D=self.use2D, + fpSize=self.fpSize, + ) + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy( + mol, + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + customAtomInvariants=self.atomInvariants, + ) + else: + return self._fpgen.GetFingerprintAsNumPy( + mol, + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + customAtomInvariants=self.atomInvariants, + ) diff --git a/scikit_mol/fingerprints/avalon.py b/scikit_mol/fingerprints/avalon.py new file mode 100644 index 0000000..074632d --- /dev/null +++ b/scikit_mol/fingerprints/avalon.py @@ -0,0 +1,62 @@ +from typing import Union + +import numpy as np + +from .baseclasses import FpsTransformer + +from rdkit.Avalon import pyAvalonTools + + +class AvalonFingerprintTransformer(FpsTransformer): + # Fingerprint from the Avalon toolkeit, https://doi.org/10.1021/ci050413p + def __init__( + self, + fpSize: int = 512, + isQuery: bool = False, + resetVect: bool = False, + bitFlags: int = 15761407, + useCounts: bool = False, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + """Transform RDKit mols into Count or bit-based Avalon Fingerprints + + Parameters + ---------- + fpSize : int, optional + Size of the fingerprint, by default 512 + isQuery : bool, optional + use the fingerprint for a query structure, by default False + resetVect : bool, optional + reset vector, by default False NB: only used in GetAvalonFP (not for GetAvalonCountFP) + bitFlags : int, optional + Substructure fingerprint (32767) or similarity fingerprint (15761407) by default 15761407 + useCounts : bool, optional + If toggled will create the count and not bit-based fingerprint, by default False + """ + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.fpSize = fpSize + self.isQuery = isQuery + self.resetVect = resetVect + self.bitFlags = bitFlags + self.useCounts = useCounts + + def _mol2fp(self, mol): + if self.useCounts: + return pyAvalonTools.GetAvalonCountFP( + mol, + nBits=int(self.fpSize), + isQuery=bool(self.isQuery), + bitFlags=int(self.bitFlags), + ) + else: + return pyAvalonTools.GetAvalonFP( + mol, + nBits=int(self.fpSize), + isQuery=bool(self.isQuery), + resetVect=bool(self.resetVect), + bitFlags=int(self.bitFlags), + ) diff --git a/scikit_mol/fingerprints/baseclasses.py b/scikit_mol/fingerprints/baseclasses.py index bea43e0..ce28e18 100644 --- a/scikit_mol/fingerprints/baseclasses.py +++ b/scikit_mol/fingerprints/baseclasses.py @@ -10,12 +10,15 @@ from rdkit.Chem import rdMolDescriptors from rdkit.Chem import rdFingerprintGenerator from rdkit.Chem import rdMHFPFingerprint -from rdkit.Avalon import pyAvalonTools -from rdkit.Chem.rdFingerprintGenerator import (GetMorganGenerator, GetMorganFeatureAtomInvGen, - GetTopologicalTorsionGenerator, - GetAtomPairGenerator, - GetRDKitFPGenerator) + +from rdkit.Chem.rdFingerprintGenerator import ( + GetMorganGenerator, + GetMorganFeatureAtomInvGen, + GetTopologicalTorsionGenerator, + GetAtomPairGenerator, + GetRDKitFPGenerator, +) import numpy as np import pandas as pd @@ -32,6 +35,7 @@ r"^(?P\w+)FingerprintTransformer$" ) + class FpsTransformer(ABC, BaseEstimator, TransformerMixin): def __init__( self, @@ -45,15 +49,20 @@ def __init__( self.safe_inference_mode = safe_inference_mode self.dtype = dtype - @property def nBits(self): - warn("nBits will be replace by fpSize, due to changes harmonization!", DeprecationWarning) + warn( + "nBits will be replace by fpSize, due to changes harmonization!", + DeprecationWarning, + ) return self.fpSize @nBits.setter def nBits(self, nBits): - warn("nBits will be replace by fpSize, due to changes harmonization!", DeprecationWarning) + warn( + "nBits will be replace by fpSize, due to changes harmonization!", + DeprecationWarning, + ) self.fpSize = nBits def _get_column_prefix(self) -> str: @@ -193,554 +202,6 @@ def transform(self, X, y=None): return arr -class MACCSKeysFingerprintTransformer(FpsTransformer): - def __init__( - self, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - fpSize=167, - ): - """MACCS keys fingerprinter - calculates the 167 fixed MACCS keys - """ - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - if fpSize != 167: - raise ValueError( - "fpSize can only be 167, matching the number of defined MACCS keys!" - ) - self._fpSize = fpSize - - @property - def fpSize(self): - return self._fpSize - - @fpSize.setter - def fpSize(self, fpSize): - if fpSize != 167: - raise ValueError( - "fpSize can only be 167, matching the number of defined MACCS keys!" - ) - self._fpSize = fpSize - - def _mol2fp(self, mol): - return rdMolDescriptors.GetMACCSKeysFingerprint(mol) - - -class MHFingerprintTransformer(FpsTransformer): - def __init__( - self, - radius: int = 3, - rings: bool = True, - isomeric: bool = False, - kekulize: bool = False, - min_radius: int = 1, - fpSize: int = 2048, - seed: int = 42, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int32, - ): - """Transforms the RDKit mol into the MinHash fingerprint (MHFP) - - https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0321-8 - - Args: - radius (int, optional): The MHFP radius. Defaults to 3. - rings (bool, optional): Whether or not to include rings in the shingling. Defaults to True. - isomeric (bool, optional): Whether the isomeric SMILES to be considered. Defaults to False. - kekulize (bool, optional): Whether or not to kekulize the extracted SMILES. Defaults to False. - min_radius (int, optional): The minimum radius that is used to extract n-gram. Defaults to 1. - fpSize (int, optional): The number of permutations used for hashing. Defaults to 2048, - this is effectively the length of the FP - seed (int, optional): The value used to seed numpy.random. Defaults to 0. - """ - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.radius = radius - self.rings = rings - self.isomeric = isomeric - self.kekulize = kekulize - self.min_radius = min_radius - # Set the .n_permutations and .seed without creating the encoder twice - self.fpSize = fpSize - self._seed = seed - # create the encoder instance - self._recreate_encoder() - - def __getstate__(self): - # Get the state of the parent class - state = super().__getstate__() - # Remove the unpicklable property from the state - state.pop("mhfp_encoder", None) # mhfp_encoder is not picklable - return state - - def __setstate__(self, state): - # Restore the state of the parent class - super().__setstate__(state) - # Re-create the unpicklable property - self._recreate_encoder() - - def _mol2fp(self, mol): - fp = self.mhfp_encoder.EncodeMol( - mol, self.radius, self.rings, self.isomeric, self.kekulize, self.min_radius - ) - return fp - - def _fp2array(self, fp): - return np.array(fp) - - def _recreate_encoder(self): - self.mhfp_encoder = rdMHFPFingerprint.MHFPEncoder( - self.fpSize, self._seed - ) - - @property - def seed(self): - return self._seed - - @seed.setter - def seed(self, seed): - self._seed = seed - # each time the seed parameter is modified refresh an instance of the encoder - self._recreate_encoder() - - @property - def n_permutations(self): - warn("n_permutations will be replace by fpSize, due to changes harmonization!", DeprecationWarning) - return self.fpSize - - @n_permutations.setter - def n_permutations(self, n_permutations): - warn("n_permutations will be replace by fpSize, due to changes harmonization!", DeprecationWarning) - self.fpSize = n_permutations - # each time the n_permutations parameter is modified refresh an instance of the encoder - self._recreate_encoder() - - -class SECFingerprintTransformer(FpsTransformer): - # https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0321-8 - def __init__( - self, - radius: int = 3, - rings: bool = True, - isomeric: bool = False, - kekulize: bool = False, - min_radius: int = 1, - fpSize: int = 2048, - n_permutations: int = 0, - seed: int = 0, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - """Transforms the RDKit mol into the SMILES extended connectivity fingerprint (SECFP) - - Args: - radius (int, optional): The MHFP radius. Defaults to 3. - rings (bool, optional): Whether or not to include rings in the shingling. Defaults to True. - isomeric (bool, optional): Whether the isomeric SMILES to be considered. Defaults to False. - kekulize (bool, optional): Whether or not to kekulize the extracted SMILES. Defaults to False. - min_radius (int, optional): The minimum radius that is used to extract n-gram. Defaults to 1. - fpSize (int, optional): The length of the folded fingerprint. Defaults to 2048. - n_permutations (int, optional): The number of permutations used for hashing. Defaults to 0. - seed (int, optional): The value used to seed numpy.random. Defaults to 0. - """ - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.radius = radius - self.rings = rings - self.isomeric = isomeric - self.kekulize = kekulize - self.min_radius = min_radius - self.fpSize = fpSize - # Set the .n_permutations and seed without creating the encoder twice - self._n_permutations = n_permutations - self._seed = seed - # create the encoder instance - self._recreate_encoder() - - def __getstate__(self): - # Get the state of the parent class - state = super().__getstate__() - # Remove the unpicklable property from the state - state.pop("mhfp_encoder", None) # mhfp_encoder is not picklable - return state - - def __setstate__(self, state): - # Restore the state of the parent class - super().__setstate__(state) - # Re-create the unpicklable property - self._recreate_encoder() - - def _mol2fp(self, mol): - return self.mhfp_encoder.EncodeSECFPMol( - mol, - self.radius, - self.rings, - self.isomeric, - self.kekulize, - self.min_radius, - self.length, - ) - - def _recreate_encoder(self): - self.mhfp_encoder = rdMHFPFingerprint.MHFPEncoder( - self._n_permutations, self._seed - ) - - @property - def seed(self): - return self._seed - - @seed.setter - def seed(self, seed): - self._seed = seed - # each time the seed parameter is modified refresh an instace of the encoder - self._recreate_encoder() - - @property - def n_permutations(self): - return self._n_permutations - - @n_permutations.setter - def n_permutations(self, n_permutations): - self._n_permutations = n_permutations - # each time the n_permutations parameter is modified refresh an instace of the encoder - self._recreate_encoder() - - @property - def length(self): - warn("length will be replace by fpSize, due to changes harmonization!", DeprecationWarning) - return self.fpSize - - -class AvalonFingerprintTransformer(FpsTransformer): - # Fingerprint from the Avalon toolkeit, https://doi.org/10.1021/ci050413p - def __init__( - self, - fpSize: int = 512, - isQuery: bool = False, - resetVect: bool = False, - bitFlags: int = 15761407, - useCounts: bool = False, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - """Transform RDKit mols into Count or bit-based Avalon Fingerprints - - Parameters - ---------- - fpSize : int, optional - Size of the fingerprint, by default 512 - isQuery : bool, optional - use the fingerprint for a query structure, by default False - resetVect : bool, optional - reset vector, by default False NB: only used in GetAvalonFP (not for GetAvalonCountFP) - bitFlags : int, optional - Substructure fingerprint (32767) or similarity fingerprint (15761407) by default 15761407 - useCounts : bool, optional - If toggled will create the count and not bit-based fingerprint, by default False - """ - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.fpSize = fpSize - self.isQuery = isQuery - self.resetVect = resetVect - self.bitFlags = bitFlags - self.useCounts = useCounts - - def _mol2fp(self, mol): - if self.useCounts: - return pyAvalonTools.GetAvalonCountFP( - mol, - nBits=int(self.fpSize), - isQuery=bool(self.isQuery), - bitFlags=int(self.bitFlags), - ) - else: - return pyAvalonTools.GetAvalonFP( - mol, - nBits=int(self.fpSize), - isQuery=bool(self.isQuery), - resetVect=bool(self.resetVect), - bitFlags=int(self.bitFlags), - ) - - -class MorganFingerprintTransformer(FpsTransformer): - def __init__( - self, - fpSize=2048, - radius=2, - useChirality=False, - useBondTypes=True, - useFeatures=False, - useCounts=False, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - """Transform RDKit mols into Count or bit-based hashed MorganFingerprints - - Parameters - ---------- - fpSize : int, optional - Size of the hashed fingerprint, by default 2048 - radius : int, optional - Radius of the fingerprint, by default 2 - useChirality : bool, optional - Include chirality in calculation of the fingerprint keys, by default False - useBondTypes : bool, optional - Include bondtypes in calculation of the fingerprint keys, by default True - useFeatures : bool, optional - use chemical features, rather than atom-type in calculation of the fingerprint keys, by default False - useCounts : bool, optional - If toggled will create the count and not bit-based fingerprint, by default False - """ - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.fpSize = fpSize - self.radius = radius - self.useChirality = useChirality - self.useBondTypes = useBondTypes - self.useFeatures = useFeatures - self.useCounts = useCounts - - warn("MorganFingerprintTransformer will be replace by MorganGeneratorTransformer, due to changes in RDKit!", DeprecationWarning) - - def _mol2fp(self, mol): - if self.useCounts: - return rdMolDescriptors.GetHashedMorganFingerprint( - mol, - int(self.radius), - nBits=int(self.fpSize), - useFeatures=bool(self.useFeatures), - useChirality=bool(self.useChirality), - useBondTypes=bool(self.useBondTypes), - ) - else: - return rdMolDescriptors.GetMorganFingerprintAsBitVect( - mol, - int(self.radius), - nBits=int(self.fpSize), - useFeatures=bool(self.useFeatures), - useChirality=bool(self.useChirality), - useBondTypes=bool(self.useBondTypes), - ) - - -class RDKitFingerprintTransformer(FpsTransformer): - def __init__( - self, - minPath: int = 1, - maxPath: int = 7, - useHs: bool = True, - branchedPaths: bool = True, - useBondOrder: bool = True, - countSimulation: bool = False, - countBounds=None, - fpSize: int = 2048, - numBitsPerFeature: int = 2, - atomInvariantsGenerator=None, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - """Calculates the RDKit fingerprints - - Parameters - ---------- - minPath : int, optional - the minimum path length (in bonds) to be included, by default 1 - maxPath : int, optional - the maximum path length (in bonds) to be included, by default 7 - useHs : bool, optional - toggles inclusion of Hs in paths (if the molecule has explicit Hs), by default True - branchedPaths : bool, optional - toggles generation of branched subgraphs, not just linear paths, by default True - useBondOrder : bool, optional - toggles inclusion of bond orders in the path hashes, by default True - countSimulation : bool, optional - if set, use count simulation while generating the fingerprint, by default False - countBounds : _type_, optional - boundaries for count simulation, corresponding bit will be set if the count is higher than the number provided for that spot, by default None - fpSize : int, optional - size of the generated fingerprint, does not affect the sparse versions, by default 2048 - numBitsPerFeature : int, optional - the number of bits set per path/subgraph found, by default 2 - atomInvariantsGenerator : _type_, optional - atom invariants to be used during fingerprint generation, by default None - """ - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.minPath = minPath - self.maxPath = maxPath - self.useHs = useHs - self.branchedPaths = branchedPaths - self.useBondOrder = useBondOrder - self.countSimulation = countSimulation - self.countBounds = countBounds - self.fpSize = fpSize - self.numBitsPerFeature = numBitsPerFeature - self.atomInvariantsGenerator = atomInvariantsGenerator - - warn("RDKitFingerprintTransformer will be replace by RDKitFPGeneratorTransformer, due to changes in RDKit!", DeprecationWarning) - - - def _mol2fp(self, mol): - generator = rdFingerprintGenerator.GetRDKitFPGenerator( - minPath=int(self.minPath), - maxPath=int(self.maxPath), - useHs=bool(self.useHs), - branchedPaths=bool(self.branchedPaths), - useBondOrder=bool(self.useBondOrder), - countSimulation=bool(self.countSimulation), - countBounds=bool(self.countBounds), - fpSize=int(self.fpSize), - numBitsPerFeature=int(self.numBitsPerFeature), - atomInvariantsGenerator=self.atomInvariantsGenerator, - ) - return generator.GetFingerprint(mol) - - -class AtomPairFingerprintTransformer(FpsTransformer): - def __init__( - self, - minLength: int = 1, - maxLength: int = 30, - fromAtoms=0, - ignoreAtoms=0, - atomInvariants=0, - nBitsPerEntry: int = 4, - includeChirality: bool = False, - use2D: bool = True, - confId: int = -1, - fpSize=2048, - useCounts: bool = False, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.minLength = minLength - self.maxLength = maxLength - self.fromAtoms = fromAtoms - self.ignoreAtoms = ignoreAtoms - self.atomInvariants = atomInvariants - self.includeChirality = includeChirality - self.use2D = use2D - self.confId = confId - self.fpSize = fpSize - self.nBitsPerEntry = nBitsPerEntry - self.useCounts = useCounts - - warn("AtomPairFingerprintTransformer will be replace by AtomPairFPGeneratorTransformer, due to changes in RDKit!", DeprecationWarning) - - def _mol2fp(self, mol): - if self.useCounts: - return rdMolDescriptors.GetHashedAtomPairFingerprint( - mol, - nBits=int(self.fpSize), - minLength=int(self.minLength), - maxLength=int(self.maxLength), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - includeChirality=bool(self.includeChirality), - use2D=bool(self.use2D), - confId=int(self.confId), - ) - else: - return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( - mol, - nBits=int(self.fpSize), - minLength=int(self.minLength), - maxLength=int(self.maxLength), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - nBitsPerEntry=int(self.nBitsPerEntry), - includeChirality=bool(self.includeChirality), - use2D=bool(self.use2D), - confId=int(self.confId), - ) - - -class TopologicalTorsionFingerprintTransformer(FpsTransformer): - def __init__( - self, - targetSize: int = 4, - fromAtoms=0, - ignoreAtoms=0, - atomInvariants=0, - includeChirality: bool = False, - nBitsPerEntry: int = 4, - fpSize=2048, - useCounts: bool = False, - parallel: Union[bool, int] = False, - safe_inference_mode: bool = False, - dtype: np.dtype = np.int8, - ): - super().__init__( - parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype - ) - self.targetSize = targetSize - self.fromAtoms = fromAtoms - self.ignoreAtoms = ignoreAtoms - self.atomInvariants = atomInvariants - self.includeChirality = includeChirality - self.nBitsPerEntry = nBitsPerEntry - self.fpSize = fpSize - self.useCounts = useCounts - - warn("TopologicalTorsionFingerprintTransformer will be replace by TopologicalTorsionFPGeneatorTransformer, due to changes in RDKit!", DeprecationWarning) - - def _mol2fp(self, mol): - if self.useCounts: - return rdMolDescriptors.GetHashedTopologicalTorsionFingerprint( - mol, - nBits=int(self.fpSize), - targetSize=int(self.targetSize), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - includeChirality=bool(self.includeChirality), - ) - else: - return rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( - mol, - nBits=int(self.fpSize), - targetSize=int(self.targetSize), - fromAtoms=self.fromAtoms, - ignoreAtoms=self.ignoreAtoms, - atomInvariants=self.atomInvariants, - includeChirality=bool(self.includeChirality), - nBitsPerEntry=int(self.nBitsPerEntry), - ) - - -def parallel_helper(args): - """Parallel_helper takes a tuple with classname, the objects parameters and the mols to process. - Then instantiates the class with the parameters and processes the mol. - Intention is to be able to do this in child processes as some classes can't be pickled""" - classname, parameters, X_mols = args - from scikit_mol import fingerprints - - transformer = getattr(fingerprints, classname)(**parameters) - return transformer._transform(X_mols) - - class FpsGeneratorTransformer(FpsTransformer): _regenerate_on_properties = () @@ -755,22 +216,29 @@ def __getstate__(self): state = super().__getstate__() state.update(self.get_params()) # Remove the unpicklable property from the state - state.pop("_fpgen", None) # fpgen is not picklable + state.pop("_fpgen", None) # fpgen is not picklable return state def __setstate__(self, state): # Restore the state of the parent class super().__setstate__(state) # Re-create the unpicklable property - generatort_keys = inspect.signature(self._generate_fp_generator).parameters.keys() - params = [setattr(self, k, state["_"+k]) if "_"+k in state else setattr(self, k, state[k]) for k in generatort_keys] + generatort_keys = inspect.signature( + self._generate_fp_generator + ).parameters.keys() + params = [ + setattr(self, k, state["_" + k]) + if "_" + k in state + else setattr(self, k, state[k]) + for k in generatort_keys + ] self._generate_fp_generator() def __setattr__(self, name: str, value): super().__setattr__(name, value) if ( - not hasattr(self, "_initializing") - and name in self._regenerate_on_properties + not hasattr(self, "_initializing") + and name in self._regenerate_on_properties ): self._generate_fp_generator() @@ -787,190 +255,12 @@ def _transform_mol(self, mol) -> np.array: raise NotImplementedError("_transform_mol not implemented") -class MorganFPGeneratorTransformer(FpsGeneratorTransformer): - _regenerate_on_properties = ("radius", "fpSize", "useChirality", "useFeatures", "useBondTypes") - - def __init__(self, fpSize=2048, radius=2, useChirality=False, - useBondTypes=True, useFeatures=False, useCounts=False, - parallel: Union[bool, int] = False, ): - """Transform RDKit mols into Count or bit-based hashed MorganFingerprints - - Parameters - ---------- - fpsize : int, optional - Size of the hashed fingerprint, by default 2048 - radius : int, optional - Radius of the fingerprint, by default 2 - useChirality : bool, optional - Include chirality in calculation of the fingerprint keys, by default False - useBondTypes : bool, optional - Include bondtypes in calculation of the fingerprint keys, by default True - useFeatures : bool, optional - use chemical features, rather than atom-type in calculation of the fingerprint keys, by default False - useCounts : bool, optional - If toggled will create the count and not bit-based fingerprint, by default False - """ - - self._initializing = True - super().__init__(parallel = parallel) - self.fpSize = fpSize - self.radius = radius - self.useChirality = useChirality - self.useFeatures = useFeatures - self.useCounts = useCounts - self.useBondTypes = useBondTypes - - self._generate_fp_generator() - delattr(self, "_initializing") - - - def _generate_fp_generator(self): - - if self.useFeatures: - atomInvariantsGenerator = GetMorganFeatureAtomInvGen() - else: - atomInvariantsGenerator = None - - self._fpgen = GetMorganGenerator(radius=self.radius, - fpSize=self.fpSize, - includeChirality=self.useChirality, - useBondTypes=self.useBondTypes, - atomInvariantsGenerator=atomInvariantsGenerator, - ) - - def _transform_mol(self, mol) -> np.array: - if self.useCounts: - return self._fpgen.GetCountFingerprintAsNumPy(mol) - else: - return self._fpgen.GetFingerprintAsNumPy(mol) - - -class TopologicalTorsionFPGeneatorTransformer(FpsGeneratorTransformer): - _regenerate_on_properties = ("fpSize", "includeChirality", "targetSize") - - def __init__(self, targetSize:int = 4, fromAtoms = None, ignoreAtoms = None, atomInvariants = None, confId=-1, - includeChirality:bool = False, fpSize:int=2048, - useCounts:bool=False, parallel: Union[bool, int] = False): - - self._initializing = True - super().__init__(parallel=parallel) - self.fpSize = fpSize - self.includeChirality = includeChirality - self.targetSize = targetSize - - self.fromAtoms = fromAtoms - self.ignoreAtoms = ignoreAtoms - self.atomInvariants = atomInvariants - self.confId = confId - self.useCounts = useCounts - - self._generate_fp_generator() - delattr(self, "_initializing") - - - def _generate_fp_generator(self): - self._fpgen = GetTopologicalTorsionGenerator(torsionAtomCount=self.targetSize, includeChirality=self.includeChirality, - fpSize=self.fpSize) - - def _transform_mol(self, mol) -> np.array: - if self.useCounts: - return self._fpgen.GetCountFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) - else: - return self._fpgen.GetFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) - - -class AtomPairFPGeneratorTransformer(FpsGeneratorTransformer): - _regenerate_on_properties = ("fpSize", "includeChirality", "use2D", "minLength", "maxLength") - - def __init__(self, minLength:int = 1, maxLength:int = 30, fromAtoms = None, ignoreAtoms = None, atomInvariants = None, - includeChirality:bool = False, use2D:bool = True, confId:int = -1, fpSize:int=2048, - useCounts:bool=False, parallel: Union[bool, int] = False,): - self._initializing = True - super().__init__(parallel = parallel) - self.fpSize = fpSize - self.use2D = use2D - self.includeChirality = includeChirality - self.minLength = minLength - self.maxLength = maxLength - - self.useCounts= useCounts - self.confId = confId - self.fromAtoms = fromAtoms - self.ignoreAtoms = ignoreAtoms - self.atomInvariants = atomInvariants - - self._generate_fp_generator() - delattr(self, "_initializing") - - def _generate_fp_generator(self): - self._fpgen = GetAtomPairGenerator(minDistance=self.minLength, maxDistance=self.maxLength, - includeChirality=self.includeChirality, - use2D=self.use2D, fpSize=self.fpSize) - - def _transform_mol(self, mol) -> np.array: - if self.useCounts: - return self._fpgen.GetCountFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) - else: - return self._fpgen.GetFingerprintAsNumPy(mol, fromAtoms=self.fromAtoms, ignoreAtoms=self.ignoreAtoms, customAtomInvariants=self.atomInvariants) - - -class RDKitFPGeneratorTransformer(FpsGeneratorTransformer): - _regenerate_on_properties = ("minPath", "maxPath", "useHs", "branchedPaths", "useBondOrder", "countSimulation", "fpSize", "countBounds", - "numBitsPerFeature") - - def __init__(self, minPath:int = 1, maxPath:int =7, useHs:bool = True, branchedPaths:bool = True, - useBondOrder:bool = True, countSimulation:bool = False, countBounds = None, - fpSize:int = 2048, numBitsPerFeature:int = 2, - useCounts:bool = False, parallel: Union[bool, int] = False - ): - """Calculates the RDKit fingerprints - - Parameters - ---------- - minPath : int, optional - the minimum path length (in bonds) to be included, by default 1 - maxPath : int, optional - the maximum path length (in bonds) to be included, by default 7 - useHs : bool, optional - toggles inclusion of Hs in paths (if the molecule has explicit Hs), by default True - branchedPaths : bool, optional - toggles generation of branched subgraphs, not just linear paths, by default True - useBondOrder : bool, optional - toggles inclusion of bond orders in the path hashes, by default True - countSimulation : bool, optional - if set, use count simulation while generating the fingerprint, by default False - countBounds : _type_, optional - boundaries for count simulation, corresponding bit will be set if the count is higher than the number provided for that spot, by default None - fpSize : int, optional - size of the generated fingerprint, does not affect the sparse versions, by default 2048 - numBitsPerFeature : int, optional - the number of bits set per path/subgraph found, by default 2 - """ - self._initializing = True - super().__init__(parallel = parallel) - self.minPath = minPath - self.maxPath = maxPath - self.useHs = useHs - self.branchedPaths = branchedPaths - self.useBondOrder = useBondOrder - self.countSimulation = countSimulation - self.fpSize = fpSize - self.numBitsPerFeature = numBitsPerFeature - self.countBounds = countBounds - - self.useCounts = useCounts - - self._generate_fp_generator() - delattr(self, "_initializing") - - def _transform_mol(self, mol) -> np.array: - if self.useCounts: - return self._fpgen.GetCountFingerprintAsNumPy(mol) - else: - return self._fpgen.GetFingerprintAsNumPy(mol) +def parallel_helper(args): + """Parallel_helper takes a tuple with classname, the objects parameters and the mols to process. + Then instantiates the class with the parameters and processes the mol. + Intention is to be able to do this in child processes as some classes can't be pickled""" + classname, parameters, X_mols = args + from scikit_mol import fingerprints - def _generate_fp_generator(self): - self._fpgen = GetRDKitFPGenerator(minPath=self.minPath, maxPath=self.maxPath, useHs=self.useHs, - branchedPaths=self.branchedPaths,useBondOrder=self.useBondOrder, - countSimulation=self.countSimulation, fpSize=self.fpSize, - countBounds=self.countBounds, numBitsPerFeature=self.numBitsPerFeature) + transformer = getattr(fingerprints, classname)(**parameters) + return transformer._transform(X_mols) diff --git a/scikit_mol/fingerprints/maccs.py b/scikit_mol/fingerprints/maccs.py new file mode 100644 index 0000000..ca38966 --- /dev/null +++ b/scikit_mol/fingerprints/maccs.py @@ -0,0 +1,41 @@ +from typing import Union +from rdkit.Chem import rdMolDescriptors +import numpy as np + +from .baseclasses import FpsTransformer + + +class MACCSKeysFingerprintTransformer(FpsTransformer): + def __init__( + self, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + fpSize=167, + ): + """MACCS keys fingerprinter + calculates the 167 fixed MACCS keys + """ + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + if fpSize != 167: + raise ValueError( + "fpSize can only be 167, matching the number of defined MACCS keys!" + ) + self._fpSize = fpSize + + @property + def fpSize(self): + return self._fpSize + + @fpSize.setter + def fpSize(self, fpSize): + if fpSize != 167: + raise ValueError( + "fpSize can only be 167, matching the number of defined MACCS keys!" + ) + self._fpSize = fpSize + + def _mol2fp(self, mol): + return rdMolDescriptors.GetMACCSKeysFingerprint(mol) diff --git a/scikit_mol/fingerprints/minhash.py b/scikit_mol/fingerprints/minhash.py new file mode 100644 index 0000000..1c7e62a --- /dev/null +++ b/scikit_mol/fingerprints/minhash.py @@ -0,0 +1,206 @@ +from typing import Union + +import numpy as np + +from warnings import warn + +from .baseclasses import FpsTransformer + +from rdkit.Chem import rdMHFPFingerprint + + +class MHFingerprintTransformer(FpsTransformer): + def __init__( + self, + radius: int = 3, + rings: bool = True, + isomeric: bool = False, + kekulize: bool = False, + min_radius: int = 1, + fpSize: int = 2048, + seed: int = 42, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int32, + ): + """Transforms the RDKit mol into the MinHash fingerprint (MHFP) + + https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0321-8 + + Args: + radius (int, optional): The MHFP radius. Defaults to 3. + rings (bool, optional): Whether or not to include rings in the shingling. Defaults to True. + isomeric (bool, optional): Whether the isomeric SMILES to be considered. Defaults to False. + kekulize (bool, optional): Whether or not to kekulize the extracted SMILES. Defaults to False. + min_radius (int, optional): The minimum radius that is used to extract n-gram. Defaults to 1. + fpSize (int, optional): The number of permutations used for hashing. Defaults to 2048, + this is effectively the length of the FP + seed (int, optional): The value used to seed numpy.random. Defaults to 0. + """ + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.radius = radius + self.rings = rings + self.isomeric = isomeric + self.kekulize = kekulize + self.min_radius = min_radius + # Set the .n_permutations and .seed without creating the encoder twice + self.fpSize = fpSize + self._seed = seed + # create the encoder instance + self._recreate_encoder() + + def __getstate__(self): + # Get the state of the parent class + state = super().__getstate__() + # Remove the unpicklable property from the state + state.pop("mhfp_encoder", None) # mhfp_encoder is not picklable + return state + + def __setstate__(self, state): + # Restore the state of the parent class + super().__setstate__(state) + # Re-create the unpicklable property + self._recreate_encoder() + + def _mol2fp(self, mol): + fp = self.mhfp_encoder.EncodeMol( + mol, self.radius, self.rings, self.isomeric, self.kekulize, self.min_radius + ) + return fp + + def _fp2array(self, fp): + return np.array(fp) + + def _recreate_encoder(self): + self.mhfp_encoder = rdMHFPFingerprint.MHFPEncoder(self.fpSize, self._seed) + + @property + def seed(self): + return self._seed + + @seed.setter + def seed(self, seed): + self._seed = seed + # each time the seed parameter is modified refresh an instance of the encoder + self._recreate_encoder() + + @property + def n_permutations(self): + warn( + "n_permutations will be replace by fpSize, due to changes harmonization!", + DeprecationWarning, + ) + return self.fpSize + + @n_permutations.setter + def n_permutations(self, n_permutations): + warn( + "n_permutations will be replace by fpSize, due to changes harmonization!", + DeprecationWarning, + ) + self.fpSize = n_permutations + # each time the n_permutations parameter is modified refresh an instance of the encoder + self._recreate_encoder() + + +class SECFingerprintTransformer(FpsTransformer): + # https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0321-8 + def __init__( + self, + radius: int = 3, + rings: bool = True, + isomeric: bool = False, + kekulize: bool = False, + min_radius: int = 1, + fpSize: int = 2048, + n_permutations: int = 0, + seed: int = 0, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + """Transforms the RDKit mol into the SMILES extended connectivity fingerprint (SECFP) + + Args: + radius (int, optional): The MHFP radius. Defaults to 3. + rings (bool, optional): Whether or not to include rings in the shingling. Defaults to True. + isomeric (bool, optional): Whether the isomeric SMILES to be considered. Defaults to False. + kekulize (bool, optional): Whether or not to kekulize the extracted SMILES. Defaults to False. + min_radius (int, optional): The minimum radius that is used to extract n-gram. Defaults to 1. + fpSize (int, optional): The length of the folded fingerprint. Defaults to 2048. + n_permutations (int, optional): The number of permutations used for hashing. Defaults to 0. + seed (int, optional): The value used to seed numpy.random. Defaults to 0. + """ + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.radius = radius + self.rings = rings + self.isomeric = isomeric + self.kekulize = kekulize + self.min_radius = min_radius + self.fpSize = fpSize + # Set the .n_permutations and seed without creating the encoder twice + self._n_permutations = n_permutations + self._seed = seed + # create the encoder instance + self._recreate_encoder() + + def __getstate__(self): + # Get the state of the parent class + state = super().__getstate__() + # Remove the unpicklable property from the state + state.pop("mhfp_encoder", None) # mhfp_encoder is not picklable + return state + + def __setstate__(self, state): + # Restore the state of the parent class + super().__setstate__(state) + # Re-create the unpicklable property + self._recreate_encoder() + + def _mol2fp(self, mol): + return self.mhfp_encoder.EncodeSECFPMol( + mol, + self.radius, + self.rings, + self.isomeric, + self.kekulize, + self.min_radius, + self.length, + ) + + def _recreate_encoder(self): + self.mhfp_encoder = rdMHFPFingerprint.MHFPEncoder( + self._n_permutations, self._seed + ) + + @property + def seed(self): + return self._seed + + @seed.setter + def seed(self, seed): + self._seed = seed + # each time the seed parameter is modified refresh an instace of the encoder + self._recreate_encoder() + + @property + def n_permutations(self): + return self._n_permutations + + @n_permutations.setter + def n_permutations(self, n_permutations): + self._n_permutations = n_permutations + # each time the n_permutations parameter is modified refresh an instace of the encoder + self._recreate_encoder() + + @property + def length(self): + warn( + "length will be replace by fpSize, due to changes harmonization!", + DeprecationWarning, + ) + return self.fpSize diff --git a/scikit_mol/fingerprints/morgan.py b/scikit_mol/fingerprints/morgan.py new file mode 100644 index 0000000..37d7cf8 --- /dev/null +++ b/scikit_mol/fingerprints/morgan.py @@ -0,0 +1,150 @@ +from typing import Union + +from rdkit.Chem import rdMolDescriptors + +import numpy as np + +from warnings import warn + +from rdkit.Chem.rdFingerprintGenerator import ( + GetMorganGenerator, + GetMorganFeatureAtomInvGen, +) + +from .baseclasses import FpsTransformer, FpsGeneratorTransformer + + +class MorganFingerprintTransformer(FpsTransformer): + def __init__( + self, + fpSize=2048, + radius=2, + useChirality=False, + useBondTypes=True, + useFeatures=False, + useCounts=False, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + """Transform RDKit mols into Count or bit-based hashed MorganFingerprints + + Parameters + ---------- + fpSize : int, optional + Size of the hashed fingerprint, by default 2048 + radius : int, optional + Radius of the fingerprint, by default 2 + useChirality : bool, optional + Include chirality in calculation of the fingerprint keys, by default False + useBondTypes : bool, optional + Include bondtypes in calculation of the fingerprint keys, by default True + useFeatures : bool, optional + use chemical features, rather than atom-type in calculation of the fingerprint keys, by default False + useCounts : bool, optional + If toggled will create the count and not bit-based fingerprint, by default False + """ + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.fpSize = fpSize + self.radius = radius + self.useChirality = useChirality + self.useBondTypes = useBondTypes + self.useFeatures = useFeatures + self.useCounts = useCounts + + warn( + "MorganFingerprintTransformer will be replace by MorganGeneratorTransformer, due to changes in RDKit!", + DeprecationWarning, + ) + + def _mol2fp(self, mol): + if self.useCounts: + return rdMolDescriptors.GetHashedMorganFingerprint( + mol, + int(self.radius), + nBits=int(self.fpSize), + useFeatures=bool(self.useFeatures), + useChirality=bool(self.useChirality), + useBondTypes=bool(self.useBondTypes), + ) + else: + return rdMolDescriptors.GetMorganFingerprintAsBitVect( + mol, + int(self.radius), + nBits=int(self.fpSize), + useFeatures=bool(self.useFeatures), + useChirality=bool(self.useChirality), + useBondTypes=bool(self.useBondTypes), + ) + + +class MorganFPGeneratorTransformer(FpsGeneratorTransformer): + _regenerate_on_properties = ( + "radius", + "fpSize", + "useChirality", + "useFeatures", + "useBondTypes", + ) + + def __init__( + self, + fpSize=2048, + radius=2, + useChirality=False, + useBondTypes=True, + useFeatures=False, + useCounts=False, + parallel: Union[bool, int] = False, + ): + """Transform RDKit mols into Count or bit-based hashed MorganFingerprints + + Parameters + ---------- + fpsize : int, optional + Size of the hashed fingerprint, by default 2048 + radius : int, optional + Radius of the fingerprint, by default 2 + useChirality : bool, optional + Include chirality in calculation of the fingerprint keys, by default False + useBondTypes : bool, optional + Include bondtypes in calculation of the fingerprint keys, by default True + useFeatures : bool, optional + use chemical features, rather than atom-type in calculation of the fingerprint keys, by default False + useCounts : bool, optional + If toggled will create the count and not bit-based fingerprint, by default False + """ + + self._initializing = True + super().__init__(parallel=parallel) + self.fpSize = fpSize + self.radius = radius + self.useChirality = useChirality + self.useFeatures = useFeatures + self.useCounts = useCounts + self.useBondTypes = useBondTypes + + self._generate_fp_generator() + delattr(self, "_initializing") + + def _generate_fp_generator(self): + if self.useFeatures: + atomInvariantsGenerator = GetMorganFeatureAtomInvGen() + else: + atomInvariantsGenerator = None + + self._fpgen = GetMorganGenerator( + radius=self.radius, + fpSize=self.fpSize, + includeChirality=self.useChirality, + useBondTypes=self.useBondTypes, + atomInvariantsGenerator=atomInvariantsGenerator, + ) + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy(mol) + else: + return self._fpgen.GetFingerprintAsNumPy(mol) diff --git a/scikit_mol/fingerprints/rdkitfp.py b/scikit_mol/fingerprints/rdkitfp.py new file mode 100644 index 0000000..28ce0a8 --- /dev/null +++ b/scikit_mol/fingerprints/rdkitfp.py @@ -0,0 +1,175 @@ +from typing import Union + +import numpy as np + +from warnings import warn + +from .baseclasses import FpsTransformer, FpsGeneratorTransformer + +from rdkit.Chem.rdFingerprintGenerator import GetRDKitFPGenerator + +from rdkit.Chem import rdFingerprintGenerator + + +class RDKitFingerprintTransformer(FpsTransformer): + def __init__( + self, + minPath: int = 1, + maxPath: int = 7, + useHs: bool = True, + branchedPaths: bool = True, + useBondOrder: bool = True, + countSimulation: bool = False, + countBounds=None, + fpSize: int = 2048, + numBitsPerFeature: int = 2, + atomInvariantsGenerator=None, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + """Calculates the RDKit fingerprints + + Parameters + ---------- + minPath : int, optional + the minimum path length (in bonds) to be included, by default 1 + maxPath : int, optional + the maximum path length (in bonds) to be included, by default 7 + useHs : bool, optional + toggles inclusion of Hs in paths (if the molecule has explicit Hs), by default True + branchedPaths : bool, optional + toggles generation of branched subgraphs, not just linear paths, by default True + useBondOrder : bool, optional + toggles inclusion of bond orders in the path hashes, by default True + countSimulation : bool, optional + if set, use count simulation while generating the fingerprint, by default False + countBounds : _type_, optional + boundaries for count simulation, corresponding bit will be set if the count is higher than the number provided for that spot, by default None + fpSize : int, optional + size of the generated fingerprint, does not affect the sparse versions, by default 2048 + numBitsPerFeature : int, optional + the number of bits set per path/subgraph found, by default 2 + atomInvariantsGenerator : _type_, optional + atom invariants to be used during fingerprint generation, by default None + """ + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.minPath = minPath + self.maxPath = maxPath + self.useHs = useHs + self.branchedPaths = branchedPaths + self.useBondOrder = useBondOrder + self.countSimulation = countSimulation + self.countBounds = countBounds + self.fpSize = fpSize + self.numBitsPerFeature = numBitsPerFeature + self.atomInvariantsGenerator = atomInvariantsGenerator + + warn( + "RDKitFingerprintTransformer will be replace by RDKitFPGeneratorTransformer, due to changes in RDKit!", + DeprecationWarning, + ) + + def _mol2fp(self, mol): + generator = rdFingerprintGenerator.GetRDKitFPGenerator( + minPath=int(self.minPath), + maxPath=int(self.maxPath), + useHs=bool(self.useHs), + branchedPaths=bool(self.branchedPaths), + useBondOrder=bool(self.useBondOrder), + countSimulation=bool(self.countSimulation), + countBounds=bool(self.countBounds), + fpSize=int(self.fpSize), + numBitsPerFeature=int(self.numBitsPerFeature), + atomInvariantsGenerator=self.atomInvariantsGenerator, + ) + return generator.GetFingerprint(mol) + + +class RDKitFPGeneratorTransformer(FpsGeneratorTransformer): + _regenerate_on_properties = ( + "minPath", + "maxPath", + "useHs", + "branchedPaths", + "useBondOrder", + "countSimulation", + "fpSize", + "countBounds", + "numBitsPerFeature", + ) + + def __init__( + self, + minPath: int = 1, + maxPath: int = 7, + useHs: bool = True, + branchedPaths: bool = True, + useBondOrder: bool = True, + countSimulation: bool = False, + countBounds=None, + fpSize: int = 2048, + numBitsPerFeature: int = 2, + useCounts: bool = False, + parallel: Union[bool, int] = False, + ): + """Calculates the RDKit fingerprints + + Parameters + ---------- + minPath : int, optional + the minimum path length (in bonds) to be included, by default 1 + maxPath : int, optional + the maximum path length (in bonds) to be included, by default 7 + useHs : bool, optional + toggles inclusion of Hs in paths (if the molecule has explicit Hs), by default True + branchedPaths : bool, optional + toggles generation of branched subgraphs, not just linear paths, by default True + useBondOrder : bool, optional + toggles inclusion of bond orders in the path hashes, by default True + countSimulation : bool, optional + if set, use count simulation while generating the fingerprint, by default False + countBounds : _type_, optional + boundaries for count simulation, corresponding bit will be set if the count is higher than the number provided for that spot, by default None + fpSize : int, optional + size of the generated fingerprint, does not affect the sparse versions, by default 2048 + numBitsPerFeature : int, optional + the number of bits set per path/subgraph found, by default 2 + """ + self._initializing = True + super().__init__(parallel=parallel) + self.minPath = minPath + self.maxPath = maxPath + self.useHs = useHs + self.branchedPaths = branchedPaths + self.useBondOrder = useBondOrder + self.countSimulation = countSimulation + self.fpSize = fpSize + self.numBitsPerFeature = numBitsPerFeature + self.countBounds = countBounds + + self.useCounts = useCounts + + self._generate_fp_generator() + delattr(self, "_initializing") + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy(mol) + else: + return self._fpgen.GetFingerprintAsNumPy(mol) + + def _generate_fp_generator(self): + self._fpgen = GetRDKitFPGenerator( + minPath=self.minPath, + maxPath=self.maxPath, + useHs=self.useHs, + branchedPaths=self.branchedPaths, + useBondOrder=self.useBondOrder, + countSimulation=self.countSimulation, + fpSize=self.fpSize, + countBounds=self.countBounds, + numBitsPerFeature=self.numBitsPerFeature, + ) diff --git a/scikit_mol/fingerprints/topologicaltorsion.py b/scikit_mol/fingerprints/topologicaltorsion.py new file mode 100644 index 0000000..0b6640d --- /dev/null +++ b/scikit_mol/fingerprints/topologicaltorsion.py @@ -0,0 +1,120 @@ +from typing import Union + +import numpy as np + +from warnings import warn + +from .baseclasses import FpsTransformer, FpsGeneratorTransformer + +from rdkit.Chem import rdMolDescriptors +from rdkit.Chem.rdFingerprintGenerator import GetTopologicalTorsionGenerator + + +class TopologicalTorsionFingerprintTransformer(FpsTransformer): + def __init__( + self, + targetSize: int = 4, + fromAtoms=0, + ignoreAtoms=0, + atomInvariants=0, + includeChirality: bool = False, + nBitsPerEntry: int = 4, + fpSize=2048, + useCounts: bool = False, + parallel: Union[bool, int] = False, + safe_inference_mode: bool = False, + dtype: np.dtype = np.int8, + ): + super().__init__( + parallel=parallel, safe_inference_mode=safe_inference_mode, dtype=dtype + ) + self.targetSize = targetSize + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants + self.includeChirality = includeChirality + self.nBitsPerEntry = nBitsPerEntry + self.fpSize = fpSize + self.useCounts = useCounts + + warn( + "TopologicalTorsionFingerprintTransformer will be replace by TopologicalTorsionFPGeneatorTransformer, due to changes in RDKit!", + DeprecationWarning, + ) + + def _mol2fp(self, mol): + if self.useCounts: + return rdMolDescriptors.GetHashedTopologicalTorsionFingerprint( + mol, + nBits=int(self.fpSize), + targetSize=int(self.targetSize), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + includeChirality=bool(self.includeChirality), + ) + else: + return rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( + mol, + nBits=int(self.fpSize), + targetSize=int(self.targetSize), + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + atomInvariants=self.atomInvariants, + includeChirality=bool(self.includeChirality), + nBitsPerEntry=int(self.nBitsPerEntry), + ) + + +class TopologicalTorsionFPGeneatorTransformer(FpsGeneratorTransformer): + _regenerate_on_properties = ("fpSize", "includeChirality", "targetSize") + + def __init__( + self, + targetSize: int = 4, + fromAtoms=None, + ignoreAtoms=None, + atomInvariants=None, + confId=-1, + includeChirality: bool = False, + fpSize: int = 2048, + useCounts: bool = False, + parallel: Union[bool, int] = False, + ): + self._initializing = True + super().__init__(parallel=parallel) + self.fpSize = fpSize + self.includeChirality = includeChirality + self.targetSize = targetSize + + self.fromAtoms = fromAtoms + self.ignoreAtoms = ignoreAtoms + self.atomInvariants = atomInvariants + self.confId = confId + self.useCounts = useCounts + + self._generate_fp_generator() + delattr(self, "_initializing") + + def _generate_fp_generator(self): + self._fpgen = GetTopologicalTorsionGenerator( + torsionAtomCount=self.targetSize, + includeChirality=self.includeChirality, + fpSize=self.fpSize, + ) + + def _transform_mol(self, mol) -> np.array: + if self.useCounts: + return self._fpgen.GetCountFingerprintAsNumPy( + mol, + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + customAtomInvariants=self.atomInvariants, + ) + else: + return self._fpgen.GetFingerprintAsNumPy( + mol, + fromAtoms=self.fromAtoms, + ignoreAtoms=self.ignoreAtoms, + customAtomInvariants=self.atomInvariants, + )