From 40821b985469afc548ed8530954095b1f5e4643f Mon Sep 17 00:00:00 2001 From: Ya Chen Date: Tue, 2 May 2023 19:21:19 +0200 Subject: [PATCH 1/2] avalonFP --- scikit_mol/fingerprints.py | 39 +++++++++++++++++++++++++++++++++++ tests/test_fptransformers.py | 33 ++++++++++++++++++----------- tests/test_parameter_types.py | 6 +++--- tests/test_transformers.py | 5 +++-- 4 files changed, 66 insertions(+), 17 deletions(-) diff --git a/scikit_mol/fingerprints.py b/scikit_mol/fingerprints.py index 719e236..65f0256 100644 --- a/scikit_mol/fingerprints.py +++ b/scikit_mol/fingerprints.py @@ -8,6 +8,7 @@ from rdkit.Chem import rdMolDescriptors from rdkit.Chem import rdFingerprintGenerator from rdkit.Chem import rdMHFPFingerprint +from rdkit.Avalon import pyAvalonTools import numpy as np import pandas as pd @@ -448,7 +449,45 @@ def _mol2fp(self, mol): useChirality=bool(self.useChirality), useBondTypes=bool(self.useBondTypes) ) +class AvalonFingerprintTransformer(FpsTransformer): + # Fingerprint from the Avalon toolkeit, https://doi.org/10.1021/ci050413p + def __init__(self, nBits:int = 512, isQuery:bool = False, resetVect:bool = False, bitFlags:int = 15761407, useCounts:bool = False, parallel: Union[bool, int] = False,): + """ Transform RDKit mols into Count or bit-based Avalon Fingerprints + Parameters + ---------- + nBits : int, optional + Size of the fingerprint, by default 512 + isQuery : bool, optional + use the fingerprint for a query structure, by default False + resetVect : bool, optional + reset vector, by default False #TODO: only used in GetAvalonFP (not for GetAvalonCountFP) and seems doesn't make difference + bitFlags : int, optional + Substructure fingerprint (32767) or similarity fingerpirnt (15761407) by default 15761407 #TODO: don't understand this parameter, it can also be other integers and result in different fingerprints. + useCounts : bool, optional + If toggled will create the count and not bit-based fingerprint, by default False + """ + super().__init__(parallel = parallel) + self.nBits = nBits + self.isQuery = isQuery + self.resetVect = resetVect + self.bitFlags = bitFlags + self.useCounts = useCounts + + def _mol2fp(self, mol): + if self.useCounts: + return pyAvalonTools.GetAvalonCountFP(mol, + nBits=int(self.nBits), + isQuery=bool(self.isQuery), + bitFlags=int(self.bitFlags) + ) + else: + return pyAvalonTools.GetAvalonFP(mol, + nBits=int(self.nBits), + isQuery=bool(self.isQuery), + resetVect=bool(self.resetVect), + bitFlags=int(self.bitFlags) + ) def parallel_helper(args): diff --git a/tests/test_fptransformers.py b/tests/test_fptransformers.py index 1975e86..90000b5 100644 --- a/tests/test_fptransformers.py +++ b/tests/test_fptransformers.py @@ -7,7 +7,7 @@ from fixtures import mols_list, smiles_list, fingerprint, chiral_smiles_list, chiral_mols_list from sklearn import clone -from scikit_mol.fingerprints import MorganFingerprintTransformer, MACCSKeysFingerprintTransformer, RDKitFingerprintTransformer, AtomPairFingerprintTransformer, TopologicalTorsionFingerprintTransformer, SECFingerprintTransformer, MHFingerprintTransformer +from scikit_mol.fingerprints import MorganFingerprintTransformer, MACCSKeysFingerprintTransformer, RDKitFingerprintTransformer, AtomPairFingerprintTransformer, TopologicalTorsionFingerprintTransformer, SECFingerprintTransformer, MHFingerprintTransformer, AvalonFingerprintTransformer @@ -39,6 +39,9 @@ def secfp_transformer(): def mhfp_transformer(): return MHFingerprintTransformer() +@pytest.fixture +def avalon_transformer(): + return AvalonFingerprintTransformer() def test_fpstransformer_fp2array(morgan_transformer, fingerprint): fp = morgan_transformer._fp2array(fingerprint) @@ -54,8 +57,8 @@ def test_fpstransformer_transform_mol(morgan_transformer, mols_list): assert(fp.shape == (2048,)) assert(fp.sum() == 14) -def test_clonability(maccs_transformer, morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, secfp_transformer, mhfp_transformer): - for t in [maccs_transformer, morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, secfp_transformer, mhfp_transformer]: +def test_clonability(maccs_transformer, morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, secfp_transformer, mhfp_transformer, avalon_transformer): + for t in [maccs_transformer, morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, secfp_transformer, mhfp_transformer, avalon_transformer]: params = t.get_params() t2 = clone(t) params_2 = t2.get_params() @@ -64,8 +67,8 @@ def test_clonability(maccs_transformer, morgan_transformer, rdkit_transformer, a #Cloned transformers should not be the same object assert t2 != t -def test_set_params(morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, secfp_transformer, mhfp_transformer): - for t in [morgan_transformer, atompair_transformer, topologicaltorsion_transformer]: +def test_set_params(morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, secfp_transformer, mhfp_transformer, avalon_transformer): + for t in [morgan_transformer, atompair_transformer, topologicaltorsion_transformer, avalon_transformer]: params = t.get_params() #change extracted dictionary params['nBits'] = 4242 @@ -96,11 +99,11 @@ def test_set_params(morgan_transformer, rdkit_transformer, atompair_transformer, params_2 = t.get_params() assert all([ params[key] == params_2[key] for key in params.keys()]) -def test_transform(mols_list, morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, secfp_transformer, mhfp_transformer): +def test_transform(mols_list, morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, secfp_transformer, mhfp_transformer, avalon_transformer): #Test different types of input for mols in [mols_list, np.array(mols_list), pd.Series(mols_list)]: #Test the different transformers - for t in [morgan_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, rdkit_transformer, secfp_transformer, mhfp_transformer]: + for t in [morgan_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, rdkit_transformer, secfp_transformer, mhfp_transformer, avalon_transformer]: params = t.get_params() fps = t.transform(mols) #Assert that the same length of input and output @@ -116,11 +119,11 @@ def test_transform(mols_list, morgan_transformer, rdkit_transformer, atompair_tr assert len(fps[0]) == fpsize -def test_transform_parallel(mols_list, morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, secfp_transformer, mhfp_transformer): +def test_transform_parallel(mols_list, morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, secfp_transformer, mhfp_transformer, avalon_transformer): #Test different types of input for mols in [mols_list, np.array(mols_list), pd.Series(mols_list)]: #Test the different transformers - for t in [morgan_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, rdkit_transformer, secfp_transformer, mhfp_transformer]: + for t in [morgan_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, rdkit_transformer, secfp_transformer, mhfp_transformer, avalon_transformer]: t.set_params(parallel=True) params = t.get_params() fps = t.transform(mols) @@ -138,9 +141,9 @@ def test_transform_parallel(mols_list, morgan_transformer, rdkit_transformer, at assert len(fps[0]) == fpsize -def test_picklable(morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, secfp_transformer): +def test_picklable(morgan_transformer, rdkit_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, secfp_transformer, avalon_transformer): #Test the different transformers - for t in [morgan_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, rdkit_transformer, secfp_transformer]: + for t in [morgan_transformer, atompair_transformer, topologicaltorsion_transformer, maccs_transformer, rdkit_transformer, secfp_transformer, avalon_transformer]: with tempfile.NamedTemporaryFile() as f: pickle.dump(t, f) f.seek(0) @@ -248,4 +251,10 @@ def test_MHFingerprintTransformer(chiral_mols_list): } assert_transformer_set_params(MHFingerprintTransformer, new_params, chiral_mols_list) - +def test_AvalonFingerprintTransformer(chiral_mols_list): + new_params = {'nBits': 1024, + 'isQuery': True, + # 'resetVect': True, #TODO: this doesn't change the FP + 'bitFlags': 32767 + } + assert_transformer_set_params(AvalonFingerprintTransformer, new_params, chiral_mols_list) diff --git a/tests/test_parameter_types.py b/tests/test_parameter_types.py index 6867dbb..f175c87 100644 --- a/tests/test_parameter_types.py +++ b/tests/test_parameter_types.py @@ -2,11 +2,11 @@ import numpy as np from rdkit import Chem from fixtures import mols_list, smiles_list -from test_fptransformers import morgan_transformer, atompair_transformer, topologicaltorsion_transformer, rdkit_transformer +from test_fptransformers import morgan_transformer, atompair_transformer, topologicaltorsion_transformer, rdkit_transformer, avalon_transformer -def test_Transformer_exotic_types(mols_list, morgan_transformer,atompair_transformer, topologicaltorsion_transformer): - for transformer in [morgan_transformer, atompair_transformer, topologicaltorsion_transformer]: +def test_Transformer_exotic_types(mols_list, morgan_transformer,atompair_transformer, topologicaltorsion_transformer, avalon_transformer): + for transformer in [morgan_transformer, atompair_transformer, topologicaltorsion_transformer, avalon_transformer]: params = transformer.get_params() for useCounts in [np.bool_(True), np.bool_(False)]: diff --git a/tests/test_transformers.py b/tests/test_transformers.py index 17e0a94..a45acb1 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -13,7 +13,7 @@ from scikit_mol.conversions import SmilesToMolTransformer from scikit_mol.fingerprints import MACCSKeysFingerprintTransformer, RDKitFingerprintTransformer, AtomPairFingerprintTransformer, \ TopologicalTorsionFingerprintTransformer, MorganFingerprintTransformer, SECFingerprintTransformer, \ - MHFingerprintTransformer + MHFingerprintTransformer, AvalonFingerprintTransformer from fixtures import SLC6A4_subset @@ -34,7 +34,8 @@ def test_transformer(SLC6A4_subset): "MorganTransformer": [MorganFingerprintTransformer, False], "MorganTransformer useCounts": [MorganFingerprintTransformer, True], "SECFingerprintTransformer": [SECFingerprintTransformer, None], - "MHFingerprintTransformer": [MHFingerprintTransformer, None]} + "MHFingerprintTransformer": [MHFingerprintTransformer, None], + 'AvalonFingerprintTransformer': [AvalonFingerprintTransformer, None]} # fit on toy data and print train/test score if successful or collect the failed FP failed_FP = [] From 0151a9151e2dc642f8c64f9629d98f387adee093 Mon Sep 17 00:00:00 2001 From: EBjerrum Date: Sat, 6 May 2023 07:24:40 +0200 Subject: [PATCH 2/2] Updated fingerprints docstring and README --- README.md | 25 +++++++++++++++++-------- scikit_mol/fingerprints.py | 4 ++-- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 8b80801..ba30f26 100644 --- a/README.md +++ b/README.md @@ -20,20 +20,29 @@ The first draft for the project was created at the [RDKIT UGM 2022 hackathon](ht ## Implemented -* Transformer Classes - * SmilesToMol - * Desc2DTransformer - * MACCSTransformer - * RDKitFPTransformer +* descriptors + * MolecularDescriptorTransformer +
+* fingerprints + * MorganFingerprintTransformer + * MACCSKeysFingerprintTransformer + * RDKitFingerprintTransformer * AtomPairFingerprintTransformer * TopologicalTorsionFingerprintTransformer - * MorganTransformer + * MHFingerprintTransformer * SECFingerprintTransformer + * AvalonFingerprintTransformer
- -* Utilities +* conversions + * SmilesToMol +
+* standardizer + * Standardizer +
+* utilities * CheckSmilesSanitazion + ## Installation Users can install latest tagged release from pip diff --git a/scikit_mol/fingerprints.py b/scikit_mol/fingerprints.py index 65f0256..81bc43b 100644 --- a/scikit_mol/fingerprints.py +++ b/scikit_mol/fingerprints.py @@ -461,9 +461,9 @@ def __init__(self, nBits:int = 512, isQuery:bool = False, resetVect:bool = False isQuery : bool, optional use the fingerprint for a query structure, by default False resetVect : bool, optional - reset vector, by default False #TODO: only used in GetAvalonFP (not for GetAvalonCountFP) and seems doesn't make difference + reset vector, by default False NB: only used in GetAvalonFP (not for GetAvalonCountFP) bitFlags : int, optional - Substructure fingerprint (32767) or similarity fingerpirnt (15761407) by default 15761407 #TODO: don't understand this parameter, it can also be other integers and result in different fingerprints. + Substructure fingerprint (32767) or similarity fingerprint (15761407) by default 15761407 useCounts : bool, optional If toggled will create the count and not bit-based fingerprint, by default False """