From ebb19e2e9684516c25777d82f96db16e4c31ee6e Mon Sep 17 00:00:00 2001 From: EBjerrum Date: Sun, 16 Oct 2022 20:45:00 +0200 Subject: [PATCH] Updated README and cleaned up some files --- README.md | 84 +++++++++++++++++++++++++++------------ notebooks/sandbox.py | 29 +------------- scikit_mol/smilestomol.py | 27 ------------- standardizer.py | 32 --------------- 4 files changed, 60 insertions(+), 112 deletions(-) delete mode 100644 scikit_mol/smilestomol.py delete mode 100644 standardizer.py diff --git a/README.md b/README.md index faa4577..001608d 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,74 @@ # scikit-mol -scikit-learn classes for molecular vectorization using RDKit +Scikit-Learn classes for molecular vectorization using RDKit -TODO: - Expand number of fingerprint classes and featurizers - AtomPairs - TopologicalTorsions - RDKit - Descriptors - LINGOS - ... +The intended usage is to be able to add molecular vectorization directly into scikit-learn pipelines, so that the final model directly predict on RDKit molecules or SMILES strings - Make dictionary based FP class - No Hashing, .fit() learns the keys of the dataset +As example with the needed scikit-learn and -mol imports and RDKit mol objects in the mol_list_train and _test lists: - Make a basic standardarizer transformer class + pipe = Pipeline([('mol_transformer', MorganTransformer()), ('Regressor', Ridge())]) + pipe.fit(mol_list_train, y_train) + pipe.score(mol_list_test, y_test) + pipe.predict([Chem.MolFromSmiles('c1ccccc1C(=O)C')]) - Make a SMILES to Mol transformer class + >>> array([4.93858815]) -Make Notebook with examples - Standalone usage - Inclusion in pipeline - Can transformers be used in parallel (e.g. to use both FP features and Descriptors at the same time?) - Hyperparameter optimization via native Scikit-Classes - Hyperparameter optimization via external optimizer e.g. https://scikit-optimize.github.io/stable/ +The scikit-learn compatibility should also make it easier to include the fingerprinting step in hyperparameter tuning with scikit-learns utilities +The first draft for the project was created at the [RDKIT UGM 2022 hackathon](https://github.com/rdkit/UGM_2022) 2022-October-14 -Make basic unit-tests +## Implemented +* Transformer Classes + * SmilesToMol + * Desc2DTransformer + * MACCSTransformer + * RDKitFPTransformer + * AtomPairFingerprintTransformer + * TopologicalTorsionFingerprintTransformer + * MorganTransformer +
+
+* Utilities + * CheckSmilesSanitazion + +## Installation +Users can install latest tagged release from pip + + pip install scikit-mol + +Bleeding edge + + pip install git+https://github.com:EBjerrum/scikit-mol.git + +Developers -Installation git clone git@github.com:EBjerrum/scikit-mol.git pip install -e . +## Documentation +None yet, but there are some # %% delimted examples in the notebooks directory that have some demonstrations + +## BUGS +Probably still + + +## TODO +* Make standardizer less 'chatty' +* Unit test coverage of classes +* Make further example notebooks + * Standalone usage (not in pipeline) + * Advanced pipelining + * Hyperparameter optimization via external optimizer e.g. https://scikit-optimize.github.io/stable/ + +## Ideas +* LINGOS transformer -Contributers: - Esben Bjerrum, esben@cheminformania.com - Son Ha, sonha@uni-mainz.de - Oh-hyeon Choung, ohhyeon.choung@gmail.com - Please add yourself here, we'll properly markdown it later +## Contributers: +* Esben Bjerrum, esben@cheminformania.com +* Carmen Esposito https://github.com/cespos +* Son Ha, sonha@uni-mainz.de +* Oh-hyeon Choung, ohhyeon.choung@gmail.com +* Andreas Poehlmann, https://github.com/ap-- +* Ya Chen, https://github.com/anya-chen diff --git a/notebooks/sandbox.py b/notebooks/sandbox.py index a76f3e8..b677552 100644 --- a/notebooks/sandbox.py +++ b/notebooks/sandbox.py @@ -6,7 +6,7 @@ #%% -from scikit_mol.smilestomol import SmilesToMol +from scikit_mol.transformers import SmilesToMol smiles_list = ['c1ccccc1'] * 10 smilestomol = SmilesToMol() mols = smilestomol.fit_transform(smiles_list) @@ -14,7 +14,7 @@ #%% -from scikit_mol.smilestomol import SmilesToMol +from scikit_mol.transformers import SmilesToMol smiles_list = ['c1ccccc1'] * 10 y = list(range(10)) y.append(1000) @@ -42,31 +42,6 @@ mols[0] - -#%% -y_out = [] -X_out = [] -y_error = [] -X_error = [] - -for smiles, y_value in zip(smiles_list, y): - mol = Chem.MolFromSmiles(smiles) - if mol: - X_out.append(mol) - y_out.append(y_value) - else: - print(f'Logging: Error in parsing {smiles}') - X_error.append(smiles) - y_error.append(y_value) - -print(X_out) -print(y_out) -print(X_error) -print(y_error) - - - - #%% X= [Chem.MolFromSmiles('c1ccccc1')]*10 t = MorganTransformer(useCounts=True) diff --git a/scikit_mol/smilestomol.py b/scikit_mol/smilestomol.py deleted file mode 100644 index 94c4741..0000000 --- a/scikit_mol/smilestomol.py +++ /dev/null @@ -1,27 +0,0 @@ -from rdkit import Chem -from sklearn.base import BaseEstimator, TransformerMixin - - -class SmilesToMol(BaseEstimator, TransformerMixin): - def __init__(self): - pass - - def fit(self, X=None, y=None): - #Nothing to do here - return self - - def transform(self, X_smiles_list): - # Unfortunately, transform is only X to X in Scikit-learn, so can't filter at this level - # TODO: Return same type as put in (e.g. List to list, numpy to numpy, pandas Series to pandas series) - X_out = [] - - for smiles in X_smiles_list: - mol = Chem.MolFromSmiles(smiles) - if mol: - X_out.append(mol) - else: - raise ValueError(f'Issue with parsing SMILES {smiles}\nYou probably should use the scikit-mol.sanitizer.CheckSmilesSanitazion first') - - return X_out - - \ No newline at end of file diff --git a/standardizer.py b/standardizer.py deleted file mode 100644 index 038fe71..0000000 --- a/standardizer.py +++ /dev/null @@ -1,32 +0,0 @@ -#TODO Implement a scikit-learn compatible molecule standardizer -# Author: Son Ha - -from rdkit import Chem -from sklearn.base import BaseEstimator, TransformerMixin -from rdkit.Chem.MolStandardize import rdMolStandardize - -class Standardizer(BaseEstimator, TransformerMixin): - """ Input a list of rdkit mols, output the same list but standardised - """ - def __init__(self, neutralize=True): - self.neutralize = neutralize - None - - def transform(self, X): - arr = [] - for mol in X: - # Normalizing functional groups - # https://molvs.readthedocs.io/en/latest/guide/standardize.html - clean_mol = rdMolStandardize.Cleanup(mol) - # Get parents fragments - parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol) - # Neutralise - if self.neutralize: - uncharger = rdMolStandardize.Uncharger() - uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol) - # Add to final list - arr.append(uncharged_parent_clean_mol) - return(arr) - - def fit(self, X, y=None): - return self