From ebb19e2e9684516c25777d82f96db16e4c31ee6e Mon Sep 17 00:00:00 2001
From: EBjerrum <esbenbjerrum+github@gmail.com>
Date: Sun, 16 Oct 2022 20:45:00 +0200
Subject: [PATCH] Updated README and cleaned up some files

---
 README.md                 | 84 +++++++++++++++++++++++++++------------
 notebooks/sandbox.py      | 29 +-------------
 scikit_mol/smilestomol.py | 27 -------------
 standardizer.py           | 32 ---------------
 4 files changed, 60 insertions(+), 112 deletions(-)
 delete mode 100644 scikit_mol/smilestomol.py
 delete mode 100644 standardizer.py
diff --git a/README.md b/README.md
index faa4577..001608d 100644
--- a/README.md
+++ b/README.md
@@ -1,42 +1,74 @@
 # scikit-mol
-scikit-learn classes for molecular vectorization using RDKit
 
+Scikit-Learn classes for molecular vectorization using RDKit
 
-TODO:
-    Expand number of fingerprint classes and featurizers
-        AtomPairs
-        TopologicalTorsions
-        RDKit
-        Descriptors
-        LINGOS
-        ...
+The intended usage is to be able to add molecular vectorization directly into scikit-learn pipelines, so that the final model directly predict on RDKit molecules or SMILES strings
 
-    Make dictionary based FP class
-        No Hashing, .fit() learns the keys of the dataset
+As example with the needed scikit-learn and -mol imports and RDKit mol objects in the mol_list_train and _test lists:
 
-    Make a basic standardarizer transformer class
+    pipe = Pipeline([('mol_transformer', MorganTransformer()), ('Regressor', Ridge())])
+    pipe.fit(mol_list_train, y_train)
+    pipe.score(mol_list_test, y_test)
+    pipe.predict([Chem.MolFromSmiles('c1ccccc1C(=O)C')])
 
-    Make a SMILES to Mol transformer class
+    >>> array([4.93858815])
 
-Make Notebook with examples
-    Standalone usage
-    Inclusion in pipeline
-        Can transformers be used in parallel (e.g. to use both FP features and Descriptors at the same time?)
-    Hyperparameter optimization via native Scikit-Classes
-    Hyperparameter optimization via external optimizer e.g. https://scikit-optimize.github.io/stable/
+The scikit-learn compatibility should also make it easier to include the fingerprinting step in hyperparameter tuning with scikit-learns utilities
 
+The first draft for the project was created at the [RDKIT UGM 2022 hackathon](https://github.com/rdkit/UGM_2022) 2022-October-14
 
-Make basic unit-tests
 
+## Implemented
+* Transformer Classes
+    * SmilesToMol
+    * Desc2DTransformer
+    * MACCSTransformer
+    * RDKitFPTransformer
+    * AtomPairFingerprintTransformer
+    * TopologicalTorsionFingerprintTransformer
+    * MorganTransformer
+<br>
+<br>
+* Utilities
+    * CheckSmilesSanitazion
+
+## Installation
+Users can install latest tagged release from pip
+
+    pip install scikit-mol
+
+Bleeding edge
+
+    pip install git+https://github.com:EBjerrum/scikit-mol.git
+
+Developers 
 
-Installation
     git clone git@github.com:EBjerrum/scikit-mol.git
     pip install -e .
 
+## Documentation
+None yet, but there are some # %% delimted examples in the notebooks directory that have some demonstrations
+
+## BUGS
+Probably still
+
+
+## TODO
+* Make standardizer less 'chatty'
+* Unit test coverage of classes
+* Make further example notebooks
+    * Standalone usage (not in pipeline)
+    * Advanced pipelining
+    * Hyperparameter optimization via external optimizer e.g. https://scikit-optimize.github.io/stable/
+
+## Ideas
+* LINGOS transformer
 
 
-Contributers:
-    Esben Bjerrum, esben@cheminformania.com
-    Son Ha, sonha@uni-mainz.de
-    Oh-hyeon Choung, ohhyeon.choung@gmail.com
-    Please add yourself here, we'll properly markdown it later
+## Contributers:
+* Esben Bjerrum, esben@cheminformania.com
+* Carmen Esposito https://github.com/cespos
+* Son Ha, sonha@uni-mainz.de
+* Oh-hyeon Choung, ohhyeon.choung@gmail.com
+* Andreas Poehlmann, https://github.com/ap--
+* Ya Chen, https://github.com/anya-chen
diff --git a/notebooks/sandbox.py b/notebooks/sandbox.py
index a76f3e8..b677552 100644
--- a/notebooks/sandbox.py
+++ b/notebooks/sandbox.py
@@ -6,7 +6,7 @@
 
 
 #%%
-from scikit_mol.smilestomol import SmilesToMol
+from scikit_mol.transformers import SmilesToMol
 smiles_list = ['c1ccccc1'] * 10
 smilestomol = SmilesToMol()
 mols = smilestomol.fit_transform(smiles_list)
@@ -14,7 +14,7 @@
 
 
 #%%
-from scikit_mol.smilestomol import SmilesToMol
+from scikit_mol.transformers import SmilesToMol
 smiles_list = ['c1ccccc1'] * 10
 y = list(range(10))
 y.append(1000)
@@ -42,31 +42,6 @@
 mols[0]
 
 
-
-#%%
-y_out = []
-X_out = []
-y_error = []
-X_error = []
-
-for smiles, y_value in zip(smiles_list, y):
-    mol = Chem.MolFromSmiles(smiles)
-    if mol:
-        X_out.append(mol)
-        y_out.append(y_value)
-    else:
-        print(f'Logging: Error in parsing {smiles}')
-        X_error.append(smiles)
-        y_error.append(y_value)
-
-print(X_out)
-print(y_out)
-print(X_error)
-print(y_error)
-
-
-
-
 #%%
 X= [Chem.MolFromSmiles('c1ccccc1')]*10
 t = MorganTransformer(useCounts=True)
diff --git a/scikit_mol/smilestomol.py b/scikit_mol/smilestomol.py
deleted file mode 100644
index 94c4741..0000000
--- a/scikit_mol/smilestomol.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from rdkit import Chem
-from sklearn.base import BaseEstimator, TransformerMixin
-
-
-class SmilesToMol(BaseEstimator, TransformerMixin):
-    def __init__(self):
-        pass
-
-    def fit(self, X=None, y=None):
-        #Nothing to do here
-        return self
-
-    def transform(self, X_smiles_list):
-        # Unfortunately, transform is only X to X in Scikit-learn, so can't filter at this level
-        # TODO: Return same type as put in (e.g. List to list, numpy to numpy, pandas Series to pandas series)
-        X_out = []
-
-        for smiles in X_smiles_list:
-            mol = Chem.MolFromSmiles(smiles)
-            if mol:
-                X_out.append(mol)
-            else:
-                raise ValueError(f'Issue with parsing SMILES {smiles}\nYou probably should use the scikit-mol.sanitizer.CheckSmilesSanitazion first')
-
-        return X_out
-
-        
\ No newline at end of file
diff --git a/standardizer.py b/standardizer.py
deleted file mode 100644
index 038fe71..0000000
--- a/standardizer.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#TODO Implement a scikit-learn compatible molecule standardizer
-# Author: Son Ha
-
-from rdkit import Chem
-from sklearn.base import BaseEstimator, TransformerMixin
-from rdkit.Chem.MolStandardize import rdMolStandardize
-
-class Standardizer(BaseEstimator, TransformerMixin):
-    """ Input a list of rdkit mols, output the same list but standardised 
-    """
-    def __init__(self, neutralize=True):
-        self.neutralize = neutralize
-        None
-
-    def transform(self, X):
-        arr = []
-        for mol in X:
-            # Normalizing functional groups
-            # https://molvs.readthedocs.io/en/latest/guide/standardize.html
-            clean_mol = rdMolStandardize.Cleanup(mol) 
-            # Get parents fragments
-            parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
-            # Neutralise
-            if self.neutralize:
-                uncharger = rdMolStandardize.Uncharger()
-                uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
-            # Add to final list
-            arr.append(uncharged_parent_clean_mol)
-        return(arr)
-
-    def fit(self, X, y=None):
-        return self