diff --git a/notebooks/09_Combinatorial_Method_Usage_with_FingerPrint_Transformers.ipynb b/notebooks/09_Combinatorial_Method_Usage_with_FingerPrint_Transformers.ipynb
new file mode 100644
index 0000000..08030d8
--- /dev/null
+++ b/notebooks/09_Combinatorial_Method_Usage_with_FingerPrint_Transformers.ipynb
@@ -0,0 +1,699 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Example: Using Multiple Different Fingerprint Transformer\n",
+ "\n",
+ "In this notebook we will explore how to evaluate the performance of machine learning models depending on different fingerprint transformers (Featurization techniques). This is an example, that you easily could adapt for many different combinations of featurizers, optimizaiton and other modelling techniques.\n",
+ "\n",
+ "Following steps will happen:\n",
+ "* Data Parsing\n",
+ "* Pipeline Building\n",
+ "* Training Phase\n",
+ "* Analysis\n",
+ "\n",
+ "Authors: @VincentAlexanderScholz, @RiesBen \n",
+ "\n",
+ "## Imports:\n",
+ "First we will import all the stuff that we will need for our work.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from time import time\n",
+ "from matplotlib import pyplot as plt\n",
+ "\n",
+ "from rdkit.Chem import PandasTools\n",
+ "\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "from sklearn.pipeline import Pipeline, make_pipeline\n",
+ "from sklearn.linear_model import Ridge\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "from scikit_mol import fingerprints\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get Data:\n",
+ "In this step we will check if the SLC6A4 data set is already present or needs to be downloaded.\n",
+ "\n",
+ "\n",
+ "**WARNING:** The Dataset is a simple and very well selected"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0 out of 200 SMILES failed in conversion\n"
+ ]
+ }
+ ],
+ "source": [
+ "full_set = False\n",
+ "\n",
+ "# if not present download example data\n",
+ "if full_set:\n",
+ " csv_file = \"SLC6A4_active_excape_export.csv\"\n",
+ " if not os.path.exists(csv_file):\n",
+ " import urllib.request\n",
+ " url = \"https://ndownloader.figshare.com/files/25747817\"\n",
+ " urllib.request.urlretrieve(url, csv_file)\n",
+ "else:\n",
+ " csv_file = '../tests/data/SLC6A4_active_excapedb_subset.csv'\n",
+ "\n",
+ "#Parse Database\n",
+ "data = pd.read_csv(csv_file)\n",
+ "\n",
+ "PandasTools.AddMoleculeColumnToFrame(data, smilesCol=\"SMILES\")\n",
+ "print(f\"{data.ROMol.isna().sum()} out of {len(data)} SMILES failed in conversion\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Build Pipeline:\n",
+ "In this stage we will build the Pipeline consisting of the featurization part (finger print transformers) and the model part (Ridge Regression).\n",
+ "\n",
+ "Note that the featurization in this section is an hyperparameter, living in `param_grid`, and the `\"fp_transformer\"` string is just a placeholder, being replaced during pipeline execution. \n",
+ "\n",
+ "This way we can define multiple different scenarios in `param_grid`, that allow us to rapidly explore different combinations of settings and methodologies."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-09-22T11:29:15.949644Z",
+ "start_time": "2023-09-22T11:29:15.461010Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'fp_transformer': [MorganFingerprintTransformer(),\n",
+ " AvalonFingerprintTransformer()],\n",
+ " 'fp_transformer__nBits': [256, 512, 1024, 2048, 4096],\n",
+ " 'regressor__alpha': array([0.1 , 0.325, 0.55 , 0.775, 1. ])},\n",
+ " {'fp_transformer': [RDKitFingerprintTransformer(),\n",
+ " AtomPairFingerprintTransformer(),\n",
+ " MACCSKeysFingerprintTransformer()],\n",
+ " 'regressor__alpha': array([0.1 , 0.325, 0.55 , 0.775, 1. ])}]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "regressor = Ridge()\n",
+ "optimization_pipe = Pipeline([(\"fp_transformer\", \"fp_transformer\"), # this is a placeholder for different transformers\n",
+ " (\"regressor\", regressor)])\n",
+ "\n",
+ "param_grid = [ # Here pass different Options and Approaches\n",
+ " {\n",
+ " \"fp_transformer\": [fingerprints.MorganFingerprintTransformer(),\n",
+ " fingerprints.AvalonFingerprintTransformer()],\n",
+ " \"fp_transformer__nBits\": [2**x for x in range(8,13)],\n",
+ " },\n",
+ " {\n",
+ " \"fp_transformer\": [fingerprints.RDKitFingerprintTransformer(),\n",
+ " fingerprints.AtomPairFingerprintTransformer(),\n",
+ " fingerprints.MACCSKeysFingerprintTransformer()], \n",
+ " },\n",
+ "]\n",
+ "\n",
+ "global_options = {\n",
+ " \"regressor__alpha\": np.linspace(0.1,1,5),\n",
+ "}\n",
+ "\n",
+ "[params.update(global_options) for params in param_grid]\n",
+ "\n",
+ "param_grid"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Train Model\n",
+ "In this section, the combinatorial approaches are trained."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-09-22T11:29:15.960939Z",
+ "start_time": "2023-09-22T11:29:15.461078Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Runtime: 21.90\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Split Data\n",
+ "mol_list_train, mol_list_test, y_train, y_test = train_test_split(data.ROMol, data.pXC50, random_state=0)\n",
+ "\n",
+ "# Define Search Process\n",
+ "grid = GridSearchCV(optimization_pipe, n_jobs=1,\n",
+ " param_grid=param_grid)\n",
+ "\n",
+ "# Train\n",
+ "t0 = time()\n",
+ "grid.fit(mol_list_train, y_train.values)\n",
+ "t1 = time()\n",
+ "\n",
+ "print(f'Runtime: {t1-t0:0.2F}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis\n",
+ "\n",
+ "Now let's investigate our results from the training stage. Which one is the best finger print method for this data set? Which parameters are optimal?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " mean_fit_time | \n",
+ " std_fit_time | \n",
+ " mean_score_time | \n",
+ " std_score_time | \n",
+ " param_fp_transformer | \n",
+ " param_fp_transformer__nBits | \n",
+ " param_regressor__alpha | \n",
+ " params | \n",
+ " split0_test_score | \n",
+ " split1_test_score | \n",
+ " split2_test_score | \n",
+ " split3_test_score | \n",
+ " split4_test_score | \n",
+ " mean_test_score | \n",
+ " std_test_score | \n",
+ " rank_test_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.008671 | \n",
+ " 0.000448 | \n",
+ " 0.002286 | \n",
+ " 0.000069 | \n",
+ " MorganFingerprintTransformer(nBits=1024) | \n",
+ " 256 | \n",
+ " 0.1 | \n",
+ " {'fp_transformer': MorganFingerprintTransforme... | \n",
+ " 0.017975 | \n",
+ " 0.394682 | \n",
+ " 0.524598 | \n",
+ " 0.542116 | \n",
+ " 0.310238 | \n",
+ " 0.357922 | \n",
+ " 0.190209 | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.008333 | \n",
+ " 0.000125 | \n",
+ " 0.002222 | \n",
+ " 0.000054 | \n",
+ " MorganFingerprintTransformer(nBits=1024) | \n",
+ " 256 | \n",
+ " 0.325 | \n",
+ " {'fp_transformer': MorganFingerprintTransforme... | \n",
+ " 0.078758 | \n",
+ " 0.449548 | \n",
+ " 0.554241 | \n",
+ " 0.572363 | \n",
+ " 0.330543 | \n",
+ " 0.397090 | \n",
+ " 0.181071 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.008217 | \n",
+ " 0.000048 | \n",
+ " 0.002193 | \n",
+ " 0.000059 | \n",
+ " MorganFingerprintTransformer(nBits=1024) | \n",
+ " 256 | \n",
+ " 0.55 | \n",
+ " {'fp_transformer': MorganFingerprintTransforme... | \n",
+ " 0.128221 | \n",
+ " 0.490253 | \n",
+ " 0.575230 | \n",
+ " 0.593237 | \n",
+ " 0.344076 | \n",
+ " 0.426203 | \n",
+ " 0.173061 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.008227 | \n",
+ " 0.000063 | \n",
+ " 0.002188 | \n",
+ " 0.000054 | \n",
+ " MorganFingerprintTransformer(nBits=1024) | \n",
+ " 256 | \n",
+ " 0.775 | \n",
+ " {'fp_transformer': MorganFingerprintTransforme... | \n",
+ " 0.169585 | \n",
+ " 0.521723 | \n",
+ " 0.590890 | \n",
+ " 0.608380 | \n",
+ " 0.353866 | \n",
+ " 0.448889 | \n",
+ " 0.166100 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.008226 | \n",
+ " 0.000055 | \n",
+ " 0.002255 | \n",
+ " 0.000130 | \n",
+ " MorganFingerprintTransformer(nBits=1024) | \n",
+ " 256 | \n",
+ " 1.0 | \n",
+ " {'fp_transformer': MorganFingerprintTransforme... | \n",
+ " 0.204831 | \n",
+ " 0.546774 | \n",
+ " 0.603010 | \n",
+ " 0.619752 | \n",
+ " 0.361324 | \n",
+ " 0.467138 | \n",
+ " 0.160060 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 60 | \n",
+ " 0.085913 | \n",
+ " 0.001511 | \n",
+ " 0.021645 | \n",
+ " 0.001469 | \n",
+ " MACCSKeysFingerprintTransformer() | \n",
+ " NaN | \n",
+ " 0.1 | \n",
+ " {'fp_transformer': MACCSKeysFingerprintTransfo... | \n",
+ " -1.649022 | \n",
+ " -1.943461 | \n",
+ " -0.602509 | \n",
+ " -0.418328 | \n",
+ " -0.752525 | \n",
+ " -1.073169 | \n",
+ " 0.606987 | \n",
+ " 65 | \n",
+ "
\n",
+ " \n",
+ " 61 | \n",
+ " 0.085902 | \n",
+ " 0.001496 | \n",
+ " 0.021606 | \n",
+ " 0.001491 | \n",
+ " MACCSKeysFingerprintTransformer() | \n",
+ " NaN | \n",
+ " 0.325 | \n",
+ " {'fp_transformer': MACCSKeysFingerprintTransfo... | \n",
+ " -0.969593 | \n",
+ " -0.813087 | \n",
+ " -0.188690 | \n",
+ " 0.003831 | \n",
+ " -0.314764 | \n",
+ " -0.456461 | \n",
+ " 0.372595 | \n",
+ " 64 | \n",
+ "
\n",
+ " \n",
+ " 62 | \n",
+ " 0.085937 | \n",
+ " 0.001397 | \n",
+ " 0.021608 | \n",
+ " 0.001495 | \n",
+ " MACCSKeysFingerprintTransformer() | \n",
+ " NaN | \n",
+ " 0.55 | \n",
+ " {'fp_transformer': MACCSKeysFingerprintTransfo... | \n",
+ " -0.657588 | \n",
+ " -0.505782 | \n",
+ " -0.045940 | \n",
+ " 0.124510 | \n",
+ " -0.171340 | \n",
+ " -0.251228 | \n",
+ " 0.289700 | \n",
+ " 62 | \n",
+ "
\n",
+ " \n",
+ " 63 | \n",
+ " 0.086048 | \n",
+ " 0.001313 | \n",
+ " 0.021615 | \n",
+ " 0.001478 | \n",
+ " MACCSKeysFingerprintTransformer() | \n",
+ " NaN | \n",
+ " 0.775 | \n",
+ " {'fp_transformer': MACCSKeysFingerprintTransfo... | \n",
+ " -0.468371 | \n",
+ " -0.356825 | \n",
+ " 0.036642 | \n",
+ " 0.182939 | \n",
+ " -0.087318 | \n",
+ " -0.138587 | \n",
+ " 0.242115 | \n",
+ " 59 | \n",
+ "
\n",
+ " \n",
+ " 64 | \n",
+ " 0.085954 | \n",
+ " 0.001460 | \n",
+ " 0.021591 | \n",
+ " 0.001460 | \n",
+ " MACCSKeysFingerprintTransformer() | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " {'fp_transformer': MACCSKeysFingerprintTransfo... | \n",
+ " -0.339715 | \n",
+ " -0.266652 | \n",
+ " 0.092180 | \n",
+ " 0.218357 | \n",
+ " -0.028878 | \n",
+ " -0.064942 | \n",
+ " 0.210919 | \n",
+ " 57 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
65 rows × 16 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " mean_fit_time std_fit_time mean_score_time std_score_time \\\n",
+ "0 0.008671 0.000448 0.002286 0.000069 \n",
+ "1 0.008333 0.000125 0.002222 0.000054 \n",
+ "2 0.008217 0.000048 0.002193 0.000059 \n",
+ "3 0.008227 0.000063 0.002188 0.000054 \n",
+ "4 0.008226 0.000055 0.002255 0.000130 \n",
+ ".. ... ... ... ... \n",
+ "60 0.085913 0.001511 0.021645 0.001469 \n",
+ "61 0.085902 0.001496 0.021606 0.001491 \n",
+ "62 0.085937 0.001397 0.021608 0.001495 \n",
+ "63 0.086048 0.001313 0.021615 0.001478 \n",
+ "64 0.085954 0.001460 0.021591 0.001460 \n",
+ "\n",
+ " param_fp_transformer param_fp_transformer__nBits \\\n",
+ "0 MorganFingerprintTransformer(nBits=1024) 256 \n",
+ "1 MorganFingerprintTransformer(nBits=1024) 256 \n",
+ "2 MorganFingerprintTransformer(nBits=1024) 256 \n",
+ "3 MorganFingerprintTransformer(nBits=1024) 256 \n",
+ "4 MorganFingerprintTransformer(nBits=1024) 256 \n",
+ ".. ... ... \n",
+ "60 MACCSKeysFingerprintTransformer() NaN \n",
+ "61 MACCSKeysFingerprintTransformer() NaN \n",
+ "62 MACCSKeysFingerprintTransformer() NaN \n",
+ "63 MACCSKeysFingerprintTransformer() NaN \n",
+ "64 MACCSKeysFingerprintTransformer() NaN \n",
+ "\n",
+ " param_regressor__alpha params \\\n",
+ "0 0.1 {'fp_transformer': MorganFingerprintTransforme... \n",
+ "1 0.325 {'fp_transformer': MorganFingerprintTransforme... \n",
+ "2 0.55 {'fp_transformer': MorganFingerprintTransforme... \n",
+ "3 0.775 {'fp_transformer': MorganFingerprintTransforme... \n",
+ "4 1.0 {'fp_transformer': MorganFingerprintTransforme... \n",
+ ".. ... ... \n",
+ "60 0.1 {'fp_transformer': MACCSKeysFingerprintTransfo... \n",
+ "61 0.325 {'fp_transformer': MACCSKeysFingerprintTransfo... \n",
+ "62 0.55 {'fp_transformer': MACCSKeysFingerprintTransfo... \n",
+ "63 0.775 {'fp_transformer': MACCSKeysFingerprintTransfo... \n",
+ "64 1.0 {'fp_transformer': MACCSKeysFingerprintTransfo... \n",
+ "\n",
+ " split0_test_score split1_test_score split2_test_score \\\n",
+ "0 0.017975 0.394682 0.524598 \n",
+ "1 0.078758 0.449548 0.554241 \n",
+ "2 0.128221 0.490253 0.575230 \n",
+ "3 0.169585 0.521723 0.590890 \n",
+ "4 0.204831 0.546774 0.603010 \n",
+ ".. ... ... ... \n",
+ "60 -1.649022 -1.943461 -0.602509 \n",
+ "61 -0.969593 -0.813087 -0.188690 \n",
+ "62 -0.657588 -0.505782 -0.045940 \n",
+ "63 -0.468371 -0.356825 0.036642 \n",
+ "64 -0.339715 -0.266652 0.092180 \n",
+ "\n",
+ " split3_test_score split4_test_score mean_test_score std_test_score \\\n",
+ "0 0.542116 0.310238 0.357922 0.190209 \n",
+ "1 0.572363 0.330543 0.397090 0.181071 \n",
+ "2 0.593237 0.344076 0.426203 0.173061 \n",
+ "3 0.608380 0.353866 0.448889 0.166100 \n",
+ "4 0.619752 0.361324 0.467138 0.160060 \n",
+ ".. ... ... ... ... \n",
+ "60 -0.418328 -0.752525 -1.073169 0.606987 \n",
+ "61 0.003831 -0.314764 -0.456461 0.372595 \n",
+ "62 0.124510 -0.171340 -0.251228 0.289700 \n",
+ "63 0.182939 -0.087318 -0.138587 0.242115 \n",
+ "64 0.218357 -0.028878 -0.064942 0.210919 \n",
+ "\n",
+ " rank_test_score \n",
+ "0 25 \n",
+ "1 24 \n",
+ "2 23 \n",
+ "3 22 \n",
+ "4 21 \n",
+ ".. ... \n",
+ "60 65 \n",
+ "61 64 \n",
+ "62 62 \n",
+ "63 59 \n",
+ "64 57 \n",
+ "\n",
+ "[65 rows x 16 columns]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_training_stats = pd.DataFrame(grid.cv_results_)\n",
+ "df_training_stats"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "