diff --git a/Example pipeline.ipynb b/Example pipeline.ipynb deleted file mode 100644 index ba448d5..0000000 --- a/Example pipeline.ipynb +++ /dev/null @@ -1,313 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "dec96c58", - "metadata": {}, - "source": [ - "# Example pipeline\n", - "This example pipeline trains a model that can be used for challenge." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee09a12f", - "metadata": {}, - "outputs": [], - "source": [ - "# Classifier imports\n", - "from sklearn.model_selection import GridSearchCV, train_test_split\n", - "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.compose import make_column_selector as selector\n", - "from sklearn.metrics import classification_report\n", - "from sklearn.metrics import RocCurveDisplay\n", - "\n", - "import os.path\n", - "import pylab as plt\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "from joblib import dump, load" - ] - }, - { - "cell_type": "markdown", - "id": "e5228901", - "metadata": {}, - "source": [ - "# 1. Read data\n", - "First download the data files by following [the instructions](https://github.com/eyra/fertility-prediction-challenge#preparation)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64b02af0", - "metadata": {}, - "outputs": [], - "source": [ - "# Data paths; the LISS data files that you've downloaded should be placed in the `data` folder\n", - "# (where the `fake_data.csv` is located).\n", - "data_folder_path = os.path.join(\"..\", \"data\")\n", - "path_train = os.path.join(data_folder_path, \"LISS_example_input_data.csv\")\n", - "path_outcome = os.path.join(data_folder_path, \"LISS_example_groundtruth_data.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "407c83dd", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# Read data\n", - "data = pd.read_csv(path_train, encoding=\"cp1252\", low_memory=False)\n", - "outcome = pd.read_csv(path_outcome, encoding=\"cp1252\", low_memory=False)\n", - "\n", - "# Drop observations where the outcome is missing\n", - "y_isna = outcome['new_child'].isnull()\n", - "data = data.loc[~y_isna]\n", - "outcome = outcome.loc[~y_isna]" - ] - }, - { - "cell_type": "markdown", - "id": "673cdba7", - "metadata": {}, - "source": [ - "# 2. Split data into train and test\n", - "First thing always, otherwise you risk overfitting." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f4252fa", - "metadata": {}, - "outputs": [], - "source": [ - "X_train, X_test, y_train, y_test = train_test_split(\n", - " data,\n", - " outcome,\n", - " test_size=0.1, random_state=2023)\n", - "y_train = y_train[\"new_child\"]\n", - "y_test = y_test[\"new_child\"]" - ] - }, - { - "cell_type": "markdown", - "id": "54490bf1", - "metadata": {}, - "source": [ - "# 3. Pre-process and model\n", - "You may not want to include the preprocessing in the pipeline if it becomes too cumbersome.\n", - "\n", - "Make sure to use the scoring that you want to optimize in the search space." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a23c548", - "metadata": {}, - "outputs": [], - "source": [ - "# Pre-pre process (keep some columns)\n", - "keepcols = ['burgstat2019', 'leeftijd2019', 'woonvorm2019', 'oplmet2019', 'aantalki2019']\n", - "data = data.loc[:, keepcols]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "84e16fa8", - "metadata": {}, - "outputs": [], - "source": [ - "# Create transformers\n", - "categorical_transformer = Pipeline(steps=[\n", - " ('imputer', SimpleImputer(strategy='most_frequent')),\n", - " ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist'))])\n", - "\n", - "numerical_transformer = Pipeline(steps=[\n", - " ('imputer', SimpleImputer(strategy='mean')),\n", - " ('scaler', StandardScaler())])\n", - "\n", - "# Use ColumnTransformer to apply the transformations to the correct columns in the dataframe\n", - "preprocessor = ColumnTransformer(\n", - " transformers=[\n", - " ('num', numerical_transformer, selector(dtype_exclude=object)(data)),\n", - " ('cat', categorical_transformer, selector(dtype_include=object)(data))])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49d788e0", - "metadata": {}, - "outputs": [], - "source": [ - "# Create pipeline\n", - "model = Pipeline([\n", - " (\"preprocess\", preprocessor),\n", - " (\"classifier\", LogisticRegression(max_iter=500))\n", - " ]) \n", - " \n", - "# Define the hyperparameters\n", - "parameters = [\n", - " {\n", - " 'classifier': [LogisticRegression(max_iter=500)],\n", - " 'classifier__C': np.logspace(-1, 5, 50)\n", - " },\n", - "\n", - "]\n", - "\n", - "# Perform hyperparameter tuning using cross-validation\n", - "grid_search = GridSearchCV(model, parameters, cv=5, n_jobs=-1, scoring=\"f1\", verbose=3)\n", - "grid_search.fit(X_train, y_train)\n", - "\n", - "# Keep best model\n", - "best_model = grid_search.best_estimator_\n", - "\n", - "best_model\n" - ] - }, - { - "cell_type": "markdown", - "id": "25c849dc", - "metadata": {}, - "source": [ - "# Evaluate the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "befcf23b", - "metadata": {}, - "outputs": [], - "source": [ - "RocCurveDisplay.from_predictions(\n", - " y_test,\n", - " best_model.predict_proba(X_test)[:, 1],\n", - " color=\"cornflowerblue\",\n", - ")\n", - "plt.plot([0, 1], [0, 1], \"k--\", label=\"chance level (AUC = 0.5)\")\n", - "plt.xlabel(\"False Positive Rate\")\n", - "plt.ylabel(\"True Positive Rate\")\n", - "plt.legend()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9cbe6f34", - "metadata": {}, - "outputs": [], - "source": [ - "# Create predictions\n", - "y_pred = best_model.predict(X_test)\n", - "\n", - "# Report\n", - "print(classification_report(y_test, y_pred))" - ] - }, - { - "cell_type": "markdown", - "id": "5e7dbff4", - "metadata": {}, - "source": [ - "# Save models\n", - "The model needs to be saved to disk. This allows the benchmarking script to load it and use it for predictions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0414eb51", - "metadata": {}, - "outputs": [], - "source": [ - "models_path = \"models\"\n", - "os.makedirs(models_path, exist_ok=True)\n", - "\n", - "# Dump model (don't change the name)\n", - "dump(best_model, os.path.join(models_path, \"model.joblib\"))" - ] - }, - { - "cell_type": "markdown", - "id": "d0fd27b9", - "metadata": {}, - "source": [ - "# How the submission would look like\n", - "The snippet below is taken from the file `submission.py`. It shows how the prediction code needs to work. The function will be called with a dataframe containing the full dataset. This dataset is similar to the data downloaded but also includes the holdout data.\n", - "\n", - "It then does the preprocessing in the same way that was used to train the model. If you make any adjustments to the pre-processing they should also be copied to the `submission.py` script (**the code below is just an excerpt**).\n", - "\n", - "Finally the script loads the model that was saved in the step above and does the prediction." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba11c24c", - "metadata": {}, - "outputs": [], - "source": [ - "def predict_outcomes(df):\n", - " # Keep \n", - " keepcols = ['burgstat2019', 'leeftijd2019', 'woonvorm2019', 'oplmet2019', 'aantalki2019']\n", - " nomem_encr = df[\"nomem_encr\"]\n", - " \n", - " df = df.loc[:, keepcols]\n", - " \n", - " # Load your trained model from the models directory\n", - " model_path = os.path.join(os.path.dirname(__file__), \"models\", \"model.joblib\")\n", - " model = load(model_path)\n", - "\n", - " # Use your trained model for prediction\n", - " predictions = model.predict(df)\n", - " # Return the result as a Pandas DataFrame with the columns \"nomem_encr\" and \"prediction\"\n", - " return pd.concat([nomem_encr, pd.Series(predictions, name=\"prediction\")], axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86ec3da1", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}