diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 60546e68..90bc589f 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.9.0 +current_version = 0.9.1 commit = True tag = True diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 73ca55b7..4859e253 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,3 +38,4 @@ repos: - types-python-dateutil - types-click - sqlmodel + - types-requests diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b5e2500..f88fd2dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,25 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.9.1] - 2023-07-12 + +### Changed + +- Updated Mendable client library version to deal with styling overrides in the RTD documentation theme +- Removed superfluous limits for confidence bands in the CBPE class (these are present in the metric classes instead) +- Threshold value limiting behaviour (e.g. overriding a value and emitting a warning) will be triggered not only when +the value crosses the threshold but also when it is equal to the threshold value. This is because we interpret the +threshold as a theoretical maximum. + +### Added + +- Added a new example notebook walking through a full use case using the NYC Green Taxi dataset, based on the blog of [@santiviquez](https://github.com/santiviquez) + +### Fixed + +- Fixed broken Docker container build due to changes in public Poetry installation procedure +- Fixed broken image source link in the README, thanks [@NeoKish](https://github.com/NeoKish)! + ## [0.9.0] - 2023-06-26 ### Changed diff --git a/Dockerfile b/Dockerfile index 6e9f0701..564e66af 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,7 +36,7 @@ RUN apt-get update && \ build-essential # Install Poetry - respects $POETRY_VERSION & $POETRY_HOME -RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/install-poetry.py | python +RUN curl -sSL https://install.python-poetry.org | python3 - --version $POETRY_VERSION ENV PATH="$POETRY_HOME/bin:$PATH" # Import our project files diff --git a/README.md b/README.md index ccb94b28..5947f39b 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Allowing you to have the following benefits: | 🔬 **[Technical reference]** | Monitor the performance of your ML models. | | 🔎 **[Blog]** | Thoughts on post-deployment data science from the NannyML team. | | 📬 **[Newsletter]** | All things post-deployment data science. Subscribe to see the latest papers and blogs. | -| 💎 **[New in v0.9.0]** | New features, bug fixes. | +| 💎 **[New in v0.9.1]** | New features, bug fixes. | | 🧑‍💻 **[Contribute]** | How to contribute to the NannyML project and codebase. | | **[Join slack]** | Need help with your specific use case? Say hi on slack! | @@ -79,7 +79,7 @@ Allowing you to have the following benefits: [performance estimation]: https://nannyml.readthedocs.io/en/stable/how_it_works/performance_estimation.html [key concepts]: https://nannyml.readthedocs.io/en/stable/glossary.html [technical reference]: https://nannyml.readthedocs.io/en/stable/nannyml/modules.html -[new in v0.9.0]: https://github.com/NannyML/nannyml/releases/latest/ +[new in v0.9.1]: https://github.com/NannyML/nannyml/releases/latest/ [real world example]: https://nannyml.readthedocs.io/en/stable/examples/california_housing.html [blog]: https://www.nannyml.com/blog [newsletter]: https://mailchi.mp/022c62281d13/postdeploymentnewsletter @@ -100,7 +100,7 @@ NannyML can also **track the realised performance** of your machine learning mod To detect **multivariate feature drift** NannyML uses [PCA-based data reconstruction](https://nannyml.readthedocs.io/en/main/how_it_works/data_reconstruction.html). Changes in the resulting reconstruction error are monitored over time and data drift alerts are logged when the reconstruction error in a certain period exceeds a threshold. This threshold is calculated based on the reconstruction error observed in the reference period. -

+

NannyML utilises statistical tests to detect **univariate feature drift**. We have just added a bunch of new univariate tests including Jensen-Shannon Distance and L-Infinity Distance, check out the [comprehensive list](https://nannyml.readthedocs.io/en/stable/how_it_works/univariate_drift_detection.html#methods-for-continuous-features). The results of these tests are tracked over time, properly corrected to counteract multiplicity and overlayed on the temporal feature distributions. (It is also possible to visualise the test-statistics over time, to get a notion of the drift magnitude.) @@ -264,11 +264,11 @@ Curious what we are working on next? Have a look at our [roadmap](https://bit.ly To cite NannyML in academic papers, please use the following BibTeX entry. -### Version 0.9.0 +### Version 0.9.1 ``` @misc{nannyml, - title = {{N}anny{ML} (release 0.9.0)}, + title = {{N}anny{ML} (release 0.9.1)}, howpublished = {\url{https://github.com/NannyML/nannyml}}, month = mar, year = 2023, diff --git a/docs/_static/example_green_taxi_all_udc.svg b/docs/_static/example_green_taxi_all_udc.svg new file mode 100644 index 00000000..e934f8f4 --- /dev/null +++ b/docs/_static/example_green_taxi_all_udc.svg @@ -0,0 +1 @@ +Dec 112016Dec 18Dec 25Jan 120170100200300400500600700Dec 112016Dec 18Dec 25Jan 12017−50510152025Dec 112016Dec 18Dec 25Jan 12017050100150Dec 112016Dec 18Dec 25Jan 1201700.20.40.60.81Dec 112016Dec 18Dec 25Jan 1201700.20.40.60.81Dec 112016Dec 18Dec 25Jan 1201700.20.40.60.81DOLocationIDOther1817441427PULocationIDOther255754174166VendorID21Column distributionsTimeTimeTimeTimeTimeTimeValuesValuesValuesValuesValuesValuesfare_amount distribution (alerts for Jensen-Shannon distance)pickup_time distribution (alerts for Jensen-Shannon distance)trip_distance distribution (alerts for Jensen-Shannon distance)DOLocationID distribution (alerts for Jensen-Shannon distance)PULocationID distribution (alerts for Jensen-Shannon distance)VendorID distribution (alerts for Jensen-Shannon distance)ReferenceAnalysisReferenceAnalysisReferenceAnalysis \ No newline at end of file diff --git a/docs/_static/example_green_taxi_dle.svg b/docs/_static/example_green_taxi_dle.svg new file mode 100644 index 00000000..0669a292 --- /dev/null +++ b/docs/_static/example_green_taxi_dle.svg @@ -0,0 +1 @@ +Dec 112016Dec 18Dec 25Jan 120170.9511.051.11.151.2MetricAlertThresholdConfidence bandEstimated performance (DLE)TimeMAEEstimated MAEReferenceAnalysis \ No newline at end of file diff --git a/docs/_static/example_green_taxi_dle_vs_realized.svg b/docs/_static/example_green_taxi_dle_vs_realized.svg new file mode 100644 index 00000000..47f0f0ba --- /dev/null +++ b/docs/_static/example_green_taxi_dle_vs_realized.svg @@ -0,0 +1 @@ +Dec 182016Dec 21Dec 24Dec 27Dec 300.9511.051.11.151.21.25MAE (estimated MAE)Confidence bandrealized MAE, MAE, maeAlertEstimated performance (DLE) vs. Realized performanceChunkMAE (estimated MAE) vs. realized MAE, MAE, mae \ No newline at end of file diff --git a/docs/_static/example_green_taxi_feature_importance.svg b/docs/_static/example_green_taxi_feature_importance.svg new file mode 100644 index 00000000..01ea59f3 --- /dev/null +++ b/docs/_static/example_green_taxi_feature_importance.svg @@ -0,0 +1,1286 @@ + + + + + + + + 2023-07-03T17:30:44.653290 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.orgdiff --git a/docs/_static/example_green_taxi_location_udc.svg b/docs/_static/example_green_taxi_location_udc.svg new file mode 100644 index 00000000..24eda5c4 --- /dev/null +++ b/docs/_static/example_green_taxi_location_udc.svg @@ -0,0 +1 @@ +Dec 112016Dec 18Dec 25Jan 1201700.20.40.60.81DOLocationIDOther1817441427Column distributionsTimeValuesDOLocationID distribution (alerts for Jensen-Shannon distance)ReferenceAnalysis \ No newline at end of file diff --git a/docs/_static/example_green_taxi_model_val.png b/docs/_static/example_green_taxi_model_val.png new file mode 100644 index 00000000..baa854e9 Binary files /dev/null and b/docs/_static/example_green_taxi_model_val.png differ diff --git a/docs/_static/example_green_taxi_pca_error.svg b/docs/_static/example_green_taxi_pca_error.svg new file mode 100644 index 00000000..1cb90a64 --- /dev/null +++ b/docs/_static/example_green_taxi_pca_error.svg @@ -0,0 +1 @@ +Dec 112016Dec 18Dec 25Jan 1201711.11.21.31.4MetricConfidence bandMultivariate drift (PCA reconstruction error)TimeData reconstruction driftReferenceAnalysis \ No newline at end of file diff --git a/docs/_static/example_green_taxi_pickup_udc.svg b/docs/_static/example_green_taxi_pickup_udc.svg new file mode 100644 index 00000000..db3bd741 --- /dev/null +++ b/docs/_static/example_green_taxi_pickup_udc.svg @@ -0,0 +1 @@ +Dec 112016Dec 18Dec 25−50510152025Column distributionsTimeValuespickup_time distribution (alerts for Jensen-Shannon distance) \ No newline at end of file diff --git a/docs/_static/example_green_taxi_tip_amount_boxplot.svg b/docs/_static/example_green_taxi_tip_amount_boxplot.svg new file mode 100644 index 00000000..2d2b58ba --- /dev/null +++ b/docs/_static/example_green_taxi_tip_amount_boxplot.svg @@ -0,0 +1,8459 @@ + + + + + + + + 2023-07-03T18:22:52.669705 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.orgdiff --git a/docs/_static/example_green_taxi_tip_amount_distribution.svg b/docs/_static/example_green_taxi_tip_amount_distribution.svg new file mode 100644 index 00000000..5ff769dd --- /dev/null +++ b/docs/_static/example_green_taxi_tip_amount_distribution.svg @@ -0,0 +1,834 @@ + + + + + + + + 2023-07-03T18:22:53.112697 + image/svg+xml + + + Matplotlib v3.7.1, https://matplotlib.orgdiff --git a/docs/_static/js/mendablesearch.js b/docs/_static/js/mendablesearch.js index f0b2487e..40193c73 100644 --- a/docs/_static/js/mendablesearch.js +++ b/docs/_static/js/mendablesearch.js @@ -66,7 +66,7 @@ document.addEventListener("DOMContentLoaded", () => { "https://unpkg.com/react-dom@17/umd/react-dom.production.min.js", () => { loadScript( - "https://unpkg.com/@mendable/search@0.0.102/dist/umd/mendable.min.js", + "https://unpkg.com/@mendable/search@0.0.114/dist/umd/mendable.min.js", initializeMendable ); } diff --git a/docs/_static/pca_reconstruction_error.svg b/docs/_static/pca_reconstruction_error.svg new file mode 100644 index 00000000..58c3ad82 --- /dev/null +++ b/docs/_static/pca_reconstruction_error.svgdiff --git a/docs/_static/tutorials/detecting_data_drift/multivariate_drift_detection/pca-reconstruction-error.svg b/docs/_static/tutorials/detecting_data_drift/multivariate_drift_detection/pca-reconstruction-error.svg index 78ed7f79..7cf8738d 100644 --- a/docs/_static/tutorials/detecting_data_drift/multivariate_drift_detection/pca-reconstruction-error.svg +++ b/docs/_static/tutorials/detecting_data_drift/multivariate_drift_detection/pca-reconstruction-error.svg @@ -1 +1 @@ -Jan 2018Jul 2018Jan 2019Jul 20191.11.151.21.25MetricAlertConfidence bandMultivariate drift (PCA reconstruction error)TimeData reconstruction driftReferenceAnalysis \ No newline at end of file +Jan 2018Jul 2018Jan 2019Jul 20191.11.151.21.25MetricAlertConfidence bandMultivariate Drift (PCA Reconstruction Error)TimeReconstruction ErrorReferenceAnalysis \ No newline at end of file diff --git a/docs/example_notebooks/Examples Green Taxi.ipynb b/docs/example_notebooks/Examples Green Taxi.ipynb new file mode 100644 index 00000000..8eaf721d --- /dev/null +++ b/docs/example_notebooks/Examples Green Taxi.ipynb @@ -0,0 +1,665 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "89298ce0", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment if you are runnning this on Google Colab\n", + "# !pip install nannyml\n", + "# !pip install numpy==1.22" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3c0635e7", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3c0635e7", + "outputId": "30d1af78-5f35-4a98-a9f7-86472a947f0d" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.metrics import mean_absolute_error\n", + "from lightgbm import LGBMRegressor, plot_importance\n", + "\n", + "import nannyml as nml" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6e90071a", + "metadata": { + "id": "6e90071a" + }, + "outputs": [], + "source": [ + "# Read data from url\n", + "url = \"https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2016-12.parquet\"\n", + "columns = ['lpep_pickup_datetime', 'PULocationID', 'DOLocationID', 'trip_distance', 'VendorID', 'payment_type', 'fare_amount', 'tip_amount']\n", + "data = pd.read_parquet(url, columns=columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9843aa09", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "9843aa09", + "outputId": "6a2c8bdd-0a25-4ab2-abbe-6852d905d784" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+------------------------+----------------+----------------+-----------------+------------+----------------+---------------+--------------+\n", + "| | lpep_pickup_datetime | PULocationID | DOLocationID | trip_distance | VendorID | payment_type | fare_amount | tip_amount |\n", + "+====+========================+================+================+=================+============+================+===============+==============+\n", + "| 0 | 2016-12-01 00:13:25 | 225 | 65 | 2.79 | 2 | 2 | 11 | 0 |\n", + "+----+------------------------+----------------+----------------+-----------------+------------+----------------+---------------+--------------+\n", + "| 1 | 2016-12-01 00:06:47 | 255 | 255 | 0.45 | 2 | 1 | 3.5 | 0.96 |\n", + "+----+------------------------+----------------+----------------+-----------------+------------+----------------+---------------+--------------+\n", + "| 2 | 2016-12-01 00:29:45 | 41 | 42 | 1.2 | 1 | 3 | 6 | 0 |\n", + "+----+------------------------+----------------+----------------+-----------------+------------+----------------+---------------+--------------+\n" + ] + } + ], + "source": [ + "print(data.head(3).to_markdown(tablefmt=\"grid\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a678285e", + "metadata": { + "id": "a678285e" + }, + "outputs": [], + "source": [ + "# Choose only payments from Credit Cards\n", + "data = data.loc[data['payment_type'] == 1,].drop(columns='payment_type') # Credit card\n", + "# Choose only positive tip amounts\n", + "data = data[data['tip_amount'] >= 0]\n", + "\n", + "# Sort data by pick up date\n", + "data = data.sort_values('lpep_pickup_datetime').reset_index(drop=True)\n", + "# Flag categoric columns as categoric\n", + "categoric_columns = ['PULocationID', 'DOLocationID', 'VendorID']\n", + "data[categoric_columns] = data[categoric_columns].astype('category')\n", + "\n", + "# Create column with pick up time\n", + "data['pickup_time'] = data['lpep_pickup_datetime'].dt.hour" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0e1af9ee", + "metadata": { + "id": "0e1af9ee" + }, + "outputs": [], + "source": [ + "# Create data partition\n", + "data['partition'] = pd.cut(\n", + " data['lpep_pickup_datetime'],\n", + " bins= [pd.to_datetime('2016-12-01'),\n", + " pd.to_datetime('2016-12-08'),\n", + " pd.to_datetime('2016-12-16'),\n", + " pd.to_datetime('2017-01-01')],\n", + " right=False,\n", + " labels= ['train', 'test', 'prod']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "B2-H3Ra8GCYl", + "metadata": { + "id": "B2-H3Ra8GCYl" + }, + "outputs": [], + "source": [ + "# Set target and features\n", + "target = 'tip_amount'\n", + "features = [col for col in data.columns if col not in [target, 'lpep_pickup_datetime', 'partition']]\n", + "\n", + "# Split the data\n", + "X_train = data.loc[data['partition'] == 'train', features]\n", + "y_train = data.loc[data['partition'] == 'train', target]\n", + "\n", + "X_test = data.loc[data['partition'] == 'test', features]\n", + "y_test = data.loc[data['partition'] == 'test', target]\n", + "\n", + "X_prod = data.loc[data['partition'] == 'prod', features]\n", + "y_prod = data.loc[data['partition'] == 'prod', target]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d68c2328", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "d68c2328", + "outputId": "ff8bdcf6-8083-4882-dc02-4342210b5023", + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tip_amount
count141568.000000
mean2.363484
std2.817078
min0.000000
25%1.060000
50%1.960000
75%3.000000
max250.700000
\n", + "
" + ], + "text/plain": [ + " tip_amount\n", + "count 141568.000000\n", + "mean 2.363484\n", + "std 2.817078\n", + "min 0.000000\n", + "25% 1.060000\n", + "50% 1.960000\n", + "75% 3.000000\n", + "max 250.700000" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(y_train.describe().to_frame())" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fa4dac07", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 865 + }, + "id": "fa4dac07", + "outputId": "3aef959b-5c61-4cbd-9b55-eb007b20c826" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "y_train.plot(kind='box')\n", + "plt.savefig('../_static/example_green_taxi_tip_amount_boxplot.svg', format='svg')\n", + "plt.show()\n", + "\n", + "y_train.clip(lower=0, upper=y_train.quantile(0.8)).to_frame().hist()\n", + "plt.savefig('../_static/example_green_taxi_tip_amount_distribution.svg', format='svg')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "528fbf0f", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 75 + }, + "id": "528fbf0f", + "outputId": "0ecfe8b0-9917-47e5-f7cc-1b3c8f7174e2" + }, + "outputs": [], + "source": [ + "# Fit the model\n", + "model = LGBMRegressor(random_state=111)\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Make predictions\n", + "y_pred_train = model.predict(X_train)\n", + "y_pred_test = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4fd0fe9b", + "metadata": { + "id": "4fd0fe9b" + }, + "outputs": [], + "source": [ + "# Make baseline predictions\n", + "y_pred_train_baseline = np.ones_like(y_train) * y_train.mean()\n", + "y_pred_test_baseline = np.ones_like(y_test) * y_train.mean()\n", + "\n", + "# Measure train, test and baseline performance\n", + "mae_train = mean_absolute_error(y_train, y_pred_train).round(4)\n", + "mae_test = mean_absolute_error(y_test, y_pred_test).round(4)\n", + "\n", + "mae_train_baseline = mean_absolute_error(y_train, y_pred_train_baseline).round(4)\n", + "mae_test_baseline = mean_absolute_error(y_test, y_pred_test_baseline).round(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "beb7b032", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 410 + }, + "id": "beb7b032", + "outputId": "f750cf58-636f-4f80-aee9-c01dc30c87e7", + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create performance report\n", + "fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,4))\n", + "\n", + "title1 = 'Train MAE: {} (<> {})'.format(mae_train, mae_train_baseline)\n", + "ax1.set(title=title1, xlabel='y_train', ylabel='y_pred')\n", + "ax1.plot(y_train, y_train, color='red', linestyle=':')\n", + "ax1.scatter(y_train, y_pred_train, alpha=0.1)\n", + "\n", + "title2 = 'Test MAE: {} (<> {})'.format(mae_test, mae_test_baseline)\n", + "ax2.set(title=title2, xlabel='y_test', ylabel='y_pred')\n", + "ax2.plot(y_test, y_test, color='red', linestyle=':')\n", + "ax2.scatter(y_test, y_pred_test, alpha=0.1)\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5BlWsneHW_eY", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 472 + }, + "id": "5BlWsneHW_eY", + "outputId": "eb12f11d-000e-48b0-c812-302f21200669" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot the feature importance\n", + "fig, ax = plt.subplots()\n", + "plot_importance(model, ax=ax)\n", + "plt.savefig('../_static/example_green_taxi_feature_importance.svg', format='svg')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a84a4a2d", + "metadata": { + "id": "a84a4a2d" + }, + "outputs": [], + "source": [ + "y_pred_prod = model.predict(X_prod)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "dd5e6ef3", + "metadata": { + "id": "dd5e6ef3" + }, + "outputs": [], + "source": [ + "reference = X_test.copy() # using the test set as a reference\n", + "reference['y_pred'] = y_pred_test # reference predictions\n", + "reference['tip_amount'] = y_test # ground truth (currect targets)\n", + "reference = reference.join(data['lpep_pickup_datetime']) # date\n", + "\n", + "analysis = X_prod.copy() # features\n", + "analysis['y_pred'] = y_pred_prod # prod predictions\n", + "analysis = analysis.join(data['lpep_pickup_datetime']) # date" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3dfbd7a4", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3dfbd7a4", + "outputId": "8d3b7567-66e8-4c09-8654-096ca2c5fa90", + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/mvds/nannyml/repos/nannyml/.venv/lib/python3.9/site-packages/lightgbm/basic.py:2065: UserWarning: Using categorical_feature in Dataset.\n", + " _log_warning('Using categorical_feature in Dataset.')\n" + ] + } + ], + "source": [ + "dle = nml.DLE(\n", + " metrics=['mae'],\n", + " y_true='tip_amount',\n", + " y_pred='y_pred',\n", + " feature_column_names=features,\n", + " timestamp_column_name='lpep_pickup_datetime',\n", + " chunk_period='d' # perform an estimation daily\n", + ")\n", + "\n", + "dle.fit(reference) # fit on the reference (test) data\n", + "estimated_performance = dle.estimate(analysis) # estimate on the prod data" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e4b34a9a", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 517 + }, + "id": "e4b34a9a", + "outputId": "7ee26cfd-7bfb-4500-8aa6-42ece6766ce5" + }, + "outputs": [], + "source": [ + "figure = estimated_performance.plot()\n", + "figure.write_image(f'../_static/example_green_taxi_dle.svg')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "3a7b6877", + "metadata": { + "id": "3a7b6877", + "scrolled": false + }, + "outputs": [], + "source": [ + "drdc = nml.DataReconstructionDriftCalculator(\n", + " column_names=features,\n", + " timestamp_column_name='lpep_pickup_datetime',\n", + " chunk_period='d',\n", + ")\n", + "\n", + "drdc.fit(reference)\n", + "multivariate_data_drift = drdc.calculate(analysis)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "74a8f9c4", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "id": "74a8f9c4", + "outputId": "5ebcc144-07e7-4cdb-c136-2b6e20d4aa1e" + }, + "outputs": [], + "source": [ + "figure = multivariate_data_drift.plot()\n", + "figure.write_image(f'../_static/example_green_taxi_pca_error.svg')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "79fed665", + "metadata": { + "id": "79fed665" + }, + "outputs": [], + "source": [ + "udc = nml.UnivariateDriftCalculator(\n", + " column_names=features,\n", + " timestamp_column_name='lpep_pickup_datetime',\n", + " chunk_period='d',\n", + ")\n", + "\n", + "udc.fit(reference)\n", + "univariate_data_drift = udc.calculate(analysis)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "GnGnV5v0d7Fp", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 517 + }, + "id": "GnGnV5v0d7Fp", + "outputId": "8b4933d6-c799-4f70-9d38-fe138e26d588" + }, + "outputs": [], + "source": [ + "figure = univariate_data_drift.filter(period='all', metrics='jensen_shannon', column_names=['DOLocationID']).plot(kind='distribution')\n", + "figure.write_image(f'../_static/example_green_taxi_location_udc.svg')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ofutS6MwgFEd", + "metadata": { + "colab": { + "background_save": true + }, + "id": "ofutS6MwgFEd", + "outputId": "5c9f19f2-6452-422e-8620-8bf38065a6c3" + }, + "outputs": [], + "source": [ + "figure = univariate_data_drift.filter(period='all', metrics='jensen_shannon', column_names=['pickup_time']).plot(kind='distribution')\n", + "figure.write_image(f'../_static/example_green_taxi_pickup_udc.svg')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "QCIMHtwkhG9K", + "metadata": { + "colab": { + "background_save": true, + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "QCIMHtwkhG9K", + "outputId": "f63aa06d-d290-469d-858c-4ee0fd98b168" + }, + "outputs": [], + "source": [ + "figure = univariate_data_drift.filter(period='all', metrics='jensen_shannon').plot(kind='distribution')\n", + "\n", + "figure.write_image(f'../_static/example_green_taxi_all_udc.svg')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "DdFamecl4JPi", + "metadata": { + "colab": { + "background_save": true, + "base_uri": "https://localhost:8080/", + "height": 517 + }, + "id": "DdFamecl4JPi", + "outputId": "5aa21588-5335-46e6-9760-0b954e988267" + }, + "outputs": [], + "source": [ + "perfc = nml.PerformanceCalculator(\n", + " metrics=['mae'],\n", + " y_true='tip_amount',\n", + " y_pred='y_pred',\n", + " problem_type='regression',\n", + " timestamp_column_name='lpep_pickup_datetime',\n", + " chunk_period='d'\n", + ")\n", + "\n", + "perfc.fit(reference)\n", + "realized_performance = perfc.calculate(analysis.assign(tip_amount = y_prod))\n", + "\n", + "figure = estimated_performance.filter(period='analysis').compare(realized_performance).plot()\n", + "figure.write_image(f'../_static/example_green_taxi_dle_vs_realized.svg')" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/example_notebooks/Tutorial - Drift - Multivariate.ipynb b/docs/example_notebooks/Tutorial - Drift - Multivariate.ipynb index e58b6bf4..58c8e610 100644 --- a/docs/example_notebooks/Tutorial - Drift - Multivariate.ipynb +++ b/docs/example_notebooks/Tutorial - Drift - Multivariate.ipynb @@ -923,7 +923,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/docs/examples.rst b/docs/examples.rst index dd6db8cf..83435e52 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -7,3 +7,4 @@ Examples :maxdepth: 2 examples/california_housing + examples/green_taxi \ No newline at end of file diff --git a/docs/examples/green_taxi.rst b/docs/examples/green_taxi.rst new file mode 100644 index 00000000..37ea4473 --- /dev/null +++ b/docs/examples/green_taxi.rst @@ -0,0 +1,315 @@ +============================================================= +Full Monitoring Workflow - Regression: NYC Green Taxi Dataset +============================================================= +.. raw:: html + + + Open In Colab + + +In this tutorial, we will use the `NYC Green Taxi Dataset `_ to build a machine-learning model that predicts the tip amount a passenger +will leave after a taxi ride. Later, we will use NannyML to monitor this model and measure its performance with unseen production data. Additionally, +we will investigate plausible reasons for the performance drop using data drift detection methods. + + +Import libraries +================ + +The following cell will import the necessary libraries plus install NannyML. NannyML is an open-source library to do post-deployment data science. +We will use it to estimate the model's performance with unseen data and run multivariate and univariate drift tests. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 2 + +Load the data +============= + +We will be using the following columns from the NYC Taxi Dataset: + +* lpep_pickup_datetime: pick-up datetime. +* PULocationID: ID of the pick-up location. +* DOLocationID: ID of the drop-out location. +* trip_distance: Trip distance in Miles. +* VendorID: Vendor ID. +* payment_type: Payment Type. We will be using only credit cards. +* fare_amount: Total fare amount in USD. +* tip_amount: Tip amount in USD. This column will be the target. + +Other columns were omitted because of having multiple missing values, having the same value for every record, or being directly associated with the target variable. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 3 + +.. nbtable:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cell: 4 + + +Preprocessing the data +====================== + +Before modeling, we will do some preprocessing: + +1. We'll only consider trips paid with a credit card as a payment type because they are the only ones with a tip amount in the dataset. +2. Choose only examples with positive tip amounts. Since negative tip amounts are not relevant for this use case, given that they may be related to chargebacks or possible errors in the data quality pipeline. +3. We will sort the data by pick-up date. This will be helpful later on when we have to partition our dataset into train, test, and production sets. +4. We will create an extra feature containing only the information about the pick-up time. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 5 + +Now, let's split the data. When training an ML model, we often split the data into 2 (train, test) or 3 (train, validation, test) sets. But, since the final goal of +this tutorial is to learn how to monitor an ML model with unseen "production" data, we will split the original data into three parts: + +- train: data from the **first week** of December 2016 +- test: data from the **second week** of December 2016 +- prod: data from **the third and fourth weeks** of December 2016 + +The production dataset will help us simulate a real-case scenario where a trained model is used in a production environment. Typically, production data don't contain targets. +This is why monitoring the model performance on it is a challenging task. + +But let's not worry too much about it (yet). We will return later to this when learning how to estimate model performance. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 6 + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 7 + +Exploring the training data +=========================== + +Let's quickly explore the train data to ensure we understand it and check that everything makes sense. Since we are building a model that can predict the tip amount +that the customers will leave at the end of the ride is essential that we look at how the distribution looks. + +The table below shows that the most common tip amount is close to \$2. However, we also observe a high max value of \$250, meaning there are probably some outliers. +So, let's take a closer look by plotting a box plot and a histogram of the tip amount column. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 8 + :show_output: + +.. image:: ../_static/example_green_taxi_tip_amount_boxplot.svg + +.. image:: ../_static/example_green_taxi_tip_amount_distribution.svg + +Indeed we see some outliers. There are several tips amounts bigger than $50. We are still going to consider them since these are completely reasonable amounts. +Maybe some clients are very generous! + +Looking at the histogram below, we see that many passengers don't tip. This is something that we would expect in this kind of scenario. +A big group of people does not leave tips, and another one does. We can see a gap between both groups, meaning tipping very low is uncommon. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 9 + :show_output: + +Training a model +================ + +We will train an LGBMRegressor with its default parameters. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 10 + +Evaluating the model +==================== + +To evaluate the model, we will compare its train and test Mean Absolute Error with a baseline model that always predicts the mean of the training tip amount. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 11 + +Below we plotted two scatter plots, one with the actual and predicted values for training and a similar one with the predicted values for the testing data. +Both mean absolute errors are relatively low, meaning the model performs well enough for this use case. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 12 + +.. image:: ../_static/example_green_taxi_model_val.png + +It makes sense that the most relevant feature is the fare amount since the tip is often a percentage of it. +Interestingly, the drop-out location is more important than the pick-up location. Let's try to reason why. + +People often pick up a taxi in crowded places like cities and business centers. So, pick-up locations tend to be similar and less variable. +In contrast, drop-out locations can be very variable since people often take a taxi to their houses, restaurants, offices, etc. One could argue that +the drop-out location contains/encodes some information about the economic and social status of the passenger. Explaining why the drop-out location is more relevant +to predict the tip amount than the pick-up location. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 13 + +.. image:: ../_static/example_green_taxi_feature_importance.svg + +Deploying the model +=================== + +To simulate that we are in a production environment, we will use the trained model to make predictions on unseen production data. + +We will later use NannyML to check how well the model performs on this data. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 14 + +Analysing ML model performance in production +============================================ + +We need to create a reference and analysis set to properly analyze the model performance in production. + +* **Reference dataset:** The reference dataset should be one where the model behaves as expected. Ideally, one that the model did not see during training, but we know the correct targets and the model's predictions. This dataset allows us to establish a baseline for every metric we want to monitor. Ideally, we use the test set as a reference set, which is what we use in the code cell below. +* **Analysis dataset:** The analysis dataset is typically the latest production data up to a desired point in the past, which should be after the reference period ends. The analysis period is not required to have targets available. The analysis period is where NannyML analyzes/monitors the model's performance and data drift of the model using the knowledge gained from the reference set. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 15 + +Estimating the model's performance +================================== + +Once an ML model is in production, we would like to get a view of how the model is performing. The tricky part is that we can not always measure the actual performance. +To measure it, we need the correct targets, in this case, the tip amounts. But these targets may take a while before they are updated in the system. +The tip goes straight to the taxi drivers, so we will only know the actual values when they report it. + +The good news is that we can leverage probabilistic methods to *estimate* the model performance. So instead of waiting for data to have targets, we will use a method +called `DLE `_, short for Direct Loss Estimation, to *estimate* the model +performance. + +The idea behind DLE is to train an extra ML model whose task is to estimate the value of the loss function of the monitored model. This can be later used to estimate +the original's model performance. DLE works for regression tasks like the one we are working on in this tutorial. But if you are interested in estimating the model +performance for a classification task, +check out `Estimating Performance for Classification `_. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 16 + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 17 + +.. image:: ../_static/example_green_taxi_dle.svg + +The plot above shows that the estimated performance exceeded the threshold during some days of the last week of December, which means that the model failed to make +reliable predictions during those days. + +The next step is to go down the rabbit hole and figure out what went wrong during those days and see if we can find the root cause of these issues. + +We will use multivariate and univariate data drift detection methods to achieve this. They will allow us to check if a drift in the data caused the performance issue. + +Detecting multivariate data drift +================================= + +Multivariate data drift detection gives us a general overview of changes across the entire feature space. It detects if there is a drift in the general distribution of all +the features. So, instead of looking at the distribution of each feature independently, it looks at all features at once. + +This method allows us to look for more subtle changes in the data structure that univariate approaches cannot detect, such as changes in the linear relationships between +features. + +.. image:: ../_static/pca_reconstruction_error.svg + +To do this, we use the method `DataReconstructionDriftCalculator` which compresses the **reference feature space** to a latent space using a PCA algorithm. +The algorithm later decompresses the latent space data and reconstructs it with some error. This error is called the reconstruction error. + +We can later use the learned compressor/decompressor to transform the **production** +set and measure its reconstruction error. If the reconstruction error is bigger than a threshold, the structure learned by PCA no longer +accurately resembles the underlying structure of the analysis data. This indicates that there is data drift in the analysis/production data. + +To learn more about how this works, check out our +documentation `Data Reconstruction with PCA Deep Dive `_. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 18 + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 19 + +.. image:: ../_static/example_green_taxi_pca_error.svg + +We don't see any multivariate drift happening. This may occur because the linear relationships between features did not change much, even though some features may have changed. + +Imagine the points moving from an area with an average reconstruction error of 1.2 to another that is ≈1.2 instead of one that is 2 x 1.2. +In this case, the reconstruction error wouldn't change. `DataReconstructionDriftCalculator` is not expected to always capture the drift. We need both multivariate and +univariate to have the full picture. + +Let's analyze it at a feature level and run the univariate drift detection methods. + +Detecting univariate data drift +=============================== + +Univariate drift detection allows us to perform a more granular investigation. This time we will look at each feature individually and compare the reference and +analysis periods in search for drift in any relevant feature. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 20 + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 21 + +.. image:: ../_static/example_green_taxi_location_udc.svg + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 22 + +.. image:: ../_static/example_green_taxi_pickup_udc.svg + +On the plots above, we see some drift happening for the `DOLocationID` and the `pickup_time` columns around Dec 18th and the week of Christmas. + +Looking back at the performance estimation plot, we see that the performance did not drop on Dec 18th. This means that the drift on this date is a false alarm. + +What is more interesting is the week of the 25th. Again, we see a drift in the pick-up location and pick-up time that correlates with the dates of the performance drop. + +For this example, we picked the plots of the `DOLocationID` and the `pickup_time` since they are the two most important features showing data drift. + +But, If you want to check if the other features drifted, you can run the following code and analyze each column distribution. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 23 + +.. image:: ../_static/example_green_taxi_all_udc.svg + +Bonus: Comparing realized and estimated performance +=================================================== + +When targets become available, we can calculate the actual model performance on production data. Also called realized performance. +In the cell below, we calculate the realized performance and compare it with NannyML's estimation. + +.. nbimport:: + :path: ./example_notebooks/Examples Green Taxi.ipynb + :cells: 24 + +.. image:: ../_static/example_green_taxi_dle_vs_realized.svg + + +In the plot above, the estimated performance is usually close to the realized one. Except for some points during the holidays where the performance degradation is bigger +than estimated. + +This may be because we have less than a year of data, so the model has no notion of what a holiday is and what it looks like. This is a sign of concept drift. +Currently, NannyML's algorithms don't support concept drift. But, the good news is that concept drift often coincides with data drift, +so in this case, `DLE `_ was able to pick up some of the degradation +issues during the holidays. + +Conclusion +========== + +We built an ML model to predict the tip amount a passenger will leave after a taxi ride. Then, we used this model to make predictions on actual production data. +And we applied NannyML's performance estimation to spot performance degradation patterns. We also used data drift detection methods to explain these performance issues. + +After finding what is causing the performance degradation issues, we need to figure out how to fix it. +Check out our previous blog post to learn six ways `to address data distribution shift `_. diff --git a/docs/quick.rst b/docs/quick.rst index 87d6d36d..53a1abed 100644 --- a/docs/quick.rst +++ b/docs/quick.rst @@ -8,6 +8,12 @@ Quickstart What is NannyML? ---------------- +.. raw:: html + + + Open In Colab + + .. include:: ./common/quickstart_what_is_nannyml.rst diff --git a/docs/tutorials/compare_estimated_and_realized_performance.rst b/docs/tutorials/compare_estimated_and_realized_performance.rst index cb388a8c..b2549469 100644 --- a/docs/tutorials/compare_estimated_and_realized_performance.rst +++ b/docs/tutorials/compare_estimated_and_realized_performance.rst @@ -20,6 +20,8 @@ When the :term:`targets` become available, the quality of estimations pr The beginning of the code below is similar to the one in :ref:`tutorial on performance calculation with binary classification data`. +while this tutorial uses the **roc_auc** metric, any metric estimated and calculated by NannyML can +be used for comparison. For simplicity this guide is based on a synthetic dataset included in the library, where the monitored model predicts whether a customer will repay a loan to buy a car. diff --git a/docs/tutorials/data_quality/missing.rst b/docs/tutorials/data_quality/missing.rst index 586e33cc..7cbf608a 100644 --- a/docs/tutorials/data_quality/missing.rst +++ b/docs/tutorials/data_quality/missing.rst @@ -38,14 +38,22 @@ The :class:`~nannyml.data_quality.missing.calculator.MissingValuesCalculator` cl the functionality needed for missing values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. -- Optionally, a boolean option indicating whether we want the absolute count of the missing +- **column_names:** A list with the names of columns to be evaluated. +- **normalize (Optional):** Optionally, a boolean option indicating whether we want the absolute count of the missing value instances or their relative ratio. By default it is set to true. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **thresholds (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Missing Values.ipynb diff --git a/docs/tutorials/data_quality/unseen.rst b/docs/tutorials/data_quality/unseen.rst index de9fd104..a5127bf3 100644 --- a/docs/tutorials/data_quality/unseen.rst +++ b/docs/tutorials/data_quality/unseen.rst @@ -40,14 +40,22 @@ The :class:`~nannyml.data_quality.unseen.calculator.UnseenValuesCalculator` clas the functionality needed for unseen values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. They need to be categorical columns. -- Optionally, a boolean option indicating whether we want the absolute count of the unseen +- **column_names:** A list with the names of columns to be evaluated. They need to be categorical columns. +- **normalize (Optional):** Optionally, a boolean option indicating whether we want the absolute count of the missing value instances or their relative ratio. By default it is set to true. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **thresholds (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. warning:: diff --git a/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst b/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst index 39d28cff..19f40366 100644 --- a/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst +++ b/docs/tutorials/detecting_data_drift/multivariate_drift_detection.rst @@ -50,12 +50,30 @@ Let's start by loading some synthetic data provided by the NannyML package and s :cell: 2 The :class:`~nannyml.drift.multivariate.data_reconstruction.calculator.DataReconstructionDriftCalculator` -module implements this functionality. We need to instantiate it with appropriate parameters - the column names of the features we want to run drift detection on, -and the timestamp column name. The features can be passed in as a simple list of strings. Alternatively, we can create a list by excluding the columns in the dataframe that are not features, -and pass them into the argument. - -Next, the :meth:`~nannyml.base.AbstractCalculator.fit` method needs to be called on the reference data, which the results will be based on. -Then the +module implements this functionality. We need to instantiate it with appropriate parameters: + +- **column_names:** A list with the column names of the features we want to run drift detection on. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **n_components (Optional):** The n_components parameter as passed to the sklearn `PCA constructor`_. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **imputer_categorical (Optional):** An sklearn `SimpleImputer`_ object specifying an appropriate strategy + for imputing missing values for categorical features. +- **imputer_continuous (Optional):** An sklearn `SimpleImputer`_ object specifying an appropriate strategy + for imputing missing values for continuous features. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. + +Next, the :meth:`~nannyml.base.AbstractCalculator.fit` method needs to be called on the reference data, +which the results will be based on. Then the :meth:`~nannyml.base.AbstractCalculator.calculate` method will calculate the multivariate drift results on the provided data. @@ -101,11 +119,8 @@ NannyML can also visualize the multivariate drift results in a plot. Our plot co * The purple step plot shows the reconstruction error in each chunk of the analysis period. Thick squared point markers indicate the middle of these chunks. - * The low-saturated purple area around the reconstruction error indicates the :ref:`sampling error`. - * The red horizontal dashed lines show upper and lower thresholds for alerting purposes. - * If the reconstruction error crosses the upper or lower threshold an alert is raised which is indicated with a red, low-saturated background across the whole width of the relevant chunk. A red, diamond-shaped point marker additionally indicates this in the middle of the chunk. @@ -118,9 +133,6 @@ NannyML can also visualize the multivariate drift results in a plot. Our plot co The multivariate drift results provide a concise summary of where data drift is happening in our input data. -.. _SimpleImputer: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html - - Insights -------- @@ -137,3 +149,6 @@ estimate the impact of the observed changes. For more information on how multivariate drift detection works, the :ref:`Data Reconstruction with PCA` explanation page gives more details. + +.. _`PCA constructor`: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html +.. _`SimpleImputer`: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html diff --git a/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst b/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst index c210433e..fd1e3a4b 100644 --- a/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst +++ b/docs/tutorials/detecting_data_drift/univariate_drift_detection.rst @@ -49,14 +49,34 @@ We begin by loading some synthetic data provided in the NannyML package. This is The :class:`~nannyml.drift.univariate.calculator.UnivariateDriftCalculator` class implements the functionality needed for univariate drift detection. First, we need to instantiate it with the appropriate parameters: -- The names of the columns to be evaluated. -- A list of methods to use on continuous columns. You can chose from :ref:`kolmogorov_smirnov`, - :ref:`jensen_shannon`, :ref:`wasserstein` - and :ref:`hellinger`. -- A list of methods to use on categorical columns. You can choose from :ref:`chi2`, :ref:`jensen_shannon`, +- **column_names:** A list with the names of columns to be evaluated. +- **treat_as_categorical (Optional):** A list of column names to treat as categorical columns. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **categorical_methods (Optional):** A list of methods to use on categorical columns. + You can choose from :ref:`chi2`, :ref:`jensen_shannon`, :ref:`l_infinity`, and :ref:`hellinger`. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default chunker creating 10 chunks will be used. +- **continuous_methods (Optional):** A list of methods to use on continuous columns. + You can chose from :ref:`kolmogorov_smirnov`, + :ref:`jensen_shannon`, + :ref:`wasserstein` + and :ref:`hellinger`. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **thresholds (Optional):** A dictionary allowing users to set a custom threshold strategy for each method. + It links a `Threshold` subclass to a method name. + For more information about thresholds, check out the :ref:`thresholds tutorial`. +- **computation_params (Optional):** A dictionary which allows users to specify whether they want drift calculated on + the exact reference data or an estimated distribution of the reference data obtained + using binning techniques. Applicable only to Kolmogorov-Smirnov and Wasserstein. For more information look + :class:`~nannyml.drift.univariate.calculator.UnivariateDriftCalculator`. .. nbimport:: :path: ./example_notebooks/Tutorial - Drift - Univariate.ipynb diff --git a/docs/tutorials/performance_estimation/binary_performance_estimation/business_value_estimation.rst b/docs/tutorials/performance_estimation/binary_performance_estimation/business_value_estimation.rst index d9303c59..beff3271 100644 --- a/docs/tutorials/performance_estimation/binary_performance_estimation/business_value_estimation.rst +++ b/docs/tutorials/performance_estimation/binary_performance_estimation/business_value_estimation.rst @@ -160,8 +160,5 @@ What's next ----------- The :ref:`Data Drift` functionality can help us to understand whether data drift is causing the performance problem. -When the target values become available they can be :ref:`compared with the estimated -results`. - -You can learn more about the Confidence Based Performance Estimation and its limitations in the -:ref:`How it Works page`. +When the target values become available we can +:ref:`compared realized and estimated business value results`. diff --git a/docs/tutorials/performance_estimation/binary_performance_estimation/confusion_matrix_estimation.rst b/docs/tutorials/performance_estimation/binary_performance_estimation/confusion_matrix_estimation.rst index a7174261..e1d95710 100644 --- a/docs/tutorials/performance_estimation/binary_performance_estimation/confusion_matrix_estimation.rst +++ b/docs/tutorials/performance_estimation/binary_performance_estimation/confusion_matrix_estimation.rst @@ -161,8 +161,5 @@ What's next ----------- The :ref:`Data Drift` functionality can help us to understand whether data drift is causing the performance problem. -When the target values become available they can be :ref:`compared with the estimated -results`. - -You can learn more about the Confidence Based Performance Estimation and its limitations in the -:ref:`How it Works page`. +When the target values become available we can +:ref:`compared realized and estimated confusion matrix results`. diff --git a/docs/tutorials/performance_estimation/binary_performance_estimation/custom_metric_estimation.rst b/docs/tutorials/performance_estimation/binary_performance_estimation/custom_metric_estimation.rst index bf7b6b2c..ed918cd6 100644 --- a/docs/tutorials/performance_estimation/binary_performance_estimation/custom_metric_estimation.rst +++ b/docs/tutorials/performance_estimation/binary_performance_estimation/custom_metric_estimation.rst @@ -135,11 +135,5 @@ What's next ----------- The :ref:`Data Drift` functionality can help us to understand whether data drift is causing the performance problem. -When the target values become available they can be :ref:`compared with the estimated -results`. - -You can learn more about the Confidence Based Performance Estimation and its limitations in the -:ref:`How it Works page`. - -And if targets are available or become available, you can learn more about *calculating* confusion -matrix elements in the :ref:`confusion-matrix-calculation` tutorial. +When the target values become available we can +:ref:`compared realized and estimated custom performance metric results`. diff --git a/docs/tutorials/performance_estimation/binary_performance_estimation/standard_metric_estimation.rst b/docs/tutorials/performance_estimation/binary_performance_estimation/standard_metric_estimation.rst index 283ec9da..f6acf359 100644 --- a/docs/tutorials/performance_estimation/binary_performance_estimation/standard_metric_estimation.rst +++ b/docs/tutorials/performance_estimation/binary_performance_estimation/standard_metric_estimation.rst @@ -157,8 +157,5 @@ What's next ----------- The :ref:`Data Drift` functionality can help us to understand whether data drift is causing the performance problem. -When the target values become available they can be :ref:`compared with the estimated -results`. - -You can learn more about the Confidence Based Performance Estimation and its limitations in the -:ref:`How it Works page`. +When the target values become available we can +:ref:`compared realized and estimated performance results`. diff --git a/docs/tutorials/performance_estimation/multiclass_performance_estimation.rst b/docs/tutorials/performance_estimation/multiclass_performance_estimation.rst index 704ed504..49e575b0 100644 --- a/docs/tutorials/performance_estimation/multiclass_performance_estimation.rst +++ b/docs/tutorials/performance_estimation/multiclass_performance_estimation.rst @@ -4,26 +4,141 @@ Estimating Performance for Multiclass Classification ==================================================== -We currently support the following **standard** metrics for multiclass classification performance estimation: +This tutorial explains how to use NannyML to estimate the performance of binary classification +models in the absence of target data. To find out how :class:`~nannyml.performance_estimation.confidence_based.cbpe.CBPE` estimates performance, read the :ref:`explanation of Confidence-based +Performance Estimation`. - * **roc_auc** - one-vs-the-rest, macro-averaged - * **f1** - macro-averaged - * **precision** - macro-averaged - * **recall** - macro-averaged - * **specificity** - macro-averaged - * **accuracy** +.. note:: + The following example uses :term:`timestamps`. + These are optional but have an impact on the way data is chunked and results are plotted. + You can read more about them in the :ref:`data requirements`. -For more information about estimating these metrics, refer to the :ref:`standard-metric-estimation` section. -We also support the following *complex* metrics for multiclass classification performance estimation: +Just The Code +------------- - * **confusion_matrix** +.. nbimport:: + :path: ./example_notebooks/Tutorial - Estimating Performance - Multiclass Classification.ipynb + :cells: 1 3 4 6 -For more information about estimating the confusion matrix for multiclass problems, -refer to the :ref:`multiclass-confusion-matrix-estimation` section. +.. admonition:: **Advanced configuration** + :class: hint -.. toctree:: - :maxdepth: 2 + - To learn how :class:`~nannyml.chunk.Chunk` works and to set up custom chunkings check out the :ref:`chunking tutorial ` + - To learn how :class:`~nannyml.thresholds.ConstantThreshold` works and to set up custom threshold check out the :ref:`thresholds tutorial ` - multiclass_performance_estimation/standard_metric_estimation - multiclass_performance_estimation/confusion_matrix_estimation \ No newline at end of file +Walkthrough +----------- + + +For simplicity this guide is based on a synthetic dataset where the monitored model predicts +which type of credit card product new customers should be assigned to. +Check out :ref:`Credit Card Dataset` to learn more about this dataset. + +In order to monitor a model, NannyML needs to learn about it and set expectations from a reference dataset. +Then it can monitor the data that is subject to actual analysis, provided as the analysis dataset. +You can read more about this in our section on :ref:`data periods`. + +.. nbimport:: + :path: ./example_notebooks/Tutorial - Estimating Performance - Multiclass Classification.ipynb + :cells: 1 + +.. nbtable:: + :path: ./example_notebooks/Tutorial - Estimating Performance - Multiclass Classification.ipynb + :cell: 2 + +Next we create the Confidence-based Performance Estimation (:class:`~nannyml.performance_estimation.confidence_based.cbpe.CBPE`) +estimator with a list of metrics, and an optional :term:`chunking` specification. For more information about +chunking check out the :ref:`chunking tutorial` and it's :ref:`advanced guide`. + +.. note:: + The list of metrics specifies which performance metrics of the monitored model will be estimated. + The following metrics are currently supported: + + - ``roc_auc`` - one-vs-the-rest, macro-averaged + - ``f1`` - macro-averaged + - ``precision`` - macro-averaged + - ``recall`` - macro-averaged + - ``specificity`` - macro-averaged + - ``accuracy`` + + +.. nbimport:: + :path: ./example_notebooks/Tutorial - Estimating Performance - Multiclass Classification.ipynb + :cells: 3 + +The :class:`~nannyml.performance_estimation.confidence_based.cbpe.CBPE` +estimator is then fitted using the +:meth:`~nannyml.performance_estimation.confidence_based.cbpe.CBPE.fit` method on the reference data. + +The fitted ``estimator`` can be used to estimate performance on other data, for which performance cannot be calculated. +Typically, this would be used on the latest production data where target is missing. In our example this is +the ``analysis_df`` data. + +NannyML can then output a dataframe that contains all the results. Let's have a look at the results for analysis period +only. + +.. nbimport:: + :path: ./example_notebooks/Tutorial - Estimating Performance - Multiclass Classification.ipynb + :cells: 4 + +.. nbtable:: + :path: ./example_notebooks/Tutorial - Estimating Performance - Multiclass Classification.ipynb + :cell: 5 + +Apart from chunk-related data, the results data have the following columns for each metric +that was estimated: + + - **value** - the estimate of a metric for a specific chunk. + - **sampling_error** - the estimate of the :term:`Sampling Error`. + - **realized** - when **target** values are available for a chunk, the realized performance metric will also + be calculated and included within the results. + - **upper_confidence_boundary** and **lower_confidence_boundary** - These values show the :term:`Confidence Band` of the relevant metric + and are equal to estimated value +/- 3 times the estimated :term:`Sampling Error`. + - **upper_threshold** and **lower_threshold** - crossing these thresholds will raise an alert on significant + performance change. The thresholds are calculated based on the actual performance of the monitored model on chunks in + the reference partition. By default, the thresholds are 3 standard deviations away from the mean performance calculated on + chunks. They are calculated during ``fit`` phase. You can also set up custom thresholds using constant or standard deviations thresholds, + to learn more about it check out our :ref:`tutorial on thresholds`. + - **alert** - flag indicating potentially significant performance change. ``True`` if estimated performance crosses + upper or lower threshold. + + +These results can be also plotted. Our plot contains several key elements. + +* The purple dashed step plot shows the estimated performance in each chunk of the analysis period. Thick squared point + markers indicate the middle of these chunks. + +* The black vertical line splits the reference and analysis periods. + +* The low-saturated colored area around the estimated performance indicates the :ref:`sampling error`. + +* The red horizontal dashed lines show upper and lower thresholds for alerting purposes. + +* If the estimated performance crosses the upper or lower threshold an alert is raised which is indicated with a red + diamond-shaped point marker in the middle of the chunk. + +Description of tabular results above explains how the +:term:`confidence bands` and thresholds are calculated. Additional information is shown in the hover (these are +interactive plots, though only static views are included here). + + +.. nbimport:: + :path: ./example_notebooks/Tutorial - Estimating Performance - Multiclass Classification.ipynb + :cells: 6 + +.. image:: ../../_static/tutorials/performance_estimation/multiclass_synthetic.svg + +Insights +-------- + +After reviewing the performance estimation results, we should be able to see any indications of performance change that +NannyML has detected based upon the model's inputs and outputs alone. + + +What's next +----------- + +The :ref:`Data Drift` functionality can help us to understand whether data drift is causing the performance problem. +When the target values become available we can +:ref:`compared realized and performance results`. diff --git a/docs/tutorials/performance_estimation/regression_performance_estimation.rst b/docs/tutorials/performance_estimation/regression_performance_estimation.rst index 3747c5b3..d7f5d33f 100644 --- a/docs/tutorials/performance_estimation/regression_performance_estimation.rst +++ b/docs/tutorials/performance_estimation/regression_performance_estimation.rst @@ -164,11 +164,8 @@ What's next ----------- The :ref:`Data Drift` functionality can help us to understand whether data drift is causing the performance problem. -When the target values become available they can be :ref:`compared with the estimated -results`. - -You can learn more about Direct Error Estimation and its limitations in the -:ref:`How it Works page`. +When the target values become available we can +:ref:`compared realized and estimated performance results`. .. _LGBMRegressor defaults: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html diff --git a/docs/tutorials/summary_stats/avg.rst b/docs/tutorials/summary_stats/avg.rst index 5f4882fa..88874f14 100644 --- a/docs/tutorials/summary_stats/avg.rst +++ b/docs/tutorials/summary_stats/avg.rst @@ -36,12 +36,20 @@ The :class:`~nannyml.stats.avg.calculator.SummaryStatsAvgCalculator` class imple the functionality needed for mean values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **column_names:** A list with the names of columns to be evaluated. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Stats - Avg.ipynb diff --git a/docs/tutorials/summary_stats/count.rst b/docs/tutorials/summary_stats/count.rst index ae2194c2..b3e22fbd 100644 --- a/docs/tutorials/summary_stats/count.rst +++ b/docs/tutorials/summary_stats/count.rst @@ -34,11 +34,19 @@ The :class:`~nannyml.stats.count.calculator.SummaryStatsRowCountCalculator` clas the functionality needed for row count calculations. We need to instantiate it with appropriate *optional* parameters: -- The name of the column containing the observation timestamps. -- A chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- A threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Stats - Count.ipynb diff --git a/docs/tutorials/summary_stats/median.rst b/docs/tutorials/summary_stats/median.rst index d13fd454..c9d96d48 100644 --- a/docs/tutorials/summary_stats/median.rst +++ b/docs/tutorials/summary_stats/median.rst @@ -36,12 +36,20 @@ The :class:`~nannyml.stats.avg.calculator.SummaryStatsMedianCalculator` class im the functionality needed for median values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **column_names:** A list with the names of columns to be evaluated. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Stats - Median.ipynb diff --git a/docs/tutorials/summary_stats/std.rst b/docs/tutorials/summary_stats/std.rst index 849a8e41..142bd8e1 100644 --- a/docs/tutorials/summary_stats/std.rst +++ b/docs/tutorials/summary_stats/std.rst @@ -36,12 +36,20 @@ The :class:`~nannyml.stats.std.calculator.SummaryStatsStdCalculator` class imple the functionality needed for standard deviation values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **column_names:** A list with the names of columns to be evaluated. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Stats - Std.ipynb diff --git a/docs/tutorials/summary_stats/sum.rst b/docs/tutorials/summary_stats/sum.rst index da2c4179..634d9062 100644 --- a/docs/tutorials/summary_stats/sum.rst +++ b/docs/tutorials/summary_stats/sum.rst @@ -36,12 +36,20 @@ The :class:`~nannyml.stats.sum.calculator.SummaryStatsSumCalculator` class imple the functionality needed for sum values calculations. We need to instantiate it with appropriate parameters: -- The names of the columns to be evaluated. -- Optionally, the name of the column containing the observation timestamps. -- Optionally, a chunking approach or a predefined chunker. If neither is provided, the default - chunker creating 10 chunks will be used. -- Optionally, a threshold strategy to modify the default one. See available threshold options - :ref:`here`. +- **column_names:** A list with the names of columns to be evaluated. +- **timestamp_column_name (Optional):** The name of the column in the reference data that + contains timestamps. +- **chunk_size (Optional):** The number of observations in each chunk of data + used. Only one chunking argument needs to be provided. For more information about + :term:`chunking` configurations check out the :ref:`chunking tutorial`. +- **chunk_number (Optional):** The number of chunks to be created out of data provided for each + :ref:`period`. +- **chunk_period (Optional):** The time period based on which we aggregate the provided data in + order to create chunks. +- **chunker (Optional):** A NannyML :class:`~nannyml.chunk.Chunker` object that will handle the aggregation + provided data in order to create chunks. +- **threshold (Optional):** The threshold strategy used to calculate the alert threshold limits. + For more information about thresholds, check out the :ref:`thresholds tutorial`. .. nbimport:: :path: ./example_notebooks/Tutorial - Stats - Sum.ipynb diff --git a/docs/tutorials/thresholds.rst b/docs/tutorials/thresholds.rst index 3070c6e5..81e30496 100644 --- a/docs/tutorials/thresholds.rst +++ b/docs/tutorials/thresholds.rst @@ -46,7 +46,6 @@ This snippet shows how to create an instance of the :class:`~nannyml.thresholds. .. nbimport:: :path: ./example_notebooks/Tutorial - Thresholds.ipynb :cells: 2 - :show_output: .. _thresholds_std: @@ -72,7 +71,6 @@ This snippet shows how to create an instance of the :class:`~nannyml.thresholds. .. nbimport:: :path: ./example_notebooks/Tutorial - Thresholds.ipynb :cells: 3 - :show_output: Setting custom thresholds for calculators and estimators diff --git a/docs/tutorials/working_with_results.rst b/docs/tutorials/working_with_results.rst index b59d4ed4..57ecd978 100644 --- a/docs/tutorials/working_with_results.rst +++ b/docs/tutorials/working_with_results.rst @@ -208,16 +208,16 @@ the database, in this case, an `SQLite` database. .. nbimport:: :path: ./example_notebooks/Tutorial - Working with results.ipynb - :cells: 11 + :cells: 20 A quick inspection shows that the database was populated and contains the univariate drift calculation results. .. nbimport:: :path: ./example_notebooks/Tutorial - Working with results.ipynb - :cells: 12 + :cells: 21 :show_output: .. nbimport:: :path: ./example_notebooks/Tutorial - Working with results.ipynb - :cells: 13 + :cells: 22 :show_output: diff --git a/nannyml/__init__.py b/nannyml/__init__.py index 0bcc5a7d..c2b5ca97 100644 --- a/nannyml/__init__.py +++ b/nannyml/__init__.py @@ -31,7 +31,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.9.0' +__version__ = '0.9.1' import logging @@ -64,6 +64,13 @@ ) from .usage_logging import UsageEvent, disable_usage_logging, enable_usage_logging, log_usage +try: + import nannyml_premium + + logging.getLogger().debug('loaded "nannyml_premium" package') +except Exception: + pass + # read any .env files to import environment variables load_dotenv() diff --git a/nannyml/config.py b/nannyml/config.py index a1103b42..adc43b45 100644 --- a/nannyml/config.py +++ b/nannyml/config.py @@ -66,6 +66,7 @@ class StoreConfig(BaseModel): path: str credentials: Optional[Dict[str, Any]] filename: Optional[str] + invalidate: bool = False class CalculatorConfig(BaseModel): diff --git a/nannyml/drift/multivariate/data_reconstruction/result.py b/nannyml/drift/multivariate/data_reconstruction/result.py index 00cd4154..399fbd38 100644 --- a/nannyml/drift/multivariate/data_reconstruction/result.py +++ b/nannyml/drift/multivariate/data_reconstruction/result.py @@ -105,8 +105,8 @@ def plot(self, kind: str = 'drift', *args, **kwargs) -> go.Figure: if kind == 'drift': return plot_metric( self, - title='Multivariate drift (PCA reconstruction error)', - metric_display_name='Data reconstruction drift', + title='Multivariate Drift (PCA Reconstruction Error)', + metric_display_name='Reconstruction Error', metric_column_name='reconstruction_error', ) else: diff --git a/nannyml/io/db/entities.py b/nannyml/io/db/entities.py index 4417ed0b..f282b31e 100644 --- a/nannyml/io/db/entities.py +++ b/nannyml/io/db/entities.py @@ -161,3 +161,29 @@ class DLEPerformanceMetric(Metric, table=True): # type: ignore[call-arg] #: The lower alerting threshold value lower_threshold: Optional[float] + + +class UnseenValuesMetric(Metric, table=True): + __tablename__ = "unseen_values_metrics" + + #: The name of the column this metric belongs to + column_name: str + + #: The upper alerting threshold value + upper_threshold: Optional[float] + + #: The lower alerting threshold value + lower_threshold: Optional[float] + + +class MissingValuesMetric(Metric, table=True): + __tablename__ = "missing_values_metrics" + + #: The name of the column this metric belongs to + column_name: str + + #: The upper alerting threshold value + upper_threshold: Optional[float] + + #: The lower alerting threshold value + lower_threshold: Optional[float] diff --git a/nannyml/io/db/mappers.py b/nannyml/io/db/mappers.py index f558915d..2254124d 100644 --- a/nannyml/io/db/mappers.py +++ b/nannyml/io/db/mappers.py @@ -6,13 +6,20 @@ from datetime import datetime from typing import Any, Callable, Dict, List, Optional, Type +from nannyml.data_quality.missing.result import Result as MissingValuesResult +from nannyml.data_quality.unseen.result import Result as UnseenValuesResult from nannyml.drift.multivariate.data_reconstruction.result import Result as DataReconstructionDriftResult from nannyml.drift.univariate import Result as UnivariateDriftResult from nannyml.exceptions import InvalidArgumentsException from nannyml.io.db.entities import CBPEPerformanceMetric, DataReconstructionFeatureDriftMetric, DLEPerformanceMetric from nannyml.io.db.entities import Metric from nannyml.io.db.entities import Metric as DbMetric -from nannyml.io.db.entities import RealizedPerformanceMetric, UnivariateDriftMetric +from nannyml.io.db.entities import ( + MissingValuesMetric, + RealizedPerformanceMetric, + UnivariateDriftMetric, + UnseenValuesMetric, +) from nannyml.performance_calculation.result import Result as RealizedPerformanceResult from nannyml.performance_estimation.confidence_based.results import Result as CBPEResult from nannyml.performance_estimation.direct_loss_estimation.result import Result as DLEResult @@ -225,7 +232,7 @@ def _parse( 'timestamp column to be specified and present' ) - res: List[Metric] = [] + res: List[DbMetric] = [] for metric in [metric.column_name for metric in result.metrics]: res += ( @@ -333,7 +340,7 @@ def _parse( 'timestamp column to be specified and present' ) - res: List[Metric] = [] + res: List[DbMetric] = [] for metric in [metric.column_name for metric in result.metrics]: res += ( @@ -353,3 +360,121 @@ def _parse( ) return res + + +@MapperFactory.register(UnseenValuesResult) +class UnseenValuesResultMapper: + def map_to_entity(self, result, **metric_args) -> List[DbMetric]: + def _parse( + column_name: str, + start_date: datetime, + end_date: datetime, + value, + upper_threshold, + lower_threshold, + alert: bool, + ) -> UnseenValuesMetric: + timestamp = start_date + (end_date - start_date) / 2 + + return UnseenValuesMetric( + column_name=column_name, + metric_name="count", + start_timestamp=start_date, + end_timestamp=end_date, + timestamp=timestamp, + value=value, + upper_threshold=upper_threshold, + lower_threshold=lower_threshold, + alert=alert, + **metric_args, + ) + + if result.timestamp_column_name is None: + raise NotImplementedError( + 'no timestamp column was specified. Listing metrics currently requires a ' + 'timestamp column to be specified and present' + ) + + columns: List[str] = list( + filter(lambda col: col != 'chunk', result.to_df().columns.get_level_values(0).drop_duplicates()) + ) + + res: List[DbMetric] = [] + + for column in columns: + res += ( + result.filter(period='analysis') + .to_df()[ + [ + ('chunk', 'start_date'), + ('chunk', 'end_date'), + (column, 'value'), + (column, 'upper_threshold'), + (column, 'lower_threshold'), + (column, 'alert'), + ] + ] + .apply(lambda r: _parse(column, *r), axis=1) + .to_list() + ) + + return res + + +@MapperFactory.register(MissingValuesResult) +class MissingValuesResultMapper: + def map_to_entity(self, result, **metric_args) -> List[DbMetric]: + def _parse( + column_name: str, + start_date: datetime, + end_date: datetime, + value, + upper_threshold, + lower_threshold, + alert: bool, + ) -> MissingValuesMetric: + timestamp = start_date + (end_date - start_date) / 2 + + return MissingValuesMetric( + column_name=column_name, + metric_name="count", + start_timestamp=start_date, + end_timestamp=end_date, + timestamp=timestamp, + value=value, + upper_threshold=upper_threshold, + lower_threshold=lower_threshold, + alert=alert, + **metric_args, + ) + + if result.timestamp_column_name is None: + raise NotImplementedError( + 'no timestamp column was specified. Listing metrics currently requires a ' + 'timestamp column to be specified and present' + ) + + columns: List[str] = list( + filter(lambda col: col != 'chunk', result.to_df().columns.get_level_values(0).drop_duplicates()) + ) + + res: List[DbMetric] = [] + + for column in columns: + res += ( + result.filter(period='analysis') + .to_df()[ + [ + ('chunk', 'start_date'), + ('chunk', 'end_date'), + (column, 'value'), + (column, 'upper_threshold'), + (column, 'lower_threshold'), + (column, 'alert'), + ] + ] + .apply(lambda r: _parse(column, *r), axis=1) + .to_list() + ) + + return res diff --git a/nannyml/performance_calculation/metrics/base.py b/nannyml/performance_calculation/metrics/base.py index c917ed8e..49da27ff 100644 --- a/nannyml/performance_calculation/metrics/base.py +++ b/nannyml/performance_calculation/metrics/base.py @@ -272,4 +272,8 @@ def _common_data_cleaning(y_true: pd.Series, y_pred: Union[pd.Series, pd.DataFra y_pred = y_pred[~y_true.isna()] y_true.dropna(inplace=True) + # NaN values have been dropped. Try to infer types again + y_pred = y_pred.infer_objects() + y_true = y_true.infer_objects() + return y_true, y_pred diff --git a/nannyml/performance_calculation/metrics/binary_classification.py b/nannyml/performance_calculation/metrics/binary_classification.py index 298bb244..2b4d97bc 100644 --- a/nannyml/performance_calculation/metrics/binary_classification.py +++ b/nannyml/performance_calculation/metrics/binary_classification.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score +import warnings from nannyml._typing import ProblemType from nannyml.base import _list_missing @@ -99,6 +100,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.nunique() <= 1: + warnings.warn("Calculated ROC-AUC score contains NaN values.") return np.nan else: return roc_auc_score(y_true, y_pred) @@ -167,6 +169,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): + warnings.warn("Calculated F1-score contains NaN values.") return np.nan else: return f1_score(y_true, y_pred) @@ -234,6 +237,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): + warnings.warn("Calculated Precision score contains NaN values.") return np.nan else: return precision_score(y_true, y_pred) @@ -301,6 +305,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): + warnings.warn("Calculated Recall score contains NaN values.") return np.nan else: return recall_score(y_true, y_pred) @@ -373,6 +378,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): + warnings.warn("Calculated Specificity score contains NaN values.") return np.nan else: tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() @@ -446,6 +452,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): + warnings.warn("Calculated Accuracy score contains NaN values.") return np.nan else: tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() @@ -564,6 +571,7 @@ def _calculate(self, data: pd.DataFrame): business_value = num_tp * tp_value + num_tn * tn_value + num_fp * fp_value + num_fn * fn_value if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): + warnings.warn("Calculated Business Value contains NaN values.") return np.nan else: if self.normalize_business_value is None: @@ -745,6 +753,7 @@ def _calculate_true_positives(self, data: pd.DataFrame) -> float: y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: + warnings.warn("Calculated true_positives contain NaN values.") return np.nan num_tp = np.sum(np.logical_and(y_pred, y_true)) @@ -773,6 +782,7 @@ def _calculate_true_negatives(self, data: pd.DataFrame) -> float: y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: + warnings.warn("Calculated true_negatives contain NaN values.") return np.nan num_tn = np.sum(np.logical_and(np.logical_not(y_pred), np.logical_not(y_true))) @@ -801,6 +811,7 @@ def _calculate_false_positives(self, data: pd.DataFrame) -> float: y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: + warnings.warn("Calculated false_positives contain NaN values.") return np.nan num_fp = np.sum(np.logical_and(y_pred, np.logical_not(y_true))) @@ -829,6 +840,7 @@ def _calculate_false_negatives(self, data: pd.DataFrame) -> float: y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.empty or y_pred.empty: + warnings.warn("Calculated false_negatives contain NaN values.") return np.nan num_fn = np.sum(np.logical_and(np.logical_not(y_pred), y_true)) diff --git a/nannyml/performance_calculation/metrics/multiclass_classification.py b/nannyml/performance_calculation/metrics/multiclass_classification.py index 0fd2840c..30b067c2 100644 --- a/nannyml/performance_calculation/metrics/multiclass_classification.py +++ b/nannyml/performance_calculation/metrics/multiclass_classification.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd +import warnings from sklearn.metrics import ( accuracy_score, f1_score, @@ -131,6 +132,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if y_true.nunique() <= 1: + warnings.warn("Calculated ROC-AUC score contains NaN values.") return np.nan else: return roc_auc_score(y_true, y_pred, multi_class='ovr', average='macro', labels=labels) @@ -218,6 +220,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): + warnings.warn("Calculated F1-score contains NaN values.") return np.nan else: return f1_score(y_true, y_pred, average='macro', labels=labels) @@ -305,6 +308,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): + warnings.warn("Calculated Precision score contains NaN values.") return np.nan else: return precision_score(y_true, y_pred, average='macro', labels=labels) @@ -392,6 +396,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): + warnings.warn("Calculated Recall score contains NaN values.") return np.nan else: return recall_score(y_true, y_pred, average='macro', labels=labels) @@ -479,6 +484,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): + warnings.warn("Calculated Specificity score contains NaN values.") return np.nan else: MCM = multilabel_confusion_matrix(y_true, y_pred, labels=labels) @@ -562,6 +568,7 @@ def _calculate(self, data: pd.DataFrame): y_true, y_pred = _common_data_cleaning(y_true, y_pred) if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1): + warnings.warn("Calculated Accuracy score contains NaN values.") return np.nan else: return accuracy_score(y_true, y_pred) diff --git a/nannyml/performance_estimation/confidence_based/cbpe.py b/nannyml/performance_estimation/confidence_based/cbpe.py index 27da22ff..cd7909b4 100644 --- a/nannyml/performance_estimation/confidence_based/cbpe.py +++ b/nannyml/performance_estimation/confidence_based/cbpe.py @@ -284,8 +284,6 @@ def __init__( ) ) - self.confidence_upper_bound = 1 - self.confidence_lower_bound = 0 self.needs_calibration: bool = False if calibrator is None: diff --git a/nannyml/performance_estimation/confidence_based/metrics.py b/nannyml/performance_estimation/confidence_based/metrics.py index 02a85a00..d52c9044 100644 --- a/nannyml/performance_estimation/confidence_based/metrics.py +++ b/nannyml/performance_estimation/confidence_based/metrics.py @@ -237,17 +237,24 @@ def _common_cleaning( y_pred_proba = data[y_pred_proba_column_name] y_pred = data[self.y_pred] + y_true = data[self.y_true] if clean_targets else None - y_pred_proba.dropna(inplace=True) + # Create mask to filter out NaN values + mask = ~(y_pred.isna() | y_pred_proba.isna()) + if clean_targets: + mask = mask | ~(y_true.isna()) + # Drop missing values (NaN/None) + y_pred_proba = y_pred_proba[mask] + y_pred = y_pred[mask] if clean_targets: - y_true = data[self.y_true] - y_true = y_true[~y_pred_proba.isna()] - y_pred_proba = y_pred_proba[~y_true.isna()] - y_pred = y_pred[~y_true.isna()] - y_true.dropna(inplace=True) - else: - y_true = None + y_true = y_true[mask] + + # NaN values have been dropped. Try to infer types again + y_pred_proba = y_pred_proba.infer_objects() + y_pred = y_pred.infer_objects() + if clean_targets: + y_true = y_true.infer_objects() return y_pred_proba, y_pred, y_true diff --git a/nannyml/runner.py b/nannyml/runner.py index d014eeae..6f7d2f23 100644 --- a/nannyml/runner.py +++ b/nannyml/runner.py @@ -7,12 +7,12 @@ import logging from contextlib import contextmanager from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Optional, Tuple, Type +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import pandas as pd from rich.console import Console -from nannyml._typing import Result +from nannyml._typing import Calculator, Estimator, Result from nannyml.config import Config, InputDataConfig, StoreConfig, WriterConfig from nannyml.data_quality.missing import MissingValuesCalculator from nannyml.data_quality.unseen import UnseenValuesCalculator @@ -59,18 +59,27 @@ def run_context(config: Config): ) -_registry: Dict[str, Type] = { - 'univariate_drift': UnivariateDriftCalculator, - 'multivariate_drift': DataReconstructionDriftCalculator, - 'performance': PerformanceCalculator, - 'cbpe': CBPE, - 'dle': DLE, - 'missing_values': MissingValuesCalculator, - 'unseen_values': UnseenValuesCalculator, -} _logger = logging.getLogger(__name__) +class CalculatorFactory: + """A factory class that produces Metric instances based on a given magic string or a metric specification.""" + + registry: Dict[str, Type] = { + 'univariate_drift': UnivariateDriftCalculator, + 'multivariate_drift': DataReconstructionDriftCalculator, + 'performance': PerformanceCalculator, + 'cbpe': CBPE, + 'dle': DLE, + 'missing_values': MissingValuesCalculator, + 'unseen_values': UnseenValuesCalculator, + } + + @classmethod + def register(cls, name: str, calculator_type: Union[Type[Calculator], Type[Estimator]]): + cls.registry[name] = calculator_type + + class RunnerLogger: def __init__(self, logger: logging.Logger, console: Optional[Console] = None): self.logger = logger @@ -138,21 +147,26 @@ def run( # noqa: C901 store = get_store(calculator_config.store, run_logger) - if calculator_config.type not in _registry: + if calculator_config.type not in CalculatorFactory.registry: raise InvalidArgumentsException(f"unknown calculator type '{calculator_config.type}'") # first step: load or (create + fit) calculator context.increase_step() - calc_cls = _registry[calculator_config.type] + calc_cls = CalculatorFactory.registry[calculator_config.type] if store and calculator_config.store: run_logger.log( f"[{context.current_step}/{context.total_steps}] '{context.current_calculator}': " f"loading calculator from store" ) - calc = store.load(filename=calculator_config.store.filename, as_type=calc_cls) + if calculator_config.store.invalidate: + calc = None + else: + calc = store.load(filename=calculator_config.store.filename, as_type=calc_cls) + if calc is None: + reason = 'invalidated' if calculator_config.store.invalidate else 'not found in store' run_logger.log( - f"calculator '{context.current_calculator}' not found in store. " + f"calculator '{context.current_calculator}' {reason}. " f"Creating, fitting and storing new instance", log_level=logging.DEBUG, ) diff --git a/nannyml/thresholds.py b/nannyml/thresholds.py index 16b748ac..68cf1af8 100644 --- a/nannyml/thresholds.py +++ b/nannyml/thresholds.py @@ -265,7 +265,7 @@ def calculate_threshold_values( if ( lower_threshold_value_limit is not None and lower_threshold_value is not None - and lower_threshold_value < lower_threshold_value_limit + and lower_threshold_value <= lower_threshold_value_limit ): override_value = None if override_using_none else lower_threshold_value_limit if logger: @@ -278,7 +278,7 @@ def calculate_threshold_values( if ( upper_threshold_value_limit is not None and upper_threshold_value is not None - and upper_threshold_value > upper_threshold_value_limit + and upper_threshold_value >= upper_threshold_value_limit ): override_value = None if override_using_none else upper_threshold_value_limit if logger: diff --git a/nannyml/usage_logging.py b/nannyml/usage_logging.py index 54bd52ef..d242c387 100644 --- a/nannyml/usage_logging.py +++ b/nannyml/usage_logging.py @@ -243,11 +243,22 @@ def _get_system_information() -> Dict[str, Any]: "runtime_environment": _get_runtime_environment(), "python_version": platform.python_version(), "nannyml_version": __version__, + "nannyml_cloud": _is_nannyml_cloud(), } +def _is_nannyml_cloud(): + return 'NML_CLOUD' in os.environ + + def _get_runtime_environment(): - if _is_running_in_docker(): + if _is_running_in_aks(): + return 'aks' + if _is_running_in_eks(): + return 'eks' + if _is_running_in_kubernetes(): + return 'kubernetes' + elif _is_running_in_docker(): return 'docker' elif _is_running_in_notebook(): return 'notebook' @@ -266,6 +277,36 @@ def _is_running_in_docker(): return False +def _is_running_in_kubernetes(): + return Path('/var/run/secrets/kubernetes.io/').exists() + + +def _is_running_in_aks(): + import requests + + try: + metadata = requests.get( + 'http://169.254.169.254/metadata/instance?api-version=2021-02-01', headers={'Metadata': 'true'} + ) + return metadata.status_code == 200 + except Exception: + return False + + +def _is_running_in_eks(): + import requests + + try: + token = requests.put( + 'http://169.254.169.254/latest/api/token', headers={'X-aws-ec2-metadata-token-ttl-seconds': 21600} + ).raw() + + metadata = requests.get('http://169.254.169.254/latest/meta-data/', headers={'X-aws-ec2-metadata-token': token}) + return metadata.status_code == 200 + except Exception: + return False + + # Inspired by # https://github.com/zenml-io/zenml/blob/275109da08b783d5d2cd508b5f703aed0c66e485/src/zenml/environment.py#L182 # and https://stackoverflow.com/a/39662359 diff --git a/pyproject.toml b/pyproject.toml index 113ad8bb..ce0ed419 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool] [tool.poetry] name = "nannyml" -version = "0.9.0" +version = "0.9.1" homepage = "https://github.com/nannyml/nannyml" description = "NannyML, Your library for monitoring model performance." authors = ["Niels Nuyttens "] @@ -103,6 +103,8 @@ pytest-lazy-fixture = "^0.6.3" types-click = "^7.1.8" types-python-dateutil = "^2.8.19.6" types-PyYAML = "^6.0" +types-requests = "^2.31.0.3" + [tool.black] line-length = 120 diff --git a/setup.cfg b/setup.cfg index e0143fc6..af714661 100644 --- a/setup.cfg +++ b/setup.cfg @@ -98,6 +98,7 @@ deps = types-click types-python-dateutil types-PyYAML + types-requests commands = flake8 nannyml tests mypy nannyml tests diff --git a/tests/io/test_writers.py b/tests/io/test_writers.py index 0f39244c..b05379ef 100644 --- a/tests/io/test_writers.py +++ b/tests/io/test_writers.py @@ -1,13 +1,17 @@ # Author: Niels Nuyttens # # License: Apache Software License 2.0 +import os import tempfile import pytest from pytest_lazyfixture import lazy_fixture +from nannyml.data_quality.missing import MissingValuesCalculator +from nannyml.data_quality.unseen import UnseenValuesCalculator from nannyml.datasets import ( load_synthetic_binary_classification_dataset, + load_synthetic_car_loan_data_quality_dataset, load_synthetic_car_price_dataset, load_synthetic_multiclass_classification_dataset, ) @@ -195,6 +199,30 @@ def dle_estimated_performance_for_regression_result(): return result +@pytest.fixture(scope='module') +def missing_values_for_binary_classification_result(): + reference_df, analysis_df, analysis_targets_df = load_synthetic_car_loan_data_quality_dataset() + calc = MissingValuesCalculator( + column_names=[col for col in reference_df if col not in ['timestamp', 'y_pred', 'y_true']], + timestamp_column_name='timestamp', + ).fit(reference_df) + result = calc.calculate(analysis_df.join(analysis_targets_df)) + return result + + +@pytest.fixture(scope='module') +def unseen_values_for_binary_classification_result(): + reference_df, analysis_df, analysis_targets_df = load_synthetic_car_loan_data_quality_dataset() + calc = UnseenValuesCalculator( + # categorical features as described in + # https://nannyml.readthedocs.io/en/stable/datasets/binary_car_loan.html#dataset-description + column_names=['salary_range', 'repaid_loan_on_prev_car', 'size_of_downpayment'], + timestamp_column_name='timestamp', + ).fit(reference_df) + result = calc.calculate(analysis_df.join(analysis_targets_df)) + return result + + @pytest.mark.parametrize( 'result', [ @@ -210,6 +238,8 @@ def dle_estimated_performance_for_regression_result(): lazy_fixture('cbpe_estimated_performance_for_binary_classification_result'), lazy_fixture('cbpe_estimated_performance_for_multiclass_classification_result'), lazy_fixture('dle_estimated_performance_for_regression_result'), + lazy_fixture('missing_values_for_binary_classification_result'), + lazy_fixture('unseen_values_for_binary_classification_result'), ], ) def test_raw_files_writer_raises_no_exceptions_when_writing_to_parquet(result): @@ -236,6 +266,8 @@ def test_raw_files_writer_raises_no_exceptions_when_writing_to_parquet(result): lazy_fixture('cbpe_estimated_performance_for_binary_classification_result'), lazy_fixture('cbpe_estimated_performance_for_multiclass_classification_result'), lazy_fixture('dle_estimated_performance_for_regression_result'), + lazy_fixture('missing_values_for_binary_classification_result'), + lazy_fixture('unseen_values_for_binary_classification_result'), ], ) def test_raw_files_writer_raises_no_exceptions_when_writing_to_csv(result): @@ -262,6 +294,8 @@ def test_raw_files_writer_raises_no_exceptions_when_writing_to_csv(result): lazy_fixture('cbpe_estimated_performance_for_binary_classification_result'), lazy_fixture('cbpe_estimated_performance_for_multiclass_classification_result'), lazy_fixture('dle_estimated_performance_for_regression_result'), + lazy_fixture('missing_values_for_binary_classification_result'), + lazy_fixture('unseen_values_for_binary_classification_result'), ], ) def test_database_writer_raises_no_exceptions_when_writing(result): @@ -287,6 +321,8 @@ def test_database_writer_raises_no_exceptions_when_writing(result): lazy_fixture('cbpe_estimated_performance_for_binary_classification_result'), lazy_fixture('cbpe_estimated_performance_for_multiclass_classification_result'), lazy_fixture('dle_estimated_performance_for_regression_result'), + lazy_fixture('missing_values_for_binary_classification_result'), + lazy_fixture('unseen_values_for_binary_classification_result'), ], ) def test_pickle_file_writer_raises_no_exceptions_when_writing(result): @@ -296,3 +332,60 @@ def test_pickle_file_writer_raises_no_exceptions_when_writing(result): writer.write(result, filename='export.pkl') except Exception as exc: pytest.fail(f"an unexpected exception occurred: {exc}") + + +@pytest.mark.parametrize( + 'result, table_name, expected_row_count', + [ + (lazy_fixture('univariate_drift_for_binary_classification_result'), 'univariate_drift_metrics', 110), + (lazy_fixture('univariate_drift_for_multiclass_classification_result'), 'univariate_drift_metrics', 110), + (lazy_fixture('univariate_drift_for_regression_result'), 'univariate_drift_metrics', 80), + ( + lazy_fixture('data_reconstruction_drift_for_binary_classification_result'), + 'data_reconstruction_feature_drift_metrics', + 10, + ), + ( + lazy_fixture('data_reconstruction_drift_for_multiclass_classification_result'), + 'data_reconstruction_feature_drift_metrics', + 10, + ), + ( + lazy_fixture('data_reconstruction_drift_for_regression_result'), + 'data_reconstruction_feature_drift_metrics', + 10, + ), + (lazy_fixture('realized_performance_for_binary_classification_result'), 'realized_performance_metrics', 40), + ( + lazy_fixture('realized_performance_for_multiclass_classification_result'), + 'realized_performance_metrics', + 40, + ), + (lazy_fixture('realized_performance_for_regression_result'), 'realized_performance_metrics', 40), + (lazy_fixture('cbpe_estimated_performance_for_binary_classification_result'), 'cbpe_performance_metrics', 20), + ( + lazy_fixture('cbpe_estimated_performance_for_multiclass_classification_result'), + 'cbpe_performance_metrics', + 20, + ), + (lazy_fixture('dle_estimated_performance_for_regression_result'), 'dle_performance_metrics', 20), + (lazy_fixture('missing_values_for_binary_classification_result'), 'missing_values_metrics', 90), + (lazy_fixture('unseen_values_for_binary_classification_result'), 'unseen_values_metrics', 30), + ], +) +def test_database_writer_exports_correctly(result, table_name, expected_row_count): + try: + writer = DatabaseWriter(connection_string='sqlite:///test.db', model_name='test') + writer.write(result) + + import sqlite3 + + with sqlite3.connect("test.db", uri=True) as db: + res = db.cursor().execute(f"SELECT COUNT(*) FROM {table_name}").fetchone() + assert res[0] == expected_row_count + + except Exception as exc: + pytest.fail(f"an unexpected exception occurred: {exc}") + + finally: + os.remove('test.db') diff --git a/tests/performance_calculation/test_performance_calculator.py b/tests/performance_calculation/test_performance_calculator.py index 23d72cb6..2d7e887b 100644 --- a/tests/performance_calculation/test_performance_calculator.py +++ b/tests/performance_calculation/test_performance_calculator.py @@ -180,6 +180,27 @@ def test_calculator_calculate_should_include_target_completeness_rate(data): # assert sut.loc[1, ('chunk', 'targets_missing_rate')] == 0.9 +def test_calculator_calculate_should_support_partial_bool_targets(data, performance_calculator): + """Test that the calculator supports partial bool targets. + + Pandas converts bool columns to object dtype when they contain NaN values. This previously resulted in problems + when calculating the performance metrics. This test ensures that the calculator supports partial bool targets. + """ + ref_data = data[0] + analysis_data = data[1].merge(data[2], on='identifier') + + # Convert target column to bool dtype + analysis_data = analysis_data.astype({'work_home_actual': 'bool'}) + + # Drop 10% of the target values in the first chunk + analysis_data.loc[0:499, 'work_home_actual'] = np.NAN + + performance_calculator.fit(reference_data=ref_data) + performance_calculator.calculate(analysis_data) + + # No further checks needed, if the above code runs without errors, the test passes. + + @pytest.mark.parametrize( 'custom_thresholds', [