diff --git a/analytics/tools/find_invalid_runs.ipynb b/analytics/tools/find_invalid_runs.ipynb
new file mode 100644
index 000000000..e76e226c6
--- /dev/null
+++ b/analytics/tools/find_invalid_runs.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Motivation\n",
+    "\n",
+    "This notebook can be used to find pipeline runs where we have empty evaluation responses despite expecting some. Older versions of Modyn have lest robustness in the evaluation handling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "from modyn.supervisor.internal.grpc.enums import PipelineStage\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import MultiEvaluationInfo, PipelineLogs, SingleEvaluationInfo\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "log_dir = Path(\"/Users/mboether/phd/dynamic-data/sigmod-data/yearbook/debug/logs\")\n",
+    "logfiles = [logfile for logfile in log_dir.glob(\"**/pipeline.log\")]\n",
+    "logfiles"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def metrics_valid(logfile: Path):\n",
+    "    logs = PipelineLogs.model_validate_json(logfile.read_text())\n",
+    "    for eval_log in logs.supervisor_logs.stage_runs:\n",
+    "        if eval_log.id == PipelineStage.EVALUATE_MULTI.name:\n",
+    "            multiinfo = eval_log.info\n",
+    "            assert isinstance(multiinfo, MultiEvaluationInfo)\n",
+    "\n",
+    "            for info in multiinfo.interval_results:\n",
+    "                assert isinstance(info, SingleEvaluationInfo)\n",
+    "                res = info.results\n",
+    "\n",
+    "                if len(res[\"metrics\"]) == 0:\n",
+    "                    if res[\"dataset_size\"] == 0:\n",
+    "                        print(\n",
+    "                            f\"Warning: Empty metrics but empty dataset in {logfile}: {info}\"\n",
+    "                        )  # Might want to remove this - not sure if needed.\n",
+    "                    else:\n",
+    "                        return False\n",
+    "\n",
+    "    return True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "invalid_pipelines = []\n",
+    "for logfile in tqdm(logfiles):\n",
+    "    if not metrics_valid(logfile):\n",
+    "        invalid_pipelines.append(logfile)\n",
+    "\n",
+    "invalid_pipelines\n",
+    "\n",
+    "# Typically, you'd want to delete those directories because they are invalid (see next cell)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Commented out for safety\n",
+    "\n",
+    "\"\"\"\n",
+    "import shutil\n",
+    "parent_dirs = {file_path.parent for file_path in invalid_pipelines}\n",
+    "\n",
+    "for directory in parent_dirs:\n",
+    "    try:\n",
+    "        shutil.rmtree(directory)\n",
+    "    except Exception as e:\n",
+    "        print(f\"Failed to delete {directory}: {e}\")\n",
+    "\"\"\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}