More plots

eth-easl · Sep 22, 2024 · bec88be · bec88be
1 parent 85e88a1
commit bec88be
Show file tree

Hide file tree

Showing 3 changed files with 806 additions and 12 deletions.
diff --git a/analytics/plotting/rh_thesis/traintime_vs_dataamount/yb_traintime_dataamount.ipynb b/analytics/plotting/rh_thesis/traintime_vs_dataamount/yb_traintime_dataamount.ipynb
@@ -0,0 +1,244 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines\n",
+    "from analytics.plotting.common.common import init_plot\n",
+    "from analytics.plotting.common.font import setup_font\n",
+    "from modyn.supervisor.internal.grpc.enums import PipelineStage\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import StageLog\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# INPUTS\n",
+    "\n",
+    "pipelines_dir = Path(\n",
+    "    \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/11_baselines_amount\"\n",
+    ")\n",
+    "# pipelines_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/11_baselines_amount\")\n",
+    "# pipelines_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/11_baselines_amount\")\n",
+    "output_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/_plots\")\n",
+    "assert pipelines_dir.exists()\n",
+    "assert output_dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = list_pipelines(pipelines_dir)\n",
+    "max_pipeline_id = max(pipelines.keys())\n",
+    "pipelines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.app.data.load import load_pipeline_logs\n",
+    "\n",
+    "pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list_df_train: list[pd.DataFrame] = []\n",
+    "\n",
+    "for pipeline_id in pipelines:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    train_logs = [record for record in logs.supervisor_logs.stage_runs if record.id == PipelineStage.TRAIN.name]\n",
+    "    df_train = StageLog.df(stage_logs=train_logs, extended=True)\n",
+    "    df_train[\"pipeline_id\"] = pipelines[pipeline_id][0]\n",
+    "    list_df_train.append(df_train)\n",
+    "\n",
+    "df_train = pd.concat(list_df_train)\n",
+    "df_train.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Conversion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Clean pipeline name\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "\n",
+    "def pipeline_name_cleaner(name: str):\n",
+    "    return re.sub(r\".*_dataamount_(\\d+)\", \"trigger every \\\\1 samples\", name)\n",
+    "\n",
+    "\n",
+    "df_train[\"pipeline_id\"] = df_train[\"pipeline_id\"].apply(pipeline_name_cleaner)\n",
+    "df_train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# to seconds\n",
+    "df_train[\"duration\"] = df_train[\"duration\"].dt.total_seconds() / 60\n",
+    "# df_train[\"duration\"] = df_train[\"duration\"].dt.total_seconds()\n",
+    "# df_train[\"train_time_at_trainer\"] = df_train[\"train_time_at_trainer\"] / 1_000  # millis to seconds\n",
+    "df_train[\"train_time_at_trainer\"] = df_train[\"train_time_at_trainer\"] / 1_000 / 60  # millis to minutes\n",
+    "df_train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sort by number of samples\n",
+    "df_train = df_train.sort_values(by=\"num_samples\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from analytics.plotting.common.color import discrete_colors, main_color\n",
+    "\n",
+    "sns.set_style(\"whitegrid\")\n",
+    "\n",
+    "init_plot()\n",
+    "setup_font(small_label=True, small_title=True)\n",
+    "\n",
+    "\n",
+    "FONTSIZE = 20\n",
+    "DOUBLE_FIG_WIDTH = 10\n",
+    "DOUBLE_FIG_HEIGHT = 3.5\n",
+    "DOUBLE_FIG_SIZE = (DOUBLE_FIG_WIDTH, 1.5 * DOUBLE_FIG_HEIGHT)\n",
+    "\n",
+    "width_factor = 0.5\n",
+    "height_factor = 0.5\n",
+    "\n",
+    "fig = plt.figure(\n",
+    "    edgecolor=\"black\",\n",
+    "    frameon=True,\n",
+    "    figsize=(\n",
+    "        DOUBLE_FIG_WIDTH * width_factor,\n",
+    "        2 * DOUBLE_FIG_HEIGHT * height_factor,\n",
+    "    ),\n",
+    "    dpi=300,\n",
+    ")\n",
+    "\n",
+    "ax1 = sns.regplot(\n",
+    "    df_train,\n",
+    "    x=\"num_samples\",\n",
+    "    y=\"train_time_at_trainer\",  # duration\n",
+    "    color=main_color(0),\n",
+    ")\n",
+    "\n",
+    "ax2 = sns.scatterplot(\n",
+    "    df_train,\n",
+    "    x=\"num_samples\",\n",
+    "    y=\"train_time_at_trainer\",  # duration\n",
+    "    hue=\"pipeline_id\",\n",
+    "    palette=(\n",
+    "        discrete_colors(14)[0:5] + discrete_colors(14)[9:14]\n",
+    "        if \"yearbook\" in str(pipelines_dir)\n",
+    "        else (\n",
+    "            discrete_colors(8)[0:3] + discrete_colors(8)[6:8]\n",
+    "            if \"huffpost\" in str(pipelines_dir)\n",
+    "            else discrete_colors(8)[0:3] + discrete_colors(8)[6:8]\n",
+    "        )\n",
+    "    ),\n",
+    "    s=200,\n",
+    "    legend=True,\n",
+    "    marker=\"X\",\n",
+    ")\n",
+    "\n",
+    "# Display the plot\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: run more variants of in less dense areas\n",
+    "# TODO: plot / add number of datapoints to thesis so that the signicance of regression line is clear\n",
+    "# State in thesis that there are no outliers to be expected!"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}