how many plots can one do in this space...

eth-easl · Sep 26, 2024 · 7529234 · 7529234
1 parent e735bce
commit 7529234
Show file tree

Hide file tree

Showing 7 changed files with 1,423 additions and 3 deletions.
diff --git a/analytics/plotting/common/cost_matrix.py b/analytics/plotting/common/cost_matrix.py
@@ -77,7 +77,14 @@ def plot_cost_matrix(
     hue_col = "id"
 
     palette = sns.color_palette("RdBu", 10)
-    new_palette = [palette[0], palette[1], palette[2], palette[-3], palette[-1]]
+    new_palette = {
+        "train": palette[0],
+        "inform remaining data": palette[-2],
+        "evaluate trigger policy": palette[2],
+        "inform trigger": palette[-1],
+        "store trained model": palette[1],
+    }
+    # [palette[0], palette[-2], palette[1], palette[-1], palette[2]]
 
     # use sum of all pipelines to determine the order of the bars that is consistent across subplots
     df_agg = df_costs.groupby([hue_col]).agg({y_col: "sum"}).reset_index()
@@ -97,7 +104,7 @@ def plot_cost_matrix(
             elif not cumulative and y_minutes:
                 df_final[y_col] = df_final[y_col] / 60
 
-            ax = axs[row, int(cumulative)]
+            ax = axs[row, int(cumulative)] if len(pipeline_ids) > 1 else axs[int(cumulative)]
             h = sns.histplot(
                 df_final,
                 x=x_col,

diff --git a/analytics/plotting/common/heatmap.py b/analytics/plotting/common/heatmap.py
@@ -37,6 +37,9 @@ def build_heatmap(
     cmap: Any | None = None,
     linewidth: int = 2,
     grid_alpha: float = 0.0,
+    disable_horizontal_grid: bool = False,
+    df_logs_models: pd.DataFrame | None = None,
+    triggers: dict[int, list[pd.Timestamp]] = {},
 ) -> Figure | Axes:
     init_plot()
     setup_font(small_label=True, small_title=True)
@@ -112,7 +115,12 @@ def build_heatmap(
             )
     ax.invert_yaxis()
 
-    ax.grid(axis="y", linestyle="--", alpha=grid_alpha, color="white")
+    ax.grid(
+        axis="y",
+        linestyle="--",
+        alpha=0 if disable_horizontal_grid else grid_alpha,
+        color="white",
+    )
     ax.grid(axis="x", linestyle="--", alpha=grid_alpha, color="white")
 
     if y_ticks is not None:
@@ -138,6 +146,7 @@ def build_heatmap(
     if title_label:
         ax.set_title(title_label)
 
+    # mainly for offline expore
     previous_y = 0
     for x_start, x_end, y in policy:
         # main box
@@ -171,6 +180,49 @@ def build_heatmap(
         ax.add_patch(connector)
         previous_y = y
 
+    # for post factum evaluation
+    if df_logs_models is not None:
+        for type_, dashed in [("train", False), ("usage", False), ("train", True)]:
+            for active_ in df_logs_models.iterrows():
+                x_start = active_[1][f"{type_}_start"].year - 1930
+                x_end = active_[1][f"{type_}_end"].year - 1930
+                y = active_[1]["model_idx"]
+                rect = plt.Rectangle(
+                    (x_start, y - 1),  # y: 0 based index, model_idx: 1 based index
+                    x_end - x_start,
+                    1,
+                    edgecolor="White" if type_ == "train" else "Black",
+                    facecolor="none",
+                    linewidth=1.5,
+                    linestyle="dotted" if dashed else "solid",
+                    hatch="/",
+                    joinstyle="bevel",
+                    # capstyle="round",
+                )
+                ax.add_patch(rect)
+
+    if triggers:
+        for y, triggers_df in triggers.items():
+            for row in triggers_df.iterrows():
+                type_ = "usage"
+                # for y, x_list in triggers.items():
+                x_start = row[1][f"{type_}_start"].year - 1930
+                x_end = row[1][f"{type_}_end"].year - 1930
+                # for x in x_list:
+                rect = plt.Rectangle(
+                    (x_start, y),  # y: 0 based index, model_idx: 1 based index
+                    x_end - x_start,
+                    1,
+                    edgecolor="black",
+                    facecolor="none",
+                    linewidth=1,
+                    # linestyle="dotted",
+                    # hatch="/",
+                    # joinstyle="bevel",
+                    # capstyle="round",
+                )
+                ax.add_patch(rect)
+
     # Display the plot
     plt.tight_layout()
     # plt.show()

diff --git a/analytics/plotting/rh_thesis/TODO.md b/analytics/plotting/rh_thesis/TODO.md
@@ -0,0 +1,3 @@
+drift:
+
+- plot arxiv / huffpost
diff --git a/analytics/plotting/rh_thesis/drift/yb_cost.ipynb b/analytics/plotting/rh_thesis/drift/yb_cost.ipynb
@@ -0,0 +1,201 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
+    "from analytics.app.data.transform import patch_yearbook_time, pipeline_leaf_times_df\n",
+    "from analytics.plotting.common.cost_matrix import plot_cost_matrix\n",
+    "from analytics.plotting.common.save import save_plot\n",
+    "from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines_dirs = [\n",
+    "    Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static\"),\n",
+    "]\n",
+    "\n",
+    "pipeline_logs: dict[int, PipelineLogs] = {}\n",
+    "pipelines: dict[int, tuple[str, Path]] = {}\n",
+    "\n",
+    "for dir in pipelines_dirs:\n",
+    "    dir_pipelines = list_pipelines(dir)\n",
+    "    pipelines.update(dir_pipelines)\n",
+    "    max_pipeline_id = max(dir_pipelines.keys())\n",
+    "    print(pipelines)\n",
+    "    pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
+    "    assert dir.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mode: time + amount\n",
+    "pipeline_ids = [107]  # yb drift mmd 0.06 250 4d\n",
+    "\n",
+    "# doesn't do anything unless include_composite_model = True\n",
+    "composite_model_variant = \"currently_active_model\"\n",
+    "\n",
+    "patch_yearbook = True\n",
+    "dataset_id = \"yearbook_test\"\n",
+    "eval_handler = \"periodic-delta+-1y\"\n",
+    "metric = \"Accuracy\"\n",
+    "include_composite_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Wrangle data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_leaf_list = []\n",
+    "for pipeline_id in pipeline_ids:\n",
+    "    logs = pipeline_logs[pipeline_id]\n",
+    "    df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
+    "    df_leaf_list.append(df_leaf_single)\n",
+    "\n",
+    "df_leaf = pd.concat(df_leaf_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted = df_leaf.copy()\n",
+    "\n",
+    "# coloring in order of decreasing avg. duration\n",
+    "avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n",
+    "df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n",
+    "    \"duration_avg\", ascending=False\n",
+    ")\n",
+    "\n",
+    "# Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years)\n",
+    "if patch_yearbook:\n",
+    "    patch_yearbook_time(df_adjusted, \"sample_time\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_adjusted\n",
+    "df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"].dt.year\n",
+    "df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new = df_adjusted[\n",
+    "    (\n",
+    "        df_adjusted[\"id\"].isin(\n",
+    "            [\n",
+    "                \"TRAIN\",\n",
+    "                \"STORE_TRAINED_MODEL\",\n",
+    "                \"INFORM_SELECTOR_REMAINING_DATA\",\n",
+    "                \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
+    "                \"EVALUATE_TRIGGER_POLICY\",\n",
+    "            ]\n",
+    "        )\n",
+    "    )\n",
+    "][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n",
+    "df_new = df_new.sort_values(\"sample_time_year\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_rename = {\n",
+    "    \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n",
+    "    \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n",
+    "}\n",
+    "\n",
+    "df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plot_cost_matrix(\n",
+    "    df_new,\n",
+    "    [107],\n",
+    "    grid_alpha=0.75,\n",
+    "    title_map={\n",
+    "        107: \"static MMD=0.07 threshold\",\n",
+    "    },\n",
+    "    height_factor=0.8,\n",
+    "    width_factor=1.0,\n",
+    "    duration_ylabel=\"Duration (sec)\",\n",
+    "    cumulative_ylabel=\"Cumulative Duration (min)\",\n",
+    "    x_ticks=[x for x in range(1940, 2010 + 1, 30)],\n",
+    "    y_ticks_cumulative=[x for x in range(0, 60 + 1, 20)],\n",
+    "    y_lim_cumulative=(0, 70),\n",
+    "    y_minutes=False,\n",
+    "    y_minutes_cumulative=True,\n",
+    ")\n",
+    "\n",
+    "save_plot(fig, \"yearbook_drift-trigger-cost-matrix\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}