Skip to content

Commit

Permalink
how many plots can one do in this space...
Browse files Browse the repository at this point in the history
  • Loading branch information
robinholzi committed Sep 26, 2024
1 parent e735bce commit 7529234
Show file tree
Hide file tree
Showing 7 changed files with 1,423 additions and 3 deletions.
11 changes: 9 additions & 2 deletions analytics/plotting/common/cost_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,14 @@ def plot_cost_matrix(
hue_col = "id"

palette = sns.color_palette("RdBu", 10)
new_palette = [palette[0], palette[1], palette[2], palette[-3], palette[-1]]
new_palette = {
"train": palette[0],
"inform remaining data": palette[-2],
"evaluate trigger policy": palette[2],
"inform trigger": palette[-1],
"store trained model": palette[1],
}
# [palette[0], palette[-2], palette[1], palette[-1], palette[2]]

# use sum of all pipelines to determine the order of the bars that is consistent across subplots
df_agg = df_costs.groupby([hue_col]).agg({y_col: "sum"}).reset_index()
Expand All @@ -97,7 +104,7 @@ def plot_cost_matrix(
elif not cumulative and y_minutes:
df_final[y_col] = df_final[y_col] / 60

ax = axs[row, int(cumulative)]
ax = axs[row, int(cumulative)] if len(pipeline_ids) > 1 else axs[int(cumulative)]
h = sns.histplot(
df_final,
x=x_col,
Expand Down
54 changes: 53 additions & 1 deletion analytics/plotting/common/heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ def build_heatmap(
cmap: Any | None = None,
linewidth: int = 2,
grid_alpha: float = 0.0,
disable_horizontal_grid: bool = False,
df_logs_models: pd.DataFrame | None = None,
triggers: dict[int, list[pd.Timestamp]] = {},
) -> Figure | Axes:
init_plot()
setup_font(small_label=True, small_title=True)
Expand Down Expand Up @@ -112,7 +115,12 @@ def build_heatmap(
)
ax.invert_yaxis()

ax.grid(axis="y", linestyle="--", alpha=grid_alpha, color="white")
ax.grid(
axis="y",
linestyle="--",
alpha=0 if disable_horizontal_grid else grid_alpha,
color="white",
)
ax.grid(axis="x", linestyle="--", alpha=grid_alpha, color="white")

if y_ticks is not None:
Expand All @@ -138,6 +146,7 @@ def build_heatmap(
if title_label:
ax.set_title(title_label)

# mainly for offline expore
previous_y = 0
for x_start, x_end, y in policy:
# main box
Expand Down Expand Up @@ -171,6 +180,49 @@ def build_heatmap(
ax.add_patch(connector)
previous_y = y

# for post factum evaluation
if df_logs_models is not None:
for type_, dashed in [("train", False), ("usage", False), ("train", True)]:
for active_ in df_logs_models.iterrows():
x_start = active_[1][f"{type_}_start"].year - 1930
x_end = active_[1][f"{type_}_end"].year - 1930
y = active_[1]["model_idx"]
rect = plt.Rectangle(
(x_start, y - 1), # y: 0 based index, model_idx: 1 based index
x_end - x_start,
1,
edgecolor="White" if type_ == "train" else "Black",
facecolor="none",
linewidth=1.5,
linestyle="dotted" if dashed else "solid",
hatch="/",
joinstyle="bevel",
# capstyle="round",
)
ax.add_patch(rect)

if triggers:
for y, triggers_df in triggers.items():
for row in triggers_df.iterrows():
type_ = "usage"
# for y, x_list in triggers.items():
x_start = row[1][f"{type_}_start"].year - 1930
x_end = row[1][f"{type_}_end"].year - 1930
# for x in x_list:
rect = plt.Rectangle(
(x_start, y), # y: 0 based index, model_idx: 1 based index
x_end - x_start,
1,
edgecolor="black",
facecolor="none",
linewidth=1,
# linestyle="dotted",
# hatch="/",
# joinstyle="bevel",
# capstyle="round",
)
ax.add_patch(rect)

# Display the plot
plt.tight_layout()
# plt.show()
Expand Down
3 changes: 3 additions & 0 deletions analytics/plotting/rh_thesis/TODO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
drift:

- plot arxiv / huffpost
201 changes: 201 additions & 0 deletions analytics/plotting/rh_thesis/drift/yb_cost.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"\n",
"from analytics.app.data.load import list_pipelines, load_pipeline_logs\n",
"from analytics.app.data.transform import patch_yearbook_time, pipeline_leaf_times_df\n",
"from analytics.plotting.common.cost_matrix import plot_cost_matrix\n",
"from analytics.plotting.common.save import save_plot\n",
"from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n",
"\n",
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipelines_dirs = [\n",
" Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static\"),\n",
"]\n",
"\n",
"pipeline_logs: dict[int, PipelineLogs] = {}\n",
"pipelines: dict[int, tuple[str, Path]] = {}\n",
"\n",
"for dir in pipelines_dirs:\n",
" dir_pipelines = list_pipelines(dir)\n",
" pipelines.update(dir_pipelines)\n",
" max_pipeline_id = max(dir_pipelines.keys())\n",
" print(pipelines)\n",
" pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n",
" assert dir.exists()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# mode: time + amount\n",
"pipeline_ids = [107] # yb drift mmd 0.06 250 4d\n",
"\n",
"# doesn't do anything unless include_composite_model = True\n",
"composite_model_variant = \"currently_active_model\"\n",
"\n",
"patch_yearbook = True\n",
"dataset_id = \"yearbook_test\"\n",
"eval_handler = \"periodic-delta+-1y\"\n",
"metric = \"Accuracy\"\n",
"include_composite_model = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Wrangle data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_leaf_list = []\n",
"for pipeline_id in pipeline_ids:\n",
" logs = pipeline_logs[pipeline_id]\n",
" df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n",
" df_leaf_list.append(df_leaf_single)\n",
"\n",
"df_leaf = pd.concat(df_leaf_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_adjusted = df_leaf.copy()\n",
"\n",
"# coloring in order of decreasing avg. duration\n",
"avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n",
"df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n",
" \"duration_avg\", ascending=False\n",
")\n",
"\n",
"# Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years)\n",
"if patch_yearbook:\n",
" patch_yearbook_time(df_adjusted, \"sample_time\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_adjusted\n",
"df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"].dt.year\n",
"df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_new = df_adjusted[\n",
" (\n",
" df_adjusted[\"id\"].isin(\n",
" [\n",
" \"TRAIN\",\n",
" \"STORE_TRAINED_MODEL\",\n",
" \"INFORM_SELECTOR_REMAINING_DATA\",\n",
" \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n",
" \"EVALUATE_TRIGGER_POLICY\",\n",
" ]\n",
" )\n",
" )\n",
"][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n",
"df_new = df_new.sort_values(\"sample_time_year\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"state_rename = {\n",
" \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n",
" \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n",
"}\n",
"\n",
"df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = plot_cost_matrix(\n",
" df_new,\n",
" [107],\n",
" grid_alpha=0.75,\n",
" title_map={\n",
" 107: \"static MMD=0.07 threshold\",\n",
" },\n",
" height_factor=0.8,\n",
" width_factor=1.0,\n",
" duration_ylabel=\"Duration (sec)\",\n",
" cumulative_ylabel=\"Cumulative Duration (min)\",\n",
" x_ticks=[x for x in range(1940, 2010 + 1, 30)],\n",
" y_ticks_cumulative=[x for x in range(0, 60 + 1, 20)],\n",
" y_lim_cumulative=(0, 70),\n",
" y_minutes=False,\n",
" y_minutes_cumulative=True,\n",
")\n",
"\n",
"save_plot(fig, \"yearbook_drift-trigger-cost-matrix\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 7529234

Please sign in to comment.