-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
how many plots can one do in this space...
- Loading branch information
1 parent
e735bce
commit 7529234
Showing
7 changed files
with
1,423 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
drift: | ||
|
||
- plot arxiv / huffpost |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from pathlib import Path\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"from analytics.app.data.load import list_pipelines, load_pipeline_logs\n", | ||
"from analytics.app.data.transform import patch_yearbook_time, pipeline_leaf_times_df\n", | ||
"from analytics.plotting.common.cost_matrix import plot_cost_matrix\n", | ||
"from analytics.plotting.common.save import save_plot\n", | ||
"from modyn.supervisor.internal.pipeline_executor.models import PipelineLogs\n", | ||
"\n", | ||
"%load_ext autoreload\n", | ||
"%autoreload 2" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pipelines_dirs = [\n", | ||
" Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/20_datadrift_static\"),\n", | ||
"]\n", | ||
"\n", | ||
"pipeline_logs: dict[int, PipelineLogs] = {}\n", | ||
"pipelines: dict[int, tuple[str, Path]] = {}\n", | ||
"\n", | ||
"for dir in pipelines_dirs:\n", | ||
" dir_pipelines = list_pipelines(dir)\n", | ||
" pipelines.update(dir_pipelines)\n", | ||
" max_pipeline_id = max(dir_pipelines.keys())\n", | ||
" print(pipelines)\n", | ||
" pipeline_logs.update({p_id: load_pipeline_logs(p_id, dir) for (p_id, (_, p_path)) in dir_pipelines.items()})\n", | ||
" assert dir.exists()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# mode: time + amount\n", | ||
"pipeline_ids = [107] # yb drift mmd 0.06 250 4d\n", | ||
"\n", | ||
"# doesn't do anything unless include_composite_model = True\n", | ||
"composite_model_variant = \"currently_active_model\"\n", | ||
"\n", | ||
"patch_yearbook = True\n", | ||
"dataset_id = \"yearbook_test\"\n", | ||
"eval_handler = \"periodic-delta+-1y\"\n", | ||
"metric = \"Accuracy\"\n", | ||
"include_composite_model = True" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Wrangle data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_leaf_list = []\n", | ||
"for pipeline_id in pipeline_ids:\n", | ||
" logs = pipeline_logs[pipeline_id]\n", | ||
" df_leaf_single = pipeline_leaf_times_df(logs, use_traintime_patch_at_trainer=True, pipeline_id=pipeline_id)\n", | ||
" df_leaf_list.append(df_leaf_single)\n", | ||
"\n", | ||
"df_leaf = pd.concat(df_leaf_list)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_adjusted = df_leaf.copy()\n", | ||
"\n", | ||
"# coloring in order of decreasing avg. duration\n", | ||
"avg_duration_per_stage = df_adjusted.groupby([\"pipeline_ref\", \"id\"])[\"duration\"].mean().sort_values(ascending=False)\n", | ||
"df_adjusted = df_adjusted.merge(avg_duration_per_stage, on=[\"pipeline_ref\", \"id\"], suffixes=(\"\", \"_avg\")).sort_values(\n", | ||
" \"duration_avg\", ascending=False\n", | ||
")\n", | ||
"\n", | ||
"# Yearbook as a mapped time dimension (to display the correct timestamps we need to convert back from days to years)\n", | ||
"if patch_yearbook:\n", | ||
" patch_yearbook_time(df_adjusted, \"sample_time\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_adjusted\n", | ||
"df_adjusted[\"sample_time_year\"] = df_adjusted[\"sample_time\"].dt.year\n", | ||
"df_adjusted[\"sample_time_year_bin\"] = pd.cut(df_adjusted[\"sample_time_year\"], bins=10, labels=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_new = df_adjusted[\n", | ||
" (\n", | ||
" df_adjusted[\"id\"].isin(\n", | ||
" [\n", | ||
" \"TRAIN\",\n", | ||
" \"STORE_TRAINED_MODEL\",\n", | ||
" \"INFORM_SELECTOR_REMAINING_DATA\",\n", | ||
" \"INFORM_SELECTOR_ABOUT_TRIGGER\",\n", | ||
" \"EVALUATE_TRIGGER_POLICY\",\n", | ||
" ]\n", | ||
" )\n", | ||
" )\n", | ||
"][[\"pipeline_ref\", \"id\", \"sample_time_year\", \"duration\"]].copy()\n", | ||
"df_new = df_new.sort_values(\"sample_time_year\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"state_rename = {\n", | ||
" \"INFORM_SELECTOR_REMAINING_DATA\": \"inform remaining data\",\n", | ||
" \"INFORM_SELECTOR_ABOUT_TRIGGER\": \"inform trigger\",\n", | ||
"}\n", | ||
"\n", | ||
"df_new[\"id\"] = df_new[\"id\"].replace(state_rename).str.lower().str.replace(\"_\", \" \")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"fig = plot_cost_matrix(\n", | ||
" df_new,\n", | ||
" [107],\n", | ||
" grid_alpha=0.75,\n", | ||
" title_map={\n", | ||
" 107: \"static MMD=0.07 threshold\",\n", | ||
" },\n", | ||
" height_factor=0.8,\n", | ||
" width_factor=1.0,\n", | ||
" duration_ylabel=\"Duration (sec)\",\n", | ||
" cumulative_ylabel=\"Cumulative Duration (min)\",\n", | ||
" x_ticks=[x for x in range(1940, 2010 + 1, 30)],\n", | ||
" y_ticks_cumulative=[x for x in range(0, 60 + 1, 20)],\n", | ||
" y_lim_cumulative=(0, 70),\n", | ||
" y_minutes=False,\n", | ||
" y_minutes_cumulative=True,\n", | ||
")\n", | ||
"\n", | ||
"save_plot(fig, \"yearbook_drift-trigger-cost-matrix\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.