-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
85e88a1
commit bec88be
Showing
3 changed files
with
806 additions
and
12 deletions.
There are no files selected for viewing
244 changes: 244 additions & 0 deletions
244
analytics/plotting/rh_thesis/traintime_vs_dataamount/yb_traintime_dataamount.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,244 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from pathlib import Path\n", | ||
"\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"import pandas as pd\n", | ||
"import seaborn as sns\n", | ||
"\n", | ||
"from analytics.app.data.load import list_pipelines\n", | ||
"from analytics.plotting.common.common import init_plot\n", | ||
"from analytics.plotting.common.font import setup_font\n", | ||
"from modyn.supervisor.internal.grpc.enums import PipelineStage\n", | ||
"from modyn.supervisor.internal.pipeline_executor.models import StageLog\n", | ||
"\n", | ||
"%load_ext autoreload\n", | ||
"%autoreload 2" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# INPUTS\n", | ||
"\n", | ||
"pipelines_dir = Path(\n", | ||
" \"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/arxiv/11_baselines_amount\"\n", | ||
")\n", | ||
"# pipelines_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/huffpost/11_baselines_amount\")\n", | ||
"# pipelines_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-robinholzi-data/data/triggering/yearbook/11_baselines_amount\")\n", | ||
"output_dir = Path(\"/Users/robinholzinger/robin/dev/eth/modyn-2/.analytics.log/.data/_plots\")\n", | ||
"assert pipelines_dir.exists()\n", | ||
"assert output_dir.exists()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pipelines = list_pipelines(pipelines_dir)\n", | ||
"max_pipeline_id = max(pipelines.keys())\n", | ||
"pipelines" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from analytics.app.data.load import load_pipeline_logs\n", | ||
"\n", | ||
"pipeline_logs = {p_id: load_pipeline_logs(p_id, pipelines_dir) for (p_id, (_, p_path)) in pipelines.items()}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Wrangle data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"list_df_train: list[pd.DataFrame] = []\n", | ||
"\n", | ||
"for pipeline_id in pipelines:\n", | ||
" logs = pipeline_logs[pipeline_id]\n", | ||
" train_logs = [record for record in logs.supervisor_logs.stage_runs if record.id == PipelineStage.TRAIN.name]\n", | ||
" df_train = StageLog.df(stage_logs=train_logs, extended=True)\n", | ||
" df_train[\"pipeline_id\"] = pipelines[pipeline_id][0]\n", | ||
" list_df_train.append(df_train)\n", | ||
"\n", | ||
"df_train = pd.concat(list_df_train)\n", | ||
"df_train.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Conversion" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Clean pipeline name\n", | ||
"\n", | ||
"import re\n", | ||
"\n", | ||
"\n", | ||
"def pipeline_name_cleaner(name: str):\n", | ||
" return re.sub(r\".*_dataamount_(\\d+)\", \"trigger every \\\\1 samples\", name)\n", | ||
"\n", | ||
"\n", | ||
"df_train[\"pipeline_id\"] = df_train[\"pipeline_id\"].apply(pipeline_name_cleaner)\n", | ||
"df_train.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# to seconds\n", | ||
"df_train[\"duration\"] = df_train[\"duration\"].dt.total_seconds() / 60\n", | ||
"# df_train[\"duration\"] = df_train[\"duration\"].dt.total_seconds()\n", | ||
"# df_train[\"train_time_at_trainer\"] = df_train[\"train_time_at_trainer\"] / 1_000 # millis to seconds\n", | ||
"df_train[\"train_time_at_trainer\"] = df_train[\"train_time_at_trainer\"] / 1_000 / 60 # millis to minutes\n", | ||
"df_train" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Sort by number of samples\n", | ||
"df_train = df_train.sort_values(by=\"num_samples\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Create Plot" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from analytics.plotting.common.color import discrete_colors, main_color\n", | ||
"\n", | ||
"sns.set_style(\"whitegrid\")\n", | ||
"\n", | ||
"init_plot()\n", | ||
"setup_font(small_label=True, small_title=True)\n", | ||
"\n", | ||
"\n", | ||
"FONTSIZE = 20\n", | ||
"DOUBLE_FIG_WIDTH = 10\n", | ||
"DOUBLE_FIG_HEIGHT = 3.5\n", | ||
"DOUBLE_FIG_SIZE = (DOUBLE_FIG_WIDTH, 1.5 * DOUBLE_FIG_HEIGHT)\n", | ||
"\n", | ||
"width_factor = 0.5\n", | ||
"height_factor = 0.5\n", | ||
"\n", | ||
"fig = plt.figure(\n", | ||
" edgecolor=\"black\",\n", | ||
" frameon=True,\n", | ||
" figsize=(\n", | ||
" DOUBLE_FIG_WIDTH * width_factor,\n", | ||
" 2 * DOUBLE_FIG_HEIGHT * height_factor,\n", | ||
" ),\n", | ||
" dpi=300,\n", | ||
")\n", | ||
"\n", | ||
"ax1 = sns.regplot(\n", | ||
" df_train,\n", | ||
" x=\"num_samples\",\n", | ||
" y=\"train_time_at_trainer\", # duration\n", | ||
" color=main_color(0),\n", | ||
")\n", | ||
"\n", | ||
"ax2 = sns.scatterplot(\n", | ||
" df_train,\n", | ||
" x=\"num_samples\",\n", | ||
" y=\"train_time_at_trainer\", # duration\n", | ||
" hue=\"pipeline_id\",\n", | ||
" palette=(\n", | ||
" discrete_colors(14)[0:5] + discrete_colors(14)[9:14]\n", | ||
" if \"yearbook\" in str(pipelines_dir)\n", | ||
" else (\n", | ||
" discrete_colors(8)[0:3] + discrete_colors(8)[6:8]\n", | ||
" if \"huffpost\" in str(pipelines_dir)\n", | ||
" else discrete_colors(8)[0:3] + discrete_colors(8)[6:8]\n", | ||
" )\n", | ||
" ),\n", | ||
" s=200,\n", | ||
" legend=True,\n", | ||
" marker=\"X\",\n", | ||
")\n", | ||
"\n", | ||
"# Display the plot\n", | ||
"plt.tight_layout()\n", | ||
"plt.show()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# TODO: run more variants of in less dense areas\n", | ||
"# TODO: plot / add number of datapoints to thesis so that the signicance of regression line is clear\n", | ||
"# State in thesis that there are no outliers to be expected!" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.