Skip to content

Commit

Permalink
Added performance plots
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Dec 3, 2024
1 parent dbd0577 commit 446f34c
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 16 deletions.
79 changes: 63 additions & 16 deletions notebook/performance_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"import mailcom.parse\n",
"import pandas as pd\n",
"import time\n",
"import datetime"
"import datetime\n",
"import matplotlib.pyplot as plt"
]
},
{
Expand All @@ -30,9 +31,11 @@
"outputs": [],
"source": [
"# import files from csv file\n",
"email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg.csv\")\n",
"email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg_copy.csv\")\n",
"print(email_list)\n",
"\n",
"t_csv_read = time.time()\n",
"\n",
"# create pseudonymization object\n",
"ps = mailcom.parse.Pseudonymize()\n",
"ps.init_spacy(\"fr\")\n",
Expand All @@ -51,17 +54,33 @@
"out_list = []\n",
"ts_list = []\n",
"for idx, row in email_list.iterrows():\n",
" ts_email_start = time.time()\n",
" text = row[\"message\"]\n",
" email_dict = {\"content\": text}\n",
" if not text:\n",
" continue\n",
" # Test functionality of Pseudonymize class\n",
" output_text = ps.pseudonymize(text)\n",
" # Pseudonymization is usually done using ps.pseudonymize\n",
" # For performance analysis the process is split into its subprocesses here\n",
" ps.reset()\n",
" sentences = ps.get_sentences(text)\n",
" ts_email_ppr_done = time.time()\n",
" pseudonymized_sentences = []\n",
" for sent in sentences:\n",
" sent = ps.pseudonymize_email_addresses(sent)\n",
" ner = ps.get_ner(sent)\n",
" ps_sent = \" \".join(ps.pseudonymize_ne(ner, sent)) if ner else sent\n",
" ps_sent = ps.pseudonymize_numbers(ps_sent)\n",
" pseudonymized_sentences.append(ps_sent)\n",
" output_text = ps.concatenate(pseudonymized_sentences)\n",
"\n",
" # add output to dict\n",
" email_dict[\"pseudo_content\"] = output_text\n",
" out_list.append(email_dict)\n",
"\n",
" # timestamp after this email\n",
" ts_list.append(time.time())"
" ts_email_end = time.time()\n",
" ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])"
]
},
{
Expand All @@ -81,18 +100,46 @@
"metadata": {},
"outputs": [],
"source": [
"# print timestamps\n",
"print(\"Time from start to model loaded:\", (datetime.datetime.fromtimestamp(t_model_loaded - t0).strftime('%S')), \"s\")\n",
"# time differences between emails\n",
"ts_diffs = []\n",
"for i in range(0, len(ts_list)):\n",
" if i == 0:\n",
" ts_diff = (ts_list[i] - t_model_loaded)\n",
" else:\n",
" ts_diff = (ts_list[i] - ts_list[i-1])\n",
" ts_diffs.append(ts_diff)\n",
" print(\"Time needed for email\", i, \":\", (datetime.datetime.fromtimestamp(ts_diff).strftime('%S')), \"s\")\n",
"print(\"Total time:\", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1] - t_model_loaded).strftime('%M:%S')))"
"# display timestamps\n",
"\n",
"# bar plot for each individual email\n",
"# processing times\n",
"idx_list = [row[0] for row in email_list.iterrows()]\n",
"email_duration_list = [ts[2] - ts[1] for ts in ts_list]\n",
"email_ppr_list = [ts[1] - ts[0] for ts in ts_list]\n",
"email_total_list = [ts[2] - ts[0] for ts in ts_list]\n",
"email_bar_height = {\n",
" \"Pre-Processing\": email_ppr_list,\n",
" \"Pseudonymization\": email_duration_list\n",
"}\n",
"bt = [0 for idx in idx_list]\n",
"\n",
"plt.figure(figsize=(10,4), dpi=80)\n",
"\n",
"# plot 1\n",
"plt.subplot(1, 2, 1)\n",
"for key, height in email_bar_height.items():\n",
" plt.bar(idx_list, height, 0.5, label=key, bottom=bt)\n",
" bt = [bi + hi for (bi,hi) in zip(bt, height)]\n",
"#plt.yscale(\"log\")\n",
"plt.xlabel(\"Email\")\n",
"plt.ylabel(\"t [s]\")\n",
"plt.title(\"Computation times for emails, model loading and file reading\")\n",
"plt.legend()\n",
"\n",
"# plot for model loading and file reading, as well as average email time\n",
"# processing times\n",
"bar_x = [\"CSV Reading\", \"Model Loading\", \"Average Email Time\"]\n",
"average_email_time = sum(email_total_list) / len(email_total_list)\n",
"bar_y = [t_csv_read - t0, t_model_loaded - t0, average_email_time]\n",
"plt.ylabel(\"t [s]\")\n",
"\n",
"# plot 2\n",
"plt.subplot(1, 2, 2)\n",
"plt.bar(bar_x, bar_y, 0.5)\n",
"\n",
"# Total time\n",
"print(\"Total time:\", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1][2] - t_model_loaded).strftime('%M:%S')))"
]
},
{
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ dependencies = [
"torch",
"pandas",
"jupyter",
"matplotlib"
]

[project.optional-dependencies]
Expand Down

0 comments on commit 446f34c

Please sign in to comment.