Added performance plots

ssciwr · Dec 3, 2024 · 446f34c · 446f34c
1 parent dbd0577
commit 446f34c
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 16 deletions.
diff --git a/notebook/performance_demo.ipynb b/notebook/performance_demo.ipynb
@@ -10,7 +10,8 @@
     "import mailcom.parse\n",
     "import pandas as pd\n",
     "import time\n",
-    "import datetime"
+    "import datetime\n",
+    "import matplotlib.pyplot as plt"
    ]
   },
   {
@@ -30,9 +31,11 @@
    "outputs": [],
    "source": [
     "# import files from csv file\n",
-    "email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg.csv\")\n",
+    "email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg_copy.csv\")\n",
     "print(email_list)\n",
     "\n",
+    "t_csv_read = time.time()\n",
+    "\n",
     "# create pseudonymization object\n",
     "ps = mailcom.parse.Pseudonymize()\n",
     "ps.init_spacy(\"fr\")\n",
@@ -51,17 +54,33 @@
     "out_list = []\n",
     "ts_list = []\n",
     "for idx, row in email_list.iterrows():\n",
+    "    ts_email_start = time.time()\n",
     "    text = row[\"message\"]\n",
     "    email_dict = {\"content\": text}\n",
     "    if not text:\n",
     "        continue\n",
     "    # Test functionality of Pseudonymize class\n",
-    "    output_text = ps.pseudonymize(text)\n",
+    "    # Pseudonymization is usually done using ps.pseudonymize\n",
+    "    # For performance analysis the process is split into its subprocesses here\n",
+    "    ps.reset()\n",
+    "    sentences = ps.get_sentences(text)\n",
+    "    ts_email_ppr_done = time.time()\n",
+    "    pseudonymized_sentences = []\n",
+    "    for sent in sentences:\n",
+    "        sent = ps.pseudonymize_email_addresses(sent)\n",
+    "        ner = ps.get_ner(sent)\n",
+    "        ps_sent = \" \".join(ps.pseudonymize_ne(ner, sent)) if ner else sent\n",
+    "        ps_sent = ps.pseudonymize_numbers(ps_sent)\n",
+    "        pseudonymized_sentences.append(ps_sent)\n",
+    "    output_text = ps.concatenate(pseudonymized_sentences)\n",
+    "\n",
+    "    # add output to dict\n",
     "    email_dict[\"pseudo_content\"] = output_text\n",
     "    out_list.append(email_dict)\n",
     "\n",
     "    # timestamp after this email\n",
-    "    ts_list.append(time.time())"
+    "    ts_email_end = time.time()\n",
+    "    ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])"
    ]
   },
   {
@@ -81,18 +100,46 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# print timestamps\n",
-    "print(\"Time from start to model loaded:\", (datetime.datetime.fromtimestamp(t_model_loaded - t0).strftime('%S')), \"s\")\n",
-    "# time differences between emails\n",
-    "ts_diffs = []\n",
-    "for i in range(0, len(ts_list)):\n",
-    "    if i == 0:\n",
-    "        ts_diff = (ts_list[i] - t_model_loaded)\n",
-    "    else:\n",
-    "        ts_diff = (ts_list[i] - ts_list[i-1])\n",
-    "    ts_diffs.append(ts_diff)\n",
-    "    print(\"Time needed for email\", i, \":\", (datetime.datetime.fromtimestamp(ts_diff).strftime('%S')), \"s\")\n",
-    "print(\"Total time:\", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1] - t_model_loaded).strftime('%M:%S')))"
+    "# display timestamps\n",
+    "\n",
+    "# bar plot for each individual email\n",
+    "# processing times\n",
+    "idx_list = [row[0] for row in email_list.iterrows()]\n",
+    "email_duration_list = [ts[2] - ts[1] for ts in ts_list]\n",
+    "email_ppr_list = [ts[1] - ts[0] for ts in ts_list]\n",
+    "email_total_list = [ts[2] - ts[0] for ts in ts_list]\n",
+    "email_bar_height = {\n",
+    "    \"Pre-Processing\": email_ppr_list,\n",
+    "    \"Pseudonymization\": email_duration_list\n",
+    "}\n",
+    "bt = [0 for idx in idx_list]\n",
+    "\n",
+    "plt.figure(figsize=(10,4), dpi=80)\n",
+    "\n",
+    "# plot 1\n",
+    "plt.subplot(1, 2, 1)\n",
+    "for key, height in email_bar_height.items():\n",
+    "    plt.bar(idx_list, height, 0.5, label=key, bottom=bt)\n",
+    "    bt = [bi + hi for (bi,hi) in zip(bt, height)]\n",
+    "#plt.yscale(\"log\")\n",
+    "plt.xlabel(\"Email\")\n",
+    "plt.ylabel(\"t [s]\")\n",
+    "plt.title(\"Computation times for emails, model loading and file reading\")\n",
+    "plt.legend()\n",
+    "\n",
+    "# plot for model loading and file reading, as well as average email time\n",
+    "# processing times\n",
+    "bar_x = [\"CSV Reading\", \"Model Loading\", \"Average Email Time\"]\n",
+    "average_email_time = sum(email_total_list) / len(email_total_list)\n",
+    "bar_y = [t_csv_read - t0, t_model_loaded - t0, average_email_time]\n",
+    "plt.ylabel(\"t [s]\")\n",
+    "\n",
+    "# plot 2\n",
+    "plt.subplot(1, 2, 2)\n",
+    "plt.bar(bar_x, bar_y, 0.5)\n",
+    "\n",
+    "# Total time\n",
+    "print(\"Total time:\", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1][2] - t_model_loaded).strftime('%M:%S')))"
    ]
   },
   {

diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
   "torch",
   "pandas",
   "jupyter",
+  "matplotlib"
 ]
 
 [project.optional-dependencies]