From 0eda4693ea9f0a0e394231000dca611aa8359df3 Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Sun, 3 Nov 2024 15:59:27 +0100 Subject: [PATCH 1/9] Created demo notebook file --- notebook/demo.ipynb | 133 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 notebook/demo.ipynb diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb new file mode 100644 index 0000000..e0fa2e0 --- /dev/null +++ b/notebook/demo.ipynb @@ -0,0 +1,133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import mailcom.inout\n", + "import mailcom.parse\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", + "- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + ] + } + ], + "source": [ + "# import files from /data/in\n", + "\n", + "io = mailcom.inout.InoutHandler(\"../mailcom/test/data\")\n", + "io.list_of_files()\n", + "\n", + "# create pseudonymization object\n", + "ps = mailcom.parse.Pseudonymize()\n", + "ps.init_spacy(\"fr\")\n", + "ps.init_transformers()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Bonjour Agathe.eml\n", + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re reunião agendada para o dia 24 de abril 2024-04-17T17_39_49+02 00.eml\n", + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ purismo.html\n", + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ ¡Voy el 24!.html\n" + ] + } + ], + "source": [ + "# loop over mails and pseudonymize them\n", + "out_list = []\n", + "for file in io.email_list:\n", + " print(\"Parsing input file {}\".format(file))\n", + " # creating dict\n", + " email_dict = {}\n", + " text = io.get_text(file)\n", + " text = io.get_html_text(text)\n", + " if not text:\n", + " continue\n", + " # Test functionality of Pseudonymize class\n", + " output_text = ps.pseudonymize(text)\n", + " email_dict[\"content\"] = text\n", + " email_dict[\"pseudo_content\"] = output_text\n", + " out_list.append(email_dict)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " content \\\n", + "0 J'espère que tu vas bien! Je voulais partager ... \n", + "1 \\nOlá Lino,Espero que este e-mail te encontre ... \n", + "2 From : aitana.garcia@zohomail.euTo : \"Alejandr... \n", + "3 From : pierre.lefevre@myyahoo.comTo : \"Alejand... \n", + "\n", + " pseudo_content \n", + "0 J'espère que tu vas bien! Je voulais partager ... \n", + "1 \\n Olá Claude,Espero que este e-mail te encont... \n", + "2 From : Claude.Dominique@xxxxxxxx.xxTo : \"Claud... \n", + "3 From : Claude.Dominique@myxxxxx.comTo : \" Clau... \n" + ] + } + ], + "source": [ + "# write output to pandas df\n", + "df = pd.DataFrame(out_list)\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mailcom", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 4779ec7fcb34f9b019ef20cdf0fc73c94fcb3b7b Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 5 Nov 2024 11:31:32 +0100 Subject: [PATCH 2/9] Added email metadata to df --- notebook/demo.ipynb | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb index e0fa2e0..55aa7bc 100644 --- a/notebook/demo.ipynb +++ b/notebook/demo.ipynb @@ -2,9 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Felix\\miniconda3\\envs\\mailcom\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "import mailcom.inout\n", "import mailcom.parse\n", @@ -13,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -40,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -59,22 +68,22 @@ "out_list = []\n", "for file in io.email_list:\n", " print(\"Parsing input file {}\".format(file))\n", - " # creating dict\n", - " email_dict = {}\n", " text = io.get_text(file)\n", + " # after this function was called, the email metadata can be accessed via io.email_content\n", + " # the dict already has the entries content, date, attachments, attachment type\n", + " email_dict = io.email_content.copy()\n", " text = io.get_html_text(text)\n", " if not text:\n", " continue\n", " # Test functionality of Pseudonymize class\n", " output_text = ps.pseudonymize(text)\n", - " email_dict[\"content\"] = text\n", " email_dict[\"pseudo_content\"] = output_text\n", " out_list.append(email_dict)\n" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -83,15 +92,21 @@ "text": [ " content \\\n", "0 J'espère que tu vas bien! Je voulais partager ... \n", - "1 \\nOlá Lino,Espero que este e-mail te encontre ... \n", - "2 From : aitana.garcia@zohomail.euTo : \"Alejandr... \n", - "3 From : pierre.lefevre@myyahoo.comTo : \"Alejand... \n", + "1
Date: Tue, 12 Nov 2024 13:58:57 +0100 Subject: [PATCH 3/9] Added notebook for timestamps --- notebook/performance_demo.ipynb | 192 ++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 notebook/performance_demo.ipynb diff --git a/notebook/performance_demo.ipynb b/notebook/performance_demo.ipynb new file mode 100644 index 0000000..3683b9b --- /dev/null +++ b/notebook/performance_demo.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Felix\\miniconda3\\envs\\mailcom\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import mailcom.inout\n", + "import mailcom.parse\n", + "import pandas as pd\n", + "import time\n", + "import datetime" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# create t0 timestamp\n", + "t0 = time.time()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", + "- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + ] + } + ], + "source": [ + "# import files from /data/in\n", + "\n", + "io = mailcom.inout.InoutHandler(\"../mailcom/test/data\")\n", + "io.list_of_files()\n", + "\n", + "# create pseudonymization object\n", + "ps = mailcom.parse.Pseudonymize()\n", + "ps.init_spacy(\"fr\")\n", + "ps.init_transformers()\n", + "# time stamp after model loading\n", + "t_model_loaded = time.time()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Bonjour Agathe.eml\n", + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re reunião agendada para o dia 24 de abril 2024-04-17T17_39_49+02 00.eml\n", + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ purismo.html\n", + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ ¡Voy el 24!.html\n" + ] + } + ], + "source": [ + "# loop over mails and pseudonymize them\n", + "out_list = []\n", + "ts_list = []\n", + "for file in io.email_list:\n", + " print(\"Parsing input file {}\".format(file))\n", + " text = io.get_text(file)\n", + " # after this function was called, the email metadata can be accessed via io.email_content\n", + " # the dict already has the entries content, date, attachments, attachment type\n", + " email_dict = io.email_content.copy()\n", + " text = io.get_html_text(text)\n", + " if not text:\n", + " continue\n", + " # Test functionality of Pseudonymize class\n", + " output_text = ps.pseudonymize(text)\n", + " email_dict[\"pseudo_content\"] = output_text\n", + " out_list.append(email_dict)\n", + "\n", + " # timestamp after this email\n", + " ts_list.append(time.time())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " content \\\n", + "0 J'espère que tu vas bien! Je voulais partager ... \n", + "1
Date: Wed, 13 Nov 2024 16:32:02 +0100 Subject: [PATCH 4/9] Added time differences --- notebook/performance_demo.ipynb | 35 +++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/notebook/performance_demo.ipynb b/notebook/performance_demo.ipynb index 3683b9b..c04d21d 100644 --- a/notebook/performance_demo.ipynb +++ b/notebook/performance_demo.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -136,28 +136,33 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "T0: 13:51:17\n", - "T_model_loaded: 13:51:26\n", - "Email 0 finished: 13:51:31\n", - "Email 1 finished: 13:51:44\n", - "Email 2 finished: 13:51:47\n", - "Email 3 finished: 13:51:53\n" + "Time from start to model loaded: 11 s\n", + "Time needed for email 0 : 04 s\n", + "Time needed for email 1 : 15 s\n", + "Time needed for email 2 : 02 s\n", + "Time needed for email 3 : 04 s\n" ] } ], "source": [ "# print timestamps\n", - "print(\"T0:\", datetime.datetime.fromtimestamp(t0).strftime('%H:%M:%S'))\n", - "print(\"T_model_loaded:\", datetime.datetime.fromtimestamp(t_model_loaded).strftime('%H:%M:%S'))\n", - "for i in range(len(ts_list)):\n", - " print(\"Email\", i, \"finished:\", datetime.datetime.fromtimestamp(ts_list[i]).strftime('%H:%M:%S'))" + "print(\"Time from start to model loaded:\", (datetime.datetime.fromtimestamp(t_model_loaded - t0).strftime('%S')), \"s\")\n", + "# time differences between emails\n", + "ts_diffs = []\n", + "for i in range(0, len(ts_list)):\n", + " if i == 0:\n", + " ts_diff = (ts_list[i] - t_model_loaded)\n", + " else:\n", + " ts_diff = (ts_list[i] - ts_list[i-1])\n", + " ts_diffs.append(ts_diff)\n", + " print(\"Time needed for email\", i, \":\", (datetime.datetime.fromtimestamp(ts_diff).strftime('%S')), \"s | Email length:\", )" ] }, { From 4f85ad968623f22c5bfdc31e2240328731b35c18 Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 19 Nov 2024 15:03:26 +0100 Subject: [PATCH 5/9] Used csv file for performance test --- notebook/performance_demo.ipynb | 209 +++++++++++++++++++++++++------- 1 file changed, 162 insertions(+), 47 deletions(-) diff --git a/notebook/performance_demo.ipynb b/notebook/performance_demo.ipynb index c04d21d..8d0920c 100644 --- a/notebook/performance_demo.ipynb +++ b/notebook/performance_demo.ipynb @@ -37,6 +37,26 @@ "execution_count": 3, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Unnamed: 0 message\n", + "0 242 Von meinem iPhone gesendet Anfang der weiterge...\n", + "1 243 Von meinem iPhone gesendet Anfang der weiterge...\n", + "2 244 Von meinem iPhone gesendet Anfang der weiterge...\n", + "3 245 Von meinem iPhone gesendet Anfang der weiterge...\n", + "4 246 Von meinem iPhone gesendet Anfang der weiterge...\n", + ".. ... ...\n", + "98 1313 \\nVon: Mélissa des Presses de l'Université Lav...\n", + "99 1314 Von: Librairie Classiques Garnier &lt;libr...\n", + "100 1315 La langue s'enrichit #36 - FranceTerme\\nProf. ...\n", + "101 1316 Activités de juin - Presses de l'Université La...\n", + "102 1317 Nouveautés de juin\\nProf. Dr. Sybille Große I ...\n", + "\n", + "[103 rows x 2 columns]\n" + ] + }, { "name": "stderr", "output_type": "stream", @@ -48,10 +68,9 @@ } ], "source": [ - "# import files from /data/in\n", - "\n", - "io = mailcom.inout.InoutHandler(\"../mailcom/test/data\")\n", - "io.list_of_files()\n", + "# import files from csv file\n", + "email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg.csv\")\n", + "print(email_list)\n", "\n", "# create pseudonymization object\n", "ps = mailcom.parse.Pseudonymize()\n", @@ -63,31 +82,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Bonjour Agathe.eml\n", - "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re reunião agendada para o dia 24 de abril 2024-04-17T17_39_49+02 00.eml\n", - "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ purismo.html\n", - "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ ¡Voy el 24!.html\n" - ] - } - ], + "outputs": [], "source": [ "# loop over mails and pseudonymize them\n", "out_list = []\n", "ts_list = []\n", - "for file in io.email_list:\n", - " print(\"Parsing input file {}\".format(file))\n", - " text = io.get_text(file)\n", - " # after this function was called, the email metadata can be accessed via io.email_content\n", - " # the dict already has the entries content, date, attachments, attachment type\n", - " email_dict = io.email_content.copy()\n", - " text = io.get_html_text(text)\n", + "for idx, row in email_list.iterrows():\n", + " text = row[\"message\"]\n", + " email_dict = {\"content\": text}\n", " if not text:\n", " continue\n", " # Test functionality of Pseudonymize class\n", @@ -101,30 +105,40 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " content \\\n", - "0 J'espère que tu vas bien! Je voulais partager ... \n", - "1
Date: Tue, 19 Nov 2024 15:04:14 +0100 Subject: [PATCH 6/9] Added output print, find a way to visualize in the future --- notebook/demo.ipynb | 147 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 127 insertions(+), 20 deletions(-) diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb index 55aa7bc..0ab8858 100644 --- a/notebook/demo.ipynb +++ b/notebook/demo.ipynb @@ -85,35 +85,142 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, + "outputs": [], + "source": [ + "# write output to pandas df\n", + "df = pd.DataFrame(out_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " content \\\n", - "0 J'espère que tu vas bien! Je voulais partager ... \n", - "1
\n", + "

Olá Lino,

Espero que este e-mail te encontre bem. Agradeço a confirmação de sua presença na reunião do dia 24 de abril às 15:00 na InnovaTech Solutions.

Com relação à confirmação dos demais membros da equipe, irei verificar a disponibilidade de cada um e te retorno o mais breve possível para que possamos organizar todos os detalhes necessários. A relevância dos temas a serem discutidos para cada área certamente contribuirá para uma sessão produtiva.

Até breve,

[Alejandro Rodriguez]\n", + "InnovaTech Solutions S.L



\n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + " Am Mittwoch, 17. April 2024 um 17:30:19 MESZ hat lino.silva1 <lino.silva1@protonmail.com> Folgendes geschrieben:\n", + "
\n", + "

\n", + "

\n", + "\n", + "\n", + "
Espero que este e-mail o encontre bem. Escrevo para confirmar minha presença na reunião agendada para o dia 24 de abril, às 15:00, nas instalações da InnovaTech Solutions, localizada na Rua Principal, 123.

Agradeceria se pudesse confirmar a participação das seguintes pessoas de nossa equipe, uma vez que os temas a serem discutidos são relevantes para suas respectivas áreas:

- Alessandra Acquarone (Secretária Executiva)
- Dr. Giulia Agostini (Literatura Francesa/Italiana/Espanhola)
- Meggi Altrock (Secretária do Prof. Dr. De Stefani)
- Felicia Augusto-Hönicke (Assistente Acadêmica em Prática de Língua Portuguesa)
- Ronny Beckert (Assistente Acadêmico)
- Dr. Gérald Béreiziat (Professor de Prática da Língua Francesa)
- Dr. Stephanie Béreiziat-Lang (Substituição de Cadeira em Literatura Espanhola)

- Ursula Bergerfurth (Didática do Italiano)
- Maylis Bonetti (Professora de Prática da Língua Francesa)
- Dr. Sarah Burnautzki (Professora Tenure-Track em Literatura Francesa/Espanhola)
- Constanza Cárdenas (Instrutora de Prática da Língua Espanhola)
- Dr. Romanita Constantinescu (Professor de Romeno da Fundação)
- Ricardo Coseano (Assistente Acadêmico em Prática da Língua Espanhola)
- Prof. Dr. Elwys De Stefani (Cadeira de Linguística Italiana/Francesa)

- Brigitta Dierkes (Didática do Espanhol)
- Annachiara Di Taranto (Professora de Prática da Língua Italiana)
- Agathe Duperron (Secretária do Prof. Große)
- Madeleine Eppel (Responsável pelo Catálogo de Cursos e Matrículas)
- José Esplá-Oliver (Assistente Acadêmico em Prática da Língua Espanhola)
- Dr. Carmela Fischer Díaz (Instrutora de Prática da Língua Espanhola)
- Sandrine Flores (Assistente Acadêmica em Prática da Língua Francesa)
- Prof. Dr. Robert Folger (Cadeira de Literatura Espanhola)

Fico no aguardo da sua confirmação para prosseguir com os preparativos necessários.

Atenciosamente,

Lino Silva
\"grafik.png\"
\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + " Sicher versendet mit Proton Mail.\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "Pseudonymized Text:\n", + " \n", + " Olá Claude,Espero que este e-mail te encontre bem. Agradeço a confirmação de sua presença na reunião do dia [number] de abril às [number]:[number] na [organization] Solutions . Com relação à confirmação dos demais membros da equipe, irei verificar a disponibilidade de cada um e te retorno o mais breve possível para que possamos organizar todos os detalhes necessários. A relevância dos temas a serem discutidos para cada área certamente contribuirá para uma sessão produtiva. Até breve,Dominique Claude ]\n", + "[organization] Solutions [organization].[organization]\n", + "\n", + "\n", + "\n", + " Am Mittwoch, [number]. April [number] um [number]:[number]:[number] [misc] hat Camille [email] Folgendes geschrieben:\n", + " \n", + "\n", + "\n", + "Espero que este e-mail o encontre bem. Escrevo para confirmar minha presença na reunião agendada para o dia [number] de abril, às [number]:[number], nas instalações da [organization] Solutions , localizada na [location] , [number].Agradeceria se pudesse confirmar a participação das seguintes pessoas de nossa equipe, uma vez que os temas a serem discutidos são relevantes para suas respectivas áreas:- Charlie (Secretária Executiva)- Dr. Florence (Literatura [misc]/[misc]/[misc])- [organization] Francis (Secretária do Prof. Dr. Maxime)- Remy (Assistente Acadêmica em Prática de [misc])- Cécile ( Assistente Acadêmico)- Dr. Claude ([misc]ofessor de [misc][organization] [misc])- DrClaude - Lang ( Substituição de Cadeira em Literatura [misc])- Claude (Didática do [misc]no)- Claude (Professora de Prática da Língua [misc])- Dr. Claude (Professora Tenure-Track em Literatura [misc])- Claude (Instrutora de Prática da Língua [misc])- Dr. Claude (Professor de [misc][location] da Fundação)- Ricardo Cosea[location] (Assistente Acadêmico em Prática da Língua [misc])- Prof. Dr[organization] Claude ([organization])- Claude Claude (Didática do [misc])- Claude [location] Claude (Professora de Prática da Língua [misc])- Claude (Secretária do Prof. Große)- Claude (Responsável pelo Catálogo de Cursos e Matrículas)- Claude -Claude ( Assistente Acadêmico em Prática da Língua [misc])- Dr. Claude (Instrutora de Prática da Língua [misc])- Claude (Assistente Acadêmica em Prática da Língua [misc])- Prof . Dr. Claude ([organization])Fico no aguardo da sua confirmação para prosseguir com os preparativos necessários. Atenciosamente,Lino Silva\n", + "\n", + "\n", + "\n", + "\n", + " Sicher versendet mit [misc][organization].\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "Email 2\n", + "Original Text:\n", + "
From : aitana.garcia@zohomail.eu
To : \"Alejandro Rodriguez\"< alejandro.rodriguez1@gmx.net>
Date : mié., 17 abr. 2024 17:24:41 +0200
Subject : Re: purismo

Muchas gracias!

Sent using Zoho Mail




---- El mié., 17 abr. 2024 17:22:49 +0200, Alejandro Rodriguez <alejandro.rodriguez1@gmx.net> escribió ----

Hola, aquí tienes el escaneo ;) Saludos, Aitana

\n", + "Pseudonymized Text:\n", + " From : [email] : \"Claude\"< [email] : mié. , [number] abr. [number] [number]:[number]:[number] +[number]Subject : Re: purismoMuchas gracias!Sent using [organization] El mié. , [number] abr. [number] [number]:[number]:[number] +[number], Claude [email] escribió ----Hola, aquí tienes el escaneo ;) Saludos, Dominique\n", + "Email 3\n", + "Original Text:\n", + "
From : pierre.lefevre@myyahoo.com
To : \"Alejandro Rodriguez\"< alejandro.rodriguez1@gmx.net>,\"Aitana Garcia\"< aitana.garcia@zohomail.eu>
Date : mié., 17 abr. 2024 17:33:23 +0200
Subject : Re: ¡Voy el 24!

\n", + "

Salut Aitana,

Merci beaucoup pour l'invitation ! Je suis ravi d'apprendre que je pourrai vous rejoindre pour la réunion du 24 à 15h. Cela sera une excellente occasion de rencontrer l'équipe et de discuter des projets en cours.

N'hésite pas à me dire s'il y a des documents spécifiques que je devrais consulter avant la réunion ou si tu veux que je prépare quelque chose de particulier. Je suis tout à fait disposé à contribuer de manière constructive à notre discussion.

À très bientôt et encore merci de m'avoir inclus.

Bien cordialement,\n", + "Pierre



\n", + " \n", + "
\n", + "
\n", + " \n", + "
\n", + " Am Mittwoch, 17. April 2024 um 17:31:14 MESZ hat Aitana Garcia <aitana.garcia@zohomail.eu> Folgendes geschrieben:\n", + "
\n", + "

\n", + "

\n", + " \n", + " \n", + "
Ah, y me gustaría invitar a mi amigo Pierre, él está en el CC

Sent using Zoho Mail




---- El mié., 17 abr. 2024 17:27:12 +0200, Aitana Garcia <aitana.garcia@zohomail.eu> escribió ----

Hola Ale,

¡Todo bien por aquí! Solo pasaba a decirte que contarás conmigo el 24 a las 3 en la ofi. ¿Hay algo que deba llevar o algún tema en particular que quieras que cheque antes?

¡Nos vemos!\n", + "Aitana

Sent using Zoho Mail





\n", + "
\n", + "
\n", + "Pseudonymized Text:\n", + " From : [email] : \" Claude\"< [email] Dominique\"< [email] >Date : mié. , [number] abr. [number] [number]:[number]:[number] +[number]Subject : Re: ¡Voy el [number]!\n", + "Salut Claude,Merci beaucoup pour l'invitation ! Je suis ravi d'apprendre que je pourrai vous rejoindre pour la réunion du [number] à [number]h. Cela sera une excellente occasion de rencontrer l'équipe et de discuter des projets en cours. N'hésite pas à me dire s'il y a des documents spécifiques que je devrais consulter avant la réunion ou si tu veux que je prépare quelque chose de particulier. Je suis tout à fait disposé à contribuer de manière constructive à notre discussion. À très bientôt et encore merci de m'avoir inclus. Bien cordialement,\n", + "Pierre\n", + "\n", + "\n", + "\n", + " Am Mittwoch, [number] . April [number] um [number]:[number]:[number] [misc] hat Charlie [email] > Folgendes geschrieben:\n", + " \n", + "\n", + "\n", + "Ah, y me gustaría invitar a mi amigo Florence, él está en el [organization] using [misc]---- El mié. , [number] abr. [number] [number]:[number]:[number] +[number], Charlie [email] escribió ----Hola Francis,¡Todo bien por aquí! Solo pasaba a decirte que contarás conmigo el [number] a las [number] en la ofi. ¿ Hay algo que deba llevar o algún tema en particular que quieras que cheque antes?¡Nos vemos!\n", + "ClaudeSent using [organization]\n", + "\n", + "\n" ] } ], "source": [ - "# write output to pandas df\n", - "df = pd.DataFrame(out_list)\n", - "print(df)" + "# print results\n", + "for idx, mail in df.iterrows():\n", + " print(\"Email\", idx)\n", + " print(\"Original Text:\\n\", mail[\"content\"])\n", + " print(\"Pseudonymized Text:\\n\", mail[\"pseudo_content\"])\t" ] }, { From e78713ae1f131a47e3884d162a62db880c3ec148 Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Mon, 2 Dec 2024 09:55:11 +0100 Subject: [PATCH 7/9] add additional requirements, more explanations in demo notebook --- notebook/demo.ipynb | 102 +++++++++++++++++++++++++++++++++----------- pyproject.toml | 2 + requirements.txt | 4 +- 3 files changed, 82 insertions(+), 26 deletions(-) diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb index 0ab8858..e2a7703 100644 --- a/notebook/demo.ipynb +++ b/notebook/demo.ipynb @@ -1,28 +1,45 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Demonstration notebook for the mailcom package\n", + "*Scientific Software Center, University of Heidelberg, December 2024*\n", + "The `mailcom` package is used to anonymize/pseudonymize textual data, i.e. email content. It takes an `eml` or `html` file as input and extracts information about attachements, number of attachements and type, and the content of the email body. The latter is then parsed through [`spaCy`](https://spacy.io/) and divided into sentences. The sentences are fed to a [`transformers`](https://huggingface.co/docs/transformers/en/index) named entity recognition (NER) [pipeline](https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/pipelines), and person names, places, organizations, miscellaneous, are detected in the inference task. Names are replaced by pseudos, while locations, organizations and miscellaneous are replaced by `[location]`, `[organization]` and `[misc]`. The text is further parsed using string methods, to replace any numbers with `[number]` and email addresses with `[email]`. The processed text and metadata can then be written to an `xml` file or into a pandas dataframe.\n", + "\n", + "Please note that 100% accuracy is not possible with this task. Any output needs to be further checked by a human to ensure the text has been anonymized completely.\n", + "\n", + "The current set-up is for Romance languages, however [other language models](https://spacy.io/usage/models) can also be loaded into the spaCy pipeline. The transformers pipeline uses the `xlm-roberta-large-finetuned-conll03-english` model revision number `18f95e9` by default, but other models can also be passed (see below).\n", + "\n", + "Before using the `mailcom` package, please install it into your conda environment using\n", + "```\n", + "pip install mailcom\n", + "```\n", + "After that, select the appropriate kernel for your Jupyter notebook and execute the cell below to import the package. The package is currently under active development and any function calls are subject to changes." + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\Felix\\miniconda3\\envs\\mailcom\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "import mailcom.inout\n", "import mailcom.parse\n", "import pandas as pd" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below, the input files are loaded from the given `input_dir` directory. You can provide relative or absolute paths to the directory that contains your `eml` or `html` files. All files of the `eml` or `htlm` file type in that directory will be considered input files." + ] + }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -36,30 +53,51 @@ } ], "source": [ - "# import files from /data/in\n", + "# import files from input_dir - change this to your own directory\n", + "input_dir = \"../mailcom/test/data\"\n", "\n", - "io = mailcom.inout.InoutHandler(\"../mailcom/test/data\")\n", + "io = mailcom.inout.InoutHandler(directory_name = input_dir)\n", + "\n", + "# some internal processing\n", "io.list_of_files()\n", "\n", - "# create pseudonymization object\n", + "# create pseudonymization object and load spacy and transformers\n", + "# set the spacy language for sentence splitting\n", + "spacy_language = \"fr\"\n", + "# you may also set the model using `model = \"fr_core_news_md\"`\n", + "spacy_model = \"default\"\n", + "# set the model for transformers, here using the default model\n", + "transformers_model = \"xlm-roberta-large-finetuned-conll03-english\"\n", + "# set the revision number for transformers, here using the default revision number\n", + "transformers_revision_number = \"18f95e9\"\n", "ps = mailcom.parse.Pseudonymize()\n", - "ps.init_spacy(\"fr\")\n", - "ps.init_transformers()" + "ps.init_spacy(language=spacy_language, model=spacy_model)\n", + "ps.init_transformers(model=transformers_model, model_revision_number=transformers_revision_number)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the cell below, the emails are looped over and the text is extracted. The text is then split into sentences and the sentences are pseudonymized. The pseudonymized sentences are then joined back into a text and saved to a new file." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Bonjour Agathe.eml\n", - "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re reunião agendada para o dia 24 de abril 2024-04-17T17_39_49+02 00.eml\n", - "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ purismo.html\n", - "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ ¡Voy el 24!.html\n" + "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Re excel 2024-04-17T17_53_28+02 00.eml\n", + "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Fw Re viagem 2024-04-17T18_00_36+02 00.eml\n", + "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Re Invitation à la réunion du 24 avril.eml\n", + "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Re AW Objet Invitation à notre événement spécial!.eml\n", + "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Re reunião agendada para o dia 24 de abril 2024-04-17T17_39_49+02 00.eml\n", + "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Bonjour Agathe.eml\n", + "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Re Uma Mensagem de Amor 2024-04-17T17_43_45+02 00.eml\n" ] } ], @@ -78,12 +116,19 @@ " # Test functionality of Pseudonymize class\n", " output_text = ps.pseudonymize(text)\n", " email_dict[\"pseudo_content\"] = output_text\n", - " out_list.append(email_dict)\n" + " out_list.append(email_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After this, the output can be written to a file or processed further. The output is a list of dictionaries, each containing the metadata of the email and the pseudonymized content. In the below cell, the output is saved in a pandas dataframe." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -91,6 +136,13 @@ "df = pd.DataFrame(out_list)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may print the output for inspection in the notebook as per the cell below." + ] + }, { "cell_type": "code", "execution_count": 5, @@ -247,7 +299,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.10" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 3a91efd..e0ef7bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,8 @@ dependencies = [ "bs4", "dicttoxml", "torch", + "pandas", + "jupyter", ] [project.optional-dependencies] diff --git a/requirements.txt b/requirements.txt index 6cd0a84..2a06ae0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ spacy fr_core_news_md @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.8.0/fr_core_news_md-3.8.0-py3-none-any.whl es_core_news_md @ https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.8.0/es_core_news_md-3.8.0-py3-none-any.whl -transformers \ No newline at end of file +transformers +pandas +jupyter \ No newline at end of file From dbd0577e7e58ffbe4528aec36ac17a206470eaba Mon Sep 17 00:00:00 2001 From: Inga Ulusoy Date: Mon, 2 Dec 2024 10:02:53 +0100 Subject: [PATCH 8/9] add nbstripout pre-commit hook, run all hooks --- .pre-commit-config.yaml | 12 +- mailcom/inout.py | 65 ++++++---- mailcom/test/test_inout.py | 34 ++++-- notebook/demo.ipynb | 159 ++----------------------- notebook/performance_demo.ipynb | 205 ++------------------------------ 5 files changed, 86 insertions(+), 389 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c8ab9a..7cc1f24 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,11 @@ repos: rev: 24.4.2 hooks: - id: black - - repo: https://github.com/pycqa/flake8 - rev: 7.1.0 - hooks: - - id: flake8 \ No newline at end of file + - repo: https://github.com/pycqa/flake8 + rev: 7.1.0 + hooks: + - id: flake8 + - repo: https://github.com/kynan/nbstripout + rev: 0.8.1 + hooks: + - id: nbstripout diff --git a/mailcom/inout.py b/mailcom/inout.py index 15c7252..0028950 100644 --- a/mailcom/inout.py +++ b/mailcom/inout.py @@ -4,51 +4,62 @@ from bs4 import BeautifulSoup from dicttoxml import dicttoxml + class InoutHandler: def __init__(self, directory_name: str): """Constructor for the InoutHandler class. - - Args: + + Args: directory_name (str): The directory where the files are located. - """ + """ self.directory_name = directory_name # presets self.pattern = [".eml", ".html"] def list_of_files(self): - """Method to create a list of Path objects (files) that are present + """Method to create a list of Path objects (files) that are present in a directory.""" - if not os.path.exists(self.directory_name): # check if given dir exists raises error otherwise + if not os.path.exists( + self.directory_name + ): # check if given dir exists raises error otherwise raise OSError("Path {} does not exist".format(self.directory_name)) mypath = Path(self.directory_name) - self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern] + self.email_list = [ + mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern + ] if len(self.email_list) == 0: - raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath)) + raise ValueError( + """The directory {} does not contain .eml or .html files. + Please check that the directory is containing the + email data files""".format( + mypath + ) + ) def get_html_text(self, text_check: str) -> str: """Clean up a string if it contains html content. Args: text_check (str): The string that may contain html content. - + Returns: str: The (potentially) cleaned up string.""" - soup = BeautifulSoup(text_check , 'html.parser') + soup = BeautifulSoup(text_check, "html.parser") if soup.find(): text_check = soup.get_text() return text_check def get_text(self, file: Path) -> str: """Function to extract the textual content and other metadata from an email file. - + Args: file (Path): The path to the email file. - + Returns: - str: The textual content of the email. In the future, this will return the + str: The textual content of the email. In the future, this will return the complete dictionary with the metadata.""" - if not file.is_file(): # check if given file exists raises error otherwise + if not file.is_file(): # check if given file exists raises error otherwise raise OSError("File {} does not exist".format(file)) - with open(file, 'rb') as fhdl: + with open(file, "rb") as fhdl: raw_email = fhdl.read() ep = eml_parser.EmlParser(include_raw_body=True) parsed_eml = ep.decode_email_bytes(raw_email) @@ -57,23 +68,26 @@ def get_text(self, file: Path) -> str: attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0 # find the types of attachements if attachments > 0: - attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)] - self.email_content = {"content": parsed_eml["body"][0]["content"], - "date": parsed_eml["header"]["date"], - "attachment": attachments, - "attachement type": attachmenttypes - } - return(self.email_content["content"]) + attachmenttypes = [ + parsed_eml["attachment"][i]["extension"] for i in range(attachments) + ] + self.email_content = { + "content": parsed_eml["body"][0]["content"], + "date": parsed_eml["header"]["date"], + "attachment": attachments, + "attachement type": attachmenttypes, + } + return self.email_content["content"] def validate_data(self): pass - + def data_to_xml(self, text): - my_item_func = lambda x: 'content' - xml = dicttoxml(text, custom_root='email', item_func = my_item_func) + my_item_func = lambda x: "content" # noqa + xml = dicttoxml(text, custom_root="email", item_func=my_item_func) return xml.decode() - def write_file(self, text: str, name: str)-> None: + def write_file(self, text: str, name: str) -> None: """Write the extracted string to a text file. Args: @@ -81,4 +95,3 @@ def write_file(self, text: str, name: str)-> None: name (str): The name of the file to be written.""" with open("{}.out".format(name), "w") as file: file.write(text) - \ No newline at end of file diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py index 1f30ff0..18a32f0 100644 --- a/mailcom/test/test_inout.py +++ b/mailcom/test/test_inout.py @@ -11,12 +11,14 @@ XML_PATH = Path(pkg / "test" / "data" / "test.out") TEXT_REF = "J'espère que tu vas bien!" -XML_REF = "" +XML_REF = '' + @pytest.fixture() def get_instant(tmp_path): return inout.InoutHandler(tmp_path) + def test_list_of_files(get_instant): with pytest.raises(ValueError): get_instant.list_of_files() @@ -34,31 +36,37 @@ def test_list_of_files(get_instant): get_instant.list_of_files() assert get_instant.directory_name / "test3.xml" not in get_instant.email_list + def test_get_text(get_instant): p = get_instant.directory_name / "test.eml" p.write_text("test") extracted_text = get_instant.get_text(p) - assert extracted_text == 'test' + assert extracted_text == "test" text = get_instant.get_text(FILE_PATH) assert text[0:25] == TEXT_REF - assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc) + assert get_instant.email_content["date"] == datetime.datetime( + 2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc + ) assert get_instant.email_content["attachment"] == 2 - assert get_instant.email_content["attachement type"] == ['jpg', 'jpg'] + assert get_instant.email_content["attachement type"] == ["jpg", "jpg"] with pytest.raises(OSError): get_instant.get_text(get_instant.directory_name / "nonexisting.eml") + def test_get_html_text(get_instant): html = """Test""" - assert get_instant.get_html_text(html) == 'Test' + assert get_instant.get_html_text(html) == "Test" noHtml = """Test""" - assert get_instant.get_html_text(noHtml) == 'Test' - -def test_data_to_xml(get_instant,tmp_path): - xml_content = {"content": "This is nothing more than a test", - "date": "2024-04-17T15:13:56+00:00", - "attachment": 2, - "attachement type": {'jpg', 'jpg'} - } + assert get_instant.get_html_text(noHtml) == "Test" + + +def test_data_to_xml(get_instant, tmp_path): + xml_content = { + "content": "This is nothing more than a test", + "date": "2024-04-17T15:13:56+00:00", + "attachment": 2, + "attachement type": {"jpg", "jpg"}, + } xml = get_instant.data_to_xml(xml_content) get_instant.write_file(xml, tmp_path / "test") assert filecmp.cmp(XML_PATH, tmp_path / "test.out") diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb index e2a7703..63f55fa 100644 --- a/notebook/demo.ipynb +++ b/notebook/demo.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -39,19 +39,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", - "- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" - ] - } - ], + "outputs": [], "source": [ "# import files from input_dir - change this to your own directory\n", "input_dir = \"../mailcom/test/data\"\n", @@ -84,23 +74,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Re excel 2024-04-17T17_53_28+02 00.eml\n", - "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Fw Re viagem 2024-04-17T18_00_36+02 00.eml\n", - "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Re Invitation à la réunion du 24 avril.eml\n", - "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Re AW Objet Invitation à notre événement spécial!.eml\n", - "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Re reunião agendada para o dia 24 de abril 2024-04-17T17_39_49+02 00.eml\n", - "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Bonjour Agathe.eml\n", - "Parsing input file /home/inga/projects/anonymization-project/mailcom/mailcom/test/data/Re Uma Mensagem de Amor 2024-04-17T17_43_45+02 00.eml\n" - ] - } - ], + "outputs": [], "source": [ "# loop over mails and pseudonymize them\n", "out_list = []\n", @@ -128,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -145,128 +121,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Email 0\n", - "Original Text:\n", - " J'espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage!\n", - "[Inline-Bild]\n", - "\n", - "[Inline-Bild]\n", - "\n", - "À bientôt,\n", - "\n", - "Pierre\n", - "\n", - "Pseudonymized Text:\n", - " J'espère que tu vas bien! Je voulais partager avec toi quelques photos de mon dernier voyage!\n", - "[Inline-Bild]\n", - "\n", - "[Inline-Bild]\n", - "\n", - "À bientôt,\n", - "\n", - "Claude\n", - "\n", - "Email 1\n", - "Original Text:\n", - "
\n", - "

Olá Lino,

Espero que este e-mail te encontre bem. Agradeço a confirmação de sua presença na reunião do dia 24 de abril às 15:00 na InnovaTech Solutions.

Com relação à confirmação dos demais membros da equipe, irei verificar a disponibilidade de cada um e te retorno o mais breve possível para que possamos organizar todos os detalhes necessários. A relevância dos temas a serem discutidos para cada área certamente contribuirá para uma sessão produtiva.

Até breve,

[Alejandro Rodriguez]\n", - "InnovaTech Solutions S.L



\n", - "\n", - "
\n", - "
\n", - "\n", - "
\n", - " Am Mittwoch, 17. April 2024 um 17:30:19 MESZ hat lino.silva1 <lino.silva1@protonmail.com> Folgendes geschrieben:\n", - "
\n", - "

\n", - "

\n", - "\n", - "\n", - "
Espero que este e-mail o encontre bem. Escrevo para confirmar minha presença na reunião agendada para o dia 24 de abril, às 15:00, nas instalações da InnovaTech Solutions, localizada na Rua Principal, 123.

Agradeceria se pudesse confirmar a participação das seguintes pessoas de nossa equipe, uma vez que os temas a serem discutidos são relevantes para suas respectivas áreas:

- Alessandra Acquarone (Secretária Executiva)
- Dr. Giulia Agostini (Literatura Francesa/Italiana/Espanhola)
- Meggi Altrock (Secretária do Prof. Dr. De Stefani)
- Felicia Augusto-Hönicke (Assistente Acadêmica em Prática de Língua Portuguesa)
- Ronny Beckert (Assistente Acadêmico)
- Dr. Gérald Béreiziat (Professor de Prática da Língua Francesa)
- Dr. Stephanie Béreiziat-Lang (Substituição de Cadeira em Literatura Espanhola)

- Ursula Bergerfurth (Didática do Italiano)
- Maylis Bonetti (Professora de Prática da Língua Francesa)
- Dr. Sarah Burnautzki (Professora Tenure-Track em Literatura Francesa/Espanhola)
- Constanza Cárdenas (Instrutora de Prática da Língua Espanhola)
- Dr. Romanita Constantinescu (Professor de Romeno da Fundação)
- Ricardo Coseano (Assistente Acadêmico em Prática da Língua Espanhola)
- Prof. Dr. Elwys De Stefani (Cadeira de Linguística Italiana/Francesa)

- Brigitta Dierkes (Didática do Espanhol)
- Annachiara Di Taranto (Professora de Prática da Língua Italiana)
- Agathe Duperron (Secretária do Prof. Große)
- Madeleine Eppel (Responsável pelo Catálogo de Cursos e Matrículas)
- José Esplá-Oliver (Assistente Acadêmico em Prática da Língua Espanhola)
- Dr. Carmela Fischer Díaz (Instrutora de Prática da Língua Espanhola)
- Sandrine Flores (Assistente Acadêmica em Prática da Língua Francesa)
- Prof. Dr. Robert Folger (Cadeira de Literatura Espanhola)

Fico no aguardo da sua confirmação para prosseguir com os preparativos necessários.

Atenciosamente,

Lino Silva
\"grafik.png\"
\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "
\n", - " Sicher versendet mit Proton Mail.\n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - "Pseudonymized Text:\n", - " \n", - " Olá Claude,Espero que este e-mail te encontre bem. Agradeço a confirmação de sua presença na reunião do dia [number] de abril às [number]:[number] na [organization] Solutions . Com relação à confirmação dos demais membros da equipe, irei verificar a disponibilidade de cada um e te retorno o mais breve possível para que possamos organizar todos os detalhes necessários. A relevância dos temas a serem discutidos para cada área certamente contribuirá para uma sessão produtiva. Até breve,Dominique Claude ]\n", - "[organization] Solutions [organization].[organization]\n", - "\n", - "\n", - "\n", - " Am Mittwoch, [number]. April [number] um [number]:[number]:[number] [misc] hat Camille [email] Folgendes geschrieben:\n", - " \n", - "\n", - "\n", - "Espero que este e-mail o encontre bem. Escrevo para confirmar minha presença na reunião agendada para o dia [number] de abril, às [number]:[number], nas instalações da [organization] Solutions , localizada na [location] , [number].Agradeceria se pudesse confirmar a participação das seguintes pessoas de nossa equipe, uma vez que os temas a serem discutidos são relevantes para suas respectivas áreas:- Charlie (Secretária Executiva)- Dr. Florence (Literatura [misc]/[misc]/[misc])- [organization] Francis (Secretária do Prof. Dr. Maxime)- Remy (Assistente Acadêmica em Prática de [misc])- Cécile ( Assistente Acadêmico)- Dr. Claude ([misc]ofessor de [misc][organization] [misc])- DrClaude - Lang ( Substituição de Cadeira em Literatura [misc])- Claude (Didática do [misc]no)- Claude (Professora de Prática da Língua [misc])- Dr. Claude (Professora Tenure-Track em Literatura [misc])- Claude (Instrutora de Prática da Língua [misc])- Dr. Claude (Professor de [misc][location] da Fundação)- Ricardo Cosea[location] (Assistente Acadêmico em Prática da Língua [misc])- Prof. Dr[organization] Claude ([organization])- Claude Claude (Didática do [misc])- Claude [location] Claude (Professora de Prática da Língua [misc])- Claude (Secretária do Prof. Große)- Claude (Responsável pelo Catálogo de Cursos e Matrículas)- Claude -Claude ( Assistente Acadêmico em Prática da Língua [misc])- Dr. Claude (Instrutora de Prática da Língua [misc])- Claude (Assistente Acadêmica em Prática da Língua [misc])- Prof . Dr. Claude ([organization])Fico no aguardo da sua confirmação para prosseguir com os preparativos necessários. Atenciosamente,Lino Silva\n", - "\n", - "\n", - "\n", - "\n", - " Sicher versendet mit [misc][organization].\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "Email 2\n", - "Original Text:\n", - "
From : aitana.garcia@zohomail.eu
To : \"Alejandro Rodriguez\"< alejandro.rodriguez1@gmx.net>
Date : mié., 17 abr. 2024 17:24:41 +0200
Subject : Re: purismo

Muchas gracias!

Sent using Zoho Mail




---- El mié., 17 abr. 2024 17:22:49 +0200, Alejandro Rodriguez <alejandro.rodriguez1@gmx.net> escribió ----

Hola, aquí tienes el escaneo ;) Saludos, Aitana

\n", - "Pseudonymized Text:\n", - " From : [email] : \"Claude\"< [email] : mié. , [number] abr. [number] [number]:[number]:[number] +[number]Subject : Re: purismoMuchas gracias!Sent using [organization] El mié. , [number] abr. [number] [number]:[number]:[number] +[number], Claude [email] escribió ----Hola, aquí tienes el escaneo ;) Saludos, Dominique\n", - "Email 3\n", - "Original Text:\n", - "
From : pierre.lefevre@myyahoo.com
To : \"Alejandro Rodriguez\"< alejandro.rodriguez1@gmx.net>,\"Aitana Garcia\"< aitana.garcia@zohomail.eu>
Date : mié., 17 abr. 2024 17:33:23 +0200
Subject : Re: ¡Voy el 24!

\n", - "

Salut Aitana,

Merci beaucoup pour l'invitation ! Je suis ravi d'apprendre que je pourrai vous rejoindre pour la réunion du 24 à 15h. Cela sera une excellente occasion de rencontrer l'équipe et de discuter des projets en cours.

N'hésite pas à me dire s'il y a des documents spécifiques que je devrais consulter avant la réunion ou si tu veux que je prépare quelque chose de particulier. Je suis tout à fait disposé à contribuer de manière constructive à notre discussion.

À très bientôt et encore merci de m'avoir inclus.

Bien cordialement,\n", - "Pierre



\n", - " \n", - "
\n", - "
\n", - " \n", - "
\n", - " Am Mittwoch, 17. April 2024 um 17:31:14 MESZ hat Aitana Garcia <aitana.garcia@zohomail.eu> Folgendes geschrieben:\n", - "
\n", - "

\n", - "

\n", - " \n", - " \n", - "
Ah, y me gustaría invitar a mi amigo Pierre, él está en el CC

Sent using Zoho Mail




---- El mié., 17 abr. 2024 17:27:12 +0200, Aitana Garcia <aitana.garcia@zohomail.eu> escribió ----

Hola Ale,

¡Todo bien por aquí! Solo pasaba a decirte que contarás conmigo el 24 a las 3 en la ofi. ¿Hay algo que deba llevar o algún tema en particular que quieras que cheque antes?

¡Nos vemos!\n", - "Aitana

Sent using Zoho Mail





\n", - "
\n", - "
\n", - "Pseudonymized Text:\n", - " From : [email] : \" Claude\"< [email] Dominique\"< [email] >Date : mié. , [number] abr. [number] [number]:[number]:[number] +[number]Subject : Re: ¡Voy el [number]!\n", - "Salut Claude,Merci beaucoup pour l'invitation ! Je suis ravi d'apprendre que je pourrai vous rejoindre pour la réunion du [number] à [number]h. Cela sera une excellente occasion de rencontrer l'équipe et de discuter des projets en cours. N'hésite pas à me dire s'il y a des documents spécifiques que je devrais consulter avant la réunion ou si tu veux que je prépare quelque chose de particulier. Je suis tout à fait disposé à contribuer de manière constructive à notre discussion. À très bientôt et encore merci de m'avoir inclus. Bien cordialement,\n", - "Pierre\n", - "\n", - "\n", - "\n", - " Am Mittwoch, [number] . April [number] um [number]:[number]:[number] [misc] hat Charlie [email] > Folgendes geschrieben:\n", - " \n", - "\n", - "\n", - "Ah, y me gustaría invitar a mi amigo Florence, él está en el [organization] using [misc]---- El mié. , [number] abr. [number] [number]:[number]:[number] +[number], Charlie [email] escribió ----Hola Francis,¡Todo bien por aquí! Solo pasaba a decirte que contarás conmigo el [number] a las [number] en la ofi. ¿ Hay algo que deba llevar o algún tema en particular que quieras que cheque antes?¡Nos vemos!\n", - "ClaudeSent using [organization]\n", - "\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# print results\n", "for idx, mail in df.iterrows():\n", diff --git a/notebook/performance_demo.ipynb b/notebook/performance_demo.ipynb index 8d0920c..1bd8958 100644 --- a/notebook/performance_demo.ipynb +++ b/notebook/performance_demo.ipynb @@ -2,18 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\Felix\\miniconda3\\envs\\mailcom\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "import mailcom.inout\n", "import mailcom.parse\n", @@ -24,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -34,39 +25,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Unnamed: 0 message\n", - "0 242 Von meinem iPhone gesendet Anfang der weiterge...\n", - "1 243 Von meinem iPhone gesendet Anfang der weiterge...\n", - "2 244 Von meinem iPhone gesendet Anfang der weiterge...\n", - "3 245 Von meinem iPhone gesendet Anfang der weiterge...\n", - "4 246 Von meinem iPhone gesendet Anfang der weiterge...\n", - ".. ... ...\n", - "98 1313 \\nVon: Mélissa des Presses de l'Université Lav...\n", - "99 1314 Von: Librairie Classiques Garnier &lt;libr...\n", - "100 1315 La langue s'enrichit #36 - FranceTerme\\nProf. ...\n", - "101 1316 Activités de juin - Presses de l'Université La...\n", - "102 1317 Nouveautés de juin\\nProf. Dr. Sybille Große I ...\n", - "\n", - "[103 rows x 2 columns]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", - "- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" - ] - } - ], + "outputs": [], "source": [ "# import files from csv file\n", "email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg.csv\")\n", @@ -82,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -105,43 +66,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " content \\\n", - "0 Von meinem iPhone gesendet Anfang der weiterge... \n", - "1 Von meinem iPhone gesendet Anfang der weiterge... \n", - "2 Von meinem iPhone gesendet Anfang der weiterge... \n", - "3 Von meinem iPhone gesendet Anfang der weiterge... \n", - "4 Von meinem iPhone gesendet Anfang der weiterge... \n", - ".. ... \n", - "98 \\nVon: Mélissa des Presses de l'Université Lav... \n", - "99 Von: Librairie Classiques Garnier &lt;libr... \n", - "100 La langue s'enrichit #36 - FranceTerme\\nProf. ... \n", - "101 Activités de juin - Presses de l'Université La... \n", - "102 Nouveautés de juin\\nProf. Dr. Sybille Große I ... \n", - "\n", - " pseudo_content \n", - "0 Von meinem [misc] gesendet Anfang der weiterge... \n", - "1 Von meinem [misc] gesendet Anfang der weiterge... \n", - "2 Von meinem [misc] gesendet Anfang der weiterge... \n", - "3 Von meinem [misc] gesendet Anfang der weiterge... \n", - "4 Von meinem [misc] gesendet Anfang der weiterge... \n", - ".. ... \n", - "98 \\n Von: Claude [organization] [email] Mittwoch... \n", - "99 Von: [misc]i[misc] Claude [email] -[organizati... \n", - "100 La langue s'enrichit #[number] - [organization... \n", - "101 Activités de juin - Presses de l'[organization... \n", - "102 Nouveautés de juin\\nProf. Dr. Claude I [misc] ... \n", - "\n", - "[103 rows x 2 columns]\n" - ] - } - ], + "outputs": [], "source": [ "# write output to pandas df\n", "df = pd.DataFrame(out_list)\n", @@ -150,121 +77,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Time from start to model loaded: 08 s\n", - "Time needed for email 0 : 06 s\n", - "Time needed for email 1 : 08 s\n", - "Time needed for email 2 : 05 s\n", - "Time needed for email 3 : 09 s\n", - "Time needed for email 4 : 06 s\n", - "Time needed for email 5 : 13 s\n", - "Time needed for email 6 : 08 s\n", - "Time needed for email 7 : 06 s\n", - "Time needed for email 8 : 14 s\n", - "Time needed for email 9 : 08 s\n", - "Time needed for email 10 : 09 s\n", - "Time needed for email 11 : 09 s\n", - "Time needed for email 12 : 12 s\n", - "Time needed for email 13 : 11 s\n", - "Time needed for email 14 : 08 s\n", - "Time needed for email 15 : 03 s\n", - "Time needed for email 16 : 01 s\n", - "Time needed for email 17 : 18 s\n", - "Time needed for email 18 : 07 s\n", - "Time needed for email 19 : 03 s\n", - "Time needed for email 20 : 27 s\n", - "Time needed for email 21 : 05 s\n", - "Time needed for email 22 : 04 s\n", - "Time needed for email 23 : 07 s\n", - "Time needed for email 24 : 05 s\n", - "Time needed for email 25 : 05 s\n", - "Time needed for email 26 : 05 s\n", - "Time needed for email 27 : 01 s\n", - "Time needed for email 28 : 03 s\n", - "Time needed for email 29 : 04 s\n", - "Time needed for email 30 : 04 s\n", - "Time needed for email 31 : 03 s\n", - "Time needed for email 32 : 12 s\n", - "Time needed for email 33 : 06 s\n", - "Time needed for email 34 : 14 s\n", - "Time needed for email 35 : 09 s\n", - "Time needed for email 36 : 07 s\n", - "Time needed for email 37 : 07 s\n", - "Time needed for email 38 : 14 s\n", - "Time needed for email 39 : 04 s\n", - "Time needed for email 40 : 14 s\n", - "Time needed for email 41 : 04 s\n", - "Time needed for email 42 : 08 s\n", - "Time needed for email 43 : 14 s\n", - "Time needed for email 44 : 06 s\n", - "Time needed for email 45 : 17 s\n", - "Time needed for email 46 : 14 s\n", - "Time needed for email 47 : 11 s\n", - "Time needed for email 48 : 09 s\n", - "Time needed for email 49 : 12 s\n", - "Time needed for email 50 : 15 s\n", - "Time needed for email 51 : 08 s\n", - "Time needed for email 52 : 15 s\n", - "Time needed for email 53 : 16 s\n", - "Time needed for email 54 : 49 s\n", - "Time needed for email 55 : 05 s\n", - "Time needed for email 56 : 06 s\n", - "Time needed for email 57 : 05 s\n", - "Time needed for email 58 : 07 s\n", - "Time needed for email 59 : 20 s\n", - "Time needed for email 60 : 12 s\n", - "Time needed for email 61 : 32 s\n", - "Time needed for email 62 : 11 s\n", - "Time needed for email 63 : 16 s\n", - "Time needed for email 64 : 12 s\n", - "Time needed for email 65 : 12 s\n", - "Time needed for email 66 : 13 s\n", - "Time needed for email 67 : 14 s\n", - "Time needed for email 68 : 05 s\n", - "Time needed for email 69 : 05 s\n", - "Time needed for email 70 : 24 s\n", - "Time needed for email 71 : 07 s\n", - "Time needed for email 72 : 21 s\n", - "Time needed for email 73 : 22 s\n", - "Time needed for email 74 : 21 s\n", - "Time needed for email 75 : 22 s\n", - "Time needed for email 76 : 08 s\n", - "Time needed for email 77 : 06 s\n", - "Time needed for email 78 : 24 s\n", - "Time needed for email 79 : 08 s\n", - "Time needed for email 80 : 35 s\n", - "Time needed for email 81 : 29 s\n", - "Time needed for email 82 : 39 s\n", - "Time needed for email 83 : 37 s\n", - "Time needed for email 84 : 36 s\n", - "Time needed for email 85 : 02 s\n", - "Time needed for email 86 : 47 s\n", - "Time needed for email 87 : 05 s\n", - "Time needed for email 88 : 45 s\n", - "Time needed for email 89 : 53 s\n", - "Time needed for email 90 : 08 s\n", - "Time needed for email 91 : 19 s\n", - "Time needed for email 92 : 14 s\n", - "Time needed for email 93 : 06 s\n", - "Time needed for email 94 : 28 s\n", - "Time needed for email 95 : 30 s\n", - "Time needed for email 96 : 30 s\n", - "Time needed for email 97 : 15 s\n", - "Time needed for email 98 : 18 s\n", - "Time needed for email 99 : 09 s\n", - "Time needed for email 100 : 21 s\n", - "Time needed for email 101 : 10 s\n", - "Time needed for email 102 : 13 s\n", - "Total time: 27:21\n" - ] - } - ], + "outputs": [], "source": [ "# print timestamps\n", "print(\"Time from start to model loaded:\", (datetime.datetime.fromtimestamp(t_model_loaded - t0).strftime('%S')), \"s\")\n", From 446f34c84dc6f03c1921a12f6a97a5ebd0d59173 Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 3 Dec 2024 18:56:09 +0100 Subject: [PATCH 9/9] Added performance plots --- notebook/performance_demo.ipynb | 79 ++++++++++++++++++++++++++------- pyproject.toml | 1 + 2 files changed, 64 insertions(+), 16 deletions(-) diff --git a/notebook/performance_demo.ipynb b/notebook/performance_demo.ipynb index 1bd8958..b8c48b4 100644 --- a/notebook/performance_demo.ipynb +++ b/notebook/performance_demo.ipynb @@ -10,7 +10,8 @@ "import mailcom.parse\n", "import pandas as pd\n", "import time\n", - "import datetime" + "import datetime\n", + "import matplotlib.pyplot as plt" ] }, { @@ -30,9 +31,11 @@ "outputs": [], "source": [ "# import files from csv file\n", - "email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg.csv\")\n", + "email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg_copy.csv\")\n", "print(email_list)\n", "\n", + "t_csv_read = time.time()\n", + "\n", "# create pseudonymization object\n", "ps = mailcom.parse.Pseudonymize()\n", "ps.init_spacy(\"fr\")\n", @@ -51,17 +54,33 @@ "out_list = []\n", "ts_list = []\n", "for idx, row in email_list.iterrows():\n", + " ts_email_start = time.time()\n", " text = row[\"message\"]\n", " email_dict = {\"content\": text}\n", " if not text:\n", " continue\n", " # Test functionality of Pseudonymize class\n", - " output_text = ps.pseudonymize(text)\n", + " # Pseudonymization is usually done using ps.pseudonymize\n", + " # For performance analysis the process is split into its subprocesses here\n", + " ps.reset()\n", + " sentences = ps.get_sentences(text)\n", + " ts_email_ppr_done = time.time()\n", + " pseudonymized_sentences = []\n", + " for sent in sentences:\n", + " sent = ps.pseudonymize_email_addresses(sent)\n", + " ner = ps.get_ner(sent)\n", + " ps_sent = \" \".join(ps.pseudonymize_ne(ner, sent)) if ner else sent\n", + " ps_sent = ps.pseudonymize_numbers(ps_sent)\n", + " pseudonymized_sentences.append(ps_sent)\n", + " output_text = ps.concatenate(pseudonymized_sentences)\n", + "\n", + " # add output to dict\n", " email_dict[\"pseudo_content\"] = output_text\n", " out_list.append(email_dict)\n", "\n", " # timestamp after this email\n", - " ts_list.append(time.time())" + " ts_email_end = time.time()\n", + " ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])" ] }, { @@ -81,18 +100,46 @@ "metadata": {}, "outputs": [], "source": [ - "# print timestamps\n", - "print(\"Time from start to model loaded:\", (datetime.datetime.fromtimestamp(t_model_loaded - t0).strftime('%S')), \"s\")\n", - "# time differences between emails\n", - "ts_diffs = []\n", - "for i in range(0, len(ts_list)):\n", - " if i == 0:\n", - " ts_diff = (ts_list[i] - t_model_loaded)\n", - " else:\n", - " ts_diff = (ts_list[i] - ts_list[i-1])\n", - " ts_diffs.append(ts_diff)\n", - " print(\"Time needed for email\", i, \":\", (datetime.datetime.fromtimestamp(ts_diff).strftime('%S')), \"s\")\n", - "print(\"Total time:\", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1] - t_model_loaded).strftime('%M:%S')))" + "# display timestamps\n", + "\n", + "# bar plot for each individual email\n", + "# processing times\n", + "idx_list = [row[0] for row in email_list.iterrows()]\n", + "email_duration_list = [ts[2] - ts[1] for ts in ts_list]\n", + "email_ppr_list = [ts[1] - ts[0] for ts in ts_list]\n", + "email_total_list = [ts[2] - ts[0] for ts in ts_list]\n", + "email_bar_height = {\n", + " \"Pre-Processing\": email_ppr_list,\n", + " \"Pseudonymization\": email_duration_list\n", + "}\n", + "bt = [0 for idx in idx_list]\n", + "\n", + "plt.figure(figsize=(10,4), dpi=80)\n", + "\n", + "# plot 1\n", + "plt.subplot(1, 2, 1)\n", + "for key, height in email_bar_height.items():\n", + " plt.bar(idx_list, height, 0.5, label=key, bottom=bt)\n", + " bt = [bi + hi for (bi,hi) in zip(bt, height)]\n", + "#plt.yscale(\"log\")\n", + "plt.xlabel(\"Email\")\n", + "plt.ylabel(\"t [s]\")\n", + "plt.title(\"Computation times for emails, model loading and file reading\")\n", + "plt.legend()\n", + "\n", + "# plot for model loading and file reading, as well as average email time\n", + "# processing times\n", + "bar_x = [\"CSV Reading\", \"Model Loading\", \"Average Email Time\"]\n", + "average_email_time = sum(email_total_list) / len(email_total_list)\n", + "bar_y = [t_csv_read - t0, t_model_loaded - t0, average_email_time]\n", + "plt.ylabel(\"t [s]\")\n", + "\n", + "# plot 2\n", + "plt.subplot(1, 2, 2)\n", + "plt.bar(bar_x, bar_y, 0.5)\n", + "\n", + "# Total time\n", + "print(\"Total time:\", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1][2] - t_model_loaded).strftime('%M:%S')))" ] }, { diff --git a/pyproject.toml b/pyproject.toml index e0ef7bf..04b04d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "torch", "pandas", "jupyter", + "matplotlib" ] [project.optional-dependencies]