From 0eda4693ea9f0a0e394231000dca611aa8359df3 Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Sun, 3 Nov 2024 15:59:27 +0100 Subject: [PATCH 1/9] Created demo notebook file --- notebook/demo.ipynb | 133 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 notebook/demo.ipynb diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb new file mode 100644 index 0000000..e0fa2e0 --- /dev/null +++ b/notebook/demo.ipynb @@ -0,0 +1,133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import mailcom.inout\n", + "import mailcom.parse\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", + "- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + ] + } + ], + "source": [ + "# import files from /data/in\n", + "\n", + "io = mailcom.inout.InoutHandler(\"../mailcom/test/data\")\n", + "io.list_of_files()\n", + "\n", + "# create pseudonymization object\n", + "ps = mailcom.parse.Pseudonymize()\n", + "ps.init_spacy(\"fr\")\n", + "ps.init_transformers()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Bonjour Agathe.eml\n", + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re reunião agendada para o dia 24 de abril 2024-04-17T17_39_49+02 00.eml\n", + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ purismo.html\n", + "Parsing input file C:\\Users\\Felix\\Documents\\GitHub\\mailcom\\mailcom\\test\\data\\Re_ ¡Voy el 24!.html\n" + ] + } + ], + "source": [ + "# loop over mails and pseudonymize them\n", + "out_list = []\n", + "for file in io.email_list:\n", + " print(\"Parsing input file {}\".format(file))\n", + " # creating dict\n", + " email_dict = {}\n", + " text = io.get_text(file)\n", + " text = io.get_html_text(text)\n", + " if not text:\n", + " continue\n", + " # Test functionality of Pseudonymize class\n", + " output_text = ps.pseudonymize(text)\n", + " email_dict[\"content\"] = text\n", + " email_dict[\"pseudo_content\"] = output_text\n", + " out_list.append(email_dict)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " content \\\n", + "0 J'espère que tu vas bien! Je voulais partager ... \n", + "1 \\nOlá Lino,Espero que este e-mail te encontre ... \n", + "2 From : aitana.garcia@zohomail.euTo : \"Alejandr... \n", + "3 From : pierre.lefevre@myyahoo.comTo : \"Alejand... \n", + "\n", + " pseudo_content \n", + "0 J'espère que tu vas bien! Je voulais partager ... \n", + "1 \\n Olá Claude,Espero que este e-mail te encont... \n", + "2 From : Claude.Dominique@xxxxxxxx.xxTo : \"Claud... \n", + "3 From : Claude.Dominique@myxxxxx.comTo : \" Clau... \n" + ] + } + ], + "source": [ + "# write output to pandas df\n", + "df = pd.DataFrame(out_list)\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mailcom", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 4779ec7fcb34f9b019ef20cdf0fc73c94fcb3b7b Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 5 Nov 2024 11:31:32 +0100 Subject: [PATCH 2/9] Added email metadata to df --- notebook/demo.ipynb | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb index e0fa2e0..55aa7bc 100644 --- a/notebook/demo.ipynb +++ b/notebook/demo.ipynb @@ -2,9 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Felix\\miniconda3\\envs\\mailcom\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "import mailcom.inout\n", "import mailcom.parse\n", @@ -13,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -40,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -59,22 +68,22 @@ "out_list = []\n", "for file in io.email_list:\n", " print(\"Parsing input file {}\".format(file))\n", - " # creating dict\n", - " email_dict = {}\n", " text = io.get_text(file)\n", + " # after this function was called, the email metadata can be accessed via io.email_content\n", + " # the dict already has the entries content, date, attachments, attachment type\n", + " email_dict = io.email_content.copy()\n", " text = io.get_html_text(text)\n", " if not text:\n", " continue\n", " # Test functionality of Pseudonymize class\n", " output_text = ps.pseudonymize(text)\n", - " email_dict[\"content\"] = text\n", " email_dict[\"pseudo_content\"] = output_text\n", " out_list.append(email_dict)\n" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -83,15 +92,21 @@ "text": [ " content \\\n", "0 J'espère que tu vas bien! Je voulais partager ... \n", - "1 \\nOlá Lino,Espero que este e-mail te encontre ... \n", - "2 From : aitana.garcia@zohomail.euTo : \"Alejandr... \n", - "3 From : pierre.lefevre@myyahoo.comTo : \"Alejand... \n", + "1
Olá Lino,
Espero que este e-mail te encontre bem. Agradeço a confirmação de sua presença na reunião do dia 24 de abril às 15:00 na InnovaTech Solutions.
Com relação à confirmação dos demais membros da equipe, irei verificar a disponibilidade de cada um e te retorno o mais breve possível para que possamos organizar todos os detalhes necessários. A relevância dos temas a serem discutidos para cada área certamente contribuirá para uma sessão produtiva.
Até breve,
[Alejandro Rodriguez]\n", + "InnovaTech Solutions S.L
Sent using Zoho Mail
Hola, aquí tienes el escaneo ;) Saludos, Aitana
Salut Aitana,
Merci beaucoup pour l'invitation ! Je suis ravi d'apprendre que je pourrai vous rejoindre pour la réunion du 24 à 15h. Cela sera une excellente occasion de rencontrer l'équipe et de discuter des projets en cours.
N'hésite pas à me dire s'il y a des documents spécifiques que je devrais consulter avant la réunion ou si tu veux que je prépare quelque chose de particulier. Je suis tout à fait disposé à contribuer de manière constructive à notre discussion.
À très bientôt et encore merci de m'avoir inclus.
Bien cordialement,\n", + "Pierre
Sent using Zoho Mail
Hola Ale,¡Todo bien por aquí! Solo pasaba a decirte que contarás conmigo el 24 a las 3 en la ofi. ¿Hay algo que deba llevar o algún tema en particular que quieras que cheque antes?
¡Nos vemos!\n", + "Aitana
Sent using Zoho Mail
Olá Lino,
Espero que este e-mail te encontre bem. Agradeço a confirmação de sua presença na reunião do dia 24 de abril às 15:00 na InnovaTech Solutions.
Com relação à confirmação dos demais membros da equipe, irei verificar a disponibilidade de cada um e te retorno o mais breve possível para que possamos organizar todos os detalhes necessários. A relevância dos temas a serem discutidos para cada área certamente contribuirá para uma sessão produtiva.
Até breve,
[Alejandro Rodriguez]\n", - "InnovaTech Solutions S.L
Sent using Zoho Mail
Hola, aquí tienes el escaneo ;) Saludos, Aitana
Salut Aitana,
Merci beaucoup pour l'invitation ! Je suis ravi d'apprendre que je pourrai vous rejoindre pour la réunion du 24 à 15h. Cela sera une excellente occasion de rencontrer l'équipe et de discuter des projets en cours.
N'hésite pas à me dire s'il y a des documents spécifiques que je devrais consulter avant la réunion ou si tu veux que je prépare quelque chose de particulier. Je suis tout à fait disposé à contribuer de manière constructive à notre discussion.
À très bientôt et encore merci de m'avoir inclus.
Bien cordialement,\n", - "Pierre
Sent using Zoho Mail
Hola Ale,¡Todo bien por aquí! Solo pasaba a decirte que contarás conmigo el 24 a las 3 en la ofi. ¿Hay algo que deba llevar o algún tema en particular que quieras que cheque antes?
¡Nos vemos!\n", - "Aitana
Sent using Zoho Mail