Skip to content

Commit

Permalink
Adjusted demo notebook to csv changes
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Mar 3, 2025
1 parent a92898b commit 79dea5a
Showing 1 changed file with 12 additions and 23 deletions.
35 changes: 12 additions & 23 deletions notebook/demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@
"\n",
"# some internal processing\n",
"io.list_of_files()\n",
"# extracts the text of all emails in the directory and cleans up html content\n",
"io.process_emails()\n",
"\n",
"# create pseudonymization object and load spacy and transformers\n",
"# set the spacy language for sentence splitting\n",
Expand Down Expand Up @@ -121,27 +123,18 @@
"outputs": [],
"source": [
"# loop over mails and pseudonymize them\n",
"out_list = []\n",
"for file in io.email_list:\n",
" print(\"Parsing input file {}\".format(file))\n",
" text = io.get_text(file)\n",
" # after this function was called, the email metadata can be accessed via io.email_content\n",
"for _, email in enumerate(io.get_email_list()):\n",
" # the email text and metadata are stored in a dict\n",
" # the dict already has the entries content, date, attachments, attachment type\n",
" email_dict = io.email_content.copy()\n",
" html_text = io.get_html_text(text)\n",
" email_dict[\"html_text\"] = html_text\n",
" if not text:\n",
" if not email[\"content\"]:\n",
" continue\n",
" # Test functionality of Pseudonymize class\n",
" output_text = ps.pseudonymize(html_text)\n",
" # The output text is returned, as well as saved as a dict entry as \"pseudo_content\"\n",
" _ = ps.pseudonymize(email)\n",
"\n",
" # display original text and highlight found and replaced NEs\n",
" highlighted_html = highlight_ne(html_text, ps.ne_list)\n",
" display(HTML(highlighted_html))\n",
"\n",
" # add pseudonymized text to dict\n",
" email_dict[\"pseudo_content\"] = output_text\n",
" out_list.append(email_dict)"
" highlighted_html = highlight_ne(email[\"content\"], ps.ne_list)\n",
" display(HTML(highlighted_html))"
]
},
{
Expand All @@ -158,14 +151,14 @@
"outputs": [],
"source": [
"# write output to pandas df\n",
"df = pd.DataFrame(out_list)"
"df = pd.DataFrame(io.get_email_list())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You may print the output for inspection in the notebook as per the cell below."
"The output can be saved as a csv file as well."
]
},
{
Expand All @@ -174,11 +167,7 @@
"metadata": {},
"outputs": [],
"source": [
"# print results\n",
"for idx, mail in df.iterrows():\n",
" print(\"Email\", idx)\n",
" print(\"Original Text:\\n\", mail[\"html_text\"])\n",
" print(\"Pseudonymized Text:\\n\", mail[\"pseudo_content\"])\t"
"io.write_csv(\"../data/out/out_demo.csv\")"
]
},
{
Expand Down

0 comments on commit 79dea5a

Please sign in to comment.