Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Notebook demos #50

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ repos:
rev: 24.4.2
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
rev: 7.1.0
hooks:
- id: flake8
- repo: https://github.com/pycqa/flake8
rev: 7.1.0
hooks:
- id: flake8
- repo: https://github.com/kynan/nbstripout
rev: 0.8.1
hooks:
- id: nbstripout
65 changes: 39 additions & 26 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,62 @@
from bs4 import BeautifulSoup
from dicttoxml import dicttoxml


class InoutHandler:
def __init__(self, directory_name: str):
"""Constructor for the InoutHandler class.
Args:

Args:
directory_name (str): The directory where the files are located.
"""
"""
self.directory_name = directory_name
# presets
self.pattern = [".eml", ".html"]

def list_of_files(self):
"""Method to create a list of Path objects (files) that are present
"""Method to create a list of Path objects (files) that are present
in a directory."""
if not os.path.exists(self.directory_name): # check if given dir exists raises error otherwise
if not os.path.exists(
self.directory_name
): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(self.directory_name))
mypath = Path(self.directory_name)
self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern]
self.email_list = [
mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern
]
if len(self.email_list) == 0:
raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
raise ValueError(
"""The directory {} does not contain .eml or .html files.
Please check that the directory is containing the
email data files""".format(
mypath
)
)

def get_html_text(self, text_check: str) -> str:
"""Clean up a string if it contains html content.
Args:
text_check (str): The string that may contain html content.

Returns:
str: The (potentially) cleaned up string."""
soup = BeautifulSoup(text_check , 'html.parser')
soup = BeautifulSoup(text_check, "html.parser")
if soup.find():
text_check = soup.get_text()
return text_check

def get_text(self, file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.

Args:
file (Path): The path to the email file.

Returns:
str: The textual content of the email. In the future, this will return the
str: The textual content of the email. In the future, this will return the
complete dictionary with the metadata."""
if not file.is_file(): # check if given file exists raises error otherwise
if not file.is_file(): # check if given file exists raises error otherwise
raise OSError("File {} does not exist".format(file))
with open(file, 'rb') as fhdl:
with open(file, "rb") as fhdl:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
Expand All @@ -57,28 +68,30 @@ def get_text(self, file: Path) -> str:
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
self.email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
}
return(self.email_content["content"])
attachmenttypes = [
parsed_eml["attachment"][i]["extension"] for i in range(attachments)
]
self.email_content = {
"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes,
}
return self.email_content["content"]

def validate_data(self):
pass

def data_to_xml(self, text):
my_item_func = lambda x: 'content'
xml = dicttoxml(text, custom_root='email', item_func = my_item_func)
my_item_func = lambda x: "content" # noqa
xml = dicttoxml(text, custom_root="email", item_func=my_item_func)
return xml.decode()

def write_file(self, text: str, name: str)-> None:
def write_file(self, text: str, name: str) -> None:
"""Write the extracted string to a text file.

Args:
text (str): The string to be written to the file.
name (str): The name of the file to be written."""
with open("{}.out".format(name), "w") as file:
file.write(text)

34 changes: 21 additions & 13 deletions mailcom/test/test_inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
XML_PATH = Path(pkg / "test" / "data" / "test.out")

TEXT_REF = "J'espère que tu vas bien!"
XML_REF = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?><email><content type=\"str\">"
XML_REF = '<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">'


@pytest.fixture()
def get_instant(tmp_path):
return inout.InoutHandler(tmp_path)


def test_list_of_files(get_instant):
with pytest.raises(ValueError):
get_instant.list_of_files()
Expand All @@ -34,31 +36,37 @@ def test_list_of_files(get_instant):
get_instant.list_of_files()
assert get_instant.directory_name / "test3.xml" not in get_instant.email_list


def test_get_text(get_instant):
p = get_instant.directory_name / "test.eml"
p.write_text("test")
extracted_text = get_instant.get_text(p)
assert extracted_text == 'test'
assert extracted_text == "test"
text = get_instant.get_text(FILE_PATH)
assert text[0:25] == TEXT_REF
assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc)
assert get_instant.email_content["date"] == datetime.datetime(
2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc
)
assert get_instant.email_content["attachment"] == 2
assert get_instant.email_content["attachement type"] == ['jpg', 'jpg']
assert get_instant.email_content["attachement type"] == ["jpg", "jpg"]
with pytest.raises(OSError):
get_instant.get_text(get_instant.directory_name / "nonexisting.eml")


def test_get_html_text(get_instant):
html = """<html><head><title>Test</title></head></html>"""
assert get_instant.get_html_text(html) == 'Test'
assert get_instant.get_html_text(html) == "Test"
noHtml = """Test"""
assert get_instant.get_html_text(noHtml) == 'Test'

def test_data_to_xml(get_instant,tmp_path):
xml_content = {"content": "This is nothing more than a test",
"date": "2024-04-17T15:13:56+00:00",
"attachment": 2,
"attachement type": {'jpg', 'jpg'}
}
assert get_instant.get_html_text(noHtml) == "Test"


def test_data_to_xml(get_instant, tmp_path):
xml_content = {
"content": "This is nothing more than a test",
"date": "2024-04-17T15:13:56+00:00",
"attachment": 2,
"attachement type": {"jpg", "jpg"},
}
xml = get_instant.data_to_xml(xml_content)
get_instant.write_file(xml, tmp_path / "test")
assert filecmp.cmp(XML_PATH, tmp_path / "test.out")
164 changes: 164 additions & 0 deletions notebook/demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Demonstration notebook for the mailcom package\n",
"*Scientific Software Center, University of Heidelberg, December 2024*\n",
"The `mailcom` package is used to anonymize/pseudonymize textual data, i.e. email content. It takes an `eml` or `html` file as input and extracts information about attachements, number of attachements and type, and the content of the email body. The latter is then parsed through [`spaCy`](https://spacy.io/) and divided into sentences. The sentences are fed to a [`transformers`](https://huggingface.co/docs/transformers/en/index) named entity recognition (NER) [pipeline](https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/pipelines), and person names, places, organizations, miscellaneous, are detected in the inference task. Names are replaced by pseudos, while locations, organizations and miscellaneous are replaced by `[location]`, `[organization]` and `[misc]`. The text is further parsed using string methods, to replace any numbers with `[number]` and email addresses with `[email]`. The processed text and metadata can then be written to an `xml` file or into a pandas dataframe.\n",
"\n",
"Please note that 100% accuracy is not possible with this task. Any output needs to be further checked by a human to ensure the text has been anonymized completely.\n",
"\n",
"The current set-up is for Romance languages, however [other language models](https://spacy.io/usage/models) can also be loaded into the spaCy pipeline. The transformers pipeline uses the `xlm-roberta-large-finetuned-conll03-english` model revision number `18f95e9` by default, but other models can also be passed (see below).\n",
"\n",
"Before using the `mailcom` package, please install it into your conda environment using\n",
"```\n",
"pip install mailcom\n",
"```\n",
"After that, select the appropriate kernel for your Jupyter notebook and execute the cell below to import the package. The package is currently under active development and any function calls are subject to changes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import mailcom.inout\n",
"import mailcom.parse\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below, the input files are loaded from the given `input_dir` directory. You can provide relative or absolute paths to the directory that contains your `eml` or `html` files. All files of the `eml` or `htlm` file type in that directory will be considered input files."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import files from input_dir - change this to your own directory\n",
"input_dir = \"../mailcom/test/data\"\n",
"\n",
"io = mailcom.inout.InoutHandler(directory_name = input_dir)\n",
"\n",
"# some internal processing\n",
"io.list_of_files()\n",
"\n",
"# create pseudonymization object and load spacy and transformers\n",
"# set the spacy language for sentence splitting\n",
"spacy_language = \"fr\"\n",
"# you may also set the model using `model = \"fr_core_news_md\"`\n",
"spacy_model = \"default\"\n",
"# set the model for transformers, here using the default model\n",
"transformers_model = \"xlm-roberta-large-finetuned-conll03-english\"\n",
"# set the revision number for transformers, here using the default revision number\n",
"transformers_revision_number = \"18f95e9\"\n",
"ps = mailcom.parse.Pseudonymize()\n",
"ps.init_spacy(language=spacy_language, model=spacy_model)\n",
"ps.init_transformers(model=transformers_model, model_revision_number=transformers_revision_number)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the cell below, the emails are looped over and the text is extracted. The text is then split into sentences and the sentences are pseudonymized. The pseudonymized sentences are then joined back into a text and saved to a new file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# loop over mails and pseudonymize them\n",
"out_list = []\n",
"for file in io.email_list:\n",
" print(\"Parsing input file {}\".format(file))\n",
" text = io.get_text(file)\n",
" # after this function was called, the email metadata can be accessed via io.email_content\n",
" # the dict already has the entries content, date, attachments, attachment type\n",
" email_dict = io.email_content.copy()\n",
" text = io.get_html_text(text)\n",
" if not text:\n",
" continue\n",
" # Test functionality of Pseudonymize class\n",
" output_text = ps.pseudonymize(text)\n",
" email_dict[\"pseudo_content\"] = output_text\n",
" out_list.append(email_dict)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After this, the output can be written to a file or processed further. The output is a list of dictionaries, each containing the metadata of the email and the pseudonymized content. In the below cell, the output is saved in a pandas dataframe."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# write output to pandas df\n",
"df = pd.DataFrame(out_list)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You may print the output for inspection in the notebook as per the cell below."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# print results\n",
"for idx, mail in df.iterrows():\n",
" print(\"Email\", idx)\n",
" print(\"Original Text:\\n\", mail[\"content\"])\n",
" print(\"Pseudonymized Text:\\n\", mail[\"pseudo_content\"])\t"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "mailcom",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading
Loading