diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c8ab9a..7cc1f24 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,11 @@ repos: rev: 24.4.2 hooks: - id: black - - repo: https://github.com/pycqa/flake8 - rev: 7.1.0 - hooks: - - id: flake8 \ No newline at end of file + - repo: https://github.com/pycqa/flake8 + rev: 7.1.0 + hooks: + - id: flake8 + - repo: https://github.com/kynan/nbstripout + rev: 0.8.1 + hooks: + - id: nbstripout diff --git a/mailcom/inout.py b/mailcom/inout.py index 15c7252..0028950 100644 --- a/mailcom/inout.py +++ b/mailcom/inout.py @@ -4,51 +4,62 @@ from bs4 import BeautifulSoup from dicttoxml import dicttoxml + class InoutHandler: def __init__(self, directory_name: str): """Constructor for the InoutHandler class. - - Args: + + Args: directory_name (str): The directory where the files are located. - """ + """ self.directory_name = directory_name # presets self.pattern = [".eml", ".html"] def list_of_files(self): - """Method to create a list of Path objects (files) that are present + """Method to create a list of Path objects (files) that are present in a directory.""" - if not os.path.exists(self.directory_name): # check if given dir exists raises error otherwise + if not os.path.exists( + self.directory_name + ): # check if given dir exists raises error otherwise raise OSError("Path {} does not exist".format(self.directory_name)) mypath = Path(self.directory_name) - self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern] + self.email_list = [ + mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern + ] if len(self.email_list) == 0: - raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath)) + raise ValueError( + """The directory {} does not contain .eml or .html files. + Please check that the directory is containing the + email data files""".format( + mypath + ) + ) def get_html_text(self, text_check: str) -> str: """Clean up a string if it contains html content. Args: text_check (str): The string that may contain html content. - + Returns: str: The (potentially) cleaned up string.""" - soup = BeautifulSoup(text_check , 'html.parser') + soup = BeautifulSoup(text_check, "html.parser") if soup.find(): text_check = soup.get_text() return text_check def get_text(self, file: Path) -> str: """Function to extract the textual content and other metadata from an email file. - + Args: file (Path): The path to the email file. - + Returns: - str: The textual content of the email. In the future, this will return the + str: The textual content of the email. In the future, this will return the complete dictionary with the metadata.""" - if not file.is_file(): # check if given file exists raises error otherwise + if not file.is_file(): # check if given file exists raises error otherwise raise OSError("File {} does not exist".format(file)) - with open(file, 'rb') as fhdl: + with open(file, "rb") as fhdl: raw_email = fhdl.read() ep = eml_parser.EmlParser(include_raw_body=True) parsed_eml = ep.decode_email_bytes(raw_email) @@ -57,23 +68,26 @@ def get_text(self, file: Path) -> str: attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0 # find the types of attachements if attachments > 0: - attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)] - self.email_content = {"content": parsed_eml["body"][0]["content"], - "date": parsed_eml["header"]["date"], - "attachment": attachments, - "attachement type": attachmenttypes - } - return(self.email_content["content"]) + attachmenttypes = [ + parsed_eml["attachment"][i]["extension"] for i in range(attachments) + ] + self.email_content = { + "content": parsed_eml["body"][0]["content"], + "date": parsed_eml["header"]["date"], + "attachment": attachments, + "attachement type": attachmenttypes, + } + return self.email_content["content"] def validate_data(self): pass - + def data_to_xml(self, text): - my_item_func = lambda x: 'content' - xml = dicttoxml(text, custom_root='email', item_func = my_item_func) + my_item_func = lambda x: "content" # noqa + xml = dicttoxml(text, custom_root="email", item_func=my_item_func) return xml.decode() - def write_file(self, text: str, name: str)-> None: + def write_file(self, text: str, name: str) -> None: """Write the extracted string to a text file. Args: @@ -81,4 +95,3 @@ def write_file(self, text: str, name: str)-> None: name (str): The name of the file to be written.""" with open("{}.out".format(name), "w") as file: file.write(text) - \ No newline at end of file diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py index 1f30ff0..18a32f0 100644 --- a/mailcom/test/test_inout.py +++ b/mailcom/test/test_inout.py @@ -11,12 +11,14 @@ XML_PATH = Path(pkg / "test" / "data" / "test.out") TEXT_REF = "J'espère que tu vas bien!" -XML_REF = "" +XML_REF = '' + @pytest.fixture() def get_instant(tmp_path): return inout.InoutHandler(tmp_path) + def test_list_of_files(get_instant): with pytest.raises(ValueError): get_instant.list_of_files() @@ -34,31 +36,37 @@ def test_list_of_files(get_instant): get_instant.list_of_files() assert get_instant.directory_name / "test3.xml" not in get_instant.email_list + def test_get_text(get_instant): p = get_instant.directory_name / "test.eml" p.write_text("test") extracted_text = get_instant.get_text(p) - assert extracted_text == 'test' + assert extracted_text == "test" text = get_instant.get_text(FILE_PATH) assert text[0:25] == TEXT_REF - assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc) + assert get_instant.email_content["date"] == datetime.datetime( + 2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc + ) assert get_instant.email_content["attachment"] == 2 - assert get_instant.email_content["attachement type"] == ['jpg', 'jpg'] + assert get_instant.email_content["attachement type"] == ["jpg", "jpg"] with pytest.raises(OSError): get_instant.get_text(get_instant.directory_name / "nonexisting.eml") + def test_get_html_text(get_instant): html = """Test""" - assert get_instant.get_html_text(html) == 'Test' + assert get_instant.get_html_text(html) == "Test" noHtml = """Test""" - assert get_instant.get_html_text(noHtml) == 'Test' - -def test_data_to_xml(get_instant,tmp_path): - xml_content = {"content": "This is nothing more than a test", - "date": "2024-04-17T15:13:56+00:00", - "attachment": 2, - "attachement type": {'jpg', 'jpg'} - } + assert get_instant.get_html_text(noHtml) == "Test" + + +def test_data_to_xml(get_instant, tmp_path): + xml_content = { + "content": "This is nothing more than a test", + "date": "2024-04-17T15:13:56+00:00", + "attachment": 2, + "attachement type": {"jpg", "jpg"}, + } xml = get_instant.data_to_xml(xml_content) get_instant.write_file(xml, tmp_path / "test") assert filecmp.cmp(XML_PATH, tmp_path / "test.out") diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb new file mode 100644 index 0000000..63f55fa --- /dev/null +++ b/notebook/demo.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Demonstration notebook for the mailcom package\n", + "*Scientific Software Center, University of Heidelberg, December 2024*\n", + "The `mailcom` package is used to anonymize/pseudonymize textual data, i.e. email content. It takes an `eml` or `html` file as input and extracts information about attachements, number of attachements and type, and the content of the email body. The latter is then parsed through [`spaCy`](https://spacy.io/) and divided into sentences. The sentences are fed to a [`transformers`](https://huggingface.co/docs/transformers/en/index) named entity recognition (NER) [pipeline](https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/pipelines), and person names, places, organizations, miscellaneous, are detected in the inference task. Names are replaced by pseudos, while locations, organizations and miscellaneous are replaced by `[location]`, `[organization]` and `[misc]`. The text is further parsed using string methods, to replace any numbers with `[number]` and email addresses with `[email]`. The processed text and metadata can then be written to an `xml` file or into a pandas dataframe.\n", + "\n", + "Please note that 100% accuracy is not possible with this task. Any output needs to be further checked by a human to ensure the text has been anonymized completely.\n", + "\n", + "The current set-up is for Romance languages, however [other language models](https://spacy.io/usage/models) can also be loaded into the spaCy pipeline. The transformers pipeline uses the `xlm-roberta-large-finetuned-conll03-english` model revision number `18f95e9` by default, but other models can also be passed (see below).\n", + "\n", + "Before using the `mailcom` package, please install it into your conda environment using\n", + "```\n", + "pip install mailcom\n", + "```\n", + "After that, select the appropriate kernel for your Jupyter notebook and execute the cell below to import the package. The package is currently under active development and any function calls are subject to changes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mailcom.inout\n", + "import mailcom.parse\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below, the input files are loaded from the given `input_dir` directory. You can provide relative or absolute paths to the directory that contains your `eml` or `html` files. All files of the `eml` or `htlm` file type in that directory will be considered input files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import files from input_dir - change this to your own directory\n", + "input_dir = \"../mailcom/test/data\"\n", + "\n", + "io = mailcom.inout.InoutHandler(directory_name = input_dir)\n", + "\n", + "# some internal processing\n", + "io.list_of_files()\n", + "\n", + "# create pseudonymization object and load spacy and transformers\n", + "# set the spacy language for sentence splitting\n", + "spacy_language = \"fr\"\n", + "# you may also set the model using `model = \"fr_core_news_md\"`\n", + "spacy_model = \"default\"\n", + "# set the model for transformers, here using the default model\n", + "transformers_model = \"xlm-roberta-large-finetuned-conll03-english\"\n", + "# set the revision number for transformers, here using the default revision number\n", + "transformers_revision_number = \"18f95e9\"\n", + "ps = mailcom.parse.Pseudonymize()\n", + "ps.init_spacy(language=spacy_language, model=spacy_model)\n", + "ps.init_transformers(model=transformers_model, model_revision_number=transformers_revision_number)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the cell below, the emails are looped over and the text is extracted. The text is then split into sentences and the sentences are pseudonymized. The pseudonymized sentences are then joined back into a text and saved to a new file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# loop over mails and pseudonymize them\n", + "out_list = []\n", + "for file in io.email_list:\n", + " print(\"Parsing input file {}\".format(file))\n", + " text = io.get_text(file)\n", + " # after this function was called, the email metadata can be accessed via io.email_content\n", + " # the dict already has the entries content, date, attachments, attachment type\n", + " email_dict = io.email_content.copy()\n", + " text = io.get_html_text(text)\n", + " if not text:\n", + " continue\n", + " # Test functionality of Pseudonymize class\n", + " output_text = ps.pseudonymize(text)\n", + " email_dict[\"pseudo_content\"] = output_text\n", + " out_list.append(email_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After this, the output can be written to a file or processed further. The output is a list of dictionaries, each containing the metadata of the email and the pseudonymized content. In the below cell, the output is saved in a pandas dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# write output to pandas df\n", + "df = pd.DataFrame(out_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may print the output for inspection in the notebook as per the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print results\n", + "for idx, mail in df.iterrows():\n", + " print(\"Email\", idx)\n", + " print(\"Original Text:\\n\", mail[\"content\"])\n", + " print(\"Pseudonymized Text:\\n\", mail[\"pseudo_content\"])\t" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mailcom", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebook/performance_demo.ipynb b/notebook/performance_demo.ipynb new file mode 100644 index 0000000..b8c48b4 --- /dev/null +++ b/notebook/performance_demo.ipynb @@ -0,0 +1,174 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mailcom.inout\n", + "import mailcom.parse\n", + "import pandas as pd\n", + "import time\n", + "import datetime\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create t0 timestamp\n", + "t0 = time.time()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import files from csv file\n", + "email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg_copy.csv\")\n", + "print(email_list)\n", + "\n", + "t_csv_read = time.time()\n", + "\n", + "# create pseudonymization object\n", + "ps = mailcom.parse.Pseudonymize()\n", + "ps.init_spacy(\"fr\")\n", + "ps.init_transformers()\n", + "# time stamp after model loading\n", + "t_model_loaded = time.time()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# loop over mails and pseudonymize them\n", + "out_list = []\n", + "ts_list = []\n", + "for idx, row in email_list.iterrows():\n", + " ts_email_start = time.time()\n", + " text = row[\"message\"]\n", + " email_dict = {\"content\": text}\n", + " if not text:\n", + " continue\n", + " # Test functionality of Pseudonymize class\n", + " # Pseudonymization is usually done using ps.pseudonymize\n", + " # For performance analysis the process is split into its subprocesses here\n", + " ps.reset()\n", + " sentences = ps.get_sentences(text)\n", + " ts_email_ppr_done = time.time()\n", + " pseudonymized_sentences = []\n", + " for sent in sentences:\n", + " sent = ps.pseudonymize_email_addresses(sent)\n", + " ner = ps.get_ner(sent)\n", + " ps_sent = \" \".join(ps.pseudonymize_ne(ner, sent)) if ner else sent\n", + " ps_sent = ps.pseudonymize_numbers(ps_sent)\n", + " pseudonymized_sentences.append(ps_sent)\n", + " output_text = ps.concatenate(pseudonymized_sentences)\n", + "\n", + " # add output to dict\n", + " email_dict[\"pseudo_content\"] = output_text\n", + " out_list.append(email_dict)\n", + "\n", + " # timestamp after this email\n", + " ts_email_end = time.time()\n", + " ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# write output to pandas df\n", + "df = pd.DataFrame(out_list)\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# display timestamps\n", + "\n", + "# bar plot for each individual email\n", + "# processing times\n", + "idx_list = [row[0] for row in email_list.iterrows()]\n", + "email_duration_list = [ts[2] - ts[1] for ts in ts_list]\n", + "email_ppr_list = [ts[1] - ts[0] for ts in ts_list]\n", + "email_total_list = [ts[2] - ts[0] for ts in ts_list]\n", + "email_bar_height = {\n", + " \"Pre-Processing\": email_ppr_list,\n", + " \"Pseudonymization\": email_duration_list\n", + "}\n", + "bt = [0 for idx in idx_list]\n", + "\n", + "plt.figure(figsize=(10,4), dpi=80)\n", + "\n", + "# plot 1\n", + "plt.subplot(1, 2, 1)\n", + "for key, height in email_bar_height.items():\n", + " plt.bar(idx_list, height, 0.5, label=key, bottom=bt)\n", + " bt = [bi + hi for (bi,hi) in zip(bt, height)]\n", + "#plt.yscale(\"log\")\n", + "plt.xlabel(\"Email\")\n", + "plt.ylabel(\"t [s]\")\n", + "plt.title(\"Computation times for emails, model loading and file reading\")\n", + "plt.legend()\n", + "\n", + "# plot for model loading and file reading, as well as average email time\n", + "# processing times\n", + "bar_x = [\"CSV Reading\", \"Model Loading\", \"Average Email Time\"]\n", + "average_email_time = sum(email_total_list) / len(email_total_list)\n", + "bar_y = [t_csv_read - t0, t_model_loaded - t0, average_email_time]\n", + "plt.ylabel(\"t [s]\")\n", + "\n", + "# plot 2\n", + "plt.subplot(1, 2, 2)\n", + "plt.bar(bar_x, bar_y, 0.5)\n", + "\n", + "# Total time\n", + "print(\"Total time:\", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1][2] - t_model_loaded).strftime('%M:%S')))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mailcom", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index 3a91efd..04b04d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,9 @@ dependencies = [ "bs4", "dicttoxml", "torch", + "pandas", + "jupyter", + "matplotlib" ] [project.optional-dependencies] diff --git a/requirements.txt b/requirements.txt index 6cd0a84..2a06ae0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ spacy fr_core_news_md @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.8.0/fr_core_news_md-3.8.0-py3-none-any.whl es_core_news_md @ https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.8.0/es_core_news_md-3.8.0-py3-none-any.whl -transformers \ No newline at end of file +transformers +pandas +jupyter \ No newline at end of file