ssciwr · fexfl · Nov 3, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 12, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,11 @@ repos:
     rev: 24.4.2
     hooks:
     - id: black
-  -   repo: https://github.com/pycqa/flake8
-      rev: 7.1.0 
-      hooks:
-      -   id: flake8
+  - repo: https://github.com/pycqa/flake8
+    rev: 7.1.0 
+    hooks:
+    - id: flake8
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.8.1
+    hooks:
+    - id: nbstripout
diff --git a/mailcom/inout.py b/mailcom/inout.py
@@ -4,51 +4,62 @@
 from bs4 import BeautifulSoup
 from dicttoxml import dicttoxml
 
+
 class InoutHandler:
     def __init__(self, directory_name: str):
         """Constructor for the InoutHandler class.
-        
-        Args: 
+
+        Args:
             directory_name (str): The directory where the files are located.
-        """        
+        """
         self.directory_name = directory_name
         # presets
         self.pattern = [".eml", ".html"]
 
     def list_of_files(self):
-        """Method to create a list of Path objects (files) that are present 
+        """Method to create a list of Path objects (files) that are present
         in a directory."""
-        if not os.path.exists(self.directory_name):  # check if given dir exists raises error otherwise
+        if not os.path.exists(
+            self.directory_name
+        ):  # check if given dir exists raises error otherwise
             raise OSError("Path {} does not exist".format(self.directory_name))
         mypath = Path(self.directory_name)
-        self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern]
+        self.email_list = [
+            mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern
+        ]
         if len(self.email_list) == 0:
-            raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
+            raise ValueError(
+                """The directory {} does not contain .eml or .html files.
+                Please check that the directory is containing the
+                email data files""".format(
+                    mypath
+                )
+            )
 
     def get_html_text(self, text_check: str) -> str:
         """Clean up a string if it contains html content.
         Args:
             text_check (str): The string that may contain html content.
-            
+
         Returns:
             str: The (potentially) cleaned up string."""
-        soup = BeautifulSoup(text_check , 'html.parser')
+        soup = BeautifulSoup(text_check, "html.parser")
         if soup.find():
             text_check = soup.get_text()
         return text_check
 
     def get_text(self, file: Path) -> str:
         """Function to extract the textual content and other metadata from an email file.
-        
+
         Args:
             file (Path): The path to the email file.
-            
+
         Returns:
-            str: The textual content of the email. In the future, this will return the 
+            str: The textual content of the email. In the future, this will return the
             complete dictionary with the metadata."""
-        if not file.is_file(): # check if given file exists raises error otherwise
+        if not file.is_file():  # check if given file exists raises error otherwise
             raise OSError("File {} does not exist".format(file))
-        with open(file, 'rb') as fhdl:
+        with open(file, "rb") as fhdl:
             raw_email = fhdl.read()
         ep = eml_parser.EmlParser(include_raw_body=True)
         parsed_eml = ep.decode_email_bytes(raw_email)
@@ -57,28 +68,30 @@ def get_text(self, file: Path) -> str:
         attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
         # find the types of attachements
         if attachments > 0:
-            attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
-        self.email_content = {"content": parsed_eml["body"][0]["content"], 
-                    "date": parsed_eml["header"]["date"], 
-                    "attachment": attachments, 
-                    "attachement type": attachmenttypes
-                    }
-        return(self.email_content["content"])
+            attachmenttypes = [
+                parsed_eml["attachment"][i]["extension"] for i in range(attachments)
+            ]
+        self.email_content = {
+            "content": parsed_eml["body"][0]["content"],
+            "date": parsed_eml["header"]["date"],
+            "attachment": attachments,
+            "attachement type": attachmenttypes,
+        }
+        return self.email_content["content"]
 
     def validate_data(self):
         pass
-    
+
     def data_to_xml(self, text):
-        my_item_func = lambda x: 'content'
-        xml = dicttoxml(text, custom_root='email', item_func = my_item_func)
+        my_item_func = lambda x: "content"  # noqa
+        xml = dicttoxml(text, custom_root="email", item_func=my_item_func)
         return xml.decode()
 
-    def write_file(self, text: str, name: str)-> None:
+    def write_file(self, text: str, name: str) -> None:
         """Write the extracted string to a text file.
 
         Args:
             text (str): The string to be written to the file.
             name (str): The name of the file to be written."""
         with open("{}.out".format(name), "w") as file:
             file.write(text)
-
diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py
@@ -11,12 +11,14 @@
 XML_PATH = Path(pkg / "test" / "data" / "test.out")
 
 TEXT_REF = "J'espère que tu vas bien!"
-XML_REF = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?><email><content type=\"str\">"
+XML_REF = '<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">'
+
 
 @pytest.fixture()
 def get_instant(tmp_path):
     return inout.InoutHandler(tmp_path)
 
+
 def test_list_of_files(get_instant):
     with pytest.raises(ValueError):
         get_instant.list_of_files()
@@ -34,31 +36,37 @@ def test_list_of_files(get_instant):
     get_instant.list_of_files()
     assert get_instant.directory_name / "test3.xml" not in get_instant.email_list
 
+
 def test_get_text(get_instant):
     p = get_instant.directory_name / "test.eml"
     p.write_text("test")
     extracted_text = get_instant.get_text(p)
-    assert extracted_text == 'test'
+    assert extracted_text == "test"
     text = get_instant.get_text(FILE_PATH)
     assert text[0:25] == TEXT_REF
-    assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc)
+    assert get_instant.email_content["date"] == datetime.datetime(
+        2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc
+    )
     assert get_instant.email_content["attachment"] == 2
-    assert get_instant.email_content["attachement type"] == ['jpg', 'jpg']
+    assert get_instant.email_content["attachement type"] == ["jpg", "jpg"]
     with pytest.raises(OSError):
         get_instant.get_text(get_instant.directory_name / "nonexisting.eml")
 
+
 def test_get_html_text(get_instant):
     html = """<html><head><title>Test</title></head></html>"""
-    assert get_instant.get_html_text(html) == 'Test'
+    assert get_instant.get_html_text(html) == "Test"
     noHtml = """Test"""
-    assert get_instant.get_html_text(noHtml) == 'Test'
-
-def test_data_to_xml(get_instant,tmp_path):
-    xml_content = {"content": "This is nothing more than a test", 
-                    "date": "2024-04-17T15:13:56+00:00", 
-                    "attachment": 2, 
-                    "attachement type": {'jpg', 'jpg'}
-                    }
+    assert get_instant.get_html_text(noHtml) == "Test"
+
+
+def test_data_to_xml(get_instant, tmp_path):
+    xml_content = {
+        "content": "This is nothing more than a test",
+        "date": "2024-04-17T15:13:56+00:00",
+        "attachment": 2,
+        "attachement type": {"jpg", "jpg"},
+    }
     xml = get_instant.data_to_xml(xml_content)
     get_instant.write_file(xml, tmp_path / "test")
     assert filecmp.cmp(XML_PATH, tmp_path / "test.out")
diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Demonstration notebook for the mailcom package\n",
+    "*Scientific Software Center, University of Heidelberg, December 2024*\n",
+    "The `mailcom` package is used to anonymize/pseudonymize textual data, i.e. email content. It takes an `eml` or `html` file as input and extracts information about attachements, number of attachements and type, and the content of the email body. The latter is then parsed through [`spaCy`](https://spacy.io/) and divided into sentences. The sentences are fed to a [`transformers`](https://huggingface.co/docs/transformers/en/index) named entity recognition (NER) [pipeline](https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/pipelines), and person names, places, organizations, miscellaneous, are detected in the inference task. Names are replaced by pseudos, while locations, organizations and miscellaneous are replaced by `[location]`, `[organization]` and `[misc]`. The text is further parsed using string methods, to replace any numbers with `[number]` and email addresses with `[email]`. The processed text and metadata can then be written to an `xml` file or into a pandas dataframe.\n",
+    "\n",
+    "Please note that 100% accuracy is not possible with this task. Any output needs to be further checked by a human to ensure the text has been anonymized completely.\n",
+    "\n",
+    "The current set-up is for Romance languages, however [other language models](https://spacy.io/usage/models) can also be loaded into the spaCy pipeline. The transformers pipeline uses the `xlm-roberta-large-finetuned-conll03-english` model revision number `18f95e9` by default, but other models can also be passed (see below).\n",
+    "\n",
+    "Before using the `mailcom` package, please install it into your conda environment using\n",
+    "```\n",
+    "pip install mailcom\n",
+    "```\n",
+    "After that, select the appropriate kernel for your Jupyter notebook and execute the cell below to import the package. The package is currently under active development and any function calls are subject to changes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mailcom.inout\n",
+    "import mailcom.parse\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below, the input files are loaded from the given `input_dir` directory. You can provide relative or absolute paths to the directory that contains your `eml` or `html` files. All files of the `eml` or `htlm` file type in that directory will be considered input files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import files from input_dir - change this to your own directory\n",
+    "input_dir = \"../mailcom/test/data\"\n",
+    "\n",
+    "io = mailcom.inout.InoutHandler(directory_name = input_dir)\n",
+    "\n",
+    "# some internal processing\n",
+    "io.list_of_files()\n",
+    "\n",
+    "# create pseudonymization object and load spacy and transformers\n",
+    "# set the spacy language for sentence splitting\n",
+    "spacy_language = \"fr\"\n",
+    "# you may also set the model using `model = \"fr_core_news_md\"`\n",
+    "spacy_model = \"default\"\n",
+    "# set the model for transformers, here using the default model\n",
+    "transformers_model = \"xlm-roberta-large-finetuned-conll03-english\"\n",
+    "# set the revision number for transformers, here using the default revision number\n",
+    "transformers_revision_number = \"18f95e9\"\n",
+    "ps = mailcom.parse.Pseudonymize()\n",
+    "ps.init_spacy(language=spacy_language, model=spacy_model)\n",
+    "ps.init_transformers(model=transformers_model, model_revision_number=transformers_revision_number)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the cell below, the emails are looped over and the text is extracted. The text is then split into sentences and the sentences are pseudonymized. The pseudonymized sentences are then joined back into a text and saved to a new file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# loop over mails and pseudonymize them\n",
+    "out_list = []\n",
+    "for file in io.email_list:\n",
+    "    print(\"Parsing input file {}\".format(file))\n",
+    "    text = io.get_text(file)\n",
+    "    # after this function was called, the email metadata can be accessed via io.email_content\n",
+    "    # the dict already has the entries content, date, attachments, attachment type\n",
+    "    email_dict = io.email_content.copy()\n",
+    "    text = io.get_html_text(text)\n",
+    "    if not text:\n",
+    "        continue\n",
+    "    # Test functionality of Pseudonymize class\n",
+    "    output_text = ps.pseudonymize(text)\n",
+    "    email_dict[\"pseudo_content\"] = output_text\n",
+    "    out_list.append(email_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After this, the output can be written to a file or processed further. The output is a list of dictionaries, each containing the metadata of the email and the pseudonymized content. In the below cell, the output is saved in a pandas dataframe."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# write output to pandas df\n",
+    "df = pd.DataFrame(out_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You may print the output for inspection in the notebook as per the cell below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print results\n",
+    "for idx, mail in df.iterrows():\n",
+    "    print(\"Email\", idx)\n",
+    "    print(\"Original Text:\\n\", mail[\"content\"])\n",
+    "    print(\"Pseudonymized Text:\\n\", mail[\"pseudo_content\"])\t"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mailcom",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}