diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7cc1f24..14cd1ac 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,10 @@ repos: - repo: https://github.com/psf/black - rev: 24.4.2 + rev: 24.10.0 hooks: - id: black - repo: https://github.com/pycqa/flake8 - rev: 7.1.0 + rev: 7.1.1 hooks: - id: flake8 - repo: https://github.com/kynan/nbstripout diff --git a/mailcom/inout.py b/mailcom/inout.py index 0028950..b79bd98 100644 --- a/mailcom/inout.py +++ b/mailcom/inout.py @@ -30,8 +30,8 @@ def list_of_files(self): if len(self.email_list) == 0: raise ValueError( """The directory {} does not contain .eml or .html files. - Please check that the directory is containing the - email data files""".format( + Please check that the directory is containing the email + data files""".format( mypath ) ) @@ -83,7 +83,9 @@ def validate_data(self): pass def data_to_xml(self, text): - my_item_func = lambda x: "content" # noqa + def my_item_func(x): + return "content" + xml = dicttoxml(text, custom_root="email", item_func=my_item_func) return xml.decode() diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb index 7c52e5e..429e6de 100644 --- a/notebook/demo.ipynb +++ b/notebook/demo.ipynb @@ -35,7 +35,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Below, the input files are loaded from the given `input_dir` directory. You can provide relative or absolute paths to the directory that contains your `eml` or `html` files. All files of the `eml` or `htlm` file type in that directory will be considered input files." + "The cell below defines a function used to display the result in the end, and highlight all named entities found in the text. It is used for demonstration purposes in this example." ] }, { @@ -67,6 +67,13 @@ " return text" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below, the input files are loaded from the given `input_dir` directory. You can provide relative or absolute paths to the directory that contains your `eml` or `html` files. All files of the `eml` or `html` file type in that directory will be considered input files." + ] + }, { "cell_type": "code", "execution_count": null, @@ -99,7 +106,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the cell below, the emails are looped over and the text is extracted. The text is then split into sentences and the sentences are pseudonymized. The pseudonymized sentences are then joined back into a text and saved to a new file." + "In the cell below, the emails are looped over and the text is extracted. The text is then split into sentences and the sentences are pseudonymized. The pseudonymized sentences are then joined back into a text and saved to a new file.\n", + "\n", + "The input text is displayed and the found named entities are highlighted for demonstration. Note that emails (all words containing '@') are filtered out seperately and thus not highlighted here." ] }, {