diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c8ab9a..3d38264 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,9 @@ repos: - repo: https://github.com/psf/black - rev: 24.4.2 + rev: 24.10.0 hooks: - id: black - repo: https://github.com/pycqa/flake8 - rev: 7.1.0 + rev: 7.1.1 hooks: - id: flake8 \ No newline at end of file diff --git a/mailcom/inout.py b/mailcom/inout.py index 15c7252..b79bd98 100644 --- a/mailcom/inout.py +++ b/mailcom/inout.py @@ -4,51 +4,62 @@ from bs4 import BeautifulSoup from dicttoxml import dicttoxml + class InoutHandler: def __init__(self, directory_name: str): """Constructor for the InoutHandler class. - - Args: + + Args: directory_name (str): The directory where the files are located. - """ + """ self.directory_name = directory_name # presets self.pattern = [".eml", ".html"] def list_of_files(self): - """Method to create a list of Path objects (files) that are present + """Method to create a list of Path objects (files) that are present in a directory.""" - if not os.path.exists(self.directory_name): # check if given dir exists raises error otherwise + if not os.path.exists( + self.directory_name + ): # check if given dir exists raises error otherwise raise OSError("Path {} does not exist".format(self.directory_name)) mypath = Path(self.directory_name) - self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern] + self.email_list = [ + mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern + ] if len(self.email_list) == 0: - raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath)) + raise ValueError( + """The directory {} does not contain .eml or .html files. + Please check that the directory is containing the email + data files""".format( + mypath + ) + ) def get_html_text(self, text_check: str) -> str: """Clean up a string if it contains html content. Args: text_check (str): The string that may contain html content. - + Returns: str: The (potentially) cleaned up string.""" - soup = BeautifulSoup(text_check , 'html.parser') + soup = BeautifulSoup(text_check, "html.parser") if soup.find(): text_check = soup.get_text() return text_check def get_text(self, file: Path) -> str: """Function to extract the textual content and other metadata from an email file. - + Args: file (Path): The path to the email file. - + Returns: - str: The textual content of the email. In the future, this will return the + str: The textual content of the email. In the future, this will return the complete dictionary with the metadata.""" - if not file.is_file(): # check if given file exists raises error otherwise + if not file.is_file(): # check if given file exists raises error otherwise raise OSError("File {} does not exist".format(file)) - with open(file, 'rb') as fhdl: + with open(file, "rb") as fhdl: raw_email = fhdl.read() ep = eml_parser.EmlParser(include_raw_body=True) parsed_eml = ep.decode_email_bytes(raw_email) @@ -57,23 +68,28 @@ def get_text(self, file: Path) -> str: attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0 # find the types of attachements if attachments > 0: - attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)] - self.email_content = {"content": parsed_eml["body"][0]["content"], - "date": parsed_eml["header"]["date"], - "attachment": attachments, - "attachement type": attachmenttypes - } - return(self.email_content["content"]) + attachmenttypes = [ + parsed_eml["attachment"][i]["extension"] for i in range(attachments) + ] + self.email_content = { + "content": parsed_eml["body"][0]["content"], + "date": parsed_eml["header"]["date"], + "attachment": attachments, + "attachement type": attachmenttypes, + } + return self.email_content["content"] def validate_data(self): pass - + def data_to_xml(self, text): - my_item_func = lambda x: 'content' - xml = dicttoxml(text, custom_root='email', item_func = my_item_func) + def my_item_func(x): + return "content" + + xml = dicttoxml(text, custom_root="email", item_func=my_item_func) return xml.decode() - def write_file(self, text: str, name: str)-> None: + def write_file(self, text: str, name: str) -> None: """Write the extracted string to a text file. Args: @@ -81,4 +97,3 @@ def write_file(self, text: str, name: str)-> None: name (str): The name of the file to be written.""" with open("{}.out".format(name), "w") as file: file.write(text) - \ No newline at end of file diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py index 1f30ff0..18a32f0 100644 --- a/mailcom/test/test_inout.py +++ b/mailcom/test/test_inout.py @@ -11,12 +11,14 @@ XML_PATH = Path(pkg / "test" / "data" / "test.out") TEXT_REF = "J'espère que tu vas bien!" -XML_REF = "" +XML_REF = '' + @pytest.fixture() def get_instant(tmp_path): return inout.InoutHandler(tmp_path) + def test_list_of_files(get_instant): with pytest.raises(ValueError): get_instant.list_of_files() @@ -34,31 +36,37 @@ def test_list_of_files(get_instant): get_instant.list_of_files() assert get_instant.directory_name / "test3.xml" not in get_instant.email_list + def test_get_text(get_instant): p = get_instant.directory_name / "test.eml" p.write_text("test") extracted_text = get_instant.get_text(p) - assert extracted_text == 'test' + assert extracted_text == "test" text = get_instant.get_text(FILE_PATH) assert text[0:25] == TEXT_REF - assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc) + assert get_instant.email_content["date"] == datetime.datetime( + 2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc + ) assert get_instant.email_content["attachment"] == 2 - assert get_instant.email_content["attachement type"] == ['jpg', 'jpg'] + assert get_instant.email_content["attachement type"] == ["jpg", "jpg"] with pytest.raises(OSError): get_instant.get_text(get_instant.directory_name / "nonexisting.eml") + def test_get_html_text(get_instant): html = """Test""" - assert get_instant.get_html_text(html) == 'Test' + assert get_instant.get_html_text(html) == "Test" noHtml = """Test""" - assert get_instant.get_html_text(noHtml) == 'Test' - -def test_data_to_xml(get_instant,tmp_path): - xml_content = {"content": "This is nothing more than a test", - "date": "2024-04-17T15:13:56+00:00", - "attachment": 2, - "attachement type": {'jpg', 'jpg'} - } + assert get_instant.get_html_text(noHtml) == "Test" + + +def test_data_to_xml(get_instant, tmp_path): + xml_content = { + "content": "This is nothing more than a test", + "date": "2024-04-17T15:13:56+00:00", + "attachment": 2, + "attachement type": {"jpg", "jpg"}, + } xml = get_instant.data_to_xml(xml_content) get_instant.write_file(xml, tmp_path / "test") assert filecmp.cmp(XML_PATH, tmp_path / "test.out")