Skip to content

Commit

Permalink
Restructured inout to be able to save all emails
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Feb 17, 2025
1 parent d24d170 commit 6ce715c
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 33 deletions.
38 changes: 30 additions & 8 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ def __init__(self, directory_name: str):
# presets
self.pattern = [".eml", ".html"]

# list containing all emails
self.email_list = []

def list_of_files(self):
"""Method to create a list of Path objects (files) that are present
in a directory."""
Expand All @@ -24,10 +27,10 @@ def list_of_files(self):
): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(self.directory_name))
mypath = Path(self.directory_name)
self.email_list = [
self.email_path_list = [
mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern
]
if len(self.email_list) == 0:
if len(self.email_path_list) == 0:
raise ValueError(
"""The directory {} does not contain .eml or .html files.
Please check that the directory is containing the email
Expand All @@ -48,15 +51,15 @@ def get_html_text(self, text_check: str) -> str:
text_check = soup.get_text()
return text_check

def get_text(self, file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.
def extract_email_info(self, file: Path) -> dict:
"""Function to extract the textual content and other metadata
from a single email file.
Args:
file (Path): The path to the email file.
Returns:
str: The textual content of the email. In the future, this will return the
complete dictionary with the metadata."""
dict: Dictionary containing email text and metadata."""
if not file.is_file(): # check if given file exists raises error otherwise
raise OSError("File {} does not exist".format(file))
with open(file, "rb") as fhdl:
Expand All @@ -71,13 +74,32 @@ def get_text(self, file: Path) -> str:
attachmenttypes = [
parsed_eml["attachment"][i]["extension"] for i in range(attachments)
]
self.email_content = {
email_content = {
"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes,
}
return self.email_content["content"]
# clean up html content
email_content["content"] = self.get_html_text(email_content["content"])

return email_content

def process_emails(self):
"""Function that processes all emails in the directory
and saves their contents in email_list"""

for email_path in self.email_path_list:
print("Processing input file {}".format(email_path))
email_dict = self.extract_email_info(email_path)
self.email_list.append(email_dict)

def get_email_list(self):
"""Function that returns an iterator of email_list
Returns:
iter: Iterator of self.email_list."""
return iter(self.email_list)

def validate_data(self):
pass
Expand Down
14 changes: 5 additions & 9 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,19 +251,15 @@ def make_dir(path: str):
# process the text
io = InoutHandler(path_input)
io.list_of_files()
io.process_emails()
# html_files = list_of_files(path_input, "html")
pseudonymizer = Pseudonymize()
pseudonymizer.init_spacy("fr")
pseudonymizer.init_transformers()
for file in io.email_list:
print("Parsing input file {}".format(file))
text = io.get_text(file)
text = io.get_html_text(text)
xml = io.data_to_xml(text)
io.write_file(xml, path_output / output_filename)
if not text:
for idx, email in enumerate(io.get_email_list()):
if not email["content"]:
continue
# Test functionality of Pseudonymize class
output_text = pseudonymizer.pseudonymize(text)
output_text = pseudonymizer.pseudonymize(email["content"])
print("New text:", output_text)
print("Old text:", text)
print("Old text:", email["content"])
50 changes: 34 additions & 16 deletions mailcom/test/test_inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,22 +37,6 @@ def test_list_of_files(get_instant):
assert get_instant.directory_name / "test3.xml" not in get_instant.email_list


def test_get_text(get_instant):
p = get_instant.directory_name / "test.eml"
p.write_text("test")
extracted_text = get_instant.get_text(p)
assert extracted_text == "test"
text = get_instant.get_text(FILE_PATH)
assert text[0:25] == TEXT_REF
assert get_instant.email_content["date"] == datetime.datetime(
2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc
)
assert get_instant.email_content["attachment"] == 2
assert get_instant.email_content["attachement type"] == ["jpg", "jpg"]
with pytest.raises(OSError):
get_instant.get_text(get_instant.directory_name / "nonexisting.eml")


def test_get_html_text(get_instant):
html = """<html><head><title>Test</title></head></html>"""
assert get_instant.get_html_text(html) == "Test"
Expand All @@ -70,3 +54,37 @@ def test_data_to_xml(get_instant, tmp_path):
xml = get_instant.data_to_xml(xml_content)
get_instant.write_file(xml, tmp_path / "test")
assert filecmp.cmp(XML_PATH, tmp_path / "test.out")


def test_extract_email_info(get_instant):
# Test with a valid email file
email_info = get_instant.extract_email_info(FILE_PATH)
assert email_info["content"].startswith(TEXT_REF)
assert email_info["date"] == datetime.datetime(
2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc
)
assert email_info["attachment"] == 2
assert email_info["attachement type"] == ["jpg", "jpg"]

# Test with a non-existing file
with pytest.raises(OSError):
get_instant.extract_email_info(get_instant.directory_name / "nonexisting.eml")


def test_process_emails(get_instant):
# Create some test email files
email_file_1 = get_instant.directory_name / "test1.eml"
email_file_1.write_text("Content of test email 1")
email_file_2 = get_instant.directory_name / "test2.eml"
email_file_2.write_text("Content of test email 2")

# Update the directory name and list of files
get_instant.list_of_files()

# Process the emails
get_instant.process_emails()

# Check if the emails were processed and added to the email list
assert len(get_instant.email_list) == 2
assert "Content of test email 1" in get_instant.email_list[0]["content"]
assert "Content of test email 2" in get_instant.email_list[1]["content"]

0 comments on commit 6ce715c

Please sign in to comment.