Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pre-commit.ci] pre-commit autoupdate #52

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
repos:
- repo: https://github.com/psf/black
rev: 24.4.2
rev: 24.10.0
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
rev: 7.1.0
rev: 7.1.1
hooks:
- id: flake8
63 changes: 37 additions & 26 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,60 @@
from bs4 import BeautifulSoup
from dicttoxml import dicttoxml


class InoutHandler:
def __init__(self, directory_name: str):
"""Constructor for the InoutHandler class.
Args:

Args:
directory_name (str): The directory where the files are located.
"""
"""
self.directory_name = directory_name
# presets
self.pattern = [".eml", ".html"]

def list_of_files(self):
"""Method to create a list of Path objects (files) that are present
"""Method to create a list of Path objects (files) that are present
in a directory."""
if not os.path.exists(self.directory_name): # check if given dir exists raises error otherwise
if not os.path.exists(
self.directory_name
): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(self.directory_name))
mypath = Path(self.directory_name)
self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern]
self.email_list = [
mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern
]
if len(self.email_list) == 0:
raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
raise ValueError(
"The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(
mypath
)
)

def get_html_text(self, text_check: str) -> str:
"""Clean up a string if it contains html content.
Args:
text_check (str): The string that may contain html content.

Returns:
str: The (potentially) cleaned up string."""
soup = BeautifulSoup(text_check , 'html.parser')
soup = BeautifulSoup(text_check, "html.parser")
if soup.find():
text_check = soup.get_text()
return text_check

def get_text(self, file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.

Args:
file (Path): The path to the email file.

Returns:
str: The textual content of the email. In the future, this will return the
str: The textual content of the email. In the future, this will return the
complete dictionary with the metadata."""
if not file.is_file(): # check if given file exists raises error otherwise
if not file.is_file(): # check if given file exists raises error otherwise
raise OSError("File {} does not exist".format(file))
with open(file, 'rb') as fhdl:
with open(file, "rb") as fhdl:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
Expand All @@ -57,28 +66,30 @@ def get_text(self, file: Path) -> str:
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
self.email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
}
return(self.email_content["content"])
attachmenttypes = [
parsed_eml["attachment"][i]["extension"] for i in range(attachments)
]
self.email_content = {
"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes,
}
return self.email_content["content"]

def validate_data(self):
pass

def data_to_xml(self, text):
my_item_func = lambda x: 'content'
xml = dicttoxml(text, custom_root='email', item_func = my_item_func)
my_item_func = lambda x: "content"
xml = dicttoxml(text, custom_root="email", item_func=my_item_func)
return xml.decode()

def write_file(self, text: str, name: str)-> None:
def write_file(self, text: str, name: str) -> None:
"""Write the extracted string to a text file.

Args:
text (str): The string to be written to the file.
name (str): The name of the file to be written."""
with open("{}.out".format(name), "w") as file:
file.write(text)

34 changes: 21 additions & 13 deletions mailcom/test/test_inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
XML_PATH = Path(pkg / "test" / "data" / "test.out")

TEXT_REF = "J'espère que tu vas bien!"
XML_REF = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?><email><content type=\"str\">"
XML_REF = '<?xml version="1.0" encoding="UTF-8" ?><email><content type="str">'


@pytest.fixture()
def get_instant(tmp_path):
return inout.InoutHandler(tmp_path)


def test_list_of_files(get_instant):
with pytest.raises(ValueError):
get_instant.list_of_files()
Expand All @@ -34,31 +36,37 @@ def test_list_of_files(get_instant):
get_instant.list_of_files()
assert get_instant.directory_name / "test3.xml" not in get_instant.email_list


def test_get_text(get_instant):
p = get_instant.directory_name / "test.eml"
p.write_text("test")
extracted_text = get_instant.get_text(p)
assert extracted_text == 'test'
assert extracted_text == "test"
text = get_instant.get_text(FILE_PATH)
assert text[0:25] == TEXT_REF
assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc)
assert get_instant.email_content["date"] == datetime.datetime(
2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc
)
assert get_instant.email_content["attachment"] == 2
assert get_instant.email_content["attachement type"] == ['jpg', 'jpg']
assert get_instant.email_content["attachement type"] == ["jpg", "jpg"]
with pytest.raises(OSError):
get_instant.get_text(get_instant.directory_name / "nonexisting.eml")


def test_get_html_text(get_instant):
html = """<html><head><title>Test</title></head></html>"""
assert get_instant.get_html_text(html) == 'Test'
assert get_instant.get_html_text(html) == "Test"
noHtml = """Test"""
assert get_instant.get_html_text(noHtml) == 'Test'

def test_data_to_xml(get_instant,tmp_path):
xml_content = {"content": "This is nothing more than a test",
"date": "2024-04-17T15:13:56+00:00",
"attachment": 2,
"attachement type": {'jpg', 'jpg'}
}
assert get_instant.get_html_text(noHtml) == "Test"


def test_data_to_xml(get_instant, tmp_path):
xml_content = {
"content": "This is nothing more than a test",
"date": "2024-04-17T15:13:56+00:00",
"attachment": 2,
"attachement type": {"jpg", "jpg"},
}
xml = get_instant.data_to_xml(xml_content)
get_instant.write_file(xml, tmp_path / "test")
assert filecmp.cmp(XML_PATH, tmp_path / "test.out")
Loading