Skip to content

Commit

Permalink
Merge branch 'release/3.2.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
fedelemantuano committed Dec 30, 2017
2 parents 0455a57 + 569950b commit 7765003
Show file tree
Hide file tree
Showing 8 changed files with 8,408 additions and 66 deletions.
1 change: 0 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ script:
- python -m mailparser -h
- python -m mailparser -f tests/mails/mail_malformed_3 -j
- cat tests/mails/mail_malformed_3 | python -m mailparser -k -j
- python -m mailparser -f tests/mails/mail_test_6 -j

after_success:
- coveralls
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,6 @@ $ mailparser -f example_mail -j
```

This example will show you the tokenized mail in a JSON pretty format.

From [raw mail](https://gist.github.com/fedelemantuano/5dd702004c25a46b2bd60de21e67458e) to
[parsed mail](https://gist.github.com/fedelemantuano/e958aa2813c898db9d2d09469db8e6f6)
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,11 @@ Example:
This example will show you the tokenized mail in a JSON pretty format.

From `raw
mail <https://gist.github.com/fedelemantuano/5dd702004c25a46b2bd60de21e67458e>`__
to `parsed
mail <https://gist.github.com/fedelemantuano/e958aa2813c898db9d2d09469db8e6f6>`__

.. |PyPI version| image:: https://badge.fury.io/py/mail-parser.svg
:target: https://badge.fury.io/py/mail-parser
.. |Build Status| image:: https://travis-ci.org/SpamScope/mail-parser.svg?branch=develop
Expand Down
30 changes: 21 additions & 9 deletions mailparser/mailparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"""

from __future__ import unicode_literals
import datetime
import email
import logging
import os
Expand All @@ -29,8 +28,13 @@
import simplejson as json

from .utils import (
ported_string, decode_header_part, ported_open,
find_between, msgconvert)
convert_mail_date,
decode_header_part,
find_between,
msgconvert,
ported_open,
ported_string,
receiveds_parsing)


log = logging.getLogger(__name__)
Expand Down Expand Up @@ -424,12 +428,12 @@ def attachments(self):
@property
def received(self):
"""
Return a list of all received headers
Return a list of all received headers parsed
"""
output = []
for i in self.message.get_all("received", []):
output.append(decode_header_part(i))
return output
return receiveds_parsing(output)

@property
def received_json(self):
Expand All @@ -438,6 +442,16 @@ def received_json(self):
"""
return json.dumps(self.received, ensure_ascii=False, indent=2)

@property
def received_raw(self):
"""
Return a list of all received headers in raw format
"""
output = []
for i in self.message.get_all("received", []):
output.append(decode_header_part(i))
return output

@property
def message_id(self):
"""
Expand Down Expand Up @@ -483,12 +497,10 @@ def date(self):
"""
Return the mail date in datetime.datetime format and UTC.
"""
date_ = self.message.get('date')
date = self.message.get('date')

try:
d = email.utils.parsedate_tz(date_)
t = email.utils.mktime_tz(d)
return datetime.datetime.utcfromtimestamp(t)
return convert_mail_date(date)
except:
return None

Expand Down
99 changes: 98 additions & 1 deletion mailparser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,16 @@

from __future__ import unicode_literals

from collections import namedtuple
from collections import namedtuple, Counter
from email.errors import HeaderParseError
from email.header import decode_header
from unicodedata import normalize
import datetime
import email
import hashlib
import logging
import os
import re
import subprocess
import tempfile

Expand All @@ -35,6 +38,13 @@
log = logging.getLogger(__name__)


RECEIVED_PATTERN = (r'from\s+(?P<from>(?:\b(?!by\b)\S+[ :]*)*)'
r'(?:by\s+(?P<by>(?:\b(?!with\b)\S+[ :]*)*))?'
r'(?:with\s+(?P<with>[^;]+))?(?:\s*;\s*(?P<date>.*))?')
JUNK_PATTERN = r'[ \(\)\[\]\t\n]+'
RECEIVED_COMPILED = re.compile(RECEIVED_PATTERN, re.I)


def sanitize(func):
""" NFC is the normalization form recommended by W3C. """

Expand Down Expand Up @@ -190,3 +200,90 @@ def markdown2rst(file_path):
import pypandoc
output = pypandoc.convert_file(file_path, 'rst')
return output


def receiveds_parsing(receiveds):
"""
This function parses the receiveds headers
Args:
receiveds (list): list of raw receiveds headers
Returns:
a list of parsed receiveds headers with first hop in first position
"""

parsed = []

try:
for i in receiveds:
cleaned = re.sub(JUNK_PATTERN, " ", i)
for j in RECEIVED_COMPILED.finditer(cleaned):
parsed.append(j.groupdict())

if len(receiveds) != len(parsed):
raise ValueError

except (AttributeError, ValueError):
return receiveds[::-1]

else:
return receiveds_format(parsed)


def convert_mail_date(date):
d = email.utils.parsedate_tz(date)
t = email.utils.mktime_tz(d)
return datetime.datetime.utcfromtimestamp(t)


def receiveds_format(receiveds):
"""
Given a list of receiveds hop, adds metadata and reformat
field values
Args:
receiveds (list): list of receiveds hops already formatted
Returns:
list of receiveds reformated and with new fields
"""

output = []
counter = Counter()

for i in receiveds[::-1]:
# Clean strings
j = {k: v.strip() for k, v in i.items() if v}

# Add hop
j["hop"] = counter["hop"] + 1

# Add UTC date
if i.get("date"):
j["date_utc"] = convert_mail_date(i["date"])

# Add delay
size = len(output)
now = j.get("date_utc")

if size and now:
before = output[counter["hop"] - 1].get("date_utc")
if before:
j["delay"] = (now - before).total_seconds()
else:
j["delay"] = 0
else:
j["delay"] = 0

# append result
output.append(j)

# new hop
counter["hop"] += 1
else:
for i in output:
if i.get("date_utc"):
i["date_utc"] = i["date_utc"].isoformat()
else:
return output
2 changes: 1 addition & 1 deletion mailparser/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
limitations under the License.
"""

__version__ = "3.1.0"
__version__ = "3.2.0"

if __name__ == "__main__":
print(__version__)
Loading

0 comments on commit 7765003

Please sign in to comment.