From a475130aa5e59a50b7267f755e09147666ffd7e4 Mon Sep 17 00:00:00 2001 From: martmists Date: Mon, 7 Nov 2016 16:06:52 +0100 Subject: [PATCH 1/3] Fix relative import `exceptions` --- mailparser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mailparser/__init__.py b/mailparser/__init__.py index 7609a30..28611a8 100644 --- a/mailparser/__init__.py +++ b/mailparser/__init__.py @@ -20,7 +20,7 @@ from __future__ import unicode_literals from email.errors import HeaderParseError from email.header import decode_header -from exceptions import InvalidMail, NotUnicodeError +from .exceptions import InvalidMail, NotUnicodeError import datetime import email import logging From c404c66e10835e63baabd8f647e595c96bcdae46 Mon Sep 17 00:00:00 2001 From: Fedele Mantuano Date: Tue, 8 Nov 2016 15:46:55 +0100 Subject: [PATCH 2/3] Fix bug charset. Now it's taken the charset of mail part. --- mailparser/__init__.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/mailparser/__init__.py b/mailparser/__init__.py index 28611a8..a3c4dc1 100644 --- a/mailparser/__init__.py +++ b/mailparser/__init__.py @@ -76,11 +76,11 @@ def _decode_header_part(self, header): return output - def _force_unicode(self, s): + def _force_unicode(self, string, encoding): try: - u = unicode(s, encoding=self.charset, errors='ignore') + u = unicode(string, encoding=encoding, errors='ignore') except: - u = unicode(s, errors='ignore',) + u = unicode(string, errors='ignore',) if not isinstance(u, unicode): raise NotUnicodeError("Body part is not unicode") @@ -125,7 +125,6 @@ def _make_mail(self): "message_id": self.message_id, "subject": self.subject, "to": self.to_, - "charset": self.charset, "has_defects": self._has_defects, "has_anomalies": self._has_anomalies, } @@ -150,8 +149,8 @@ def _parse(self): epilogue = self.find_between( self._message.epilogue, "{}".format("--" + self._message.get_boundary()), - "{}".format("--" + self._message.get_boundary() + "--"), - ) + "{}".format("--" + self._message.get_boundary() + "--")) + try: p = email.message_from_string(epilogue) parts.append(p) @@ -162,11 +161,12 @@ def _parse(self): for p in parts: if not p.is_multipart(): f = p.get_filename() + charset = p.get_content_charset('utf-8') + if f: filename = self._decode_header_part(f) mail_content_type = self._decode_header_part( - p.get_content_type(), - ) + p.get_content_type()) transfer_encoding = \ unicode(p.get('content-transfer-encoding', '')).lower() @@ -174,7 +174,8 @@ def _parse(self): payload = p.get_payload(decode=False) else: payload = self._force_unicode( - p.get_payload(decode=True)) + string=p.get_payload(decode=True), + encoding=charset) self._attachments.append( { @@ -186,7 +187,8 @@ def _parse(self): ) else: payload = self._force_unicode( - p.get_payload(decode=True)) + string=p.get_payload(decode=True), + encoding=charset) self._text_plain.append(payload) # Parsed object mail @@ -257,10 +259,6 @@ def text_plain_list(self): def attachments_list(self): return self._attachments - @property - def charset(self): - return self._message.get_content_charset('utf-8') - @property def date_mail(self): date_ = self._message.get('date') From 5fdb11e4f85348a9bd774c0a9fd309618c258fe3 Mon Sep 17 00:00:00 2001 From: Fedele Mantuano Date: Tue, 8 Nov 2016 16:05:38 +0100 Subject: [PATCH 3/3] Update version --- README | 1 - README.md | 1 - setup.py | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/README b/README index 9a36504..935d465 100644 --- a/README +++ b/README @@ -88,7 +88,6 @@ Then you can get all parts parser.subject parser.text_plain_list: only text plain mail parts in a list parser.attachments_list: list of all attachments - parser.charset parser.date_mail parser.parsed_mail_obj: tokenized mail in a object parser.parsed_mail_json: tokenized mail in a JSON diff --git a/README.md b/README.md index a51d0e7..2689003 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,6 @@ parser.from_ parser.subject parser.text_plain_list: only text plain mail parts in a list parser.attachments_list: list of all attachments -parser.charset parser.date_mail parser.parsed_mail_obj: tokenized mail in a object parser.parsed_mail_json: tokenized mail in a JSON diff --git a/setup.py b/setup.py index eecdc2d..4bcfcf8 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup -VERSION = (0, 3, 7) +VERSION = (0, 4, 0) __version__ = VERSION __versionstr__ = '.'.join(map(str, VERSION))