diff --git a/.travis.yml b/.travis.yml index ce3328c..4db0527 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,9 +5,9 @@ language: python python: - "2.7" - - "3.4" - - "3.5" - - "3.6" + - "3.7" + - "3.8" + - "3.9" before_install: - sudo apt-get -qq update diff --git a/README.md b/README.md index 333c1ec..dbe4b96 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,6 @@ # mail-parser -## Overview - mail-parser is not only a wrapper for [email](https://docs.python.org/2/library/email.message.html) Python Standard Library. It give you an easy way to pass from raw mail to Python object that you can use in your code. It's the key module of [SpamScope](https://github.com/SpamScope/spamscope). @@ -28,15 +26,29 @@ $ apt-cache show libemail-outlook-message-perl mail-parser supports Python 3. -## mail-parser on Web + +# Apache 2 Open Source License +mail-parser can be downloaded, used, and modified free of charge. It is available under the Apache 2 license. + +If you want support the project: + + +[![Donate](https://www.paypal.com/en_US/i/btn/btn_donateCC_LG.gif "Donate")](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=VEPXYP745KJF2) + +![Bitcoin Donate](https://i.stack.imgur.com/MnQ6V.png) + +![](https://github.com/SpamScope/mail-parser/raw/develop/docs/bitcoin-qrcode.png) + + +# mail-parser on Web - [Splunk app](https://splunkbase.splunk.com/app/4129/) - [FreeBSD port](https://www.freshports.org/mail/py-mail-parser/) - [Arch User Repository](https://aur.archlinux.org/packages/mailparser/) -## Description +# Description -mail-parser takes as input a raw email and generates a parsed object. The properties of this object are the same name of +mail-parser takes as input a raw email and generates a parsed object. The properties of this object are the same name of [RFC headers](https://www.iana.org/assignments/message-headers/message-headers.xhtml): - bcc @@ -107,27 +119,18 @@ $ mail.to_raw (raw header) The command line tool use the JSON format. -### Defects +## Defects These defects can be used to evade the antispam filter. An example are the mails with a malformed boundary that can hide a not legitimate epilogue (often malware). This library can take these epilogues. -### Apache 2 Open Source License -mail-parser can be downloaded, used, and modified free of charge. It is available under the Apache 2 license. - -If you want support the project: - - -[![Donate](https://www.paypal.com/en_US/i/btn/btn_donateCC_LG.gif "Donate")](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=VEPXYP745KJF2) - - -## Authors +# Authors -### Main Author +## Main Author **Fedele Mantuano**: [LinkedIn](https://www.linkedin.com/in/fmantuano/) -## Installation +# Installation Clone repository @@ -149,7 +152,7 @@ or use `pip`: $ pip install mail-parser ``` -## Usage in a project +# Usage in a project Import `mailparser` module: @@ -196,7 +199,7 @@ It's possible to write the attachments on disk with the method: mail.write_attachments(base_path) ``` -## Usage from command-line +# Usage from command-line If you installed mailparser with `pip` or `setup.py` you can use it with command-line. @@ -216,7 +219,7 @@ optional arguments: -s STRING, --string STRING Raw email string (default: None) -k, --stdin Enable parsing from stdin (default: False) - -l {CRITICAL,ERROR,WARNING,INFO,DEBUG,NOTSET}, --log-level {CRITICAL,ERROR,WARNING,INFO,DEBUG,NOTSET} + -l {CRITICAL,ERROR,WARNING,INFO,DEBUG,NOTSET}, --log-level {CRITICAL,ERROR,WARNING,INFO,DEBUG,NOTSET} Set log level (default: WARNING) -j, --json Show the JSON of parsed mail (default: False) -b, --body Print the body of mail (default: False) @@ -253,11 +256,11 @@ $ mailparser -f example_mail -j This example will show you the tokenized mail in a JSON pretty format. -From [raw mail](https://gist.github.com/fedelemantuano/5dd702004c25a46b2bd60de21e67458e) to +From [raw mail](https://gist.github.com/fedelemantuano/5dd702004c25a46b2bd60de21e67458e) to [parsed mail](https://gist.github.com/fedelemantuano/e958aa2813c898db9d2d09469db8e6f6). -## Exceptions +# Exceptions Exceptions hierarchy of mail-parser: diff --git a/docs/bitcoin-qrcode.png b/docs/bitcoin-qrcode.png new file mode 100644 index 0000000..e50dd04 Binary files /dev/null and b/docs/bitcoin-qrcode.png differ diff --git a/mailparser/const.py b/mailparser/const.py index 2ac985a..89efa3c 100644 --- a/mailparser/const.py +++ b/mailparser/const.py @@ -65,16 +65,18 @@ r'envelope-from|\s*[(]?envelope-sender|\s+' r'from|\s+by|\s+id|\s+for|\s+with(?! cipher)|;))' ), - # assumes emails are always inside <> r'(?:envelope-from\s+<(?P.+?)>)', r'(?:envelope-sender\s+<(?P.+?)>)', # datetime comes after ; at the end r';\s*(?P.*)', - + # sendgrid datetime - r'(?P\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{9} \+0000 UTC) m=\+\d+\.\d+' + ( + r'(?P\d{4}-\d{2}-\d{2} \d{2}:\d{2}:' + r'\d{2}\.\d{9} \+0000 UTC) m=\+\d+\.\d+' + ) ] RECEIVED_COMPILED_LIST = [ diff --git a/mailparser/mailparser.py b/mailparser/mailparser.py index 0509d0d..4e14bc8 100644 --- a/mailparser/mailparser.py +++ b/mailparser/mailparser.py @@ -42,6 +42,7 @@ msgconvert, ported_open, ported_string, + random_string, receiveds_parsing, write_attachments, ) @@ -353,14 +354,31 @@ def parse(self): charset = p.get_content_charset('utf-8') charset_raw = p.get_content_charset() log.debug("Charset {!r} part {!r}".format(charset, i)) + content_disposition = ported_string( + p.get('content-disposition')) + log.debug("content-disposition {!r} part {!r}".format( + content_disposition, i)) content_id = ported_string(p.get('content-id')) log.debug("content-id {!r} part {!r}".format( content_id, i)) - filename = decode_header_part( - p.get_filename("{}".format(content_id))) + content_subtype = ported_string(p.get_content_subtype()) + log.debug("content subtype {!r} part {!r}".format( + content_subtype, i)) + filename = decode_header_part(p.get_filename()) - # this is an attachment + is_attachment = False if filename: + is_attachment = True + else: + if content_id and content_subtype not in ('html', 'plain'): + is_attachment = True + filename = content_id + elif content_subtype in ('rtf'): + is_attachment = True + filename = "{}.rtf".format(random_string()) + + # this is an attachment + if is_attachment: log.debug("Email part {!r} is an attachment".format(i)) log.debug("Filename {!r} part {!r}".format(filename, i)) binary = False @@ -412,8 +430,23 @@ def parse(self): # this isn't an attachments else: log.debug("Email part {!r} is not an attachment".format(i)) - payload = ported_string( - p.get_payload(decode=True), encoding=charset) + + # Get the payload using get_payload method with decode=True + # As Python truly decodes only 'base64', + # 'quoted-printable', 'x-uuencode', + # 'uuencode', 'uue', 'x-uue' + # And for other encodings it breaks the characters so + # we need to decode them with encoding python is appying + # To maintain the characters + payload = p.get_payload(decode=True) + cte = p.get('Content-Transfer-Encoding') + if cte: + cte = cte.lower() + if not cte or cte in ['7bit', '8bit']: + payload = payload.decode('raw-unicode-escape') + else: + payload = ported_string(payload, encoding=charset) + if payload: if p.get_content_subtype() == 'html': self._text_html.append(payload) diff --git a/mailparser/utils.py b/mailparser/utils.py index 428e520..3fd4a3c 100644 --- a/mailparser/utils.py +++ b/mailparser/utils.py @@ -101,19 +101,19 @@ def ported_string(raw_data, encoding='utf-8', errors='ignore'): return six.text_type() if isinstance(raw_data, six.text_type): - return raw_data.strip() + return raw_data if six.PY2: try: - return six.text_type(raw_data, encoding, errors).strip() + return six.text_type(raw_data, encoding, errors) except LookupError: - return six.text_type(raw_data, "utf-8", errors).strip() + return six.text_type(raw_data, "utf-8", errors) if six.PY3: try: - return six.text_type(raw_data, encoding).strip() + return six.text_type(raw_data, encoding) except (LookupError, UnicodeDecodeError): - return six.text_type(raw_data, "utf-8", errors).strip() + return six.text_type(raw_data, "utf-8", errors) def decode_header_part(header): @@ -141,7 +141,7 @@ def decode_header_part(header): log.error("Failed decoding header part: {}".format(header)) output += header - return output + return output.strip() def ported_open(file_): @@ -290,7 +290,23 @@ def parse_received(received): if len(values_by_clause) == 0: # we weren't able to match anything... msg = "Unable to match any clauses in %s" % (received) - log.error(msg) + + # Modification #1: Commenting the following log as + # this raised exception is caught above and then + # raw header is updated in response + # We dont want to get so many errors in our error + # logger as we are not even trying to parse the + # received headers + # Wanted to make it configurable via settiings, + # but this package does not depend on django and + # making configurable setting + # will make it django dependent, + # so better to keep it working with only python + # dependent and on any framework of python + # commenting it just for our use + + # log.error(msg) + raise MailParserReceivedParsingError(msg) return values_by_clause @@ -468,7 +484,7 @@ def get_header(message, name): headers = [decode_header_part(i) for i in headers] if len(headers) == 1: # in this case return a string - return headers[0] + return headers[0].strip() # in this case return a list return headers return six.text_type() @@ -551,7 +567,6 @@ def write_sample(binary, payload, path, filename): # pragma: no cover """ if not os.path.exists(path): os.makedirs(path) - sample = os.path.join(path, filename) if binary: diff --git a/mailparser/version.py b/mailparser/version.py index dc0e3bc..41eeab1 100644 --- a/mailparser/version.py +++ b/mailparser/version.py @@ -17,7 +17,7 @@ limitations under the License. """ -__version__ = "3.14.0" +__version__ = "3.15.0" if __name__ == "__main__": print(__version__) diff --git a/setup.py b/setup.py index 959bd43..909da61 100644 --- a/setup.py +++ b/setup.py @@ -64,6 +64,8 @@ "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", ], install_requires=requires, entry_points={'console_scripts': [ diff --git a/tests/test_mail_parser.py b/tests/test_mail_parser.py index f1740ea..400dff3 100644 --- a/tests/test_mail_parser.py +++ b/tests/test_mail_parser.py @@ -200,14 +200,14 @@ def test_fingerprints_body(self): mail = mailparser.parse_from_file(mail_test_1) md5, sha1, sha256, sha512 = fingerprints( mail.body.encode("utf-8")) - self.assertEqual(md5, "1bbdb7dcf511113bbc0c1b214aeac392") - self.assertEqual(sha1, "ce9e62b50fa4e2168278880b14460b905b24eb4b") - self.assertEqual(sha256, ("1e9b96e3f1bc74702f9703391e8ba0715b849" - "7127a7ff857013ab33385898574")) - self.assertEqual(sha512, ("ad858f7b5ec5549e55650fd13df7683e403489" - "77522995851fb6b625ac54744cf3a4bf652784" - "dba971ef99afeec4e6caf2fdd10be72eabb730" - "c312ffbe1c4de3")) + self.assertEqual(md5, "55852a2efe95e7249887c92cc02123f8") + self.assertEqual(sha1, "62fef1e38327ed09363624c3aff8ea11723ee05f") + self.assertEqual(sha256, ("cd4af1017f2e623f6d38f691048b6" + "a28d8b1f44a0478137b4337eac6de78f71a")) + self.assertEqual(sha512, ("4a573c7929b078f2a2c1c0f869d418b0c020d4" + "d37196bd6dcc209f9ccb29ca67355aa5e47b97" + "c8bf90377204f59efde7ba1fc071b6f250a665" + "72f63b997e92e8")) def test_fingerprints_unicodeencodeerror(self): mail = mailparser.parse_from_file(mail_test_7) @@ -456,7 +456,7 @@ def test_parse_from_file_msg(self): m = mailparser.parse_from_file_msg(mail_outlook_1) email = m.mail self.assertIn("attachments", email) - self.assertEqual(len(email["attachments"]), 5) + self.assertEqual(len(email["attachments"]), 6) self.assertIn("from", email) self.assertEqual(email["from"][0][1], "NueblingV@w-vwa.de") self.assertIn("subject", email) @@ -564,11 +564,7 @@ def test_ported_string(self): s = ported_string(raw_data) self.assertEqual(s, six.text_type()) - raw_data = "test " - s = ported_string(raw_data) - self.assertEqual(s, "test") - - raw_data = u"test " + raw_data = u"test" s = ported_string(raw_data) self.assertEqual(s, "test") @@ -671,5 +667,6 @@ def test_write_uuencode_attachment(self): shutil.rmtree(temp_dir) self.assertEqual(md5.hexdigest(), '4f2cf891e7cfb349fca812091f184ecc') + if __name__ == '__main__': unittest.main(verbosity=2) diff --git a/tox.ini b/tox.ini index eba8ffb..9d99fdd 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = begin, py27, py37, end +envlist = begin, py27, py39, end [testenv:begin] commands = coverage erase