From 15d9af835a6377d190e750f4f760da5a3f4920d2 Mon Sep 17 00:00:00 2001 From: Alexis de Lattre Date: Tue, 11 Jun 2019 17:28:14 +0200 Subject: [PATCH] Add getAttachFiles() on PdfFileReader --- pypdf/pdf.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/pypdf/pdf.py b/pypdf/pdf.py index 82f625671..af2c10f05 100644 --- a/pypdf/pdf.py +++ b/pypdf/pdf.py @@ -2605,6 +2605,105 @@ def _authenticateUserPassword(self, password): def isEncrypted(self): return "/Encrypt" in self._trailer + def _get_dict_entry(self, node, entry): + if not isinstance(node, dict): + raise ValueError('The node must be a dict') + dict_entry = node.get(entry) + if isinstance(dict_entry, dict): + return dict_entry + elif isinstance(dict_entry, IndirectObject): + res_dict_entry = dict_entry.getObject() + if isinstance(res_dict_entry, dict): + return res_dict_entry + else: + return False + else: + return False + + def _parse_embeddedfiles_kids_node(self, kids_node, level, res): + if level not in [1, 2]: + raise ValueError('Level argument should be 1 or 2') + # The /Kids entry of the EmbeddedFiles name tree must be an array + if not isinstance(kids_node, list): + return False + for kid_entry in kids_node: + # The /Kids entry of the EmbeddedFiles name tree must be a + # list of IndirectObjects + if not isinstance(kid_entry, IndirectObject): + return False + kids_node = kid_entry.getObject() + # The /Kids entry of the EmbeddedFiles name tree + # must be a list of IndirectObjects that point to dict objects + if not isinstance(kids_node, dict): + return False + if '/Names' in kids_node: + # The /Names entry in EmbeddedFiles must be an array + if not isinstance(kids_node['/Names'], list): + return False + res += kids_node['/Names'] + elif '/Kids' in kids_node and level == 1: + kids_node_l2 = kids_node['/Kids'] + self._parse_embeddedfiles_kids_node(kids_node_l2, 2, res) + else: + # /Kids node should have a /Names or /Kids entry + return False + return True + + def _get_embeddedfiles(self, embeddedfiles_node): + if not isinstance(embeddedfiles_node, dict): + raise ValueError('The EmbeddedFiles node must be a dict') + res = [] + if '/Names' in embeddedfiles_node: + # The /Names entry of the EmbeddedFiles name tree must be an array + if not isinstance(embeddedfiles_node['/Names'], list): + return False + res = embeddedfiles_node['/Names'] + elif '/Kids' in embeddedfiles_node: + kids_node = embeddedfiles_node['/Kids'] + parse_result = self._parse_embeddedfiles_kids_node( + kids_node, 1, res) + if parse_result is False: + return False + else: + # The EmbeddedFiles name tree should have either a /Names or a + # /Kids entry + return False + # The EmbeddedFiles name tree should point to an even number + # of elements + if len(res) % 2 != 0: + return False + return res + + def getAttachFiles(self): + """ + Retrieves all the attachments. + + :return: a list of tuples (filename, file) + :rtype: list + """ + res = [] + catalog = self.trailer['/Root'] + # get the name tree + catalog_name = self._get_dict_entry(catalog, '/Names') + if not catalog_name: + return res + embeddedfiles_node = self._get_dict_entry( + catalog_name, '/EmbeddedFiles') + if not embeddedfiles_node: + return res + embeddedfiles = self._get_embeddedfiles(embeddedfiles_node) + if not embeddedfiles: + return res + embeddedfiles_by_two = list(zip(embeddedfiles, embeddedfiles[1:]))[::2] + for (filename, file_obj) in embeddedfiles_by_two: + file_dict = file_obj.getObject() + try: + file_content = file_dict['/EF']['/F'].getData() + res.append((filename, file_content)) + except KeyError: + pass + return res + def _convertToInt(d, size): if size > 8: