Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add getAttachFiles() on PdfFileReader #56

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions pypdf/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2605,6 +2605,105 @@ def _authenticateUserPassword(self, password):
def isEncrypted(self):
return "/Encrypt" in self._trailer

def _get_dict_entry(self, node, entry):
if not isinstance(node, dict):
raise ValueError('The node must be a dict')
dict_entry = node.get(entry)
if isinstance(dict_entry, dict):
return dict_entry
elif isinstance(dict_entry, IndirectObject):
res_dict_entry = dict_entry.getObject()
if isinstance(res_dict_entry, dict):
return res_dict_entry
else:
return False
else:
return False

def _parse_embeddedfiles_kids_node(self, kids_node, level, res):
if level not in [1, 2]:
raise ValueError('Level argument should be 1 or 2')
# The /Kids entry of the EmbeddedFiles name tree must be an array
if not isinstance(kids_node, list):
return False
for kid_entry in kids_node:
# The /Kids entry of the EmbeddedFiles name tree must be a
# list of IndirectObjects
if not isinstance(kid_entry, IndirectObject):
return False
kids_node = kid_entry.getObject()
# The /Kids entry of the EmbeddedFiles name tree
# must be a list of IndirectObjects that point to dict objects
if not isinstance(kids_node, dict):
return False
if '/Names' in kids_node:
# The /Names entry in EmbeddedFiles must be an array
if not isinstance(kids_node['/Names'], list):
return False
res += kids_node['/Names']
elif '/Kids' in kids_node and level == 1:
kids_node_l2 = kids_node['/Kids']
self._parse_embeddedfiles_kids_node(kids_node_l2, 2, res)
else:
# /Kids node should have a /Names or /Kids entry
return False
return True

def _get_embeddedfiles(self, embeddedfiles_node):
if not isinstance(embeddedfiles_node, dict):
raise ValueError('The EmbeddedFiles node must be a dict')
res = []
if '/Names' in embeddedfiles_node:
# The /Names entry of the EmbeddedFiles name tree must be an array
if not isinstance(embeddedfiles_node['/Names'], list):
return False
res = embeddedfiles_node['/Names']
elif '/Kids' in embeddedfiles_node:
kids_node = embeddedfiles_node['/Kids']
parse_result = self._parse_embeddedfiles_kids_node(
kids_node, 1, res)
if parse_result is False:
return False
else:
# The EmbeddedFiles name tree should have either a /Names or a
# /Kids entry
return False
# The EmbeddedFiles name tree should point to an even number
# of elements
if len(res) % 2 != 0:
return False
return res

def getAttachFiles(self):
"""
Retrieves all the attachments.

:return: a list of tuples (filename, file)
:rtype: list
"""
res = []
catalog = self.trailer['/Root']
# get the name tree
catalog_name = self._get_dict_entry(catalog, '/Names')
if not catalog_name:
return res
embeddedfiles_node = self._get_dict_entry(
catalog_name, '/EmbeddedFiles')
if not embeddedfiles_node:
return res
embeddedfiles = self._get_embeddedfiles(embeddedfiles_node)
if not embeddedfiles:
return res
embeddedfiles_by_two = list(zip(embeddedfiles, embeddedfiles[1:]))[::2]
for (filename, file_obj) in embeddedfiles_by_two:
file_dict = file_obj.getObject()
try:
file_content = file_dict['/EF']['/F'].getData()
res.append((filename, file_content))
except KeyError:
pass
return res


def _convertToInt(d, size):
if size > 8:
Expand Down