Skip to content

Commit

Permalink
Improve the handling of indeterminate and pure reference nodes (again)
Browse files Browse the repository at this point in the history
  • Loading branch information
ashariyar committed Oct 13, 2022
1 parent f847bb1 commit bdb4241
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 42 deletions.
7 changes: 5 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# NEXT RELEASE

# 1.10.5
# 1.10.6
* Improve the handling of indeterminate and pure reference nodes (again)

### 1.10.5
* Improve the handling of indeterminate and pure reference nodes

# 1.10.4
### 1.10.4
* Fix bug with unescaped string in section header

### 1.10.3
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ A PDF analysis tool geared towards visualizing the inner tree-like data structur
### Quick Start
```sh
pipx install pdfalyzer
pdfalyze the_heidiggerian_themes_expressed_in_illmatic.pdf
pdfalyze heidegger_-_being_illmatic.pdf
```

### What It Do
Expand Down
87 changes: 58 additions & 29 deletions pdfalyzer/decorators/pdf_tree_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@
get_type_style, get_type_string_style)
from pdfalyzer.helpers.string_helper import pypdf_class_name
from pdfalyzer.output.layout import get_label_style
from pdfalyzer.util.adobe_strings import (DANGEROUS_PDF_KEYS, FIRST, FONT, LAST, NEXT, NON_TREE_REFERENCES, TYPE1_FONT, S,
SUBTYPE, TRAILER, TYPE, UNLABELED, XREF, XREF_STREAM, PURE_REFERENCE_NODE_LABELS)
from pdfalyzer.util.adobe_strings import *
from pdfalyzer.util.exceptions import PdfWalkError

Relationship = namedtuple('Relationship', ['from_node', 'reference_key'])
Expand All @@ -43,14 +42,13 @@


class PdfTreeNode(NodeMixin):
def __init__(self, obj: PdfObject, known_to_parent_as: str, idnum: int):
def __init__(self, obj: PdfObject, address: str, idnum: int):
"""
reference_key: PDF instruction string used to reference obj
idnum: ID used in the reference
"""
self.obj = obj
self.idnum = idnum
self.known_to_parent_as = known_to_parent_as
self.other_relationships = []
self.all_references_processed = False

Expand All @@ -62,27 +60,31 @@ def __init__(self, obj: PdfObject, known_to_parent_as: str, idnum: int):
self.stream_length = 0

if isinstance(obj, DictionaryObject):
self.type = obj.get(TYPE) or known_to_parent_as
self.label = obj.get(TYPE) or known_to_parent_as
self.type = obj.get(TYPE) or address
self.label = obj.get(TYPE) or address
self.sub_type = obj.get(SUBTYPE) or obj.get(S)

if isinstance(self.type, str):
self.type = self.type.split('[')[0]
else:
self.type = known_to_parent_as.split('[')[0] if isinstance(known_to_parent_as, str) else known_to_parent_as
self.label = known_to_parent_as
# TODO: hacky/incorrect
self.type = address.split('[')[0] if isinstance(address, str) else address
self.label = address
self.sub_type = None

if isinstance(self.known_to_parent_as, int):
self.known_to_parent_as = f"[{known_to_parent_as}]"
# TODO: this is hacky and possibly incorrect
if isinstance(address, int):
self.known_to_parent_as = f"[{address}]"
else:
self.known_to_parent_as = address

if isinstance(self.label, int):
self.label = f"{UNLABELED}[{self.label}]"

@classmethod
def from_reference(cls, ref: IndirectObject, known_to_parent_as: str) -> 'PdfTreeNode':
def from_reference(cls, ref: IndirectObject, address: str) -> 'PdfTreeNode':
"""Builds a PdfTreeDecorator from an IndirectObject"""
return cls(ref.get_object(), known_to_parent_as, ref.idnum)
return cls(ref.get_object(), address, ref.idnum)

def set_parent(self, parent: 'PdfTreeNode') -> None:
"""Set the parent of this node"""
Expand Down Expand Up @@ -153,7 +155,7 @@ def referenced_by_keys(self) -> List[str]:
return [r.reference_key for r in self.other_relationships] + [self.known_to_parent_as]

# Old clause: elif key in NON_TREE_REFERENCES or node.label.startswith(NUMS) or node.label in PURE_REFERENCE_NODE_LABELS:
def is_pure_reference(self, reference_key):
def is_pure_reference(self, reference_key) -> bool:
"""Returns True if the reference is probably not in the tree"""
# if self.idnum == 505:
# import pdb;pdb.set_trace()
Expand All @@ -165,8 +167,35 @@ def is_pure_reference(self, reference_key):
# TODO: Checking startswith(NUMS) etc. is a hack that probably will not cover all cases with /StructElem
return any(self.label.startswith(key) for key in PURE_REFERENCE_NODE_LABELS)

# def is_indeterminat_reference(self, reference_key):
def is_parent_reference(self, reference_key) -> bool:
"""Returns True for explicit parent refs"""
if reference_key == PARENT:
return True
elif self.type == STRUCT_ELEM and reference_key == P:
return True
else:
return False

def is_child_reference(self, reference_key) -> bool:
"""Returns True for explicit child refs"""
if reference_key == KIDS:
return True
elif self.type == STRUCT_ELEM and reference_key == K:
return True
elif self.type == OBJR and reference_key == OBJ:
# TODO: there can be multiple OBJR refs to the same object... which wouldn't work w/this code
return True
else:
return False

# old check: elif key in INDETERMINATE_REFERENCES and key == address:
# TODO: why did we check equality?
def is_indeterminate_reference(self, reference_key):
"""Returns true if we need to wait for all objects to be parsed before placement"""
if reference_key in INDETERMINATE_REFERENCES:
return True
else:
return False

def references(self) -> List[PdfObjectRef]:
"""Returns all nodes referenced from this node (see PdfObjectRef definition)"""
Expand Down Expand Up @@ -270,6 +299,21 @@ def generate_rich_table(self) -> Table:

return table

def generate_rich_tree(self, tree=None, depth=0):
"""Recursively generates a rich.tree.Tree object from this node"""
tree = tree or Tree(self.generate_rich_table())

for child in self.children:
if isinstance(child, SymlinkNode):
symlink_rep = get_symlink_representation(self, child)
tree.add(Panel(symlink_rep.text, style=symlink_rep.style, expand=False))
continue

child_branch = tree.add(child.generate_rich_table())
child.generate_rich_tree(child_branch)

return tree

def _get_stream_preview_rows(self) -> List[List[Text]]:
"""Get rows that preview the stream data"""
return_rows: List[List[Text]] = []
Expand Down Expand Up @@ -310,21 +354,6 @@ def add_preview_row(hex_or_stream: str, stream_string: str):
return_rows.append([Text('StreamLength', style='grey'), size_text(len(self.stream_data))])
return return_rows

def generate_rich_tree(self, tree=None, depth=0):
"""Recursively generates a rich.tree.Tree object from this node"""
tree = tree or Tree(self.generate_rich_table())

for child in self.children:
if isinstance(child, SymlinkNode):
symlink_rep = get_symlink_representation(self, child)
tree.add(Panel(symlink_rep.text, style=symlink_rep.style, expand=False))
continue

child_branch = tree.add(child.generate_rich_table())
child.generate_rich_tree(child_branch)

return tree

def _node_label(self) -> Text:
text = Text('<', style='white')
text.append(f'{self.idnum}', style='bright_white')
Expand Down
1 change: 0 additions & 1 deletion pdfalyzer/helpers/pdf_object_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
SymlinkRepresentation = namedtuple('SymlinkRepresentation', ['text', 'style'])



def get_references(obj: PdfObject, ref_key=None, ref_address=None) -> List[PdfObjectRef]:
"""Return list of PdfObjectRefs"""
if isinstance(obj, IndirectObject):
Expand Down
24 changes: 17 additions & 7 deletions pdfalyzer/pdfalyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,9 @@ def _process_reference(self, node: PdfTreeNode, key: str, address: str, referenc
return []

# If there's an explicit /Parent or /Kids reference then we know the correct relationship
if key in [PARENT, KIDS] or (node.type == STRUCT_ELEM and key in [K, P]):
if node.is_parent_reference(key) or node.is_child_reference(key):
log.debug(f"Explicit parent/child reference in {node} at {key}")
if key in [PARENT, P]:
if node.is_parent_reference(key):
# try:
node.set_parent(referenced_node)
# except Exception as e:
Expand All @@ -256,7 +256,7 @@ def _process_reference(self, node: PdfTreeNode, key: str, address: str, referenc
references_to_return = [referenced_node]

# Indeterminate references need to wait until everything has been scanned to be placed
elif key in INDETERMINATE_REFERENCES and key == address:
elif node.is_indeterminate_reference(key):
log.info(f' Indeterminate {reference_log_string}')
referenced_node.add_relationship(node, address)
self.indeterminate_ids.add(referenced_node.idnum)
Expand Down Expand Up @@ -326,11 +326,18 @@ def _resolve_indeterminate_nodes(self) -> None:
if node.label == RESOURCES:
self._place_resources_node(node)
continue
# TODO: these almost all have the same outcome; could be one super ugly if statement
elif len(referenced_by_keys) == 1:
log.info(f"{node}'s other relationships all use key {referenced_by_keys[0]}, linking to lowest id")
set_lowest_id_node_as_parent = True
possible_parents = node.other_relationships
elif len(referenced_by_keys) == 2 and (referenced_by_keys[0] in referenced_by_keys[1] or referenced_by_keys[1] in referenced_by_keys[0]):
elif all([EXTERNAL_GRAPHICS_STATE_REGEX.match(key) for key in referenced_by_keys]):
log.info(f"{node}'s other relationships are all {EXT_G_STATE} refs; linking to lowest id")
set_lowest_id_node_as_parent = True
possible_parents = node.other_relationships
elif len(referenced_by_keys) == 2 and \
( referenced_by_keys[0] in referenced_by_keys[1] \
or referenced_by_keys[1] in referenced_by_keys[0]):
log.info(f"{node}'s other relationships ref keys are same except slice: {referenced_by_keys}, linking to lowest id")
set_lowest_id_node_as_parent = True
possible_parents = node.other_relationships
Expand All @@ -357,8 +364,8 @@ def _resolve_indeterminate_nodes(self) -> None:
continue

self.print_tree()
log.fatal("Dumped tree status for debugging.")
node.print_other_relationships()
log.fatal("Dumped tree status and other_relationships for debugging")
raise PdfWalkError(f"Cannot place {node}")

def _extract_font_infos(self) -> None:
Expand Down Expand Up @@ -436,10 +443,13 @@ def _print_traversed_nodes(self) -> None:

def _verify_all_traversed_nodes_are_in_tree(self) -> None:
"""Make sure every node we can see is reachable from the root of the tree"""
missing_nodes = [node for idnum, node in self.traversed_nodes.items() if self.find_node_by_idnum(idnum) is None]
missing_nodes = [
node for idnum, node in self.traversed_nodes.items()
if self.find_node_by_idnum(idnum) is None
]

if len(missing_nodes) > 0:
msg = f"Nodes were traversed but never placed: {missing_nodes}"
msg = f"Nodes were traversed but never placed: {escape(str(missing_nodes))}"
console.print(msg)
log.warning(msg)
#raise PdfWalkError(msg)
Expand Down
14 changes: 13 additions & 1 deletion pdfalyzer/util/adobe_strings.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""
String constants specified in the Adobe specs for PDFs, fonts, etc.
"""
import re

from PyPDF2.constants import (CatalogDictionary, ImageAttributes, PageAttributes,
PagesAttributes, Ressources as Resources)


# Fake PDF instructions used to create more explanatory tables/trees/addresses/etc.
ARRAY_ELEMENT = '/ArrayElement'
TRAILER = '/Trailer'
Expand All @@ -14,11 +14,13 @@
# Actual PDF instructions
AA = CatalogDictionary.AA # Automatic Action
ACRO_FORM = CatalogDictionary.ACRO_FORM # Can trigger Javascript on open
ANNOTS = '/Annots'
COLOR_SPACE = Resources.COLOR_SPACE
D = '/D' # Destination, usually of a link or action
CONTENTS = '/Contents'
DEST = '/Dest' # Similar to /D?
EXT_G_STATE = Resources.EXT_G_STATE
FIELDS = '/Fields'
FIRST = '/First'
FONT = Resources.FONT
FONT_FILE = '/FontFile'
Expand All @@ -34,9 +36,13 @@
NEXT = '/Next'
NUMS = '/Nums'
OBJECT_STREAM = '/ObjStm'
OBJ = '/Obj'
# TODO: /Pg refs could be the parents of /OBJR?
OBJR = '/OBJR' # Object reference to "an entire PDF object"
OPEN_ACTION = CatalogDictionary.OPEN_ACTION
P = '/P' # Equivalent of /Parent for /StructElem
PARENT = PagesAttributes.PARENT
PG = '/Pg' # Page ref for OBJR
PREV = '/Prev'
RESOURCES = PageAttributes.RESOURCES
S = '/S' # Equivalent of /Subtype for /StructElem
Expand All @@ -47,6 +53,7 @@
TYPE = PageAttributes.TYPE
TYPE1_FONT = '/Type1'
W = '/W' # Equivalen of /Widths in some situations
WIDGET = '/Widget'
WIDTHS = '/Widths'
XOBJECT = Resources.XOBJECT
XREF = '/XRef'
Expand Down Expand Up @@ -87,14 +94,19 @@

# Some PdfObjects can't be properly placed in the tree until the entire tree is parsed
INDETERMINATE_REFERENCES = [
ANNOTS, # At least when it appears in a page
COLOR_SPACE,
D,
DEST,
EXT_G_STATE,
FIELDS, # At least for /AcroForm
FIRST,
FONT,
OPEN_ACTION,
P, # At least for widgets...
RESOURCES,
XOBJECT,
UNLABELED, # TODO: this might be wrong? maybe this is where the /Resources actually live?
]

EXTERNAL_GRAPHICS_STATE_REGEX = re.compile('/Resources\\[/ExtGState\\]\\[/GS\\d+\\]')
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdfalyzer"
version = "1.10.5"
version = "1.10.6"
description = "A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
authors = ["Michel de Cryptadamus <[email protected]>"]
license = "GPL-3.0-or-later"
Expand Down

0 comments on commit bdb4241

Please sign in to comment.