Improve the handling of indeterminate and pure reference nodes (again)

michelcrypt4d4mus · Oct 13, 2022 · bdb4241 · bdb4241
1 parent f847bb1
commit bdb4241
Show file tree

Hide file tree

Showing 7 changed files with 95 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,12 @@
 # NEXT RELEASE
 
-# 1.10.5
+# 1.10.6
+* Improve the handling of indeterminate and pure reference nodes (again)
+
+### 1.10.5
 * Improve the handling of indeterminate and pure reference nodes
 
-# 1.10.4
+### 1.10.4
 * Fix bug with unescaped string in section header
 
 ### 1.10.3

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ A PDF analysis tool geared towards visualizing the inner tree-like data structur
 ### Quick Start
 ```sh
 pipx install pdfalyzer
-pdfalyze the_heidiggerian_themes_expressed_in_illmatic.pdf
+pdfalyze heidegger_-_being_illmatic.pdf
 ```
 
 ### What It Do

diff --git a/pdfalyzer/decorators/pdf_tree_node.py b/pdfalyzer/decorators/pdf_tree_node.py
@@ -29,8 +29,7 @@
      get_type_style, get_type_string_style)
 from pdfalyzer.helpers.string_helper import pypdf_class_name
 from pdfalyzer.output.layout import get_label_style
-from pdfalyzer.util.adobe_strings import (DANGEROUS_PDF_KEYS, FIRST, FONT, LAST, NEXT, NON_TREE_REFERENCES, TYPE1_FONT, S,
-     SUBTYPE, TRAILER, TYPE, UNLABELED, XREF, XREF_STREAM, PURE_REFERENCE_NODE_LABELS)
+from pdfalyzer.util.adobe_strings import *
 from pdfalyzer.util.exceptions import PdfWalkError
 
 Relationship = namedtuple('Relationship', ['from_node', 'reference_key'])
@@ -43,14 +42,13 @@
 
 
 class PdfTreeNode(NodeMixin):
-    def __init__(self, obj: PdfObject, known_to_parent_as: str, idnum: int):
+    def __init__(self, obj: PdfObject, address: str, idnum: int):
         """
         reference_key: PDF instruction string used to reference obj
         idnum: ID used in the reference
         """
         self.obj = obj
         self.idnum = idnum
-        self.known_to_parent_as = known_to_parent_as
         self.other_relationships = []
         self.all_references_processed = False
 
@@ -62,27 +60,31 @@ def __init__(self, obj: PdfObject, known_to_parent_as: str, idnum: int):
             self.stream_length = 0
 
         if isinstance(obj, DictionaryObject):
-            self.type = obj.get(TYPE) or known_to_parent_as
-            self.label = obj.get(TYPE) or known_to_parent_as
+            self.type = obj.get(TYPE) or address
+            self.label = obj.get(TYPE) or address
             self.sub_type = obj.get(SUBTYPE) or obj.get(S)
 
             if isinstance(self.type, str):
                 self.type = self.type.split('[')[0]
         else:
-            self.type = known_to_parent_as.split('[')[0] if isinstance(known_to_parent_as, str) else known_to_parent_as
-            self.label = known_to_parent_as
+            # TODO: hacky/incorrect
+            self.type = address.split('[')[0] if isinstance(address, str) else address
+            self.label = address
             self.sub_type = None
 
-        if isinstance(self.known_to_parent_as, int):
-            self.known_to_parent_as = f"[{known_to_parent_as}]"
+        # TODO: this is hacky and possibly incorrect
+        if isinstance(address, int):
+            self.known_to_parent_as = f"[{address}]"
+        else:
+            self.known_to_parent_as = address
 
         if isinstance(self.label, int):
             self.label = f"{UNLABELED}[{self.label}]"
 
     @classmethod
-    def from_reference(cls, ref: IndirectObject, known_to_parent_as: str) -> 'PdfTreeNode':
+    def from_reference(cls, ref: IndirectObject, address: str) -> 'PdfTreeNode':
         """Builds a PdfTreeDecorator from an IndirectObject"""
-        return cls(ref.get_object(), known_to_parent_as, ref.idnum)
+        return cls(ref.get_object(), address, ref.idnum)
 
     def set_parent(self, parent: 'PdfTreeNode') -> None:
         """Set the parent of this node"""
@@ -153,7 +155,7 @@ def referenced_by_keys(self) -> List[str]:
         return [r.reference_key for r in self.other_relationships] + [self.known_to_parent_as]
 
     # Old clause:  elif key in NON_TREE_REFERENCES or node.label.startswith(NUMS) or node.label in PURE_REFERENCE_NODE_LABELS:
-    def is_pure_reference(self, reference_key):
+    def is_pure_reference(self, reference_key) -> bool:
         """Returns True if the reference is probably not in the tree"""
         # if self.idnum == 505:
         #     import pdb;pdb.set_trace()
@@ -165,8 +167,35 @@ def is_pure_reference(self, reference_key):
         # TODO: Checking startswith(NUMS) etc. is a hack that probably will not cover all cases with /StructElem
         return any(self.label.startswith(key) for key in PURE_REFERENCE_NODE_LABELS)
 
-#    def is_indeterminat_reference(self, reference_key):
+    def is_parent_reference(self, reference_key) -> bool:
+        """Returns True for explicit parent refs"""
+        if reference_key == PARENT:
+            return True
+        elif self.type == STRUCT_ELEM and reference_key == P:
+            return True
+        else:
+            return False
 
+    def is_child_reference(self, reference_key) -> bool:
+        """Returns True for explicit child refs"""
+        if reference_key == KIDS:
+            return True
+        elif self.type == STRUCT_ELEM and reference_key == K:
+            return True
+        elif self.type == OBJR and reference_key == OBJ:
+            # TODO: there can be multiple OBJR refs to the same object... which wouldn't work w/this code
+            return True
+        else:
+            return False
+
+    # old check: elif key in INDETERMINATE_REFERENCES and key == address:
+    # TODO: why did we check equality?
+    def is_indeterminate_reference(self, reference_key):
+        """Returns true if we need to wait for all objects to be parsed before placement"""
+        if reference_key in INDETERMINATE_REFERENCES:
+            return True
+        else:
+            return False
 
     def references(self) -> List[PdfObjectRef]:
         """Returns all nodes referenced from this node (see PdfObjectRef definition)"""
@@ -270,6 +299,21 @@ def generate_rich_table(self) -> Table:
 
         return table
 
+    def generate_rich_tree(self, tree=None, depth=0):
+        """Recursively generates a rich.tree.Tree object from this node"""
+        tree = tree or Tree(self.generate_rich_table())
+
+        for child in self.children:
+            if isinstance(child, SymlinkNode):
+                symlink_rep = get_symlink_representation(self, child)
+                tree.add(Panel(symlink_rep.text, style=symlink_rep.style, expand=False))
+                continue
+
+            child_branch = tree.add(child.generate_rich_table())
+            child.generate_rich_tree(child_branch)
+
+        return tree
+
     def _get_stream_preview_rows(self) -> List[List[Text]]:
         """Get rows that preview the stream data"""
         return_rows: List[List[Text]] = []
@@ -310,21 +354,6 @@ def add_preview_row(hex_or_stream: str, stream_string: str):
         return_rows.append([Text('StreamLength', style='grey'), size_text(len(self.stream_data))])
         return return_rows
 
-    def generate_rich_tree(self, tree=None, depth=0):
-        """Recursively generates a rich.tree.Tree object from this node"""
-        tree = tree or Tree(self.generate_rich_table())
-
-        for child in self.children:
-            if isinstance(child, SymlinkNode):
-                symlink_rep = get_symlink_representation(self, child)
-                tree.add(Panel(symlink_rep.text, style=symlink_rep.style, expand=False))
-                continue
-
-            child_branch = tree.add(child.generate_rich_table())
-            child.generate_rich_tree(child_branch)
-
-        return tree
-
     def _node_label(self) -> Text:
         text = Text('<', style='white')
         text.append(f'{self.idnum}', style='bright_white')

diff --git a/pdfalyzer/helpers/pdf_object_helper.py b/pdfalyzer/helpers/pdf_object_helper.py
@@ -23,7 +23,6 @@
 SymlinkRepresentation = namedtuple('SymlinkRepresentation', ['text', 'style'])
 
 
-
 def get_references(obj: PdfObject, ref_key=None, ref_address=None) -> List[PdfObjectRef]:
     """Return list of PdfObjectRefs"""
     if isinstance(obj, IndirectObject):

diff --git a/pdfalyzer/pdfalyzer.py b/pdfalyzer/pdfalyzer.py
@@ -238,9 +238,9 @@ def _process_reference(self, node: PdfTreeNode, key: str, address: str, referenc
             return []
 
         # If there's an explicit /Parent or /Kids reference then we know the correct relationship
-        if key in [PARENT, KIDS] or (node.type == STRUCT_ELEM and key in [K, P]):
+        if node.is_parent_reference(key) or node.is_child_reference(key):
             log.debug(f"Explicit parent/child reference in {node} at {key}")
-            if key in [PARENT, P]:
+            if node.is_parent_reference(key):
                 # try:
                     node.set_parent(referenced_node)
                 # except Exception as e:
@@ -256,7 +256,7 @@ def _process_reference(self, node: PdfTreeNode, key: str, address: str, referenc
                 references_to_return = [referenced_node]
 
         # Indeterminate references need to wait until everything has been scanned to be placed
-        elif key in INDETERMINATE_REFERENCES and key == address:
+        elif node.is_indeterminate_reference(key):
             log.info(f'  Indeterminate {reference_log_string}')
             referenced_node.add_relationship(node, address)
             self.indeterminate_ids.add(referenced_node.idnum)
@@ -326,11 +326,18 @@ def _resolve_indeterminate_nodes(self) -> None:
             if node.label == RESOURCES:
                 self._place_resources_node(node)
                 continue
+            # TODO: these almost all have the same outcome; could be one super ugly if statement
             elif len(referenced_by_keys) == 1:
                 log.info(f"{node}'s other relationships all use key {referenced_by_keys[0]}, linking to lowest id")
                 set_lowest_id_node_as_parent = True
                 possible_parents = node.other_relationships
-            elif len(referenced_by_keys) == 2 and (referenced_by_keys[0] in referenced_by_keys[1] or referenced_by_keys[1] in referenced_by_keys[0]):
+            elif all([EXTERNAL_GRAPHICS_STATE_REGEX.match(key) for key in referenced_by_keys]):
+                log.info(f"{node}'s other relationships are all {EXT_G_STATE} refs; linking to lowest id")
+                set_lowest_id_node_as_parent = True
+                possible_parents = node.other_relationships
+            elif len(referenced_by_keys) == 2 and \
+                    (   referenced_by_keys[0] in referenced_by_keys[1] \
+                     or referenced_by_keys[1] in referenced_by_keys[0]):
                 log.info(f"{node}'s other relationships ref keys are same except slice: {referenced_by_keys}, linking to lowest id")
                 set_lowest_id_node_as_parent = True
                 possible_parents = node.other_relationships
@@ -357,8 +364,8 @@ def _resolve_indeterminate_nodes(self) -> None:
                 continue
 
             self.print_tree()
-            log.fatal("Dumped tree status for debugging.")
             node.print_other_relationships()
+            log.fatal("Dumped tree status and other_relationships for debugging")
             raise PdfWalkError(f"Cannot place {node}")
 
     def _extract_font_infos(self) -> None:
@@ -436,10 +443,13 @@ def _print_traversed_nodes(self) -> None:
 
     def _verify_all_traversed_nodes_are_in_tree(self) -> None:
         """Make sure every node we can see is reachable from the root of the tree"""
-        missing_nodes = [node for idnum, node in self.traversed_nodes.items() if self.find_node_by_idnum(idnum) is None]
+        missing_nodes = [
+            node for idnum, node in self.traversed_nodes.items()
+            if self.find_node_by_idnum(idnum) is None
+        ]
 
         if len(missing_nodes) > 0:
-            msg = f"Nodes were traversed but never placed: {missing_nodes}"
+            msg = f"Nodes were traversed but never placed: {escape(str(missing_nodes))}"
             console.print(msg)
             log.warning(msg)
             #raise PdfWalkError(msg)

diff --git a/pdfalyzer/util/adobe_strings.py b/pdfalyzer/util/adobe_strings.py
@@ -1,11 +1,11 @@
 """
 String constants specified in the Adobe specs for PDFs, fonts, etc.
 """
+import re
 
 from PyPDF2.constants import (CatalogDictionary, ImageAttributes, PageAttributes,
      PagesAttributes, Ressources as Resources)
 
-
 # Fake PDF instructions used to create more explanatory tables/trees/addresses/etc.
 ARRAY_ELEMENT = '/ArrayElement'
 TRAILER = '/Trailer'
@@ -14,11 +14,13 @@
 # Actual PDF instructions
 AA              = CatalogDictionary.AA  # Automatic Action
 ACRO_FORM       = CatalogDictionary.ACRO_FORM  # Can trigger Javascript on open
+ANNOTS          = '/Annots'
 COLOR_SPACE     = Resources.COLOR_SPACE
 D               = '/D'  # Destination, usually of a link or action
 CONTENTS        = '/Contents'
 DEST            = '/Dest'  # Similar to /D?
 EXT_G_STATE     = Resources.EXT_G_STATE
+FIELDS          = '/Fields'
 FIRST           = '/First'
 FONT            = Resources.FONT
 FONT_FILE       = '/FontFile'
@@ -34,9 +36,13 @@
 NEXT            = '/Next'
 NUMS            = '/Nums'
 OBJECT_STREAM   = '/ObjStm'
+OBJ             = '/Obj'
+# TODO: /Pg refs could be the parents of /OBJR?
+OBJR            = '/OBJR'  # Object reference to "an entire PDF object"
 OPEN_ACTION     = CatalogDictionary.OPEN_ACTION
 P               = '/P'  # Equivalent of /Parent for /StructElem
 PARENT          = PagesAttributes.PARENT
+PG              = '/Pg'  # Page ref for OBJR
 PREV            = '/Prev'
 RESOURCES       = PageAttributes.RESOURCES
 S               = '/S'  # Equivalent of /Subtype for /StructElem
@@ -47,6 +53,7 @@
 TYPE            = PageAttributes.TYPE
 TYPE1_FONT      = '/Type1'
 W               = '/W'  # Equivalen of /Widths in some situations
+WIDGET          = '/Widget'
 WIDTHS          = '/Widths'
 XOBJECT         = Resources.XOBJECT
 XREF            = '/XRef'
@@ -87,14 +94,19 @@
 
 # Some PdfObjects can't be properly placed in the tree until the entire tree is parsed
 INDETERMINATE_REFERENCES = [
+    ANNOTS,  # At least when it appears in a page
     COLOR_SPACE,
     D,
     DEST,
     EXT_G_STATE,
+    FIELDS,   # At least for  /AcroForm
     FIRST,
     FONT,
     OPEN_ACTION,
+    P,   # At least for widgets...
     RESOURCES,
     XOBJECT,
     UNLABELED, # TODO: this might be wrong? maybe this is where the /Resources actually live?
 ]
+
+EXTERNAL_GRAPHICS_STATE_REGEX = re.compile('/Resources\\[/ExtGState\\]\\[/GS\\d+\\]')
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdfalyzer"
-version = "1.10.5"
+version = "1.10.6"
 description = "A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
 authors = ["Michel de Cryptadamus <[email protected]>"]
 license = "GPL-3.0-or-later"