Scan all binaries (not just font binaries) with included PDF related …

…YARA rules, lots of refactoring
michelcrypt4d4mus · Oct 14, 2022 · 0173885 · 0173885
1 parent 524ceed
commit 0173885
Show file tree

Hide file tree

Showing 19 changed files with 342 additions and 184 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,14 @@
 # NEXT RELEASE
+
+# 1.11.0
+* Scan all binaries (not just font binaries) with included PDF related YARA rules
 * Better warning about stream decode failures
 * Remove warnings that should not be warnings
 * Refactor rich table view code to `pdf_node_rich_table.py`
+* Refactor `Relationship` and `PdfObjectRef` to single class, `PdfObjectRelationship`
 
 ### 1.10.8
-* Fix `importlib.resources` usage in case there's a zip file
+* Fix `importlib.resources` usage in case pdfalyer is packaged as a zip file
 * `/Names` is an indeterminate reference type
 * Catch stream decode exceptions and show error instead of failing.
 

diff --git a/pdfalyzer/decorators/pdf_tree_node.py b/pdfalyzer/decorators/pdf_tree_node.py
@@ -6,63 +6,59 @@
 methods and not set directly. (TODO: this could be done better with anytree
 hooks)
 """
-from collections import namedtuple
 from typing import List, Optional, Union
 
-from anytree import NodeMixin, SymlinkNode
-from PyPDF2.generic import DictionaryObject, IndirectObject, NumberObject, PdfObject, StreamObject
+from anytree import NodeMixin
 from PyPDF2.errors import PdfReadError
+from PyPDF2.generic import DictionaryObject, IndirectObject, NumberObject, PdfObject, StreamObject
 from rich.markup import escape
-from rich.panel import Panel
 from rich.text import Text
-from rich.tree import Tree
 from yaralyzer.output.rich_console import console
 from yaralyzer.util.logging import log
 
-from pdfalyzer.helpers.pdf_object_helper import PdfObjectRelationship, get_symlink_representation
-from pdfalyzer.helpers.rich_text_helper import get_type_style, get_type_string_style
-from pdfalyzer.helpers.string_helper import pypdf_class_name
-from pdfalyzer.output.layout import get_label_style
-from pdfalyzer.output.pdf_node_rich_table import build_pdf_node_table, get_node_type_style
+from pdfalyzer.helpers.pdf_object_helper import node_label
+from pdfalyzer.helpers.rich_text_helper import get_type_string_style, get_type_style
+from pdfalyzer.helpers.string_helper import pypdf_class_name, root_address
+from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
 from pdfalyzer.util.adobe_strings import *
 from pdfalyzer.util.exceptions import PdfWalkError
 
 DEFAULT_MAX_ADDRESS_LENGTH = 90
+DECODE_FAILURE_LEN = -1
 
 
 class PdfTreeNode(NodeMixin):
     def __init__(self, obj: PdfObject, address: str, idnum: int):
         """
-        reference_key: PDF instruction string used to reference obj
+        address: PDF instruction string + modifiers used to reference obj
         idnum: ID used in the reference
         """
         self.obj = obj
         self.idnum = idnum
         self.all_references_processed: bool = False
         self.other_relationships: List[PdfObjectRelationship] = []
+        self.sub_type = None
 
         if isinstance(obj, DictionaryObject):
             self.type = obj.get(TYPE) or address
-            self.label = obj.get(TYPE) or address
             self.sub_type = obj.get(SUBTYPE) or obj.get(S)
+            self.label = obj.get(TYPE) or address  # TODO: should we use sub_type?
 
             if isinstance(self.type, str):
-                self.type = self.type.split('[')[0]
+                self.type = root_address(self.type)
+        elif isinstance(address, int):
+            self.label = f"{UNLABELED}[{address}]"
+            self.type = str(address)
         else:
-            # TODO: hacky
-            self.type = address.split('[')[0] if isinstance(address, str) else address
             self.label = address
-            self.sub_type = None
+            self.type = root_address(address)
 
         # TODO: this is hacky/temporarily incorrect bc we often don't know the parent when node is being constructed
         if isinstance(address, int):
             self.known_to_parent_as = f"[{address}]"
         else:
             self.known_to_parent_as = address
 
-        if isinstance(self.label, int):
-            self.label = f"{UNLABELED}[{self.label}]"
-
         if isinstance(obj, StreamObject):
             try:
                 self.stream_data = self.obj.get_data()
@@ -71,8 +67,8 @@ def __init__(self, obj: PdfObject, address: str, idnum: int):
                 msg = f"Failed to decode stream in {self}: {e}"
                 console.print_exception()
                 log.warning(msg)
-                self.stream_data = msg
-                self.stream_length = -1
+                self.stream_data = msg.encode()
+                self.stream_length = DECODE_FAILURE_LEN
         else:
             self.stream_data = None
             self.stream_length = 0
@@ -245,7 +241,8 @@ def _find_address_of_this_node(self, from_node: 'PdfTreeNode') -> Optional[str]:
             reference_address = refs_to_this_node[0].reference_address
 
             if not all(ref.reference_address in [FIRST, LAST] for ref in refs_to_this_node):
-                log.warning(f"Multiple refs from {from_node} to {self}: {refs_to_this_node}. Using {reference_address} as address")
+                msg = f"Multiple refs from {from_node} to {self}: {refs_to_this_node}"
+                log.warning(msg + ", using {reference_address}")
 
             return reference_address
 
@@ -272,31 +269,8 @@ def tree_address(self, max_length: Optional[int] = None) -> str:
 
         return '...' + address[-max_length:][3:]
 
-    def generate_rich_tree(self, tree=None, depth=0) -> Tree:
-        """Recursively generates a rich.tree.Tree object from this node"""
-        tree = tree or Tree(build_pdf_node_table(self))
-
-        for child in self.children:
-            if isinstance(child, SymlinkNode):
-                symlink_rep = get_symlink_representation(self, child)
-                tree.add(Panel(symlink_rep.text, style=symlink_rep.style, expand=False))
-                continue
-
-            child_branch = tree.add(build_pdf_node_table(child))
-            child.generate_rich_tree(child_branch)
-
-        return tree
-
     def _node_label(self) -> Text:
-        text = Text('<', style='white')
-        text.append(f'{self.idnum}', style='bright_white')
-        text.append(':', style='white')
-        text.append(self.label[1:], style=f'{get_label_style(self.label)} underline bold')
-        text.append('(', style='white')
-        text.append(pypdf_class_name(self.obj), style=get_node_type_style(self.obj))
-        text.append(')', style='white')
-        text.append('>')
-        return text
+        return node_label(self.idnum, self.label, self.obj)
 
     def _colored_address(self, max_length: Optional[int] = None) -> Text:
         """Rich text version of tree_address()"""

diff --git a/pdfalyzer/font_info.py b/pdfalyzer/font_info.py
@@ -143,13 +143,6 @@ def __init__(self, label, idnum, font, font_descriptor, font_file, obj_with_reso
             self._char_map = None
             self.character_mapping = None
 
-    def yara_scan(self) -> None:
-        if self.binary_scanner is None:
-            log.debug(f"No binary to scan for {self.display_title}")
-            return
-
-        get_bytes_yaralyzer(self.stream_data, str(self)).yaralyze()
-
     def width_stats(self):
         if self.widths is None:
             return {}

diff --git a/pdfalyzer/helpers/pdf_object_helper.py b/pdfalyzer/helpers/pdf_object_helper.py
@@ -1,125 +1,40 @@
 """
 Some methods to help with the direct manipulation/processing of PyPDF2's PdfObjects
 """
-from collections import namedtuple
 from typing import List, Optional
 
 from PyPDF2.generic import IndirectObject, PdfObject
-from rich.markup import escape
-from yaralyzer.helpers.string_helper import comma_join
-from yaralyzer.util.logging import log
+from rich.text import Text
 
+from pdfalyzer.helpers.string_helper import pypdf_class_name
 from pdfalyzer.output.layout import get_label_style
+from pdfalyzer.output.pdf_node_rich_table import get_node_type_style
+from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
 from pdfalyzer.util.adobe_strings import *
-from pdfalyzer.util.exceptions import PdfWalkError
 
-# For printing SymlinkNodes
-SymlinkRepresentation = namedtuple('SymlinkRepresentation', ['text', 'style'])
 
-
-class PdfObjectRelationship:
-    """
-    Simple container class for information about a link between two PDF objects.
-    In the case of easy key/value pairs the reference_key and the reference_address are the same but
-    for more complicated references the reference_address will be the reference_key plus sub references.
-
-    e.g. a reference_key for a /Font labeled /F1 could be '/Resources' but the reference_address
-    might be '/Resources[/Font][/F1] if the /Font is a directly embedded reference instead of a remote one.
-    """
-    def __init__(
-            self,
-            from_obj: PdfObject,
-            to_obj: IndirectObject,
-            reference_key: str,
-            reference_address: str
-        ) -> None:
-        self.from_obj = from_obj
-        self.to_obj = to_obj
-        self.reference_key = reference_key
-        self.reference_address = reference_address
-        self.from_node: Optional['PdfTreeNode'] = None  # To be filled in later.  TODO: Hacky
-
-    @classmethod
-    def get_references(
-            cls,
-            obj: PdfObject,
-            ref_key: Optional[str] = None,
-            ref_address: Optional[str] = None
-        ) -> List['PdfObjectRelationship']:
-        """
-        Recurse through elements in 'obj' and return list of PdfObjectRelationships containing all IndirectObjects
-        referenced from addresses in 'obj'.
-        """
-        if isinstance(obj, IndirectObject):
-            if ref_key is None or ref_address is None:
-                raise PdfWalkError(f"{obj} is a reference but key or address not provided")
-            else:
-                return [cls(obj, obj, ref_key, ref_address)]
-
-        return_list: List[PdfObjectRelationship] = []
-
-        if isinstance(obj, list):
-            for i, element in enumerate(obj):
-                if not isinstance(element, (IndirectObject, list, dict)):
-                    continue
-
-                _ref_address = f"[{i}]" if ref_address is None else f"{ref_address}[{i}]"
-                return_list += cls.get_references(element, ref_key or i, _ref_address)
-        elif isinstance(obj, dict):
-            for k, v in obj.items():
-                _ref_address = k if ref_address is None else f"{ref_address}[{k}]"
-                return_list += cls.get_references(v, ref_key or k, _ref_address)
-        else:
-            log.debug(f"Adding no references for PdfObject reference '{ref_key}' -> '{obj}'")
-
-        for ref in return_list:
-            ref.from_obj = obj
-
-        return return_list
-
-    def __eq__(self, other: 'PdfObjectRelationship') -> bool:
-        """Note that equality does not check self.from_obj."""
-        if (self.to_obj.idnum != other.to_obj.idnum) or (self.from_node != other.from_node):
-            return False
-
-        for k in ['reference_key', 'reference_address']:
-            if getattr(self, k) != getattr(other, k):
-                return False
-
-        return True
-
-    def __str__(self) -> str:
-        return comma_join([f"{k}: {v}" for k, v in vars(self).items()])
-
-    def description(self) -> str:
-        """Sort of like __str__ but w/out the extra lines"""
-        return f"{self.from_node}: {self.reference_address} to {self.to_obj}"
-
-
-def get_symlink_representation(from_node, to_node) -> SymlinkRepresentation:
-    """Returns a tuple (symlink_text, style) that can be used for pretty printing, tree creation, etc"""
-    reference_key = str(to_node.get_address_for_relationship(from_node))
-    pdf_instruction = reference_key.split('[')[0]  # In case we ended up with a [0] or similar
-
-    if pdf_instruction in DANGEROUS_PDF_KEYS:
-        symlink_style = 'red_alert'
-    else:
-        symlink_style = get_label_style(to_node.label) + ' dim'
-
-    symlink_str = f"{escape(reference_key)} [bright_white]=>[/bright_white]"
-    symlink_str += f" {escape(str(to_node.target))} [grey](Non Child Reference)[/grey]"
-    return SymlinkRepresentation(symlink_str, symlink_style)
-
-
-def pdf_object_id(pdf_object):
+def pdf_object_id(pdf_object) -> Optional[int]:
     """Return the ID of an IndirectObject and None for everything else"""
     return pdf_object.idnum if isinstance(pdf_object, IndirectObject) else None
 
 
 def does_list_have_any_references(_list) -> bool:
-    """Return true if any element of _list is an IndirectObject"""
+    """Return true if any element of _list is an IndirectObject."""
     return any(isinstance(item, IndirectObject) for item in _list)
 
 
+def node_label(idnum: int, label: str, pdf_object: PdfObject) -> Text:
+    """Colored text representation of a node."""
+    text = Text('<', style='white')
+    text.append(f'{idnum}', style='bright_white')
+    text.append(':', style='white')
+    text.append(label[1:], style=f'{get_label_style(label)} underline bold')
+    text.append('(', style='white')
+    text.append(pypdf_class_name(pdf_object), style=get_node_type_style(pdf_object))
+    text.append(')', style='white')
+    text.append('>')
+    return text
+
+
 def _sort_pdf_object_refs(refs: List[PdfObjectRelationship]) -> List[PdfObjectRelationship]:
     return sorted(refs, key=lambda ref: ref.to_obj.idnum)
diff --git a/pdfalyzer/helpers/string_helper.py b/pdfalyzer/helpers/string_helper.py
@@ -8,12 +8,9 @@
 from PyPDF2.generic import PdfObject
 from yaralyzer.output.rich_console import console_width
 
-
-# Style
 INDENT_DEPTH = 4
 PRETTY_PRINT_WIDTH = 60
 
-
 # Pretty Printer
 pp = PrettyPrinter(
     indent=INDENT_DEPTH,
@@ -52,3 +49,8 @@ def count_pattern_matches_in_text(pattern: str, text: str) -> int:
 def count_regex_matches_in_text(regex: Pattern, text: str) -> int:
     """For use when you precompile the regex"""
     return sum(1 for _ in regex.finditer(text))
+
+
+def root_address(_string: str) -> str:
+    """Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
+    return _string.split('[')[0]
diff --git a/pdfalyzer/output/pdf_node_rich_table.py b/pdfalyzer/output/pdf_node_rich_table.py
@@ -1,29 +1,67 @@
 """
 Methods to create the rich table view for a PdfTreeNode.
 """
-from typing import List
+from collections import namedtuple
+from typing import List, Optional
 
+from anytree import SymlinkNode
 from PyPDF2.generic import StreamObject
 from rich.markup import escape
+from rich.panel import Panel
 from rich.table import Table
 from rich.text import Text
+from rich.tree import Tree
 from yaralyzer.encoding_detection.character_encodings import NEWLINE_BYTE
 from yaralyzer.helpers.bytes_helper import clean_byte_string, hex_text
 from yaralyzer.helpers.rich_text_helper import size_text
 from yaralyzer.output.rich_console import BYTES_NO_DIM, YARALYZER_THEME
 from yaralyzer.util.logging import log
 
 from pdfalyzer.helpers.rich_text_helper import PDF_ARRAY, TYPE_STYLES
-from pdfalyzer.helpers.string_helper import pypdf_class_name
+from pdfalyzer.helpers.string_helper import pypdf_class_name, root_address
 from pdfalyzer.output.layout import get_label_style
 from pdfalyzer.util.adobe_strings import *
 
+# For printing SymlinkNodes
+SymlinkRepresentation = namedtuple('SymlinkRepresentation', ['text', 'style'])
+
 HEX = 'Hex'
 STREAM = 'Stream'
 STREAM_PREVIEW_LENGTH_IN_TABLE = 500
 PREVIEW_STYLES = {HEX: BYTES_NO_DIM, STREAM: 'bytes'}
 
 
+def get_symlink_representation(from_node, to_node) -> SymlinkRepresentation:
+    """Returns a tuple (symlink_text, style) that can be used for pretty printing, tree creation, etc"""
+    reference_key = str(to_node.get_address_for_relationship(from_node))
+    pdf_instruction = root_address(reference_key)  # In case we ended up with a [0] or similar
+
+    if pdf_instruction in DANGEROUS_PDF_KEYS:
+        symlink_style = 'red_alert'
+    else:
+        symlink_style = get_label_style(to_node.label) + ' dim'
+
+    symlink_str = f"{escape(reference_key)} [bright_white]=>[/bright_white]"
+    symlink_str += f" {escape(str(to_node.target))} [grey](Non Child Reference)[/grey]"
+    return SymlinkRepresentation(symlink_str, symlink_style)
+
+
+def generate_rich_tree(node: 'PdfTreeNode', tree: Optional[Tree] = None, depth: int = 0) -> Tree:
+    """Recursively generates a rich.tree.Tree object from this node"""
+    tree = tree or Tree(build_pdf_node_table(node))
+
+    for child in node.children:
+        if isinstance(child, SymlinkNode):
+            symlink_rep = get_symlink_representation(node, child)
+            tree.add(Panel(symlink_rep.text, style=symlink_rep.style, expand=False))
+            continue
+
+        child_branch = tree.add(build_pdf_node_table(child))
+        generate_rich_tree(child, child_branch)
+
+    return tree
+
+
 def build_pdf_node_table(node: 'PdfTreeNode') -> Table:
     """
     Generate a Rich table representation of this node's PDF object and its properties.