Skip to content

Commit

Permalink
Scan all binaries (not just font binaries) with included PDF related …
Browse files Browse the repository at this point in the history
…YARA rules, lots of refactoring
  • Loading branch information
ashariyar committed Oct 14, 2022
1 parent 524ceed commit 0173885
Show file tree
Hide file tree
Showing 19 changed files with 342 additions and 184 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# NEXT RELEASE

# 1.11.0
* Scan all binaries (not just font binaries) with included PDF related YARA rules
* Better warning about stream decode failures
* Remove warnings that should not be warnings
* Refactor rich table view code to `pdf_node_rich_table.py`
* Refactor `Relationship` and `PdfObjectRef` to single class, `PdfObjectRelationship`

### 1.10.8
* Fix `importlib.resources` usage in case there's a zip file
* Fix `importlib.resources` usage in case pdfalyer is packaged as a zip file
* `/Names` is an indeterminate reference type
* Catch stream decode exceptions and show error instead of failing.

Expand Down
66 changes: 20 additions & 46 deletions pdfalyzer/decorators/pdf_tree_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,63 +6,59 @@
methods and not set directly. (TODO: this could be done better with anytree
hooks)
"""
from collections import namedtuple
from typing import List, Optional, Union

from anytree import NodeMixin, SymlinkNode
from PyPDF2.generic import DictionaryObject, IndirectObject, NumberObject, PdfObject, StreamObject
from anytree import NodeMixin
from PyPDF2.errors import PdfReadError
from PyPDF2.generic import DictionaryObject, IndirectObject, NumberObject, PdfObject, StreamObject
from rich.markup import escape
from rich.panel import Panel
from rich.text import Text
from rich.tree import Tree
from yaralyzer.output.rich_console import console
from yaralyzer.util.logging import log

from pdfalyzer.helpers.pdf_object_helper import PdfObjectRelationship, get_symlink_representation
from pdfalyzer.helpers.rich_text_helper import get_type_style, get_type_string_style
from pdfalyzer.helpers.string_helper import pypdf_class_name
from pdfalyzer.output.layout import get_label_style
from pdfalyzer.output.pdf_node_rich_table import build_pdf_node_table, get_node_type_style
from pdfalyzer.helpers.pdf_object_helper import node_label
from pdfalyzer.helpers.rich_text_helper import get_type_string_style, get_type_style
from pdfalyzer.helpers.string_helper import pypdf_class_name, root_address
from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
from pdfalyzer.util.adobe_strings import *
from pdfalyzer.util.exceptions import PdfWalkError

DEFAULT_MAX_ADDRESS_LENGTH = 90
DECODE_FAILURE_LEN = -1


class PdfTreeNode(NodeMixin):
def __init__(self, obj: PdfObject, address: str, idnum: int):
"""
reference_key: PDF instruction string used to reference obj
address: PDF instruction string + modifiers used to reference obj
idnum: ID used in the reference
"""
self.obj = obj
self.idnum = idnum
self.all_references_processed: bool = False
self.other_relationships: List[PdfObjectRelationship] = []
self.sub_type = None

if isinstance(obj, DictionaryObject):
self.type = obj.get(TYPE) or address
self.label = obj.get(TYPE) or address
self.sub_type = obj.get(SUBTYPE) or obj.get(S)
self.label = obj.get(TYPE) or address # TODO: should we use sub_type?

if isinstance(self.type, str):
self.type = self.type.split('[')[0]
self.type = root_address(self.type)
elif isinstance(address, int):
self.label = f"{UNLABELED}[{address}]"
self.type = str(address)
else:
# TODO: hacky
self.type = address.split('[')[0] if isinstance(address, str) else address
self.label = address
self.sub_type = None
self.type = root_address(address)

# TODO: this is hacky/temporarily incorrect bc we often don't know the parent when node is being constructed
if isinstance(address, int):
self.known_to_parent_as = f"[{address}]"
else:
self.known_to_parent_as = address

if isinstance(self.label, int):
self.label = f"{UNLABELED}[{self.label}]"

if isinstance(obj, StreamObject):
try:
self.stream_data = self.obj.get_data()
Expand All @@ -71,8 +67,8 @@ def __init__(self, obj: PdfObject, address: str, idnum: int):
msg = f"Failed to decode stream in {self}: {e}"
console.print_exception()
log.warning(msg)
self.stream_data = msg
self.stream_length = -1
self.stream_data = msg.encode()
self.stream_length = DECODE_FAILURE_LEN
else:
self.stream_data = None
self.stream_length = 0
Expand Down Expand Up @@ -245,7 +241,8 @@ def _find_address_of_this_node(self, from_node: 'PdfTreeNode') -> Optional[str]:
reference_address = refs_to_this_node[0].reference_address

if not all(ref.reference_address in [FIRST, LAST] for ref in refs_to_this_node):
log.warning(f"Multiple refs from {from_node} to {self}: {refs_to_this_node}. Using {reference_address} as address")
msg = f"Multiple refs from {from_node} to {self}: {refs_to_this_node}"
log.warning(msg + ", using {reference_address}")

return reference_address

Expand All @@ -272,31 +269,8 @@ def tree_address(self, max_length: Optional[int] = None) -> str:

return '...' + address[-max_length:][3:]

def generate_rich_tree(self, tree=None, depth=0) -> Tree:
"""Recursively generates a rich.tree.Tree object from this node"""
tree = tree or Tree(build_pdf_node_table(self))

for child in self.children:
if isinstance(child, SymlinkNode):
symlink_rep = get_symlink_representation(self, child)
tree.add(Panel(symlink_rep.text, style=symlink_rep.style, expand=False))
continue

child_branch = tree.add(build_pdf_node_table(child))
child.generate_rich_tree(child_branch)

return tree

def _node_label(self) -> Text:
text = Text('<', style='white')
text.append(f'{self.idnum}', style='bright_white')
text.append(':', style='white')
text.append(self.label[1:], style=f'{get_label_style(self.label)} underline bold')
text.append('(', style='white')
text.append(pypdf_class_name(self.obj), style=get_node_type_style(self.obj))
text.append(')', style='white')
text.append('>')
return text
return node_label(self.idnum, self.label, self.obj)

def _colored_address(self, max_length: Optional[int] = None) -> Text:
"""Rich text version of tree_address()"""
Expand Down
7 changes: 0 additions & 7 deletions pdfalyzer/font_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,6 @@ def __init__(self, label, idnum, font, font_descriptor, font_file, obj_with_reso
self._char_map = None
self.character_mapping = None

def yara_scan(self) -> None:
if self.binary_scanner is None:
log.debug(f"No binary to scan for {self.display_title}")
return

get_bytes_yaralyzer(self.stream_data, str(self)).yaralyze()

def width_stats(self):
if self.widths is None:
return {}
Expand Down
123 changes: 19 additions & 104 deletions pdfalyzer/helpers/pdf_object_helper.py
Original file line number Diff line number Diff line change
@@ -1,125 +1,40 @@
"""
Some methods to help with the direct manipulation/processing of PyPDF2's PdfObjects
"""
from collections import namedtuple
from typing import List, Optional

from PyPDF2.generic import IndirectObject, PdfObject
from rich.markup import escape
from yaralyzer.helpers.string_helper import comma_join
from yaralyzer.util.logging import log
from rich.text import Text

from pdfalyzer.helpers.string_helper import pypdf_class_name
from pdfalyzer.output.layout import get_label_style
from pdfalyzer.output.pdf_node_rich_table import get_node_type_style
from pdfalyzer.pdf_object_relationship import PdfObjectRelationship
from pdfalyzer.util.adobe_strings import *
from pdfalyzer.util.exceptions import PdfWalkError

# For printing SymlinkNodes
SymlinkRepresentation = namedtuple('SymlinkRepresentation', ['text', 'style'])


class PdfObjectRelationship:
"""
Simple container class for information about a link between two PDF objects.
In the case of easy key/value pairs the reference_key and the reference_address are the same but
for more complicated references the reference_address will be the reference_key plus sub references.
e.g. a reference_key for a /Font labeled /F1 could be '/Resources' but the reference_address
might be '/Resources[/Font][/F1] if the /Font is a directly embedded reference instead of a remote one.
"""
def __init__(
self,
from_obj: PdfObject,
to_obj: IndirectObject,
reference_key: str,
reference_address: str
) -> None:
self.from_obj = from_obj
self.to_obj = to_obj
self.reference_key = reference_key
self.reference_address = reference_address
self.from_node: Optional['PdfTreeNode'] = None # To be filled in later. TODO: Hacky

@classmethod
def get_references(
cls,
obj: PdfObject,
ref_key: Optional[str] = None,
ref_address: Optional[str] = None
) -> List['PdfObjectRelationship']:
"""
Recurse through elements in 'obj' and return list of PdfObjectRelationships containing all IndirectObjects
referenced from addresses in 'obj'.
"""
if isinstance(obj, IndirectObject):
if ref_key is None or ref_address is None:
raise PdfWalkError(f"{obj} is a reference but key or address not provided")
else:
return [cls(obj, obj, ref_key, ref_address)]

return_list: List[PdfObjectRelationship] = []

if isinstance(obj, list):
for i, element in enumerate(obj):
if not isinstance(element, (IndirectObject, list, dict)):
continue

_ref_address = f"[{i}]" if ref_address is None else f"{ref_address}[{i}]"
return_list += cls.get_references(element, ref_key or i, _ref_address)
elif isinstance(obj, dict):
for k, v in obj.items():
_ref_address = k if ref_address is None else f"{ref_address}[{k}]"
return_list += cls.get_references(v, ref_key or k, _ref_address)
else:
log.debug(f"Adding no references for PdfObject reference '{ref_key}' -> '{obj}'")

for ref in return_list:
ref.from_obj = obj

return return_list

def __eq__(self, other: 'PdfObjectRelationship') -> bool:
"""Note that equality does not check self.from_obj."""
if (self.to_obj.idnum != other.to_obj.idnum) or (self.from_node != other.from_node):
return False

for k in ['reference_key', 'reference_address']:
if getattr(self, k) != getattr(other, k):
return False

return True

def __str__(self) -> str:
return comma_join([f"{k}: {v}" for k, v in vars(self).items()])

def description(self) -> str:
"""Sort of like __str__ but w/out the extra lines"""
return f"{self.from_node}: {self.reference_address} to {self.to_obj}"


def get_symlink_representation(from_node, to_node) -> SymlinkRepresentation:
"""Returns a tuple (symlink_text, style) that can be used for pretty printing, tree creation, etc"""
reference_key = str(to_node.get_address_for_relationship(from_node))
pdf_instruction = reference_key.split('[')[0] # In case we ended up with a [0] or similar

if pdf_instruction in DANGEROUS_PDF_KEYS:
symlink_style = 'red_alert'
else:
symlink_style = get_label_style(to_node.label) + ' dim'

symlink_str = f"{escape(reference_key)} [bright_white]=>[/bright_white]"
symlink_str += f" {escape(str(to_node.target))} [grey](Non Child Reference)[/grey]"
return SymlinkRepresentation(symlink_str, symlink_style)


def pdf_object_id(pdf_object):
def pdf_object_id(pdf_object) -> Optional[int]:
"""Return the ID of an IndirectObject and None for everything else"""
return pdf_object.idnum if isinstance(pdf_object, IndirectObject) else None


def does_list_have_any_references(_list) -> bool:
"""Return true if any element of _list is an IndirectObject"""
"""Return true if any element of _list is an IndirectObject."""
return any(isinstance(item, IndirectObject) for item in _list)


def node_label(idnum: int, label: str, pdf_object: PdfObject) -> Text:
"""Colored text representation of a node."""
text = Text('<', style='white')
text.append(f'{idnum}', style='bright_white')
text.append(':', style='white')
text.append(label[1:], style=f'{get_label_style(label)} underline bold')
text.append('(', style='white')
text.append(pypdf_class_name(pdf_object), style=get_node_type_style(pdf_object))
text.append(')', style='white')
text.append('>')
return text


def _sort_pdf_object_refs(refs: List[PdfObjectRelationship]) -> List[PdfObjectRelationship]:
return sorted(refs, key=lambda ref: ref.to_obj.idnum)
8 changes: 5 additions & 3 deletions pdfalyzer/helpers/string_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,9 @@
from PyPDF2.generic import PdfObject
from yaralyzer.output.rich_console import console_width


# Style
INDENT_DEPTH = 4
PRETTY_PRINT_WIDTH = 60


# Pretty Printer
pp = PrettyPrinter(
indent=INDENT_DEPTH,
Expand Down Expand Up @@ -52,3 +49,8 @@ def count_pattern_matches_in_text(pattern: str, text: str) -> int:
def count_regex_matches_in_text(regex: Pattern, text: str) -> int:
"""For use when you precompile the regex"""
return sum(1 for _ in regex.finditer(text))


def root_address(_string: str) -> str:
"""Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'."""
return _string.split('[')[0]
42 changes: 40 additions & 2 deletions pdfalyzer/output/pdf_node_rich_table.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,67 @@
"""
Methods to create the rich table view for a PdfTreeNode.
"""
from typing import List
from collections import namedtuple
from typing import List, Optional

from anytree import SymlinkNode
from PyPDF2.generic import StreamObject
from rich.markup import escape
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
from rich.tree import Tree
from yaralyzer.encoding_detection.character_encodings import NEWLINE_BYTE
from yaralyzer.helpers.bytes_helper import clean_byte_string, hex_text
from yaralyzer.helpers.rich_text_helper import size_text
from yaralyzer.output.rich_console import BYTES_NO_DIM, YARALYZER_THEME
from yaralyzer.util.logging import log

from pdfalyzer.helpers.rich_text_helper import PDF_ARRAY, TYPE_STYLES
from pdfalyzer.helpers.string_helper import pypdf_class_name
from pdfalyzer.helpers.string_helper import pypdf_class_name, root_address
from pdfalyzer.output.layout import get_label_style
from pdfalyzer.util.adobe_strings import *

# For printing SymlinkNodes
SymlinkRepresentation = namedtuple('SymlinkRepresentation', ['text', 'style'])

HEX = 'Hex'
STREAM = 'Stream'
STREAM_PREVIEW_LENGTH_IN_TABLE = 500
PREVIEW_STYLES = {HEX: BYTES_NO_DIM, STREAM: 'bytes'}


def get_symlink_representation(from_node, to_node) -> SymlinkRepresentation:
"""Returns a tuple (symlink_text, style) that can be used for pretty printing, tree creation, etc"""
reference_key = str(to_node.get_address_for_relationship(from_node))
pdf_instruction = root_address(reference_key) # In case we ended up with a [0] or similar

if pdf_instruction in DANGEROUS_PDF_KEYS:
symlink_style = 'red_alert'
else:
symlink_style = get_label_style(to_node.label) + ' dim'

symlink_str = f"{escape(reference_key)} [bright_white]=>[/bright_white]"
symlink_str += f" {escape(str(to_node.target))} [grey](Non Child Reference)[/grey]"
return SymlinkRepresentation(symlink_str, symlink_style)


def generate_rich_tree(node: 'PdfTreeNode', tree: Optional[Tree] = None, depth: int = 0) -> Tree:
"""Recursively generates a rich.tree.Tree object from this node"""
tree = tree or Tree(build_pdf_node_table(node))

for child in node.children:
if isinstance(child, SymlinkNode):
symlink_rep = get_symlink_representation(node, child)
tree.add(Panel(symlink_rep.text, style=symlink_rep.style, expand=False))
continue

child_branch = tree.add(build_pdf_node_table(child))
generate_rich_tree(child, child_branch)

return tree


def build_pdf_node_table(node: 'PdfTreeNode') -> Table:
"""
Generate a Rich table representation of this node's PDF object and its properties.
Expand Down
Loading

0 comments on commit 0173885

Please sign in to comment.