diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c78786..1741b52 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,14 @@ # NEXT RELEASE + +# 1.11.0 +* Scan all binaries (not just font binaries) with included PDF related YARA rules * Better warning about stream decode failures * Remove warnings that should not be warnings * Refactor rich table view code to `pdf_node_rich_table.py` +* Refactor `Relationship` and `PdfObjectRef` to single class, `PdfObjectRelationship` ### 1.10.8 -* Fix `importlib.resources` usage in case there's a zip file +* Fix `importlib.resources` usage in case pdfalyer is packaged as a zip file * `/Names` is an indeterminate reference type * Catch stream decode exceptions and show error instead of failing. diff --git a/pdfalyzer/decorators/pdf_tree_node.py b/pdfalyzer/decorators/pdf_tree_node.py index d9385c0..8b7cd84 100644 --- a/pdfalyzer/decorators/pdf_tree_node.py +++ b/pdfalyzer/decorators/pdf_tree_node.py @@ -6,53 +6,52 @@ methods and not set directly. (TODO: this could be done better with anytree hooks) """ -from collections import namedtuple from typing import List, Optional, Union -from anytree import NodeMixin, SymlinkNode -from PyPDF2.generic import DictionaryObject, IndirectObject, NumberObject, PdfObject, StreamObject +from anytree import NodeMixin from PyPDF2.errors import PdfReadError +from PyPDF2.generic import DictionaryObject, IndirectObject, NumberObject, PdfObject, StreamObject from rich.markup import escape -from rich.panel import Panel from rich.text import Text -from rich.tree import Tree from yaralyzer.output.rich_console import console from yaralyzer.util.logging import log -from pdfalyzer.helpers.pdf_object_helper import PdfObjectRelationship, get_symlink_representation -from pdfalyzer.helpers.rich_text_helper import get_type_style, get_type_string_style -from pdfalyzer.helpers.string_helper import pypdf_class_name -from pdfalyzer.output.layout import get_label_style -from pdfalyzer.output.pdf_node_rich_table import build_pdf_node_table, get_node_type_style +from pdfalyzer.helpers.pdf_object_helper import node_label +from pdfalyzer.helpers.rich_text_helper import get_type_string_style, get_type_style +from pdfalyzer.helpers.string_helper import pypdf_class_name, root_address +from pdfalyzer.pdf_object_relationship import PdfObjectRelationship from pdfalyzer.util.adobe_strings import * from pdfalyzer.util.exceptions import PdfWalkError DEFAULT_MAX_ADDRESS_LENGTH = 90 +DECODE_FAILURE_LEN = -1 class PdfTreeNode(NodeMixin): def __init__(self, obj: PdfObject, address: str, idnum: int): """ - reference_key: PDF instruction string used to reference obj + address: PDF instruction string + modifiers used to reference obj idnum: ID used in the reference """ self.obj = obj self.idnum = idnum self.all_references_processed: bool = False self.other_relationships: List[PdfObjectRelationship] = [] + self.sub_type = None if isinstance(obj, DictionaryObject): self.type = obj.get(TYPE) or address - self.label = obj.get(TYPE) or address self.sub_type = obj.get(SUBTYPE) or obj.get(S) + self.label = obj.get(TYPE) or address # TODO: should we use sub_type? if isinstance(self.type, str): - self.type = self.type.split('[')[0] + self.type = root_address(self.type) + elif isinstance(address, int): + self.label = f"{UNLABELED}[{address}]" + self.type = str(address) else: - # TODO: hacky - self.type = address.split('[')[0] if isinstance(address, str) else address self.label = address - self.sub_type = None + self.type = root_address(address) # TODO: this is hacky/temporarily incorrect bc we often don't know the parent when node is being constructed if isinstance(address, int): @@ -60,9 +59,6 @@ def __init__(self, obj: PdfObject, address: str, idnum: int): else: self.known_to_parent_as = address - if isinstance(self.label, int): - self.label = f"{UNLABELED}[{self.label}]" - if isinstance(obj, StreamObject): try: self.stream_data = self.obj.get_data() @@ -71,8 +67,8 @@ def __init__(self, obj: PdfObject, address: str, idnum: int): msg = f"Failed to decode stream in {self}: {e}" console.print_exception() log.warning(msg) - self.stream_data = msg - self.stream_length = -1 + self.stream_data = msg.encode() + self.stream_length = DECODE_FAILURE_LEN else: self.stream_data = None self.stream_length = 0 @@ -245,7 +241,8 @@ def _find_address_of_this_node(self, from_node: 'PdfTreeNode') -> Optional[str]: reference_address = refs_to_this_node[0].reference_address if not all(ref.reference_address in [FIRST, LAST] for ref in refs_to_this_node): - log.warning(f"Multiple refs from {from_node} to {self}: {refs_to_this_node}. Using {reference_address} as address") + msg = f"Multiple refs from {from_node} to {self}: {refs_to_this_node}" + log.warning(msg + ", using {reference_address}") return reference_address @@ -272,31 +269,8 @@ def tree_address(self, max_length: Optional[int] = None) -> str: return '...' + address[-max_length:][3:] - def generate_rich_tree(self, tree=None, depth=0) -> Tree: - """Recursively generates a rich.tree.Tree object from this node""" - tree = tree or Tree(build_pdf_node_table(self)) - - for child in self.children: - if isinstance(child, SymlinkNode): - symlink_rep = get_symlink_representation(self, child) - tree.add(Panel(symlink_rep.text, style=symlink_rep.style, expand=False)) - continue - - child_branch = tree.add(build_pdf_node_table(child)) - child.generate_rich_tree(child_branch) - - return tree - def _node_label(self) -> Text: - text = Text('<', style='white') - text.append(f'{self.idnum}', style='bright_white') - text.append(':', style='white') - text.append(self.label[1:], style=f'{get_label_style(self.label)} underline bold') - text.append('(', style='white') - text.append(pypdf_class_name(self.obj), style=get_node_type_style(self.obj)) - text.append(')', style='white') - text.append('>') - return text + return node_label(self.idnum, self.label, self.obj) def _colored_address(self, max_length: Optional[int] = None) -> Text: """Rich text version of tree_address()""" diff --git a/pdfalyzer/font_info.py b/pdfalyzer/font_info.py index 29c2bb4..116fa46 100644 --- a/pdfalyzer/font_info.py +++ b/pdfalyzer/font_info.py @@ -143,13 +143,6 @@ def __init__(self, label, idnum, font, font_descriptor, font_file, obj_with_reso self._char_map = None self.character_mapping = None - def yara_scan(self) -> None: - if self.binary_scanner is None: - log.debug(f"No binary to scan for {self.display_title}") - return - - get_bytes_yaralyzer(self.stream_data, str(self)).yaralyze() - def width_stats(self): if self.widths is None: return {} diff --git a/pdfalyzer/helpers/pdf_object_helper.py b/pdfalyzer/helpers/pdf_object_helper.py index cde90d1..dc15076 100644 --- a/pdfalyzer/helpers/pdf_object_helper.py +++ b/pdfalyzer/helpers/pdf_object_helper.py @@ -1,125 +1,40 @@ """ Some methods to help with the direct manipulation/processing of PyPDF2's PdfObjects """ -from collections import namedtuple from typing import List, Optional from PyPDF2.generic import IndirectObject, PdfObject -from rich.markup import escape -from yaralyzer.helpers.string_helper import comma_join -from yaralyzer.util.logging import log +from rich.text import Text +from pdfalyzer.helpers.string_helper import pypdf_class_name from pdfalyzer.output.layout import get_label_style +from pdfalyzer.output.pdf_node_rich_table import get_node_type_style +from pdfalyzer.pdf_object_relationship import PdfObjectRelationship from pdfalyzer.util.adobe_strings import * -from pdfalyzer.util.exceptions import PdfWalkError -# For printing SymlinkNodes -SymlinkRepresentation = namedtuple('SymlinkRepresentation', ['text', 'style']) - -class PdfObjectRelationship: - """ - Simple container class for information about a link between two PDF objects. - In the case of easy key/value pairs the reference_key and the reference_address are the same but - for more complicated references the reference_address will be the reference_key plus sub references. - - e.g. a reference_key for a /Font labeled /F1 could be '/Resources' but the reference_address - might be '/Resources[/Font][/F1] if the /Font is a directly embedded reference instead of a remote one. - """ - def __init__( - self, - from_obj: PdfObject, - to_obj: IndirectObject, - reference_key: str, - reference_address: str - ) -> None: - self.from_obj = from_obj - self.to_obj = to_obj - self.reference_key = reference_key - self.reference_address = reference_address - self.from_node: Optional['PdfTreeNode'] = None # To be filled in later. TODO: Hacky - - @classmethod - def get_references( - cls, - obj: PdfObject, - ref_key: Optional[str] = None, - ref_address: Optional[str] = None - ) -> List['PdfObjectRelationship']: - """ - Recurse through elements in 'obj' and return list of PdfObjectRelationships containing all IndirectObjects - referenced from addresses in 'obj'. - """ - if isinstance(obj, IndirectObject): - if ref_key is None or ref_address is None: - raise PdfWalkError(f"{obj} is a reference but key or address not provided") - else: - return [cls(obj, obj, ref_key, ref_address)] - - return_list: List[PdfObjectRelationship] = [] - - if isinstance(obj, list): - for i, element in enumerate(obj): - if not isinstance(element, (IndirectObject, list, dict)): - continue - - _ref_address = f"[{i}]" if ref_address is None else f"{ref_address}[{i}]" - return_list += cls.get_references(element, ref_key or i, _ref_address) - elif isinstance(obj, dict): - for k, v in obj.items(): - _ref_address = k if ref_address is None else f"{ref_address}[{k}]" - return_list += cls.get_references(v, ref_key or k, _ref_address) - else: - log.debug(f"Adding no references for PdfObject reference '{ref_key}' -> '{obj}'") - - for ref in return_list: - ref.from_obj = obj - - return return_list - - def __eq__(self, other: 'PdfObjectRelationship') -> bool: - """Note that equality does not check self.from_obj.""" - if (self.to_obj.idnum != other.to_obj.idnum) or (self.from_node != other.from_node): - return False - - for k in ['reference_key', 'reference_address']: - if getattr(self, k) != getattr(other, k): - return False - - return True - - def __str__(self) -> str: - return comma_join([f"{k}: {v}" for k, v in vars(self).items()]) - - def description(self) -> str: - """Sort of like __str__ but w/out the extra lines""" - return f"{self.from_node}: {self.reference_address} to {self.to_obj}" - - -def get_symlink_representation(from_node, to_node) -> SymlinkRepresentation: - """Returns a tuple (symlink_text, style) that can be used for pretty printing, tree creation, etc""" - reference_key = str(to_node.get_address_for_relationship(from_node)) - pdf_instruction = reference_key.split('[')[0] # In case we ended up with a [0] or similar - - if pdf_instruction in DANGEROUS_PDF_KEYS: - symlink_style = 'red_alert' - else: - symlink_style = get_label_style(to_node.label) + ' dim' - - symlink_str = f"{escape(reference_key)} [bright_white]=>[/bright_white]" - symlink_str += f" {escape(str(to_node.target))} [grey](Non Child Reference)[/grey]" - return SymlinkRepresentation(symlink_str, symlink_style) - - -def pdf_object_id(pdf_object): +def pdf_object_id(pdf_object) -> Optional[int]: """Return the ID of an IndirectObject and None for everything else""" return pdf_object.idnum if isinstance(pdf_object, IndirectObject) else None def does_list_have_any_references(_list) -> bool: - """Return true if any element of _list is an IndirectObject""" + """Return true if any element of _list is an IndirectObject.""" return any(isinstance(item, IndirectObject) for item in _list) +def node_label(idnum: int, label: str, pdf_object: PdfObject) -> Text: + """Colored text representation of a node.""" + text = Text('<', style='white') + text.append(f'{idnum}', style='bright_white') + text.append(':', style='white') + text.append(label[1:], style=f'{get_label_style(label)} underline bold') + text.append('(', style='white') + text.append(pypdf_class_name(pdf_object), style=get_node_type_style(pdf_object)) + text.append(')', style='white') + text.append('>') + return text + + def _sort_pdf_object_refs(refs: List[PdfObjectRelationship]) -> List[PdfObjectRelationship]: return sorted(refs, key=lambda ref: ref.to_obj.idnum) diff --git a/pdfalyzer/helpers/string_helper.py b/pdfalyzer/helpers/string_helper.py index 4ccf055..eb13065 100644 --- a/pdfalyzer/helpers/string_helper.py +++ b/pdfalyzer/helpers/string_helper.py @@ -8,12 +8,9 @@ from PyPDF2.generic import PdfObject from yaralyzer.output.rich_console import console_width - -# Style INDENT_DEPTH = 4 PRETTY_PRINT_WIDTH = 60 - # Pretty Printer pp = PrettyPrinter( indent=INDENT_DEPTH, @@ -52,3 +49,8 @@ def count_pattern_matches_in_text(pattern: str, text: str) -> int: def count_regex_matches_in_text(regex: Pattern, text: str) -> int: """For use when you precompile the regex""" return sum(1 for _ in regex.finditer(text)) + + +def root_address(_string: str) -> str: + """Strip the bracketed part off an address, e.g. '/Root[1]' => '/Root'.""" + return _string.split('[')[0] diff --git a/pdfalyzer/output/pdf_node_rich_table.py b/pdfalyzer/output/pdf_node_rich_table.py index b22a2b5..0e50a4e 100644 --- a/pdfalyzer/output/pdf_node_rich_table.py +++ b/pdfalyzer/output/pdf_node_rich_table.py @@ -1,12 +1,16 @@ """ Methods to create the rich table view for a PdfTreeNode. """ -from typing import List +from collections import namedtuple +from typing import List, Optional +from anytree import SymlinkNode from PyPDF2.generic import StreamObject from rich.markup import escape +from rich.panel import Panel from rich.table import Table from rich.text import Text +from rich.tree import Tree from yaralyzer.encoding_detection.character_encodings import NEWLINE_BYTE from yaralyzer.helpers.bytes_helper import clean_byte_string, hex_text from yaralyzer.helpers.rich_text_helper import size_text @@ -14,16 +18,50 @@ from yaralyzer.util.logging import log from pdfalyzer.helpers.rich_text_helper import PDF_ARRAY, TYPE_STYLES -from pdfalyzer.helpers.string_helper import pypdf_class_name +from pdfalyzer.helpers.string_helper import pypdf_class_name, root_address from pdfalyzer.output.layout import get_label_style from pdfalyzer.util.adobe_strings import * +# For printing SymlinkNodes +SymlinkRepresentation = namedtuple('SymlinkRepresentation', ['text', 'style']) + HEX = 'Hex' STREAM = 'Stream' STREAM_PREVIEW_LENGTH_IN_TABLE = 500 PREVIEW_STYLES = {HEX: BYTES_NO_DIM, STREAM: 'bytes'} +def get_symlink_representation(from_node, to_node) -> SymlinkRepresentation: + """Returns a tuple (symlink_text, style) that can be used for pretty printing, tree creation, etc""" + reference_key = str(to_node.get_address_for_relationship(from_node)) + pdf_instruction = root_address(reference_key) # In case we ended up with a [0] or similar + + if pdf_instruction in DANGEROUS_PDF_KEYS: + symlink_style = 'red_alert' + else: + symlink_style = get_label_style(to_node.label) + ' dim' + + symlink_str = f"{escape(reference_key)} [bright_white]=>[/bright_white]" + symlink_str += f" {escape(str(to_node.target))} [grey](Non Child Reference)[/grey]" + return SymlinkRepresentation(symlink_str, symlink_style) + + +def generate_rich_tree(node: 'PdfTreeNode', tree: Optional[Tree] = None, depth: int = 0) -> Tree: + """Recursively generates a rich.tree.Tree object from this node""" + tree = tree or Tree(build_pdf_node_table(node)) + + for child in node.children: + if isinstance(child, SymlinkNode): + symlink_rep = get_symlink_representation(node, child) + tree.add(Panel(symlink_rep.text, style=symlink_rep.style, expand=False)) + continue + + child_branch = tree.add(build_pdf_node_table(child)) + generate_rich_tree(child, child_branch) + + return tree + + def build_pdf_node_table(node: 'PdfTreeNode') -> Table: """ Generate a Rich table representation of this node's PDF object and its properties. diff --git a/pdfalyzer/pdf_object_relationship.py b/pdfalyzer/pdf_object_relationship.py new file mode 100644 index 0000000..eab4b80 --- /dev/null +++ b/pdfalyzer/pdf_object_relationship.py @@ -0,0 +1,89 @@ +""" +Simple container class for information about a link between two PDF objects. +""" +from typing import List, Optional, Union + +from PyPDF2.generic import IndirectObject, PdfObject +from yaralyzer.helpers.string_helper import comma_join +from yaralyzer.util.logging import log + +from pdfalyzer.util.adobe_strings import * +from pdfalyzer.util.exceptions import PdfWalkError + + +class PdfObjectRelationship: + """ + In the case of easy key/value pairs the reference_key and the reference_address are the same but + for more complicated references the reference_address will be the reference_key plus sub references. + e.g. a reference_key for a /Font labeled /F1 could be '/Resources' but the reference_address + might be '/Resources[/Font][/F1] if the /Font is a directly embedded reference instead of a remote one. + """ + def __init__( + self, + from_obj: PdfObject, + to_obj: IndirectObject, + reference_key: str, + reference_address: str + ) -> None: + self.from_obj = from_obj + self.to_obj = to_obj + self.reference_key = reference_key + self.reference_address = reference_address + self.from_node: Optional['PdfTreeNode'] = None # To be filled in later. TODO: Hacky + + @classmethod + def get_references( + cls, + obj: PdfObject, + ref_key: Optional[Union[str, int]] = None, + ref_address: Optional[Union[str, int]] = None + ) -> List['PdfObjectRelationship']: + """ + Recurse through elements in 'obj' and return list of PdfObjectRelationships containing all IndirectObjects + referenced from addresses in 'obj'. + """ + if isinstance(obj, IndirectObject): + if ref_key is None or ref_address is None: + raise PdfWalkError(f"{obj} is a reference but key or address not provided") + else: + return [cls(obj, obj, str(ref_key), str(ref_address))] + + return_list: List[PdfObjectRelationship] = [] + + if isinstance(obj, list): + for i, element in enumerate(obj): + if not isinstance(element, (IndirectObject, list, dict)): + continue + + idx = f"[{i}]" + _ref_address = idx if ref_address is None else f"{ref_address}{idx}" + return_list += cls.get_references(element, ref_key or i, _ref_address) + elif isinstance(obj, dict): + for k, v in obj.items(): + _ref_address = k if ref_address is None else f"{ref_address}[{k}]" + return_list += cls.get_references(v, ref_key or k, _ref_address) + else: + log.debug(f"Adding no references for PdfObject reference '{ref_key}' -> '{obj}'") + + for ref in return_list: + ref.from_obj = obj + + return return_list + + def __eq__(self, other: 'PdfObjectRelationship') -> bool: + """Note that equality does not check self.from_obj.""" + if (self.to_obj.idnum != other.to_obj.idnum) or (self.from_node != other.from_node): + return False + + for k in ['reference_key', 'reference_address']: + if getattr(self, k) != getattr(other, k): + return False + + return True + + def __str__(self) -> str: + return comma_join([f"{k}: {v}" for k, v in vars(self).items()]) + + def description(self) -> str: + """Sort of like __str__ but w/out the extra lines""" + return f"{self.from_node}: {self.reference_address} to {self.to_obj}" diff --git a/pdfalyzer/pdfalyzer.py b/pdfalyzer/pdfalyzer.py index c4a565f..2af70fa 100644 --- a/pdfalyzer/pdfalyzer.py +++ b/pdfalyzer/pdfalyzer.py @@ -5,7 +5,7 @@ """ from collections import defaultdict from os.path import basename -from typing import List, Optional +from typing import Dict, List, Optional from anytree import LevelOrderIter, RenderTree, SymlinkNode from anytree.render import DoubleStyle @@ -27,12 +27,14 @@ from pdfalyzer.binary.binary_scanner import BinaryScanner from pdfalyzer.decorators.document_model_printer import print_with_header -from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode +from pdfalyzer.decorators.pdf_tree_node import DECODE_FAILURE_LEN, PdfTreeNode from pdfalyzer.detection.yaralyzer_helper import get_file_yaralyzer -from pdfalyzer.helpers.pdf_object_helper import PdfObjectRelationship, get_symlink_representation -from pdfalyzer.helpers.string_helper import pp from pdfalyzer.font_info import FontInfo +from pdfalyzer.helpers.string_helper import pp +from pdfalyzer.detection.yaralyzer_helper import get_bytes_yaralyzer from pdfalyzer.output.layout import print_section_header, print_section_subheader, print_section_sub_subheader +from pdfalyzer.output.pdf_node_rich_table import generate_rich_tree, get_symlink_representation +from pdfalyzer.pdf_object_relationship import PdfObjectRelationship from pdfalyzer.util.adobe_strings import * from pdfalyzer.util.exceptions import PdfWalkError @@ -50,10 +52,10 @@ def __init__(self, pdf_path: str): self.yaralyzer = get_file_yaralyzer(pdf_path) # Initialize tracking variables self.indeterminate_ids = set() # See INDETERMINATE_REFERENCES comment - self.traversed_nodes = {} # Nodes we've seen already - self.font_infos = [] # Font summary objects - self.max_generation = 0 # PDF revisions are "generations"; this is the max generation encountered - self.walk_pdf() # Build the tree + self.traversed_nodes: Dict[int, PdfTreeNode] = {} # Nodes we've seen already + self.font_infos: List[FontInfo] = [] # Font summary objects + self.max_generation = 0 # PDF revisions are "generations"; this is the max generation encountered + self.walk_pdf() # Build the tree def walk_pdf(self): """ @@ -104,7 +106,7 @@ def find_node_by_idnum(self, idnum) -> Optional[PdfTreeNode]: raise PdfWalkError(f"Too many nodes had id {idnum}: {nodes}") def print_everything(self) -> None: - """Print every kind of analysis on offer to Rich console""" + """Print every kind of analysis on offer to Rich console.""" self.print_document_info() self.print_summary() self.print_tree() @@ -113,14 +115,15 @@ def print_everything(self) -> None: self.print_other_relationships() def print_document_info(self) -> None: - """Print the embedded document info (author, timestamps, version, etc)""" + """Print the embedded document info (author, timestamps, version, etc).""" print_section_header(f'Document Info for {self.pdf_basename}') console.print(pp.pformat(self.pdf_reader.getDocumentInfo())) console.line() console.print(bytes_hashes_table(self.pdf_bytes, self.pdf_basename)) console.line() - def print_tree(self): + def print_tree(self) -> None: + """Print the simple view of the PDF tree.""" print_section_header(f'Simple tree view of {self.pdf_basename}') for pre, _fill, node in RenderTree(self.pdf_tree, style=DoubleStyle): @@ -134,8 +137,9 @@ def print_tree(self): console.print("\n\n") def print_rich_table_tree(self) -> None: + """Print the rich view of the PDF tree.""" print_section_header(f'Rich tree view of {self.pdf_basename}') - console.print(self.pdf_tree.generate_rich_tree()) + console.print(generate_rich_tree(self.pdf_tree)) self._verify_all_traversed_nodes_are_in_tree() def print_summary(self) -> None: @@ -177,17 +181,21 @@ def print_streams_analysis(self, idnum: Optional[int] = None) -> None: binary_scanner.print_decoding_stats_table() - def print_yara_results(self, font_idnum=None) -> None: + def print_yara_results(self) -> None: print_section_header(f"YARA Scan of PDF rules for '{self.pdf_basename}'") self.yaralyzer.yaralyze() - # TODO: we should really scan all the binary streams not just those in the fonts - for font_info in [fi for fi in self.font_infos if font_idnum is None or font_idnum == fi.idnum]: - font_info.yara_scan() + for node in self.stream_nodes(): + if node.stream_length == DECODE_FAILURE_LEN: + log.warning(f"{node} binary stream could not be extracted") + elif node.stream_length == 0 or node.stream_data is None: + log.debug(f"No binary to scan for {node}") + else: + get_bytes_yaralyzer(node.stream_data, str(node)).yaralyze() def print_other_relationships(self) -> None: """Print the inter-node, non-tree relationships for all nodes in the tree""" - console.print("\n\n") + console.line(2) console.print(Panel(f"Other Relationships", expand=False), style='reverse') for node in LevelOrderIter(self.pdf_tree): @@ -199,6 +207,7 @@ def print_other_relationships(self) -> None: node.print_other_relationships() def stream_objects_table(self) -> Table: + """Build a table of stream objects and their lengths.""" table = Table('Stream Length', 'Node') table.columns[0].justify = 'right' @@ -279,7 +288,6 @@ def _process_reference(self, reference: PdfObjectRelationship) -> List[PdfTreeNo log.debug("Nodes to walk next: " + ', '.join([str(r) for r in references_to_return])) return references_to_return - # TODO: this should probably be in the PdfTreeNode class def _symlink_other_relationships(self) -> None: """Create SymlinkNodes for relationships between PDF objects that are not parent/child relationships""" for node in LevelOrderIter(self.pdf_tree): @@ -288,6 +296,7 @@ def _symlink_other_relationships(self) -> None: log.info(f"Symlinking {node}'s {node.other_relationship_count()} other relationships...") + # TODO: this should probably be in the PdfTreeNode class for relationship in node.other_relationships: log.debug(f" Linking {relationship.description()} to {node}") SymlinkNode(node, parent=relationship.from_node) diff --git a/pyproject.toml b/pyproject.toml index d3b7c86..cfd10ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdfalyzer" -version = "1.10.8" +version = "1.11.0" description = "A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more." authors = ["Michel de Cryptadamus "] license = "GPL-3.0-or-later" diff --git a/tests/pdfalyzer/lib/binary/test_binary_scanner.py b/tests/pdfalyzer/lib/binary/test_binary_scanner.py new file mode 100644 index 0000000..e959543 --- /dev/null +++ b/tests/pdfalyzer/lib/binary/test_binary_scanner.py @@ -0,0 +1,23 @@ +def test_quote_extraction_methods(font_info): + _check_matches(font_info.binary_scanner.extract_backtick_quoted_bytes, 163, 52840) + + +def test_front_slash_quoted_bytes_extraction(font_info): + _check_matches(font_info.binary_scanner.extract_front_slash_quoted_bytes, 756, 167814) + + +def test_extract_guillemet(font_info): + _check_matches(font_info.binary_scanner.extract_guillemet_quoted_bytes, 59, 78763) + + +def _check_matches(match_iterator, expected_matches: int, expected_bytes: int) -> None: + quoted_bytes_found = 0 + quoted_sections_found = 0 + + for quoted_bytes, _decoder in match_iterator(): + quoted_bytes_found += quoted_bytes.match_length + quoted_sections_found += 1 + + print(f"sections: {quoted_sections_found}, bytes: {quoted_bytes_found}") + assert quoted_sections_found == expected_matches + assert quoted_bytes_found == expected_bytes diff --git a/tests/pdfalyzer/lib/detection/test_encoding_detector.py b/tests/pdfalyzer/lib/detection/test_encoding_detector.py new file mode 100644 index 0000000..53d1146 --- /dev/null +++ b/tests/pdfalyzer/lib/detection/test_encoding_detector.py @@ -0,0 +1,11 @@ +import pytest +from rich.text import Text + + +@pytest.fixture +def hebrew_win_1255(): + return { + 'encoding': 'Windows-1255', + 'language': 'Hebrew', + 'confidence': 0.62538832, + } diff --git a/tests/pdfalyzer/lib/detection/test_javascript_hunter.py b/tests/pdfalyzer/lib/detection/test_javascript_hunter.py new file mode 100644 index 0000000..5f42202 --- /dev/null +++ b/tests/pdfalyzer/lib/detection/test_javascript_hunter.py @@ -0,0 +1,11 @@ +from pdfalyzer.detection.javascript_hunter import JavascriptHunter + +TEST_STRING = 'export then gracefully exit before finally rising to the moon' + + +def test_count_js_keywords_in_text(): + assert JavascriptHunter.count_js_keywords_in_text(TEST_STRING) == 3 + + +def test_js_keyword_matches(): + assert JavascriptHunter.js_keyword_matches(TEST_STRING) == ['export', 'for', 'final'] diff --git a/tests/pdfalyzer/lib/helpers/test_pdf_object_helper.py b/tests/pdfalyzer/lib/helpers/test_pdf_object_helper.py new file mode 100644 index 0000000..03d3129 --- /dev/null +++ b/tests/pdfalyzer/lib/helpers/test_pdf_object_helper.py @@ -0,0 +1,50 @@ +from PyPDF2 import PdfReader +from PyPDF2.generic import IndirectObject + +from pdfalyzer.helpers.pdf_object_helper import _sort_pdf_object_refs +from pdfalyzer.pdf_object_relationship import PdfObjectRelationship +from pdfalyzer.util.adobe_strings import * + +FONT_IDS = [5, 9, 11, 14, 20, 22, 24] +ANNOTS_IDS = [13, 19] + [i for i in range(26, 54)] +EXT_G_STATE_IDS = [7, 8] + + +def test_get_references(analyzing_malicious_documents_pdf_path): + pdf_file = open(analyzing_malicious_documents_pdf_path, 'rb') + pdf_reader = PdfReader(pdf_file) + pdf_obj = IndirectObject(3, 0, pdf_reader) + + direct_refs = [ + PdfObjectRelationship(pdf_obj, IndirectObject(2, 0, pdf_reader), PARENT, PARENT), + PdfObjectRelationship(pdf_obj, IndirectObject(4, 0, pdf_reader), CONTENTS, CONTENTS), + ] + + ext_g_state_refs = [ + PdfObjectRelationship( + pdf_obj, + IndirectObject(id, 0, pdf_reader), + RESOURCES, + f"{RESOURCES}[{EXT_G_STATE}][/GS{id}]" + ) + for id in EXT_G_STATE_IDS + ] + + font_refs = [ + PdfObjectRelationship( + pdf_obj, + IndirectObject(id, 0, pdf_reader), + RESOURCES, + f"{RESOURCES}[{FONT}][/F{i + 1}]" + ) + for i, id in enumerate(FONT_IDS) + ] + + annots_refs = [ + PdfObjectRelationship(pdf_obj, IndirectObject(id, 0, pdf_reader), ANNOTS, ANNOTS + f"[{i}]") + for i, id in enumerate(ANNOTS_IDS) + ] + + expected_references = _sort_pdf_object_refs(direct_refs + ext_g_state_refs + font_refs + annots_refs) + assert _sort_pdf_object_refs(PdfObjectRelationship.get_references(pdf_obj.get_object())) == expected_references + pdf_file.close diff --git a/tests/pdfalyzer/lib/test_pdf_parser_manager.py b/tests/pdfalyzer/lib/test_pdf_parser_manager.py new file mode 100644 index 0000000..108413e --- /dev/null +++ b/tests/pdfalyzer/lib/test_pdf_parser_manager.py @@ -0,0 +1,6 @@ +from pdfalyzer.util.pdf_parser_manager import PdfParserManager + + +def test_pdf_parser_manager(analyzing_malicious_documents_pdf_path): + pdf_parser_manager = PdfParserManager(analyzing_malicious_documents_pdf_path) + assert pdf_parser_manager.object_ids_containing_stream_data == [4, 71, 411, 412, 416, 419, 421, 423, 424, 426] diff --git a/tests/pdfalyzer/lib/test_pdf_walker.py b/tests/pdfalyzer/lib/test_pdf_walker.py new file mode 100644 index 0000000..458ea6f --- /dev/null +++ b/tests/pdfalyzer/lib/test_pdf_walker.py @@ -0,0 +1,24 @@ +import pytest + +from pdfalyzer.util.pdf_parser_manager import PdfParserManager + + +class TestPdfalyzer: + def test_struct_elem_parent(self, analyzing_malicious_documents_pdfalyzer): + struct_elem_node = analyzing_malicious_documents_pdfalyzer.find_node_by_idnum(120) + assert struct_elem_node.parent.idnum == 119 + + def test_all_nodes_in_tree(self, analyzing_malicious_documents_pdfalyzer, analyzing_malicious_documents_pdf_path): + for object_id in PdfParserManager(analyzing_malicious_documents_pdf_path).object_ids: + if object_id == 71: + # 71 is the ID of the object stream holding many of the /StructElem + continue + elif object_id == 67: + # 67 is an object without any references or data + continue + elif object_id == 426: + # 426 is a Cross-reference stream containing the same info as the trailer + continue + + node = analyzing_malicious_documents_pdfalyzer.find_node_by_idnum(object_id) + assert node is not None, f"Expected {object_id} to appear in tree." diff --git a/tests/pdfalyzer/lib/util/test_dict_helper.py b/tests/pdfalyzer/lib/util/test_dict_helper.py new file mode 100644 index 0000000..43936e0 --- /dev/null +++ b/tests/pdfalyzer/lib/util/test_dict_helper.py @@ -0,0 +1,9 @@ +from pdfalyzer.helpers.dict_helper import get_dict_key_by_value + + +def test_get_dict_key_by_value(): + arr = [1, 2, 3] + hsh = {'a': 1, 'b': b'BYTES', 1: arr} + assert get_dict_key_by_value(hsh, 1) == 'a' + assert get_dict_key_by_value(hsh, b'BYTES') == 'b' + assert get_dict_key_by_value(hsh, arr) == 1 diff --git a/tests/pdfalyzer/lib/util/test_string_utils.py b/tests/pdfalyzer/lib/util/test_string_utils.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_file_export.py b/tests/test_file_export.py index a093aa7..0ea48e1 100644 --- a/tests/test_file_export.py +++ b/tests/test_file_export.py @@ -20,7 +20,7 @@ def test_file_export(analyzing_malicious_documents_pdf_path, tmp_dir): rendered_files = files_in_dir(tmp_dir) assert len(rendered_files) == 7 file_sizes = sorted([path.getsize(f) for f in rendered_files]) - assert_array_is_close(file_sizes, [3193, 8724, 35908, 79141, 181688, 1465273, 6948612]) + assert_array_is_close(file_sizes, [3193, 8724, 35908, 86645, 181688, 1465273, 6948612]) for file in rendered_files: remove(file) diff --git a/tests/test_pdfalyzer.py b/tests/test_pdfalyzer.py index 4497e61..a4ecc6d 100644 --- a/tests/test_pdfalyzer.py +++ b/tests/test_pdfalyzer.py @@ -41,7 +41,7 @@ def test_font_scan(adobe_type1_fonts_pdf_path): def test_yara_scan(adobe_type1_fonts_pdf_path): font_scan_output = _run_with_args(adobe_type1_fonts_pdf_path, '-y') - _assert_line_count_within_range(471, font_scan_output) + _assert_line_count_within_range(544, font_scan_output) def _run_with_args(pdf, *args) -> str: