--streams optional ID, show more than just first 512 bytes of YARA ma…

…tches
michelcrypt4d4mus · Oct 7, 2022 · 927927c · 927927c
1 parent 631e4d3
commit 927927c
Show file tree

Hide file tree

Showing 18 changed files with 135 additions and 139 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,19 +1,25 @@
 # NEXT RELEASE
 
+# 1.10.0
+* `--streams` arg now takes an optional PDF object ID
+* `--fonts` no longer takes an optional PDF object ID
+* YARA matches will display more than 512 bytes
+* Improved output formatting
+
 # 1.9.0
 * Scan all binary streams, not just fonts.  Separate `--streams` option is provided. (`--font` option has much less output)
 * Display MD5, SHA1, and SHA256 for all binary streams as well as overall file
 
-# 1.8.3
+### 1.8.3
 * Highlight suspicious instructions in red, BOMs in green
 * Reenable guillemet quote matching
 * Clearer labeling of binary scan results
 * Sync with `yaralyzer` v0.4.0
 
-# 1.8.2
+### 1.8.2
 * Sync with `yaralyzer` v0.3.3
 
-# 1.8.1
+### 1.8.1
 * Show defaults and valid values for command line options
 
 # 1.8.0

diff --git a/README.md b/README.md
@@ -151,27 +151,27 @@ Here's a short intro to how to access these objects:
 from pdfalyzer.pdfalyzer import Pdfalyzer
 
 # Load a PDF and parse its nodes into the tree.
-walker = Pdfalyzer("/path/to/the/evil.pdf")
-actual_pdf_tree = walker.pdf_tree
+pdfalyzer = Pdfalyzer("/path/to/the/evil.pdf")
+actual_pdf_tree = pdfalyzer.pdf_tree
 
 # Find a PDF object by its ID in the PDF
-node = walker.find_node_by_idnum(44)
+node = pdfalyzer.find_node_by_idnum(44)
 pdf_object = node.obj
 
 # Use anytree's findall_by_attr to find nodes with a given property
 from anytree.search import findall_by_attr
-page_nodes = findall_by_attr(walker.pdf_tree, name='type', value='/Page')
+page_nodes = findall_by_attr(pdfalyzer.pdf_tree, name='type', value='/Page')
 
 # Get the fonts
-font1 = walker.font_infos[0]
+font1 = pdfalyzer.font_infos[0]
 
 # Iterate over backtick quoted strings from a font binary and process them
 for backtick_quoted_string in font1.binary_scanner.extract_backtick_quoted_bytes():
     process(backtick_quoted_string)
 
-# Try to decode - by force if necessary - everything in the font binary that looks like a quoted string
-# or regex (meaning bytes between single quotes, double quotes, front slashes, backticks, or guillemet quotes)
-font1.force_decode_all_quoted_bytes()
+# Iterate over all stream objects:
+for node in pdfalyzer.stream_nodes():
+    do_stuff(node.stream_data)
 ```
 
 

diff --git a/doc/PDF Forensic Analysis System using YARA 20170511.pdf b/doc/PDF Forensic Analysis System using YARA 20170511.pdf
diff --git a/doc/svgs/rendered_images/pdfalyzer_help.png b/doc/svgs/rendered_images/pdfalyzer_help.png
diff --git a/pdfalyzer/__init__.py b/pdfalyzer/__init__.py
@@ -18,7 +18,7 @@
 
 from pdfalyzer.pdfalyzer import Pdfalyzer
 from pdfalyzer.util.pdf_parser_manager import PdfParserManager
-from pdfalyzer.util.argument_parser import ALL_FONTS_OPTION, output_sections, parse_arguments
+from pdfalyzer.util.argument_parser import ALL_STREAMS, output_sections, parse_arguments
 
 
 def pdfalyze():
@@ -41,8 +41,8 @@ def get_output_basepath(export_method):
         if export_type == 'font_info':
             output_basename += '_'
 
-            if args.font != ALL_FONTS_OPTION:
-                output_basename += f"_id{args.font}"
+            if args.streams != ALL_STREAMS:
+                output_basename += f"_id{args.streams}"
 
             output_basename += f"_maxdecode{YaralyzerConfig.MAX_DECODE_LENGTH}"
 

diff --git a/pdfalyzer/binary/binary_scanner.py b/pdfalyzer/binary/binary_scanner.py
@@ -2,12 +2,12 @@
 Class for handling binary data - scanning through it for various suspicious patterns as well as forcing
 various character encodings upon it to see what comes out.
 """
-import re
 from collections import defaultdict
 from numbers import Number
-from typing import Any, Iterator, Optional, Pattern, Tuple
+from typing import Any, Iterator, Optional, Pattern, Tuple, Union
 
 from deprecated import deprecated
+from rich.markup import escape
 from rich.panel import Panel
 from rich.table import Table
 from rich.text import Text
@@ -25,42 +25,44 @@
 from yaralyzer.util.logging import log
 
 from pdfalyzer.config import PdfalyzerConfig
-from pdfalyzer.detection.constants.binary_regexes import BACKTICK, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET, QUOTE_PATTERNS
-from pdfalyzer.helpers.rich_text_helper import (DANGER_HEADER, NOT_FOUND_MSG,
-     generate_subtable, get_label_style, pad_header)
+from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode
+from pdfalyzer.detection.constants.binary_regexes import (BACKTICK, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET,
+     QUOTE_PATTERNS)
+from pdfalyzer.helpers.rich_text_helper import NOT_FOUND_MSG, generate_subtable, get_label_style, pad_header
 from pdfalyzer.helpers.string_helper import generate_hyphen_line
-from pdfalyzer.output.layout import half_width, print_section_header, print_section_subheader, subheading_width
-from pdfalyzer.util.adobe_strings import CURRENTFILE_EEXEC
+from pdfalyzer.output.layout import half_width, print_headline_panel, print_section_sub_subheader
+from pdfalyzer.util.adobe_strings import CONTENTS, CURRENTFILE_EEXEC
 
 # For rainbow colors
 CHAR_ENCODING_1ST_COLOR_NUMBER = 203
 
 
 class BinaryScanner:
-    def __init__(self, _bytes: bytes, owner: Any = None, label: Any = None):
+    def __init__(self, _bytes: bytes, owner: Union['FontInfo', 'PdfTreeNode'], label: Optional[Text] = None):
         """owner is an optional link back to the object containing this binary"""
         self.bytes = _bytes
         self.label = label
         self.owner = owner
 
-        if label is None and owner is not None:
-             self.label = Text(owner.label, get_label_style(owner.label))
+        if label is None and isinstance(owner, PdfTreeNode):
+             self.label = owner.__rich__()
 
         self.stream_length = len(_bytes)
         self.regex_extraction_stats = defaultdict(lambda: RegexMatchMetrics())
         self.suppression_notice_queue = []
 
     def check_for_dangerous_instructions(self) -> None:
         """Scan for all the strings in DANGEROUS_INSTRUCTIONS list and decode bytes around them"""
-        print_section_subheader("Scanning Binary For Anything 'Mad Sus'...", style=f"{DANGER_HEADER} reverse")
+        print_section_sub_subheader("Scanning Binary For Anything 'Mad Sus'...", style=f"bright_red")
 
         for instruction in DANGEROUS_STRINGS:
             yaralyzer = self._pattern_yaralyzer(instruction, REGEX)
             yaralyzer.highlight_style = 'bright_red bold'
             self.process_yara_matches(yaralyzer, instruction, force=True)
 
     def check_for_boms(self) -> None:
-        print_section_subheader("Scanning Binary for any BOMs...", style='BOM')
+        """Check the binary data for BOMs"""
+        print_section_sub_subheader("Scanning Binary for any BOMs...", style='BOM')
 
         for bom_bytes, bom_name in BOMS.items():
             yaralyzer = self._pattern_yaralyzer(hex_string(bom_bytes), HEX, bom_name)
@@ -72,8 +74,13 @@ def force_decode_all_quoted_bytes(self) -> None:
         quote_types = QUOTE_PATTERNS.keys() if PdfalyzerConfig.QUOTE_TYPE is None else [PdfalyzerConfig.QUOTE_TYPE]
 
         for quote_type in quote_types:
+            if self.owner and self.owner.type == CONTENTS and quote_type in [FRONTSLASH, GUILLEMET]:
+                msg = f"Not attempting {quote_type} decode for {CONTENTS} node type..."
+                print_headline_panel(msg, style='dim')
+                continue
+
             quote_pattern = QUOTE_PATTERNS[quote_type]
-            print_section_subheader(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style=BYTES_NO_DIM)
+            print_section_sub_subheader(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style=BYTES_NO_DIM)
             yaralyzer = self._quote_yaralyzer(quote_pattern, quote_type)
             self.process_yara_matches(yaralyzer, f"{quote_type}_quoted")
 
@@ -169,8 +176,10 @@ def process_yara_matches(self, yaralyzer: Yaralyzer, pattern: str, force: bool =
             self._print_suppression_notices()
             self._record_decode_stats(bytes_match, bytes_decoder, pattern)
 
-            if self.regex_extraction_stats[pattern].match_count == 0:
-                console.print(f"{pattern} was not found for {self.label}...", style='dim')
+        # This check initializes the defaultdic for 'pattern'
+        if self.regex_extraction_stats[pattern].match_count == 0:
+            #console.print(f"{pattern} was not found for {escape(self.label.plain)}...", style='dim')
+            pass
 
     def bytes_after_eexec_statement(self) -> bytes:
         """Get the bytes after the 'eexec' demarcation line (if it appears). See Adobe docs for details."""

diff --git a/pdfalyzer/decorators/pdf_tree_node.py b/pdfalyzer/decorators/pdf_tree_node.py
@@ -24,7 +24,7 @@
 from yaralyzer.util.logging import log
 
 from yaralyzer.encoding_detection.character_encodings import NEWLINE_BYTE
-from pdfalyzer.helpers.pdf_object_helper import get_references, get_symlink_representation
+from pdfalyzer.helpers.pdf_object_helper import PdfObjectRef, get_references, get_symlink_representation
 from pdfalyzer.helpers.rich_text_helper import (PDF_ARRAY, TYPE_STYLES, get_label_style,
      get_type_style, get_type_string_style)
 from pdfalyzer.helpers.string_helper import pypdf_class_name
@@ -134,7 +134,7 @@ def remove_relationship(self, from_node: 'PdfTreeNode') -> None:
             log.debug(f"Removing relationship {relationship} from {self}")
             self.other_relationships.remove(relationship)
 
-    def other_relationship_count(self):
+    def other_relationship_count(self) -> int:
         return len(self.other_relationships)
 
     def get_reference_key_for_relationship(self, from_node: 'PdfTreeNode'):
@@ -147,13 +147,18 @@ def get_reference_key_for_relationship(self, from_node: 'PdfTreeNode'):
         return relationship.reference_key
 
     # TODO: this doesn't include /Parent references
-    def referenced_by_keys(self) -> list[str]:
+    def referenced_by_keys(self) -> List[str]:
         """All the PDF instruction strings that referred to this object"""
         return [r.reference_key for r in self.other_relationships] + [self.known_to_parent_as]
 
-    def references(self):
+    def references(self) -> List[PdfObjectRef]:
+        """Returns all nodes referenced from this node (see PdfObjectRef definition)"""
         return get_references(self.obj)
 
+    def contains_stream(self) -> bool:
+        """Returns True for ContentStream, DecodedStream, and EncodedStream objects"""
+        return isinstance(self.obj, StreamObject)
+
     def _find_address_of_this_node(self, other_node: 'PdfTreeNode') -> str:
         """Find the address used in other_node to refer to this node"""
         refs_to_this_node = [ref for ref in other_node.references() if ref.pdf_obj.idnum == self.idnum]
@@ -179,7 +184,7 @@ def _find_address_of_this_node(self, other_node: 'PdfTreeNode') -> str:
     # BELOW HERE IS JUST TEXT FORMATTING #
     ######################################
 
-    def print_other_relationships(self):
+    def print_other_relationships(self) -> None:
         """Print this node's non tree relationships (the ones represented by SymlinkNodes in the tree)"""
         console.print(f"Other relationships of {escape(str(self))}")
 

diff --git a/pdfalyzer/detection/constants/binary_regexes.py b/pdfalyzer/detection/constants/binary_regexes.py
@@ -6,24 +6,19 @@
 import re
 from typing import Union
 
+from deprecated import deprecated
+
 from pdfalyzer.util.adobe_strings import DANGEROUS_PDF_KEYS
-from yaralyzer.encoding_detection.character_encodings import BOMS
 
+DANGEROUS_JAVASCRIPT_INSTRUCTIONS = ['eval']
+DANGEROUS_PDF_KEYS_TO_HUNT_WITH_SLASH = ['/F', '/AA']
 
 # Potentially dangerous PDF instructions: Remove the leading '/' and convert to bytes except /F ("URL")
-DANGEROUS_BYTES = [instruction[1:].encode() for instruction in DANGEROUS_PDF_KEYS] + [b'/F']
-DANGEROUS_JAVASCRIPT_INSTRUCTIONS = [b'eval']
-DANGEROUS_INSTRUCTIONS = DANGEROUS_BYTES + DANGEROUS_JAVASCRIPT_INSTRUCTIONS + list(BOMS.keys())
-
-# Yaralyzer
-DANGEROUS_STRINGS = [instruction[1:] for instruction in DANGEROUS_PDF_KEYS] + ['/F', 'eval']
+DANGEROUS_STRINGS = [instruction[1:] for instruction in DANGEROUS_PDF_KEYS]
+DANGEROUS_STRINGS.extend(DANGEROUS_PDF_KEYS_TO_HUNT_WITH_SLASH)
+DANGEROUS_STRINGS.extend(DANGEROUS_JAVASCRIPT_INSTRUCTIONS)
 
 # Quote capture regexes
-CAPTURE_BYTES = b'(.+?)'
-FRONT_SLASH_BYTE = b"/"
-ESCAPED_DOUBLE_QUOTE_BYTES = b'\\"'
-ESCAPED_SINGLE_QUOTE_BYTES = b"\\'"
-
 GUILLEMET = 'guillemet'
 FRONTSLASH = 'frontslash'
 BACKSLASH = 'backslash'
@@ -40,19 +35,3 @@
     FRONTSLASH: '/.+/',
     GUILLEMET: 'AB [-] BB',  # Guillemet quotes are not ANSI so require byte pattern
 }
-
-
-def build_quote_capture_group(open_quote: bytes, close_quote: Union[bytes, None]=None):
-    """Regex that captures everything between open and close quote (close_quote defaults to open_quote)"""
-    return re.compile(open_quote + CAPTURE_BYTES + (close_quote or open_quote), re.DOTALL)
-
-
-# Deprecated binary Quote regexes used to hunt for particular binary patterns of interest
-QUOTE_REGEXES = {
-    BACKTICK: build_quote_capture_group(b'`'),
-    GUILLEMET: build_quote_capture_group(b'\xab', b'\xbb'),
-    ESCAPED_SINGLE: build_quote_capture_group(ESCAPED_SINGLE_QUOTE_BYTES),
-    ESCAPED_DOUBLE: build_quote_capture_group(ESCAPED_DOUBLE_QUOTE_BYTES),
-    FRONTSLASH: build_quote_capture_group(FRONT_SLASH_BYTE),
-}
-
diff --git a/pdfalyzer/font_info.py b/pdfalyzer/font_info.py
@@ -22,7 +22,7 @@
 from pdfalyzer.detection.yaralyzer_helper import get_bytes_yaralyzer
 from pdfalyzer.helpers.rich_text_helper import get_label_style, get_type_style
 from pdfalyzer.helpers.string_helper import pp
-from pdfalyzer.output.layout import print_section_subheader, subheading_width
+from pdfalyzer.output.layout import print_section_subheader, print_headline_panel, subheading_width
 from pdfalyzer.util.adobe_strings import (FONT, FONT_DESCRIPTOR, FONT_FILE, FONT_LENGTHS, RESOURCES, SUBTYPE,
      TO_UNICODE, TYPE, W, WIDTHS)
 
@@ -162,20 +162,21 @@ def width_stats(self):
 
     def print_summary(self):
         """Prints a table of info about the font drawn from the various PDF objects. quote_type of None means all."""
-        self.print_header_panel()
+        print_section_subheader(str(self), style='font.title')
+        #console.print(Panel(self.display_title, width=subheading_width(), padding=(1, 1)), style='font.title')
         console.print(self._summary_table())
+        console.line()
         self.print_character_mapping()
         self.print_prepared_charmap()
-        console.line(2)
+        console.line()
 
     def print_character_mapping(self):
         """Prints the character mapping extracted by PyPDF2._charmap in tidy columns"""
         if self.character_mapping is None or len(self.character_mapping) == 0:
             log.info(f"No character map found in {self}")
             return
 
-        header_panel = Panel(f"{CHARMAP_TITLE} for {self.display_title}", style='charmap.title', expand=False)
-        console.print(Padding(header_panel, CHARMAP_TITLE_PADDING))
+        print_headline_panel(f"{self} {CHARMAP_TITLE}", style='charmap.title')
         charmap_entries = [_format_charmap_entry(k, v) for k, v in self.character_mapping.items()]
 
         charmap_columns = Columns(
@@ -194,8 +195,8 @@ def print_prepared_charmap(self):
             log.info(f"No prepared_charmap found in {self}")
             return
 
-        section_title = f"Adobe PostScript charmap prepared by PyPDF2 for {self.display_title}"
-        console.print(Padding(Panel(section_title, style='charmap.prepared_title', expand=False), CHARMAP_TITLE_PADDING))
+        headline = f"{self} Adobe PostScript charmap prepared by PyPDF2"
+        print_headline_panel(headline, style='charmap.prepared_title')
         print_bytes(self.prepared_char_map, style='charmap.prepared')
         console.print('')
 
@@ -213,9 +214,6 @@ def preview_bytes_at_advertised_lengths(self):
 
         print(f"\nfinal bytes back from {self.stream_data.lengths[2]} + 10: {self.stream_data[-10 - -f.lengths[2]:]}")
 
-    def print_header_panel(self):
-        console.print(Panel(self.display_title, width=subheading_width(), padding=(1, 1)), style='font.title')
-
     def _summary_table(self):
         """Build a Rich Table with important info about the font"""
         table = Table('', '', show_header=False)

diff --git a/pdfalyzer/output/layout.py b/pdfalyzer/output/layout.py
@@ -5,6 +5,7 @@
 
 from yaralyzer.output.rich_console import console, console_width
 
+HEADER_PADDING = (1, 1)
 
 
 def subheading_width() -> int:
@@ -16,14 +17,24 @@ def half_width() -> int:
 
 
 def print_section_header(headline: str, style: str = '') -> None:
-    print_section_subheader(headline, f"{style} reverse", True, console_width())
+    console.line(2)
+    _print_header_panel(headline, f"{style} reverse", True, console_width(), HEADER_PADDING)
+    console.line()
 
 
-def print_section_subheader(headline: str, style: str = '', expand: bool = True, width = None) -> None:
-    console.line(2)
-    console.print(Panel(headline, style=style, expand=expand, width=width or subheading_width()))
+def print_section_subheader(headline: str, style: str = '') -> None:
     console.line()
+    _print_header_panel(headline, style, True, subheading_width(), HEADER_PADDING)
 
 
 def print_section_sub_subheader(headline: str, style: str = ''):
-    print_section_subheader(headline, style, False, half_width())
+    console.line()
+    _print_header_panel(headline, style, True, half_width())
+
+
+def print_headline_panel(headline, style: str = ''):
+    _print_header_panel(headline, style, False, console_width())
+
+
+def _print_header_panel(headline: str, style: str, expand: bool, width: int, padding: tuple = (0,)) -> None:
+    console.print(Panel(headline, style=style, expand=expand, width=width or subheading_width(), padding=padding))