Skip to content

Commit

Permalink
--streams optional ID, show more than just first 512 bytes of YARA ma…
Browse files Browse the repository at this point in the history
…tches
  • Loading branch information
ashariyar committed Oct 7, 2022
1 parent 631e4d3 commit 927927c
Show file tree
Hide file tree
Showing 18 changed files with 135 additions and 139 deletions.
12 changes: 9 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
# NEXT RELEASE

# 1.10.0
* `--streams` arg now takes an optional PDF object ID
* `--fonts` no longer takes an optional PDF object ID
* YARA matches will display more than 512 bytes
* Improved output formatting

# 1.9.0
* Scan all binary streams, not just fonts. Separate `--streams` option is provided. (`--font` option has much less output)
* Display MD5, SHA1, and SHA256 for all binary streams as well as overall file

# 1.8.3
### 1.8.3
* Highlight suspicious instructions in red, BOMs in green
* Reenable guillemet quote matching
* Clearer labeling of binary scan results
* Sync with `yaralyzer` v0.4.0

# 1.8.2
### 1.8.2
* Sync with `yaralyzer` v0.3.3

# 1.8.1
### 1.8.1
* Show defaults and valid values for command line options

# 1.8.0
Expand Down
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,27 +151,27 @@ Here's a short intro to how to access these objects:
from pdfalyzer.pdfalyzer import Pdfalyzer

# Load a PDF and parse its nodes into the tree.
walker = Pdfalyzer("/path/to/the/evil.pdf")
actual_pdf_tree = walker.pdf_tree
pdfalyzer = Pdfalyzer("/path/to/the/evil.pdf")
actual_pdf_tree = pdfalyzer.pdf_tree

# Find a PDF object by its ID in the PDF
node = walker.find_node_by_idnum(44)
node = pdfalyzer.find_node_by_idnum(44)
pdf_object = node.obj

# Use anytree's findall_by_attr to find nodes with a given property
from anytree.search import findall_by_attr
page_nodes = findall_by_attr(walker.pdf_tree, name='type', value='/Page')
page_nodes = findall_by_attr(pdfalyzer.pdf_tree, name='type', value='/Page')

# Get the fonts
font1 = walker.font_infos[0]
font1 = pdfalyzer.font_infos[0]

# Iterate over backtick quoted strings from a font binary and process them
for backtick_quoted_string in font1.binary_scanner.extract_backtick_quoted_bytes():
process(backtick_quoted_string)

# Try to decode - by force if necessary - everything in the font binary that looks like a quoted string
# or regex (meaning bytes between single quotes, double quotes, front slashes, backticks, or guillemet quotes)
font1.force_decode_all_quoted_bytes()
# Iterate over all stream objects:
for node in pdfalyzer.stream_nodes():
do_stuff(node.stream_data)
```


Expand Down
Binary file not shown.
Binary file modified doc/svgs/rendered_images/pdfalyzer_help.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 3 additions & 3 deletions pdfalyzer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from pdfalyzer.pdfalyzer import Pdfalyzer
from pdfalyzer.util.pdf_parser_manager import PdfParserManager
from pdfalyzer.util.argument_parser import ALL_FONTS_OPTION, output_sections, parse_arguments
from pdfalyzer.util.argument_parser import ALL_STREAMS, output_sections, parse_arguments


def pdfalyze():
Expand All @@ -41,8 +41,8 @@ def get_output_basepath(export_method):
if export_type == 'font_info':
output_basename += '_'

if args.font != ALL_FONTS_OPTION:
output_basename += f"_id{args.font}"
if args.streams != ALL_STREAMS:
output_basename += f"_id{args.streams}"

output_basename += f"_maxdecode{YaralyzerConfig.MAX_DECODE_LENGTH}"

Expand Down
39 changes: 24 additions & 15 deletions pdfalyzer/binary/binary_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
Class for handling binary data - scanning through it for various suspicious patterns as well as forcing
various character encodings upon it to see what comes out.
"""
import re
from collections import defaultdict
from numbers import Number
from typing import Any, Iterator, Optional, Pattern, Tuple
from typing import Any, Iterator, Optional, Pattern, Tuple, Union

from deprecated import deprecated
from rich.markup import escape
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
Expand All @@ -25,42 +25,44 @@
from yaralyzer.util.logging import log

from pdfalyzer.config import PdfalyzerConfig
from pdfalyzer.detection.constants.binary_regexes import BACKTICK, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET, QUOTE_PATTERNS
from pdfalyzer.helpers.rich_text_helper import (DANGER_HEADER, NOT_FOUND_MSG,
generate_subtable, get_label_style, pad_header)
from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode
from pdfalyzer.detection.constants.binary_regexes import (BACKTICK, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET,
QUOTE_PATTERNS)
from pdfalyzer.helpers.rich_text_helper import NOT_FOUND_MSG, generate_subtable, get_label_style, pad_header
from pdfalyzer.helpers.string_helper import generate_hyphen_line
from pdfalyzer.output.layout import half_width, print_section_header, print_section_subheader, subheading_width
from pdfalyzer.util.adobe_strings import CURRENTFILE_EEXEC
from pdfalyzer.output.layout import half_width, print_headline_panel, print_section_sub_subheader
from pdfalyzer.util.adobe_strings import CONTENTS, CURRENTFILE_EEXEC

# For rainbow colors
CHAR_ENCODING_1ST_COLOR_NUMBER = 203


class BinaryScanner:
def __init__(self, _bytes: bytes, owner: Any = None, label: Any = None):
def __init__(self, _bytes: bytes, owner: Union['FontInfo', 'PdfTreeNode'], label: Optional[Text] = None):
"""owner is an optional link back to the object containing this binary"""
self.bytes = _bytes
self.label = label
self.owner = owner

if label is None and owner is not None:
self.label = Text(owner.label, get_label_style(owner.label))
if label is None and isinstance(owner, PdfTreeNode):
self.label = owner.__rich__()

self.stream_length = len(_bytes)
self.regex_extraction_stats = defaultdict(lambda: RegexMatchMetrics())
self.suppression_notice_queue = []

def check_for_dangerous_instructions(self) -> None:
"""Scan for all the strings in DANGEROUS_INSTRUCTIONS list and decode bytes around them"""
print_section_subheader("Scanning Binary For Anything 'Mad Sus'...", style=f"{DANGER_HEADER} reverse")
print_section_sub_subheader("Scanning Binary For Anything 'Mad Sus'...", style=f"bright_red")

for instruction in DANGEROUS_STRINGS:
yaralyzer = self._pattern_yaralyzer(instruction, REGEX)
yaralyzer.highlight_style = 'bright_red bold'
self.process_yara_matches(yaralyzer, instruction, force=True)

def check_for_boms(self) -> None:
print_section_subheader("Scanning Binary for any BOMs...", style='BOM')
"""Check the binary data for BOMs"""
print_section_sub_subheader("Scanning Binary for any BOMs...", style='BOM')

for bom_bytes, bom_name in BOMS.items():
yaralyzer = self._pattern_yaralyzer(hex_string(bom_bytes), HEX, bom_name)
Expand All @@ -72,8 +74,13 @@ def force_decode_all_quoted_bytes(self) -> None:
quote_types = QUOTE_PATTERNS.keys() if PdfalyzerConfig.QUOTE_TYPE is None else [PdfalyzerConfig.QUOTE_TYPE]

for quote_type in quote_types:
if self.owner and self.owner.type == CONTENTS and quote_type in [FRONTSLASH, GUILLEMET]:
msg = f"Not attempting {quote_type} decode for {CONTENTS} node type..."
print_headline_panel(msg, style='dim')
continue

quote_pattern = QUOTE_PATTERNS[quote_type]
print_section_subheader(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style=BYTES_NO_DIM)
print_section_sub_subheader(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style=BYTES_NO_DIM)
yaralyzer = self._quote_yaralyzer(quote_pattern, quote_type)
self.process_yara_matches(yaralyzer, f"{quote_type}_quoted")

Expand Down Expand Up @@ -169,8 +176,10 @@ def process_yara_matches(self, yaralyzer: Yaralyzer, pattern: str, force: bool =
self._print_suppression_notices()
self._record_decode_stats(bytes_match, bytes_decoder, pattern)

if self.regex_extraction_stats[pattern].match_count == 0:
console.print(f"{pattern} was not found for {self.label}...", style='dim')
# This check initializes the defaultdic for 'pattern'
if self.regex_extraction_stats[pattern].match_count == 0:
#console.print(f"{pattern} was not found for {escape(self.label.plain)}...", style='dim')
pass

def bytes_after_eexec_statement(self) -> bytes:
"""Get the bytes after the 'eexec' demarcation line (if it appears). See Adobe docs for details."""
Expand Down
15 changes: 10 additions & 5 deletions pdfalyzer/decorators/pdf_tree_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from yaralyzer.util.logging import log

from yaralyzer.encoding_detection.character_encodings import NEWLINE_BYTE
from pdfalyzer.helpers.pdf_object_helper import get_references, get_symlink_representation
from pdfalyzer.helpers.pdf_object_helper import PdfObjectRef, get_references, get_symlink_representation
from pdfalyzer.helpers.rich_text_helper import (PDF_ARRAY, TYPE_STYLES, get_label_style,
get_type_style, get_type_string_style)
from pdfalyzer.helpers.string_helper import pypdf_class_name
Expand Down Expand Up @@ -134,7 +134,7 @@ def remove_relationship(self, from_node: 'PdfTreeNode') -> None:
log.debug(f"Removing relationship {relationship} from {self}")
self.other_relationships.remove(relationship)

def other_relationship_count(self):
def other_relationship_count(self) -> int:
return len(self.other_relationships)

def get_reference_key_for_relationship(self, from_node: 'PdfTreeNode'):
Expand All @@ -147,13 +147,18 @@ def get_reference_key_for_relationship(self, from_node: 'PdfTreeNode'):
return relationship.reference_key

# TODO: this doesn't include /Parent references
def referenced_by_keys(self) -> list[str]:
def referenced_by_keys(self) -> List[str]:
"""All the PDF instruction strings that referred to this object"""
return [r.reference_key for r in self.other_relationships] + [self.known_to_parent_as]

def references(self):
def references(self) -> List[PdfObjectRef]:
"""Returns all nodes referenced from this node (see PdfObjectRef definition)"""
return get_references(self.obj)

def contains_stream(self) -> bool:
"""Returns True for ContentStream, DecodedStream, and EncodedStream objects"""
return isinstance(self.obj, StreamObject)

def _find_address_of_this_node(self, other_node: 'PdfTreeNode') -> str:
"""Find the address used in other_node to refer to this node"""
refs_to_this_node = [ref for ref in other_node.references() if ref.pdf_obj.idnum == self.idnum]
Expand All @@ -179,7 +184,7 @@ def _find_address_of_this_node(self, other_node: 'PdfTreeNode') -> str:
# BELOW HERE IS JUST TEXT FORMATTING #
######################################

def print_other_relationships(self):
def print_other_relationships(self) -> None:
"""Print this node's non tree relationships (the ones represented by SymlinkNodes in the tree)"""
console.print(f"Other relationships of {escape(str(self))}")

Expand Down
35 changes: 7 additions & 28 deletions pdfalyzer/detection/constants/binary_regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,19 @@
import re
from typing import Union

from deprecated import deprecated

from pdfalyzer.util.adobe_strings import DANGEROUS_PDF_KEYS
from yaralyzer.encoding_detection.character_encodings import BOMS

DANGEROUS_JAVASCRIPT_INSTRUCTIONS = ['eval']
DANGEROUS_PDF_KEYS_TO_HUNT_WITH_SLASH = ['/F', '/AA']

# Potentially dangerous PDF instructions: Remove the leading '/' and convert to bytes except /F ("URL")
DANGEROUS_BYTES = [instruction[1:].encode() for instruction in DANGEROUS_PDF_KEYS] + [b'/F']
DANGEROUS_JAVASCRIPT_INSTRUCTIONS = [b'eval']
DANGEROUS_INSTRUCTIONS = DANGEROUS_BYTES + DANGEROUS_JAVASCRIPT_INSTRUCTIONS + list(BOMS.keys())

# Yaralyzer
DANGEROUS_STRINGS = [instruction[1:] for instruction in DANGEROUS_PDF_KEYS] + ['/F', 'eval']
DANGEROUS_STRINGS = [instruction[1:] for instruction in DANGEROUS_PDF_KEYS]
DANGEROUS_STRINGS.extend(DANGEROUS_PDF_KEYS_TO_HUNT_WITH_SLASH)
DANGEROUS_STRINGS.extend(DANGEROUS_JAVASCRIPT_INSTRUCTIONS)

# Quote capture regexes
CAPTURE_BYTES = b'(.+?)'
FRONT_SLASH_BYTE = b"/"
ESCAPED_DOUBLE_QUOTE_BYTES = b'\\"'
ESCAPED_SINGLE_QUOTE_BYTES = b"\\'"

GUILLEMET = 'guillemet'
FRONTSLASH = 'frontslash'
BACKSLASH = 'backslash'
Expand All @@ -40,19 +35,3 @@
FRONTSLASH: '/.+/',
GUILLEMET: 'AB [-] BB', # Guillemet quotes are not ANSI so require byte pattern
}


def build_quote_capture_group(open_quote: bytes, close_quote: Union[bytes, None]=None):
"""Regex that captures everything between open and close quote (close_quote defaults to open_quote)"""
return re.compile(open_quote + CAPTURE_BYTES + (close_quote or open_quote), re.DOTALL)


# Deprecated binary Quote regexes used to hunt for particular binary patterns of interest
QUOTE_REGEXES = {
BACKTICK: build_quote_capture_group(b'`'),
GUILLEMET: build_quote_capture_group(b'\xab', b'\xbb'),
ESCAPED_SINGLE: build_quote_capture_group(ESCAPED_SINGLE_QUOTE_BYTES),
ESCAPED_DOUBLE: build_quote_capture_group(ESCAPED_DOUBLE_QUOTE_BYTES),
FRONTSLASH: build_quote_capture_group(FRONT_SLASH_BYTE),
}

18 changes: 8 additions & 10 deletions pdfalyzer/font_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from pdfalyzer.detection.yaralyzer_helper import get_bytes_yaralyzer
from pdfalyzer.helpers.rich_text_helper import get_label_style, get_type_style
from pdfalyzer.helpers.string_helper import pp
from pdfalyzer.output.layout import print_section_subheader, subheading_width
from pdfalyzer.output.layout import print_section_subheader, print_headline_panel, subheading_width
from pdfalyzer.util.adobe_strings import (FONT, FONT_DESCRIPTOR, FONT_FILE, FONT_LENGTHS, RESOURCES, SUBTYPE,
TO_UNICODE, TYPE, W, WIDTHS)

Expand Down Expand Up @@ -162,20 +162,21 @@ def width_stats(self):

def print_summary(self):
"""Prints a table of info about the font drawn from the various PDF objects. quote_type of None means all."""
self.print_header_panel()
print_section_subheader(str(self), style='font.title')
#console.print(Panel(self.display_title, width=subheading_width(), padding=(1, 1)), style='font.title')
console.print(self._summary_table())
console.line()
self.print_character_mapping()
self.print_prepared_charmap()
console.line(2)
console.line()

def print_character_mapping(self):
"""Prints the character mapping extracted by PyPDF2._charmap in tidy columns"""
if self.character_mapping is None or len(self.character_mapping) == 0:
log.info(f"No character map found in {self}")
return

header_panel = Panel(f"{CHARMAP_TITLE} for {self.display_title}", style='charmap.title', expand=False)
console.print(Padding(header_panel, CHARMAP_TITLE_PADDING))
print_headline_panel(f"{self} {CHARMAP_TITLE}", style='charmap.title')
charmap_entries = [_format_charmap_entry(k, v) for k, v in self.character_mapping.items()]

charmap_columns = Columns(
Expand All @@ -194,8 +195,8 @@ def print_prepared_charmap(self):
log.info(f"No prepared_charmap found in {self}")
return

section_title = f"Adobe PostScript charmap prepared by PyPDF2 for {self.display_title}"
console.print(Padding(Panel(section_title, style='charmap.prepared_title', expand=False), CHARMAP_TITLE_PADDING))
headline = f"{self} Adobe PostScript charmap prepared by PyPDF2"
print_headline_panel(headline, style='charmap.prepared_title')
print_bytes(self.prepared_char_map, style='charmap.prepared')
console.print('')

Expand All @@ -213,9 +214,6 @@ def preview_bytes_at_advertised_lengths(self):

print(f"\nfinal bytes back from {self.stream_data.lengths[2]} + 10: {self.stream_data[-10 - -f.lengths[2]:]}")

def print_header_panel(self):
console.print(Panel(self.display_title, width=subheading_width(), padding=(1, 1)), style='font.title')

def _summary_table(self):
"""Build a Rich Table with important info about the font"""
table = Table('', '', show_header=False)
Expand Down
21 changes: 16 additions & 5 deletions pdfalyzer/output/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from yaralyzer.output.rich_console import console, console_width

HEADER_PADDING = (1, 1)


def subheading_width() -> int:
Expand All @@ -16,14 +17,24 @@ def half_width() -> int:


def print_section_header(headline: str, style: str = '') -> None:
print_section_subheader(headline, f"{style} reverse", True, console_width())
console.line(2)
_print_header_panel(headline, f"{style} reverse", True, console_width(), HEADER_PADDING)
console.line()


def print_section_subheader(headline: str, style: str = '', expand: bool = True, width = None) -> None:
console.line(2)
console.print(Panel(headline, style=style, expand=expand, width=width or subheading_width()))
def print_section_subheader(headline: str, style: str = '') -> None:
console.line()
_print_header_panel(headline, style, True, subheading_width(), HEADER_PADDING)


def print_section_sub_subheader(headline: str, style: str = ''):
print_section_subheader(headline, style, False, half_width())
console.line()
_print_header_panel(headline, style, True, half_width())


def print_headline_panel(headline, style: str = ''):
_print_header_panel(headline, style, False, console_width())


def _print_header_panel(headline: str, style: str, expand: bool, width: int, padding: tuple = (0,)) -> None:
console.print(Panel(headline, style=style, expand=expand, width=width or subheading_width(), padding=padding))
Loading

0 comments on commit 927927c

Please sign in to comment.