diff --git a/CHANGELOG.md b/CHANGELOG.md index 9357820..9814569 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,19 +1,25 @@ # NEXT RELEASE +# 1.10.0 +* `--streams` arg now takes an optional PDF object ID +* `--fonts` no longer takes an optional PDF object ID +* YARA matches will display more than 512 bytes +* Improved output formatting + # 1.9.0 * Scan all binary streams, not just fonts. Separate `--streams` option is provided. (`--font` option has much less output) * Display MD5, SHA1, and SHA256 for all binary streams as well as overall file -# 1.8.3 +### 1.8.3 * Highlight suspicious instructions in red, BOMs in green * Reenable guillemet quote matching * Clearer labeling of binary scan results * Sync with `yaralyzer` v0.4.0 -# 1.8.2 +### 1.8.2 * Sync with `yaralyzer` v0.3.3 -# 1.8.1 +### 1.8.1 * Show defaults and valid values for command line options # 1.8.0 diff --git a/README.md b/README.md index 57b1217..53f1136 100644 --- a/README.md +++ b/README.md @@ -151,27 +151,27 @@ Here's a short intro to how to access these objects: from pdfalyzer.pdfalyzer import Pdfalyzer # Load a PDF and parse its nodes into the tree. -walker = Pdfalyzer("/path/to/the/evil.pdf") -actual_pdf_tree = walker.pdf_tree +pdfalyzer = Pdfalyzer("/path/to/the/evil.pdf") +actual_pdf_tree = pdfalyzer.pdf_tree # Find a PDF object by its ID in the PDF -node = walker.find_node_by_idnum(44) +node = pdfalyzer.find_node_by_idnum(44) pdf_object = node.obj # Use anytree's findall_by_attr to find nodes with a given property from anytree.search import findall_by_attr -page_nodes = findall_by_attr(walker.pdf_tree, name='type', value='/Page') +page_nodes = findall_by_attr(pdfalyzer.pdf_tree, name='type', value='/Page') # Get the fonts -font1 = walker.font_infos[0] +font1 = pdfalyzer.font_infos[0] # Iterate over backtick quoted strings from a font binary and process them for backtick_quoted_string in font1.binary_scanner.extract_backtick_quoted_bytes(): process(backtick_quoted_string) -# Try to decode - by force if necessary - everything in the font binary that looks like a quoted string -# or regex (meaning bytes between single quotes, double quotes, front slashes, backticks, or guillemet quotes) -font1.force_decode_all_quoted_bytes() +# Iterate over all stream objects: +for node in pdfalyzer.stream_nodes(): + do_stuff(node.stream_data) ``` diff --git a/doc/PDF Forensic Analysis System using YARA 20170511.pdf b/doc/PDF Forensic Analysis System using YARA 20170511.pdf new file mode 100644 index 0000000..23349d1 Binary files /dev/null and b/doc/PDF Forensic Analysis System using YARA 20170511.pdf differ diff --git a/doc/svgs/rendered_images/pdfalyzer_help.png b/doc/svgs/rendered_images/pdfalyzer_help.png index ef64d39..b8afdf1 100644 Binary files a/doc/svgs/rendered_images/pdfalyzer_help.png and b/doc/svgs/rendered_images/pdfalyzer_help.png differ diff --git a/pdfalyzer/__init__.py b/pdfalyzer/__init__.py index f78fc0f..e319f2b 100644 --- a/pdfalyzer/__init__.py +++ b/pdfalyzer/__init__.py @@ -18,7 +18,7 @@ from pdfalyzer.pdfalyzer import Pdfalyzer from pdfalyzer.util.pdf_parser_manager import PdfParserManager -from pdfalyzer.util.argument_parser import ALL_FONTS_OPTION, output_sections, parse_arguments +from pdfalyzer.util.argument_parser import ALL_STREAMS, output_sections, parse_arguments def pdfalyze(): @@ -41,8 +41,8 @@ def get_output_basepath(export_method): if export_type == 'font_info': output_basename += '_' - if args.font != ALL_FONTS_OPTION: - output_basename += f"_id{args.font}" + if args.streams != ALL_STREAMS: + output_basename += f"_id{args.streams}" output_basename += f"_maxdecode{YaralyzerConfig.MAX_DECODE_LENGTH}" diff --git a/pdfalyzer/binary/binary_scanner.py b/pdfalyzer/binary/binary_scanner.py index 23704fa..4b64e4c 100644 --- a/pdfalyzer/binary/binary_scanner.py +++ b/pdfalyzer/binary/binary_scanner.py @@ -2,12 +2,12 @@ Class for handling binary data - scanning through it for various suspicious patterns as well as forcing various character encodings upon it to see what comes out. """ -import re from collections import defaultdict from numbers import Number -from typing import Any, Iterator, Optional, Pattern, Tuple +from typing import Any, Iterator, Optional, Pattern, Tuple, Union from deprecated import deprecated +from rich.markup import escape from rich.panel import Panel from rich.table import Table from rich.text import Text @@ -25,26 +25,27 @@ from yaralyzer.util.logging import log from pdfalyzer.config import PdfalyzerConfig -from pdfalyzer.detection.constants.binary_regexes import BACKTICK, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET, QUOTE_PATTERNS -from pdfalyzer.helpers.rich_text_helper import (DANGER_HEADER, NOT_FOUND_MSG, - generate_subtable, get_label_style, pad_header) +from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode +from pdfalyzer.detection.constants.binary_regexes import (BACKTICK, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET, + QUOTE_PATTERNS) +from pdfalyzer.helpers.rich_text_helper import NOT_FOUND_MSG, generate_subtable, get_label_style, pad_header from pdfalyzer.helpers.string_helper import generate_hyphen_line -from pdfalyzer.output.layout import half_width, print_section_header, print_section_subheader, subheading_width -from pdfalyzer.util.adobe_strings import CURRENTFILE_EEXEC +from pdfalyzer.output.layout import half_width, print_headline_panel, print_section_sub_subheader +from pdfalyzer.util.adobe_strings import CONTENTS, CURRENTFILE_EEXEC # For rainbow colors CHAR_ENCODING_1ST_COLOR_NUMBER = 203 class BinaryScanner: - def __init__(self, _bytes: bytes, owner: Any = None, label: Any = None): + def __init__(self, _bytes: bytes, owner: Union['FontInfo', 'PdfTreeNode'], label: Optional[Text] = None): """owner is an optional link back to the object containing this binary""" self.bytes = _bytes self.label = label self.owner = owner - if label is None and owner is not None: - self.label = Text(owner.label, get_label_style(owner.label)) + if label is None and isinstance(owner, PdfTreeNode): + self.label = owner.__rich__() self.stream_length = len(_bytes) self.regex_extraction_stats = defaultdict(lambda: RegexMatchMetrics()) @@ -52,7 +53,7 @@ def __init__(self, _bytes: bytes, owner: Any = None, label: Any = None): def check_for_dangerous_instructions(self) -> None: """Scan for all the strings in DANGEROUS_INSTRUCTIONS list and decode bytes around them""" - print_section_subheader("Scanning Binary For Anything 'Mad Sus'...", style=f"{DANGER_HEADER} reverse") + print_section_sub_subheader("Scanning Binary For Anything 'Mad Sus'...", style=f"bright_red") for instruction in DANGEROUS_STRINGS: yaralyzer = self._pattern_yaralyzer(instruction, REGEX) @@ -60,7 +61,8 @@ def check_for_dangerous_instructions(self) -> None: self.process_yara_matches(yaralyzer, instruction, force=True) def check_for_boms(self) -> None: - print_section_subheader("Scanning Binary for any BOMs...", style='BOM') + """Check the binary data for BOMs""" + print_section_sub_subheader("Scanning Binary for any BOMs...", style='BOM') for bom_bytes, bom_name in BOMS.items(): yaralyzer = self._pattern_yaralyzer(hex_string(bom_bytes), HEX, bom_name) @@ -72,8 +74,13 @@ def force_decode_all_quoted_bytes(self) -> None: quote_types = QUOTE_PATTERNS.keys() if PdfalyzerConfig.QUOTE_TYPE is None else [PdfalyzerConfig.QUOTE_TYPE] for quote_type in quote_types: + if self.owner and self.owner.type == CONTENTS and quote_type in [FRONTSLASH, GUILLEMET]: + msg = f"Not attempting {quote_type} decode for {CONTENTS} node type..." + print_headline_panel(msg, style='dim') + continue + quote_pattern = QUOTE_PATTERNS[quote_type] - print_section_subheader(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style=BYTES_NO_DIM) + print_section_sub_subheader(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style=BYTES_NO_DIM) yaralyzer = self._quote_yaralyzer(quote_pattern, quote_type) self.process_yara_matches(yaralyzer, f"{quote_type}_quoted") @@ -169,8 +176,10 @@ def process_yara_matches(self, yaralyzer: Yaralyzer, pattern: str, force: bool = self._print_suppression_notices() self._record_decode_stats(bytes_match, bytes_decoder, pattern) - if self.regex_extraction_stats[pattern].match_count == 0: - console.print(f"{pattern} was not found for {self.label}...", style='dim') + # This check initializes the defaultdic for 'pattern' + if self.regex_extraction_stats[pattern].match_count == 0: + #console.print(f"{pattern} was not found for {escape(self.label.plain)}...", style='dim') + pass def bytes_after_eexec_statement(self) -> bytes: """Get the bytes after the 'eexec' demarcation line (if it appears). See Adobe docs for details.""" diff --git a/pdfalyzer/decorators/pdf_tree_node.py b/pdfalyzer/decorators/pdf_tree_node.py index 129d2c1..cceba45 100644 --- a/pdfalyzer/decorators/pdf_tree_node.py +++ b/pdfalyzer/decorators/pdf_tree_node.py @@ -24,7 +24,7 @@ from yaralyzer.util.logging import log from yaralyzer.encoding_detection.character_encodings import NEWLINE_BYTE -from pdfalyzer.helpers.pdf_object_helper import get_references, get_symlink_representation +from pdfalyzer.helpers.pdf_object_helper import PdfObjectRef, get_references, get_symlink_representation from pdfalyzer.helpers.rich_text_helper import (PDF_ARRAY, TYPE_STYLES, get_label_style, get_type_style, get_type_string_style) from pdfalyzer.helpers.string_helper import pypdf_class_name @@ -134,7 +134,7 @@ def remove_relationship(self, from_node: 'PdfTreeNode') -> None: log.debug(f"Removing relationship {relationship} from {self}") self.other_relationships.remove(relationship) - def other_relationship_count(self): + def other_relationship_count(self) -> int: return len(self.other_relationships) def get_reference_key_for_relationship(self, from_node: 'PdfTreeNode'): @@ -147,13 +147,18 @@ def get_reference_key_for_relationship(self, from_node: 'PdfTreeNode'): return relationship.reference_key # TODO: this doesn't include /Parent references - def referenced_by_keys(self) -> list[str]: + def referenced_by_keys(self) -> List[str]: """All the PDF instruction strings that referred to this object""" return [r.reference_key for r in self.other_relationships] + [self.known_to_parent_as] - def references(self): + def references(self) -> List[PdfObjectRef]: + """Returns all nodes referenced from this node (see PdfObjectRef definition)""" return get_references(self.obj) + def contains_stream(self) -> bool: + """Returns True for ContentStream, DecodedStream, and EncodedStream objects""" + return isinstance(self.obj, StreamObject) + def _find_address_of_this_node(self, other_node: 'PdfTreeNode') -> str: """Find the address used in other_node to refer to this node""" refs_to_this_node = [ref for ref in other_node.references() if ref.pdf_obj.idnum == self.idnum] @@ -179,7 +184,7 @@ def _find_address_of_this_node(self, other_node: 'PdfTreeNode') -> str: # BELOW HERE IS JUST TEXT FORMATTING # ###################################### - def print_other_relationships(self): + def print_other_relationships(self) -> None: """Print this node's non tree relationships (the ones represented by SymlinkNodes in the tree)""" console.print(f"Other relationships of {escape(str(self))}") diff --git a/pdfalyzer/detection/constants/binary_regexes.py b/pdfalyzer/detection/constants/binary_regexes.py index 72ada04..8ee6eba 100644 --- a/pdfalyzer/detection/constants/binary_regexes.py +++ b/pdfalyzer/detection/constants/binary_regexes.py @@ -6,24 +6,19 @@ import re from typing import Union +from deprecated import deprecated + from pdfalyzer.util.adobe_strings import DANGEROUS_PDF_KEYS -from yaralyzer.encoding_detection.character_encodings import BOMS +DANGEROUS_JAVASCRIPT_INSTRUCTIONS = ['eval'] +DANGEROUS_PDF_KEYS_TO_HUNT_WITH_SLASH = ['/F', '/AA'] # Potentially dangerous PDF instructions: Remove the leading '/' and convert to bytes except /F ("URL") -DANGEROUS_BYTES = [instruction[1:].encode() for instruction in DANGEROUS_PDF_KEYS] + [b'/F'] -DANGEROUS_JAVASCRIPT_INSTRUCTIONS = [b'eval'] -DANGEROUS_INSTRUCTIONS = DANGEROUS_BYTES + DANGEROUS_JAVASCRIPT_INSTRUCTIONS + list(BOMS.keys()) - -# Yaralyzer -DANGEROUS_STRINGS = [instruction[1:] for instruction in DANGEROUS_PDF_KEYS] + ['/F', 'eval'] +DANGEROUS_STRINGS = [instruction[1:] for instruction in DANGEROUS_PDF_KEYS] +DANGEROUS_STRINGS.extend(DANGEROUS_PDF_KEYS_TO_HUNT_WITH_SLASH) +DANGEROUS_STRINGS.extend(DANGEROUS_JAVASCRIPT_INSTRUCTIONS) # Quote capture regexes -CAPTURE_BYTES = b'(.+?)' -FRONT_SLASH_BYTE = b"/" -ESCAPED_DOUBLE_QUOTE_BYTES = b'\\"' -ESCAPED_SINGLE_QUOTE_BYTES = b"\\'" - GUILLEMET = 'guillemet' FRONTSLASH = 'frontslash' BACKSLASH = 'backslash' @@ -40,19 +35,3 @@ FRONTSLASH: '/.+/', GUILLEMET: 'AB [-] BB', # Guillemet quotes are not ANSI so require byte pattern } - - -def build_quote_capture_group(open_quote: bytes, close_quote: Union[bytes, None]=None): - """Regex that captures everything between open and close quote (close_quote defaults to open_quote)""" - return re.compile(open_quote + CAPTURE_BYTES + (close_quote or open_quote), re.DOTALL) - - -# Deprecated binary Quote regexes used to hunt for particular binary patterns of interest -QUOTE_REGEXES = { - BACKTICK: build_quote_capture_group(b'`'), - GUILLEMET: build_quote_capture_group(b'\xab', b'\xbb'), - ESCAPED_SINGLE: build_quote_capture_group(ESCAPED_SINGLE_QUOTE_BYTES), - ESCAPED_DOUBLE: build_quote_capture_group(ESCAPED_DOUBLE_QUOTE_BYTES), - FRONTSLASH: build_quote_capture_group(FRONT_SLASH_BYTE), -} - diff --git a/pdfalyzer/font_info.py b/pdfalyzer/font_info.py index 6dc5f62..e911c41 100644 --- a/pdfalyzer/font_info.py +++ b/pdfalyzer/font_info.py @@ -22,7 +22,7 @@ from pdfalyzer.detection.yaralyzer_helper import get_bytes_yaralyzer from pdfalyzer.helpers.rich_text_helper import get_label_style, get_type_style from pdfalyzer.helpers.string_helper import pp -from pdfalyzer.output.layout import print_section_subheader, subheading_width +from pdfalyzer.output.layout import print_section_subheader, print_headline_panel, subheading_width from pdfalyzer.util.adobe_strings import (FONT, FONT_DESCRIPTOR, FONT_FILE, FONT_LENGTHS, RESOURCES, SUBTYPE, TO_UNICODE, TYPE, W, WIDTHS) @@ -162,11 +162,13 @@ def width_stats(self): def print_summary(self): """Prints a table of info about the font drawn from the various PDF objects. quote_type of None means all.""" - self.print_header_panel() + print_section_subheader(str(self), style='font.title') + #console.print(Panel(self.display_title, width=subheading_width(), padding=(1, 1)), style='font.title') console.print(self._summary_table()) + console.line() self.print_character_mapping() self.print_prepared_charmap() - console.line(2) + console.line() def print_character_mapping(self): """Prints the character mapping extracted by PyPDF2._charmap in tidy columns""" @@ -174,8 +176,7 @@ def print_character_mapping(self): log.info(f"No character map found in {self}") return - header_panel = Panel(f"{CHARMAP_TITLE} for {self.display_title}", style='charmap.title', expand=False) - console.print(Padding(header_panel, CHARMAP_TITLE_PADDING)) + print_headline_panel(f"{self} {CHARMAP_TITLE}", style='charmap.title') charmap_entries = [_format_charmap_entry(k, v) for k, v in self.character_mapping.items()] charmap_columns = Columns( @@ -194,8 +195,8 @@ def print_prepared_charmap(self): log.info(f"No prepared_charmap found in {self}") return - section_title = f"Adobe PostScript charmap prepared by PyPDF2 for {self.display_title}" - console.print(Padding(Panel(section_title, style='charmap.prepared_title', expand=False), CHARMAP_TITLE_PADDING)) + headline = f"{self} Adobe PostScript charmap prepared by PyPDF2" + print_headline_panel(headline, style='charmap.prepared_title') print_bytes(self.prepared_char_map, style='charmap.prepared') console.print('') @@ -213,9 +214,6 @@ def preview_bytes_at_advertised_lengths(self): print(f"\nfinal bytes back from {self.stream_data.lengths[2]} + 10: {self.stream_data[-10 - -f.lengths[2]:]}") - def print_header_panel(self): - console.print(Panel(self.display_title, width=subheading_width(), padding=(1, 1)), style='font.title') - def _summary_table(self): """Build a Rich Table with important info about the font""" table = Table('', '', show_header=False) diff --git a/pdfalyzer/output/layout.py b/pdfalyzer/output/layout.py index ac736eb..384256e 100644 --- a/pdfalyzer/output/layout.py +++ b/pdfalyzer/output/layout.py @@ -5,6 +5,7 @@ from yaralyzer.output.rich_console import console, console_width +HEADER_PADDING = (1, 1) def subheading_width() -> int: @@ -16,14 +17,24 @@ def half_width() -> int: def print_section_header(headline: str, style: str = '') -> None: - print_section_subheader(headline, f"{style} reverse", True, console_width()) + console.line(2) + _print_header_panel(headline, f"{style} reverse", True, console_width(), HEADER_PADDING) + console.line() -def print_section_subheader(headline: str, style: str = '', expand: bool = True, width = None) -> None: - console.line(2) - console.print(Panel(headline, style=style, expand=expand, width=width or subheading_width())) +def print_section_subheader(headline: str, style: str = '') -> None: console.line() + _print_header_panel(headline, style, True, subheading_width(), HEADER_PADDING) def print_section_sub_subheader(headline: str, style: str = ''): - print_section_subheader(headline, style, False, half_width()) + console.line() + _print_header_panel(headline, style, True, half_width()) + + +def print_headline_panel(headline, style: str = ''): + _print_header_panel(headline, style, False, console_width()) + + +def _print_header_panel(headline: str, style: str, expand: bool, width: int, padding: tuple = (0,)) -> None: + console.print(Panel(headline, style=style, expand=expand, width=width or subheading_width(), padding=padding)) diff --git a/pdfalyzer/pdfalyzer.py b/pdfalyzer/pdfalyzer.py index ac10737..fc42795 100644 --- a/pdfalyzer/pdfalyzer.py +++ b/pdfalyzer/pdfalyzer.py @@ -3,14 +3,13 @@ managed by the anytree library. Once the PDF is parsed this class manages things like searching the tree and printing out information. """ -import hashlib from collections import defaultdict from os.path import basename -from typing import Iterator, List +from typing import List, Optional from anytree import LevelOrderIter, RenderTree, SymlinkNode from anytree.render import DoubleStyle -from anytree.search import findall_by_attr +from anytree.search import findall, findall_by_attr from PyPDF2 import PdfReader from PyPDF2.errors import PdfReadError from PyPDF2.generic import IndirectObject, NameObject, NumberObject, StreamObject @@ -21,7 +20,7 @@ from yaralyzer.helpers.bytes_helper import get_bytes_info from yaralyzer.helpers.file_helper import load_binary_data from yaralyzer.helpers.rich_text_helper import CENTER, LEFT, size_in_bytes_text -from yaralyzer.output.rich_console import BYTES_HIGHLIGHT, GREY, console, console_width, theme_colors_with_prefix +from yaralyzer.output.rich_console import BYTES_HIGHLIGHT, GREY, console from yaralyzer.output.rich_layout_elements import bytes_hashes_table from yaralyzer.util.logging import log @@ -171,11 +170,11 @@ def print_font_info(self, font_idnum=None) -> None: for font_info in [fi for fi in self.font_infos if font_idnum is None or font_idnum == fi.idnum]: font_info.print_summary() - def print_streams_analysis(self) -> None: + def print_streams_analysis(self, idnum: Optional[int] = None) -> None: print_section_header(f'Binary Stream Analysis / Extraction') console.print(self.stream_objects_table()) - for node in self.stream_node_iterator(): + for node in [n for n in self.stream_nodes() if idnum is None or idnum == n.idnum]: node_stream_bytes = node.stream_data if node_stream_bytes is None or node.stream_length == 0: @@ -188,8 +187,8 @@ def print_streams_analysis(self) -> None: log.warning(msg) node_stream_bytes = node_stream_bytes.encode() - print_section_subheader(f"{node} Analysis", style=BYTES_HIGHLIGHT, width=console_width()) - binary_scanner = BinaryScanner(node_stream_bytes, node, node.__rich__()) + print_section_subheader(f"{node} Summary and Analysis", style=f"{BYTES_HIGHLIGHT} reverse") + binary_scanner = BinaryScanner(node_stream_bytes, node) console.print(bytes_hashes_table(binary_scanner.bytes)) binary_scanner.print_stream_preview() binary_scanner.check_for_dangerous_instructions() @@ -201,10 +200,7 @@ def print_streams_analysis(self) -> None: binary_scanner.print_decoding_stats_table() def print_yara_results(self, font_idnum=None) -> None: - print_section_header(f'YARA Scan for {self.pdf_basename}') - theme_colors = [color[len('yara') + 1:] for color in theme_colors_with_prefix('yara')] - color_key = Text('Color Code: ') + Text(' ').join(theme_colors) + Text('\n') - console.print(color_key, justify='center') + print_section_header(f"YARA Scan of PDF rules for '{self.pdf_basename}'") self.yaralyzer.yaralyze() # TODO: we should really scan all the binary streams not just those in the fonts @@ -226,22 +222,17 @@ def print_other_relationships(self) -> None: def stream_objects_table(self) -> Table: table = Table('Stream Length', 'Node') - table.columns[0].justify = 'right' - stream_nodes: List[PdfTreeNode] = [] - - for node in self.stream_node_iterator(): - stream_nodes.append(node) - for node in sorted(stream_nodes, key=lambda r: r.idnum): + for node in self.stream_nodes(): table.add_row(size_in_bytes_text(node.stream_length), node.__rich__()) return table - def stream_node_iterator(self) -> Iterator[PdfTreeNode]: - for node in LevelOrderIter(self.pdf_tree): - if isinstance(node.obj, StreamObject): - yield node + def stream_nodes(self) -> List[PdfTreeNode]: + """List of actual nodes (not SymlinkNodes) containing streams sorted by PDF object ID""" + stream_filter = lambda node: node.contains_stream() and not isinstance(node, SymlinkNode) + return sorted(findall(self.pdf_tree, stream_filter), key=lambda r: r.idnum) def _process_reference(self, node: PdfTreeNode, key: str, address: str, reference: IndirectObject) -> [PdfTreeNode]: """Place the referenced node in the tree. Returns a list of nodes to walk next.""" diff --git a/pdfalyzer/util/adobe_strings.py b/pdfalyzer/util/adobe_strings.py index dcb2420..cce29b7 100644 --- a/pdfalyzer/util/adobe_strings.py +++ b/pdfalyzer/util/adobe_strings.py @@ -16,6 +16,7 @@ ACRO_FORM = CatalogDictionary.ACRO_FORM # Can trigger Javascript on open COLOR_SPACE = Resources.COLOR_SPACE D = '/D' # Destination, usually of a link or action +CONTENTS = '/Contents' DEST = '/Dest' # Similar to /D? EXT_G_STATE = Resources.EXT_G_STATE FIRST = '/First' @@ -58,7 +59,7 @@ # Instructions to flag when scanning stream data for malicious content. DANGEROUS_PDF_KEYS = [ - AA, + # AA, # AA is too generic; can't afford to remove the frontslash ACRO_FORM, JAVASCRIPT, JS, diff --git a/pdfalyzer/util/argument_parser.py b/pdfalyzer/util/argument_parser.py index 8e21f88..aea9672 100644 --- a/pdfalyzer/util/argument_parser.py +++ b/pdfalyzer/util/argument_parser.py @@ -13,12 +13,12 @@ from yaralyzer.util.logging import log, log_and_print, log_argparse_result, log_current_config, log_invocation from pdfalyzer.config import LOG_DIR_ENV_VAR, PdfalyzerConfig -from pdfalyzer.detection.constants.binary_regexes import QUOTE_REGEXES +from pdfalyzer.detection.constants.binary_regexes import QUOTE_PATTERNS # NamedTuple to keep our argument selection orderly OutputSection = namedtuple('OutputSection', ['argument', 'method']) -ALL_FONTS_OPTION = -1 +ALL_STREAMS = -1 DESCRIPTION = "Explore PDF's inner data structure with absurdly large and in depth visualizations. " + \ "Track the control flow of her darker impulses, scan rivers of her binary data for signs " + \ @@ -60,28 +60,28 @@ select.add_argument('-r', '--rich', action='store_true', help='show much larger / more detailed tree visualization (one row per PDF object property)') +select.add_argument('-f', '--fonts', action='store_true', + help="show info about fonts included character mappings for embedded font binaries") + +select.add_argument('-y', '--yara', action='store_true', + help="scan the PDF with YARA rules") + select.add_argument('-c', '--counts', action='store_true', help='show counts of some of the properties of the objects in the PDF') -select.add_argument('-f', '--font', - help="scan font binaries for sus content. brute force is involved. brutes are slow and so " + \ - "is slow. a single font can be optionally be selected by its internal PDF [ID]. " + \ - "not a multiselect but choosing nothing is still choosing everything. " + \ - "try '-f -- [the rest]' if you run into an argument position related piccadilly.", +select.add_argument('-s', '--streams', + help="scan all the PDF's decoded/decrypted streams for sus content as well as any YARA rule matches. " + \ + "brute force is involved; output is verbose. a single OBJ_ID can be optionally provided to " + \ + "limit the output to a single internal object. try '-s -- [OTHERARGS]' if you run into an " + \ + "argument position related piccadilly.", nargs='?', - const=ALL_FONTS_OPTION, - metavar='ID', + const=ALL_STREAMS, + metavar='OBJ_ID', type=int) -select.add_argument('-y', '--yara', action='store_true', - help="scan the PDF with YARA rules") - -select.add_argument('-s', '--streams', action='store_true', - help="scan all the PDF's decoded/decrypted for suspicious bytes as well as any YARA rule matches") - select.add_argument('--quote-type', - help='scan binary data for quoted data of this type only or all types if not set', - choices=list(QUOTE_REGEXES.keys())) + help='optionally limit stream extraction of quoted bytes to this quote type only', + choices=list(QUOTE_PATTERNS.keys())) # Make sure the selection section is at the top parser._action_groups = parser._action_groups[:2] + [parser._action_groups[-1]] + parser._action_groups[2:-1] @@ -122,9 +122,9 @@ def output_sections(args, pdfalyzer) -> List[OutputSection]: """ # Create a partial for print_font_info() because it's the only one that can take an argument # partials have no __name__ so update_wrapper() propagates the 'print_font_info' as this partial's name - font_id = None if args.font == ALL_FONTS_OPTION else args.font - font_info = partial(pdfalyzer.print_font_info, font_idnum=font_id) - update_wrapper(font_info, pdfalyzer.print_font_info) + stream_id = None if args.streams == ALL_STREAMS else args.streams + stream_scan = partial(pdfalyzer.print_streams_analysis, idnum=stream_id) + update_wrapper(stream_scan, pdfalyzer.print_streams_analysis) # The first element string matches the argument in 'select' group. # Top to bottom is the default order of output. @@ -132,10 +132,10 @@ def output_sections(args, pdfalyzer) -> List[OutputSection]: OutputSection('docinfo', pdfalyzer.print_document_info), OutputSection('tree', pdfalyzer.print_tree), OutputSection('rich', pdfalyzer.print_rich_table_tree), - OutputSection('font', font_info), + OutputSection('fonts', pdfalyzer.print_font_info), OutputSection('counts', pdfalyzer.print_summary), OutputSection('yara', pdfalyzer.print_yara_results), - OutputSection('streams', pdfalyzer.print_streams_analysis), + OutputSection('streams', stream_scan), ] output_sections = [section for section in possible_output_sections if vars(args)[section.argument]] diff --git a/poetry.lock b/poetry.lock index ba16b43..d839dae 100644 --- a/poetry.lock +++ b/poetry.lock @@ -242,7 +242,7 @@ python-versions = "*" [[package]] name = "yaralyzer" -version = "0.5.2" +version = "0.6.0" description = "Visualize and force decode YARA and regex matches found in a file or byte stream. With colors. Lots of colors." category = "main" optional = false @@ -258,7 +258,7 @@ yara-python = ">=4.2.3,<5.0.0" [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "0b376615f77298ff1e3768681ec6c5b36011f97a7fd641e5473d3579e0e92bf4" +content-hash = "294b4e3f8a1d27d0597333b7bf8945c87b95e897ba2d5f3bf5d1f55c1dee4437" [metadata.files] anytree = [ @@ -423,6 +423,6 @@ yara-python = [ {file = "yara_python-4.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:87125ede7fbc18ae65aab550f1a36f4ebf73bb828c5d7a3dd2bb99176f0faa15"}, ] yaralyzer = [ - {file = "yaralyzer-0.5.2-py3-none-any.whl", hash = "sha256:303eb48da7a6f97444180b60642eb4022ced9566e7bdb4dd8c313813cab7834e"}, - {file = "yaralyzer-0.5.2.tar.gz", hash = "sha256:965eba49fd25188f910cb11602e8f06dc61b2e992ac0c31438d61483f0161d3a"}, + {file = "yaralyzer-0.6.0-py3-none-any.whl", hash = "sha256:fa3243059f608ce0cffc76e9d8e2c7ca8a290fad9133b0b9bd46a7f8149c479c"}, + {file = "yaralyzer-0.6.0.tar.gz", hash = "sha256:d5a0c76256b95a43ef8616b2c510a7d6b922d7b4dde4e70da6e905cfc978f153"}, ] diff --git a/pyproject.toml b/pyproject.toml index e720e1e..3ed7a9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdfalyzer" -version = "1.9.0" +version = "1.10.0" description = "A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more." authors = ["Michel de Cryptadamus "] license = "GPL-3.0-or-later" @@ -49,7 +49,7 @@ PyPDF2 = "^2.10" python-dotenv = "^0.21.0" rich = "^12.5.1" rich-argparse = "^0.3.0" -yaralyzer = "^0.5.2" +yaralyzer = "^0.6.0" [tool.poetry.dev-dependencies] diff --git a/tests/lib/binary/test_binary_scanner.py b/tests/lib/binary/test_binary_scanner.py index 7d24aff..e959543 100644 --- a/tests/lib/binary/test_binary_scanner.py +++ b/tests/lib/binary/test_binary_scanner.py @@ -1,17 +1,13 @@ -from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode -from pdfalyzer.pdfalyzer import Pdfalyzer - - def test_quote_extraction_methods(font_info): - _check_matches(font_info.binary_scanner.extract_backtick_quoted_bytes, 163, 33267) + _check_matches(font_info.binary_scanner.extract_backtick_quoted_bytes, 163, 52840) def test_front_slash_quoted_bytes_extraction(font_info): - _check_matches(font_info.binary_scanner.extract_front_slash_quoted_bytes, 756, 106625) + _check_matches(font_info.binary_scanner.extract_front_slash_quoted_bytes, 756, 167814) def test_extract_guillemet(font_info): - _check_matches(font_info.binary_scanner.extract_guillemet_quoted_bytes, 59, 23138) + _check_matches(font_info.binary_scanner.extract_guillemet_quoted_bytes, 59, 78763) def _check_matches(match_iterator, expected_matches: int, expected_bytes: int) -> None: diff --git a/tests/test_file_export.py b/tests/test_file_export.py index 858354c..a093aa7 100644 --- a/tests/test_file_export.py +++ b/tests/test_file_export.py @@ -20,7 +20,7 @@ def test_file_export(analyzing_malicious_documents_pdf_path, tmp_dir): rendered_files = files_in_dir(tmp_dir) assert len(rendered_files) == 7 file_sizes = sorted([path.getsize(f) for f in rendered_files]) - assert_array_is_close(file_sizes, [2815, 8346, 35523, 78146, 181310, 1464895, 6948612]) + assert_array_is_close(file_sizes, [3193, 8724, 35908, 79141, 181688, 1465273, 6948612]) for file in rendered_files: remove(file) @@ -28,5 +28,5 @@ def test_file_export(analyzing_malicious_documents_pdf_path, tmp_dir): def assert_array_is_close(_list1, _list2): for i, item in enumerate(_list1): - if not isclose(item, _list2[i], rel_tol=0.10): + if not isclose(item, _list2[i], rel_tol=0.05): assert False, f"File size of {item} too far from {_list2[i]}" diff --git a/tests/test_pdfalyzer.py b/tests/test_pdfalyzer.py index d051179..4497e61 100644 --- a/tests/test_pdfalyzer.py +++ b/tests/test_pdfalyzer.py @@ -21,7 +21,7 @@ def test_help_option(): def test_pdfalyzer_basic_tree(adobe_type1_fonts_pdf_path, analyzing_malicious_documents_pdf_path): type1_tree = _run_with_args(adobe_type1_fonts_pdf_path, '-t') - _assert_line_count_within_range(88, type1_tree) + _assert_line_count_within_range(90, type1_tree) analyzing_malicious_tree = _run_with_args(analyzing_malicious_documents_pdf_path, '-t') _assert_line_count_within_range(1004, analyzing_malicious_tree) @@ -52,7 +52,7 @@ def _run_with_args(pdf, *args) -> str: def _assert_line_count_within_range(line_count, text): lines_in_text = len(text.split("\n")) - if not isclose(line_count, lines_in_text, rel_tol=0.02): + if not isclose(line_count, lines_in_text, rel_tol=0.05): for i, line in enumerate(text.split("\n")): print(f"{i}: {line}")