diff --git a/.gitignore b/.gitignore index 017ccfa..1d4f5c0 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ dist/ /tools/ /tmp/ /output/export/ +/tests/tmp/ # Other .DS_Store diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ea9bf6..9b589d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,14 @@ # NEXT RELEASE +# 1.8.3 +* Highlight suspicious instructions in red, BOMs in green +* Reenable guillemet quote matching +* Clearer labeling of binary scan results +* Sync with `yaralyzer` v0.4.0 + # 1.8.2 -* Sync with `yaralyzer` 0.3.3 +* Sync with `yaralyzer` v0.3.3 # 1.8.1 * Show defaults and valid values for command line options diff --git a/pdfalyzer/binary/binary_scanner.py b/pdfalyzer/binary/binary_scanner.py index 1a00041..e3d927d 100644 --- a/pdfalyzer/binary/binary_scanner.py +++ b/pdfalyzer/binary/binary_scanner.py @@ -5,7 +5,7 @@ import re from collections import defaultdict from numbers import Number -from typing import Any, Iterator, Pattern, Tuple +from typing import Any, Iterator, Optional, Pattern, Tuple from deprecated import deprecated from rich.panel import Panel @@ -15,20 +15,21 @@ from yaralyzer.config import YaralyzerConfig from yaralyzer.decoding.bytes_decoder import BytesDecoder from yaralyzer.encoding_detection.character_encodings import BOMS -from yaralyzer.helpers.bytes_helper import print_bytes +from yaralyzer.helpers.bytes_helper import hex_string, print_bytes from yaralyzer.helpers.rich_text_helper import CENTER, na_txt, prefix_with_plain_text_obj from yaralyzer.helpers.string_helper import escape_yara_pattern -from yaralyzer.output.rich_console import console, console_width +from yaralyzer.output.rich_console import BYTES_NO_DIM, console, console_width from yaralyzer.output.regex_match_metrics import RegexMatchMetrics +from yaralyzer.yara.yara_rule_builder import HEX, REGEX, safe_label from yaralyzer.yaralyzer import Yaralyzer from yaralyzer.util.logging import log from pdfalyzer.config import PdfalyzerConfig -from pdfalyzer.detection.constants.binary_regexes import DANGEROUS_STRINGS, QUOTE_REGEXES, QUOTE_PATTERNS +from pdfalyzer.detection.constants.binary_regexes import BACKTICK, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET, QUOTE_PATTERNS from pdfalyzer.helpers.rich_text_helper import (DANGER_HEADER, NOT_FOUND_MSG, - generate_subtable, get_label_style, pad_header, print_section_header) + generate_subtable, get_label_style, pad_header) from pdfalyzer.helpers.string_helper import generate_hyphen_line -from pdfalyzer.output.layout import subheading_width +from pdfalyzer.output.layout import print_section_header, print_section_subheader, subheading_width from pdfalyzer.util.adobe_strings import CURRENTFILE_EEXEC # For rainbow colors @@ -51,20 +52,30 @@ def __init__(self, _bytes: bytes, owner: Any = None, label: Any = None): def check_for_dangerous_instructions(self) -> None: """Scan for all the strings in DANGEROUS_INSTRUCTIONS list and decode bytes around them""" - print_section_header("Scanning Font Binary For Anything 'Mad Sus'...", style=DANGER_HEADER) + print_section_header("Scanning Binary For Anything 'Mad Sus'...", style=DANGER_HEADER) for instruction in DANGEROUS_STRINGS: - label = f"({BOMS[instruction]}) " if instruction in BOMS else instruction - self.process_yara_matches(instruction, label, force=True) + yaralyzer = self._pattern_yaralyzer(instruction, REGEX) + yaralyzer.highlight_style = 'bright_red bold' + self.process_yara_matches(yaralyzer, instruction, force=True) + + def check_for_boms(self) -> None: + print_section_subheader("Scanning Binary for any BOMs...") + + for bom_bytes, bom_name in BOMS.items(): + yaralyzer = self._pattern_yaralyzer(hex_string(bom_bytes), HEX, bom_name) + yaralyzer.highlight_style = 'bright_green bold' + self.process_yara_matches(yaralyzer, bom_name, force=True) def force_decode_all_quoted_bytes(self) -> None: """Find all strings matching QUOTE_PATTERNS (AKA between quote chars) and decode them with various encodings""" quote_types = QUOTE_PATTERNS.keys() if PdfalyzerConfig.QUOTE_TYPE is None else [PdfalyzerConfig.QUOTE_TYPE] for quote_type in quote_types: - quote_regex = QUOTE_PATTERNS[quote_type] - print_section_header(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style='color(100)') - self.process_yara_matches(quote_regex, rules_label=f"{quote_type} quoted") + quote_pattern = QUOTE_PATTERNS[quote_type] + print_section_header(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style=BYTES_NO_DIM) + yaralyzer = self._quote_yaralyzer(quote_pattern, quote_type) + self.process_yara_matches(yaralyzer, f"{quote_type}_quoted") # ------------------------------------------------------------------------------- # These extraction iterators will iterate over all matches for a specific pattern. @@ -72,15 +83,15 @@ def force_decode_all_quoted_bytes(self) -> None: # ------------------------------------------------------------------------------- def extract_guillemet_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]: """Iterate on all strings surrounded by Guillemet quotes, e.g. «string»""" - return self._pattern_yaralyzer(QUOTE_PATTERNS['guillemet'], 'guillemet').match_iterator() + return self._quote_yaralyzer(QUOTE_PATTERNS[GUILLEMET], GUILLEMET).match_iterator() def extract_backtick_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]: """Returns an interator over all strings surrounded by backticks""" - return self._pattern_yaralyzer(QUOTE_PATTERNS['backtick'], 'backtick').match_iterator() + return self._quote_yaralyzer(QUOTE_PATTERNS[BACKTICK], BACKTICK).match_iterator() def extract_front_slash_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]: """Returns an interator over all strings surrounded by front_slashes (hint: regular expressions)""" - return self._pattern_yaralyzer(QUOTE_PATTERNS['frontslash'], 'frontslash').match_iterator() + return self._quote_yaralyzer(QUOTE_PATTERNS[FRONTSLASH], FRONTSLASH).match_iterator() def print_stream_preview(self, num_bytes=None, title_suffix=None) -> None: """Print a preview showing the beginning and end of the stream data""" @@ -141,21 +152,22 @@ def print_decoding_stats_table(self) -> None: console.line(2) console.print(stats_table) - def process_yara_matches(self, pattern: str, rules_label: str, force: bool = False) -> None: + def process_yara_matches(self, yaralyzer: Yaralyzer, pattern: str, force: bool = False) -> None: """Decide whether to attempt to decode the matched bytes, track stats. force param ignores min/max length""" - for bytes_match, bytes_decoder in self._pattern_yaralyzer(pattern, rules_label).match_iterator(): + for bytes_match, bytes_decoder in yaralyzer.match_iterator(): self.regex_extraction_stats[pattern].match_count += 1 self.regex_extraction_stats[pattern].bytes_matched += bytes_match.match_length self.regex_extraction_stats[pattern].bytes_match_objs.append(bytes_match) # Send suppressed decodes to a queue and track the reason for the suppression in the stats - if not (force or (YaralyzerConfig.MIN_DECODE_LENGTH < bytes_match.match_length < YaralyzerConfig.MAX_DECODE_LENGTH)): - self._queue_suppression_notice(bytes_match, rules_label) + if not ((YaralyzerConfig.MIN_DECODE_LENGTH < bytes_match.match_length < YaralyzerConfig.MAX_DECODE_LENGTH) \ + or force): + self._queue_suppression_notice(bytes_match, pattern) continue # Print out any queued suppressed notices before printing non suppressed matches self._print_suppression_notices() - self._record_decode_stats(bytes_match, bytes_decoder, rules_label or pattern) + self._record_decode_stats(bytes_match, bytes_decoder, pattern) if self.regex_extraction_stats[pattern].match_count == 0: console.print(f"{pattern} was not found for {self.label}...", style='dim') @@ -170,8 +182,31 @@ def extract_regex_capture_bytes(self, regex: Pattern[bytes]) -> Iterator[BytesMa for i, match in enumerate(regex.finditer(self.bytes, self._eexec_idx())): yield(BytesMatch.from_regex_match(self.bytes, match, i + 1)) - def _pattern_yaralyzer(self, pattern: str, rules_label: str): - return Yaralyzer.for_patterns([escape_yara_pattern(pattern)], self.bytes, self.label.plain, rules_label) + def _pattern_yaralyzer( + self, + pattern: str, + pattern_type: str, + rules_label: Optional[str] = None, + pattern_label: Optional[str] = None + ) -> Yaralyzer: + """Build a yaralyzer to scan self.bytes""" + return Yaralyzer.for_patterns( + patterns=[escape_yara_pattern(pattern)], + patterns_type=pattern_type, + scannable=self.bytes, + scannable_label=self.label.plain, + rules_label=safe_label(rules_label or pattern), + pattern_label=safe_label(pattern_label or pattern) + ) + + def _quote_yaralyzer(self, quote_pattern: str, quote_type: str): + """Helper method to build a Yaralyzer for a quote_pattern""" + label = f"{quote_type}_Quoted" + + if quote_type == GUILLEMET: + return self._pattern_yaralyzer(quote_pattern, HEX, label, label) + else: + return self._pattern_yaralyzer(quote_pattern, REGEX, label, label) def _record_decode_stats(self, bytes_match: BytesMatch, decoder: BytesDecoder, label: str) -> None: """Attempt to decode _bytes with all configured encodings and print a table of the results""" @@ -198,7 +233,7 @@ def _queue_suppression_notice(self, bytes_match: BytesMatch, quote_type: str) -> if bytes_match.match_length < YaralyzerConfig.MIN_DECODE_LENGTH: txt = Text('Too little to actually attempt decode at ', style='grey') + txt else: - txt.append(" is too large to decode ") + txt.append(" too long to decode ") txt.append(f"(--max-decode-length is {YaralyzerConfig.MAX_DECODE_LENGTH} bytes)", style='grey') log.debug(Text('Queueing suppression notice: ') + txt) diff --git a/pdfalyzer/decorators/pdf_tree_node.py b/pdfalyzer/decorators/pdf_tree_node.py index a49ddf1..c399252 100644 --- a/pdfalyzer/decorators/pdf_tree_node.py +++ b/pdfalyzer/decorators/pdf_tree_node.py @@ -18,7 +18,7 @@ from rich.table import Table from rich.text import Text from rich.tree import Tree -from yaralyzer.helpers.bytes_helper import clean_byte_string, hex_string +from yaralyzer.helpers.bytes_helper import clean_byte_string, hex_text from yaralyzer.output.rich_console import BYTES_NO_DIM, YARALYZER_THEME, console from yaralyzer.util.logging import log @@ -262,7 +262,7 @@ def _get_stream_preview_rows(self) -> List[List[Text]]: stream_preview_length = len(stream_preview) if isinstance(self.stream_data, bytes): - stream_preview_hex = hex_string(stream_preview).plain + stream_preview_hex = hex_text(stream_preview).plain else: stream_preview_hex = f"N/A (Stream data is type '{type(self.stream_data).__name__}', not bytes)" diff --git a/pdfalyzer/detection/constants/binary_regexes.py b/pdfalyzer/detection/constants/binary_regexes.py index 5913e1b..72ada04 100644 --- a/pdfalyzer/detection/constants/binary_regexes.py +++ b/pdfalyzer/detection/constants/binary_regexes.py @@ -24,25 +24,35 @@ ESCAPED_DOUBLE_QUOTE_BYTES = b'\\"' ESCAPED_SINGLE_QUOTE_BYTES = b"\\'" +GUILLEMET = 'guillemet' +FRONTSLASH = 'frontslash' +BACKSLASH = 'backslash' +BACKTICK = 'backtick' +SINGLE_QUOTE = 'single_quote' +DOUBLE_QUOTE = 'double_quote' +ESCAPED_SINGLE = f"escaped_{SINGLE_QUOTE}" +ESCAPED_DOUBLE = f"escaped_{DOUBLE_QUOTE}" + +QUOTE_PATTERNS = { + BACKTICK: '`.+`', + ESCAPED_SINGLE: "\\'.+\\'", + ESCAPED_DOUBLE: '\\".+\\"', + FRONTSLASH: '/.+/', + GUILLEMET: 'AB [-] BB', # Guillemet quotes are not ANSI so require byte pattern +} + -# Quote regexes used to hunt for particular binary patterns of interest def build_quote_capture_group(open_quote: bytes, close_quote: Union[bytes, None]=None): """Regex that captures everything between open and close quote (close_quote defaults to open_quote)""" return re.compile(open_quote + CAPTURE_BYTES + (close_quote or open_quote), re.DOTALL) +# Deprecated binary Quote regexes used to hunt for particular binary patterns of interest QUOTE_REGEXES = { - 'backtick': build_quote_capture_group(b'`'), - 'guillemet': build_quote_capture_group(b'\xab', b'\xbb'), - 'escaped_single': build_quote_capture_group(ESCAPED_SINGLE_QUOTE_BYTES), - 'escaped_double': build_quote_capture_group(ESCAPED_DOUBLE_QUOTE_BYTES), - 'frontslash': build_quote_capture_group(FRONT_SLASH_BYTE), + BACKTICK: build_quote_capture_group(b'`'), + GUILLEMET: build_quote_capture_group(b'\xab', b'\xbb'), + ESCAPED_SINGLE: build_quote_capture_group(ESCAPED_SINGLE_QUOTE_BYTES), + ESCAPED_DOUBLE: build_quote_capture_group(ESCAPED_DOUBLE_QUOTE_BYTES), + FRONTSLASH: build_quote_capture_group(FRONT_SLASH_BYTE), } -QUOTE_PATTERNS = { - 'backtick': '`.+`', - 'escaped_single': "\\'.+\\'", - 'escaped_double': '\\".+\\"', - 'guillemet': '\\xab.+\\xbb', - 'frontslash': '/.+/', -} diff --git a/pdfalyzer/detection/yaralyzer_helper.py b/pdfalyzer/detection/yaralyzer_helper.py index 0b933a1..ed26e59 100644 --- a/pdfalyzer/detection/yaralyzer_helper.py +++ b/pdfalyzer/detection/yaralyzer_helper.py @@ -1,10 +1,10 @@ """ Class to help with the pre-configured YARA rules in /yara. """ +import re from os import path -from typing import Iterator, Union -from yaralyzer.bytes_match import BytesMatch +from yaralyzer.util.logging import log from yaralyzer.yaralyzer import Yaralyzer from pdfalyzer.util.filesystem_awareness import PROJECT_DIR, YARA_RULES_DIR diff --git a/pdfalyzer/font_info.py b/pdfalyzer/font_info.py index 9a5469b..65fbf73 100644 --- a/pdfalyzer/font_info.py +++ b/pdfalyzer/font_info.py @@ -169,6 +169,7 @@ def print_summary(self): if self.binary_scanner is not None: self.binary_scanner.print_stream_preview(title_suffix=f" of /FontFile for {self.display_title}") self.binary_scanner.check_for_dangerous_instructions() + self.binary_scanner.check_for_boms() if not YaralyzerConfig.SUPPRESS_DECODES: self.binary_scanner.force_decode_all_quoted_bytes() diff --git a/pdfalyzer/helpers/rich_text_helper.py b/pdfalyzer/helpers/rich_text_helper.py index 34381a0..528e527 100644 --- a/pdfalyzer/helpers/rich_text_helper.py +++ b/pdfalyzer/helpers/rich_text_helper.py @@ -138,16 +138,6 @@ NOT_FOUND_MSG = Text('(not found)', style='grey.dark_italic') -def print_section_header(headline: str, style: str = '') -> None: - print_section_subheader(headline, f"{style} reverse", True) - - -def print_section_subheader(headline: str, style: str = '', expand: bool = False) -> None: - console.line(2) - console.print(Panel(headline, style=style, expand=expand)) - console.line() - - def get_label_style(label: str) -> str: """Lookup a style based on the label string""" return next((ls[1] for ls in LABEL_STYLES if ls[0].search(label)), DEFAULT_LABEL_STYLE) diff --git a/pdfalyzer/pdfalyzer.py b/pdfalyzer/pdfalyzer.py index d4772a1..6cf23bb 100644 --- a/pdfalyzer/pdfalyzer.py +++ b/pdfalyzer/pdfalyzer.py @@ -26,9 +26,9 @@ from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode from pdfalyzer.helpers.number_helper import size_string, size_in_bytes_string from pdfalyzer.helpers.pdf_object_helper import get_symlink_representation -from pdfalyzer.helpers.rich_text_helper import print_section_header, print_section_subheader from pdfalyzer.helpers.string_helper import pp from pdfalyzer.font_info import FontInfo +from pdfalyzer.output.layout import print_section_header, print_section_subheader from pdfalyzer.util.adobe_strings import (COLOR_SPACE, D, DEST, EXT_G_STATE, FONT, K, KIDS, NON_TREE_REFERENCES, NUMS, OBJECT_STREAM, OPEN_ACTION, P, PARENT, PREV, RESOURCES, SIZE, STRUCT_ELEM, TRAILER, TYPE, UNLABELED, XOBJECT, XREF, XREF_STREAM) diff --git a/poetry.lock b/poetry.lock index 5da3de5..f408a5b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -242,7 +242,7 @@ python-versions = "*" [[package]] name = "yaralyzer" -version = "0.3.3" +version = "0.4.0" description = "Visualize and force decode YARA and regex matches found in a file or byte stream. With colors. Lots of colors." category = "main" optional = false @@ -258,7 +258,7 @@ yara-python = ">=4.2.3,<5.0.0" [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "233b0227d984ca2b9dfc6278695927efd8192e2c4fd7afec4c5c8cb9ade8533c" +content-hash = "d3d85192197e608e48fea30832d7964bf1ebc18c468f431a7856fa3a55554ed2" [metadata.files] anytree = [ @@ -423,6 +423,6 @@ yara-python = [ {file = "yara_python-4.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:87125ede7fbc18ae65aab550f1a36f4ebf73bb828c5d7a3dd2bb99176f0faa15"}, ] yaralyzer = [ - {file = "yaralyzer-0.3.3-py3-none-any.whl", hash = "sha256:a70edc8197a6ecc470d0ce405041bb930a09421346703231ef1e5d9b26cc697d"}, - {file = "yaralyzer-0.3.3.tar.gz", hash = "sha256:5b0c2a61489ced81005f321c603c5548434aaa62766f62426f3ae3c378e0ea1b"}, + {file = "yaralyzer-0.4.0-py3-none-any.whl", hash = "sha256:b6b1e8e5d32447ec308c2a1fbade9b624172ffa6a3e4d5b550c295718463dcda"}, + {file = "yaralyzer-0.4.0.tar.gz", hash = "sha256:99f8721f64346e1f0281b783a3b15813bfff5837a89219c90bd3089a5f802d25"}, ] diff --git a/pyproject.toml b/pyproject.toml index 4c72c0c..6ce9d13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdfalyzer" -version = "1.8.2" +version = "1.8.3" description = "A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more." authors = ["Michel de Cryptadamus "] license = "GPL-3.0-or-later" @@ -49,7 +49,7 @@ PyPDF2 = "^2.10" python-dotenv = "^0.21.0" rich = "^12.5.1" rich-argparse = "^0.3.0" -yaralyzer = "^0.3.3" +yaralyzer = "^0.4.0" [tool.poetry.dev-dependencies] pytest = "^7.1.2" diff --git a/tests/lib/binary/test_binary_scanner.py b/tests/lib/binary/test_binary_scanner.py index b000e38..7d24aff 100644 --- a/tests/lib/binary/test_binary_scanner.py +++ b/tests/lib/binary/test_binary_scanner.py @@ -11,7 +11,7 @@ def test_front_slash_quoted_bytes_extraction(font_info): def test_extract_guillemet(font_info): - _check_matches(font_info.binary_scanner.extract_guillemet_quoted_bytes, 12, 2167) + _check_matches(font_info.binary_scanner.extract_guillemet_quoted_bytes, 59, 23138) def _check_matches(match_iterator, expected_matches: int, expected_bytes: int) -> None: diff --git a/tests/test_file_export.py b/tests/test_file_export.py index 40a7863..1c903fa 100644 --- a/tests/test_file_export.py +++ b/tests/test_file_export.py @@ -20,7 +20,7 @@ def test_file_export(analyzing_malicious_documents_pdf_path, tmp_dir): rendered_files = files_in_dir(tmp_dir) assert len(rendered_files) == 6 file_sizes = sorted([path.getsize(f) for f in rendered_files]) - assert_array_is_close(file_sizes, [7031, 8346, 55178, 142742, 181310, 1464895]) + assert_array_is_close(file_sizes, [7031, 8346, 55178, 181310, 437873, 1464895]) for file in rendered_files: remove(file)