Sync yaralyzer 0.4.0, enable coloring of non quote matches, better la…

…beling
michelcrypt4d4mus · Oct 6, 2022 · fcdc483 · fcdc483
1 parent ff4d6f6
commit fcdc483
Show file tree

Hide file tree

Showing 13 changed files with 103 additions and 60 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,7 @@ dist/
 /tools/
 /tmp/
 /output/export/
+/tests/tmp/
 
 # Other
 .DS_Store

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,14 @@
 # NEXT RELEASE
 
 
+# 1.8.3
+* Highlight suspicious instructions in red, BOMs in green
+* Reenable guillemet quote matching
+* Clearer labeling of binary scan results
+* Sync with `yaralyzer` v0.4.0
+
 # 1.8.2
-* Sync with `yaralyzer` 0.3.3
+* Sync with `yaralyzer` v0.3.3
 
 # 1.8.1
 * Show defaults and valid values for command line options

diff --git a/pdfalyzer/binary/binary_scanner.py b/pdfalyzer/binary/binary_scanner.py
@@ -5,7 +5,7 @@
 import re
 from collections import defaultdict
 from numbers import Number
-from typing import Any, Iterator, Pattern, Tuple
+from typing import Any, Iterator, Optional, Pattern, Tuple
 
 from deprecated import deprecated
 from rich.panel import Panel
@@ -15,20 +15,21 @@
 from yaralyzer.config import YaralyzerConfig
 from yaralyzer.decoding.bytes_decoder import BytesDecoder
 from yaralyzer.encoding_detection.character_encodings import BOMS
-from yaralyzer.helpers.bytes_helper import print_bytes
+from yaralyzer.helpers.bytes_helper import hex_string, print_bytes
 from yaralyzer.helpers.rich_text_helper import CENTER, na_txt, prefix_with_plain_text_obj
 from yaralyzer.helpers.string_helper import escape_yara_pattern
-from yaralyzer.output.rich_console import console, console_width
+from yaralyzer.output.rich_console import BYTES_NO_DIM, console, console_width
 from yaralyzer.output.regex_match_metrics import RegexMatchMetrics
+from yaralyzer.yara.yara_rule_builder import HEX, REGEX, safe_label
 from yaralyzer.yaralyzer import Yaralyzer
 from yaralyzer.util.logging import log
 
 from pdfalyzer.config import PdfalyzerConfig
-from pdfalyzer.detection.constants.binary_regexes import DANGEROUS_STRINGS, QUOTE_REGEXES, QUOTE_PATTERNS
+from pdfalyzer.detection.constants.binary_regexes import BACKTICK, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET, QUOTE_PATTERNS
 from pdfalyzer.helpers.rich_text_helper import (DANGER_HEADER, NOT_FOUND_MSG,
-     generate_subtable, get_label_style, pad_header, print_section_header)
+     generate_subtable, get_label_style, pad_header)
 from pdfalyzer.helpers.string_helper import generate_hyphen_line
-from pdfalyzer.output.layout import subheading_width
+from pdfalyzer.output.layout import print_section_header, print_section_subheader, subheading_width
 from pdfalyzer.util.adobe_strings import CURRENTFILE_EEXEC
 
 # For rainbow colors
@@ -51,36 +52,46 @@ def __init__(self, _bytes: bytes, owner: Any = None, label: Any = None):
 
     def check_for_dangerous_instructions(self) -> None:
         """Scan for all the strings in DANGEROUS_INSTRUCTIONS list and decode bytes around them"""
-        print_section_header("Scanning Font Binary For Anything 'Mad Sus'...", style=DANGER_HEADER)
+        print_section_header("Scanning Binary For Anything 'Mad Sus'...", style=DANGER_HEADER)
 
         for instruction in DANGEROUS_STRINGS:
-            label = f"({BOMS[instruction]}) " if instruction in BOMS else instruction
-            self.process_yara_matches(instruction, label, force=True)
+            yaralyzer = self._pattern_yaralyzer(instruction, REGEX)
+            yaralyzer.highlight_style = 'bright_red bold'
+            self.process_yara_matches(yaralyzer, instruction, force=True)
+
+    def check_for_boms(self) -> None:
+        print_section_subheader("Scanning Binary for any BOMs...")
+
+        for bom_bytes, bom_name in BOMS.items():
+            yaralyzer = self._pattern_yaralyzer(hex_string(bom_bytes), HEX, bom_name)
+            yaralyzer.highlight_style = 'bright_green bold'
+            self.process_yara_matches(yaralyzer, bom_name, force=True)
 
     def force_decode_all_quoted_bytes(self) -> None:
         """Find all strings matching QUOTE_PATTERNS (AKA between quote chars) and decode them with various encodings"""
         quote_types = QUOTE_PATTERNS.keys() if PdfalyzerConfig.QUOTE_TYPE is None else [PdfalyzerConfig.QUOTE_TYPE]
 
         for quote_type in quote_types:
-            quote_regex = QUOTE_PATTERNS[quote_type]
-            print_section_header(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style='color(100)')
-            self.process_yara_matches(quote_regex, rules_label=f"{quote_type} quoted")
+            quote_pattern = QUOTE_PATTERNS[quote_type]
+            print_section_header(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style=BYTES_NO_DIM)
+            yaralyzer = self._quote_yaralyzer(quote_pattern, quote_type)
+            self.process_yara_matches(yaralyzer, f"{quote_type}_quoted")
 
     # -------------------------------------------------------------------------------
     # These extraction iterators will iterate over all matches for a specific pattern.
     # extract_regex_capture_bytes() is the generalized method that acccepts any regex.
     # -------------------------------------------------------------------------------
     def extract_guillemet_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
         """Iterate on all strings surrounded by Guillemet quotes, e.g. «string»"""
-        return self._pattern_yaralyzer(QUOTE_PATTERNS['guillemet'], 'guillemet').match_iterator()
+        return self._quote_yaralyzer(QUOTE_PATTERNS[GUILLEMET], GUILLEMET).match_iterator()
 
     def extract_backtick_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
         """Returns an interator over all strings surrounded by backticks"""
-        return self._pattern_yaralyzer(QUOTE_PATTERNS['backtick'], 'backtick').match_iterator()
+        return self._quote_yaralyzer(QUOTE_PATTERNS[BACKTICK], BACKTICK).match_iterator()
 
     def extract_front_slash_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
         """Returns an interator over all strings surrounded by front_slashes (hint: regular expressions)"""
-        return self._pattern_yaralyzer(QUOTE_PATTERNS['frontslash'], 'frontslash').match_iterator()
+        return self._quote_yaralyzer(QUOTE_PATTERNS[FRONTSLASH], FRONTSLASH).match_iterator()
 
     def print_stream_preview(self, num_bytes=None, title_suffix=None) -> None:
         """Print a preview showing the beginning and end of the stream data"""
@@ -141,21 +152,22 @@ def print_decoding_stats_table(self) -> None:
         console.line(2)
         console.print(stats_table)
 
-    def process_yara_matches(self, pattern: str, rules_label: str, force: bool = False) -> None:
+    def process_yara_matches(self, yaralyzer: Yaralyzer, pattern: str, force: bool = False) -> None:
         """Decide whether to attempt to decode the matched bytes, track stats. force param ignores min/max length"""
-        for bytes_match, bytes_decoder in self._pattern_yaralyzer(pattern, rules_label).match_iterator():
+        for bytes_match, bytes_decoder in yaralyzer.match_iterator():
             self.regex_extraction_stats[pattern].match_count += 1
             self.regex_extraction_stats[pattern].bytes_matched += bytes_match.match_length
             self.regex_extraction_stats[pattern].bytes_match_objs.append(bytes_match)
 
             # Send suppressed decodes to a queue and track the reason for the suppression in the stats
-            if not (force or (YaralyzerConfig.MIN_DECODE_LENGTH < bytes_match.match_length < YaralyzerConfig.MAX_DECODE_LENGTH)):
-                self._queue_suppression_notice(bytes_match, rules_label)
+            if not ((YaralyzerConfig.MIN_DECODE_LENGTH < bytes_match.match_length < YaralyzerConfig.MAX_DECODE_LENGTH) \
+                    or force):
+                self._queue_suppression_notice(bytes_match, pattern)
                 continue
 
             # Print out any queued suppressed notices before printing non suppressed matches
             self._print_suppression_notices()
-            self._record_decode_stats(bytes_match, bytes_decoder, rules_label or pattern)
+            self._record_decode_stats(bytes_match, bytes_decoder, pattern)
 
             if self.regex_extraction_stats[pattern].match_count == 0:
                 console.print(f"{pattern} was not found for {self.label}...", style='dim')
@@ -170,8 +182,31 @@ def extract_regex_capture_bytes(self, regex: Pattern[bytes]) -> Iterator[BytesMa
         for i, match in enumerate(regex.finditer(self.bytes, self._eexec_idx())):
             yield(BytesMatch.from_regex_match(self.bytes, match, i + 1))
 
-    def _pattern_yaralyzer(self, pattern: str, rules_label: str):
-        return Yaralyzer.for_patterns([escape_yara_pattern(pattern)], self.bytes, self.label.plain, rules_label)
+    def _pattern_yaralyzer(
+            self,
+            pattern: str,
+            pattern_type: str,
+            rules_label: Optional[str] = None,
+            pattern_label: Optional[str] = None
+        ) -> Yaralyzer:
+        """Build a yaralyzer to scan self.bytes"""
+        return Yaralyzer.for_patterns(
+            patterns=[escape_yara_pattern(pattern)],
+            patterns_type=pattern_type,
+            scannable=self.bytes,
+            scannable_label=self.label.plain,
+            rules_label=safe_label(rules_label or pattern),
+            pattern_label=safe_label(pattern_label or pattern)
+        )
+
+    def _quote_yaralyzer(self, quote_pattern: str, quote_type: str):
+        """Helper method to build a Yaralyzer for a quote_pattern"""
+        label = f"{quote_type}_Quoted"
+
+        if quote_type == GUILLEMET:
+            return self._pattern_yaralyzer(quote_pattern, HEX, label, label)
+        else:
+            return self._pattern_yaralyzer(quote_pattern, REGEX, label, label)
 
     def _record_decode_stats(self, bytes_match: BytesMatch, decoder: BytesDecoder, label: str) -> None:
         """Attempt to decode _bytes with all configured encodings and print a table of the results"""
@@ -198,7 +233,7 @@ def _queue_suppression_notice(self, bytes_match: BytesMatch, quote_type: str) ->
         if bytes_match.match_length < YaralyzerConfig.MIN_DECODE_LENGTH:
             txt = Text('Too little to actually attempt decode at ', style='grey') + txt
         else:
-            txt.append(" is too large to decode ")
+            txt.append(" too long to decode ")
             txt.append(f"(--max-decode-length is {YaralyzerConfig.MAX_DECODE_LENGTH} bytes)", style='grey')
 
         log.debug(Text('Queueing suppression notice: ') + txt)

diff --git a/pdfalyzer/decorators/pdf_tree_node.py b/pdfalyzer/decorators/pdf_tree_node.py
@@ -18,7 +18,7 @@
 from rich.table import Table
 from rich.text import Text
 from rich.tree import Tree
-from yaralyzer.helpers.bytes_helper import clean_byte_string, hex_string
+from yaralyzer.helpers.bytes_helper import clean_byte_string, hex_text
 from yaralyzer.output.rich_console import BYTES_NO_DIM, YARALYZER_THEME, console
 from yaralyzer.util.logging import log
 
@@ -262,7 +262,7 @@ def _get_stream_preview_rows(self) -> List[List[Text]]:
         stream_preview_length = len(stream_preview)
 
         if isinstance(self.stream_data, bytes):
-            stream_preview_hex = hex_string(stream_preview).plain
+            stream_preview_hex = hex_text(stream_preview).plain
         else:
             stream_preview_hex = f"N/A (Stream data is type '{type(self.stream_data).__name__}', not bytes)"
 

diff --git a/pdfalyzer/detection/constants/binary_regexes.py b/pdfalyzer/detection/constants/binary_regexes.py
@@ -24,25 +24,35 @@
 ESCAPED_DOUBLE_QUOTE_BYTES = b'\\"'
 ESCAPED_SINGLE_QUOTE_BYTES = b"\\'"
 
+GUILLEMET = 'guillemet'
+FRONTSLASH = 'frontslash'
+BACKSLASH = 'backslash'
+BACKTICK = 'backtick'
+SINGLE_QUOTE = 'single_quote'
+DOUBLE_QUOTE = 'double_quote'
+ESCAPED_SINGLE = f"escaped_{SINGLE_QUOTE}"
+ESCAPED_DOUBLE = f"escaped_{DOUBLE_QUOTE}"
+
+QUOTE_PATTERNS = {
+    BACKTICK: '`.+`',
+    ESCAPED_SINGLE: "\\'.+\\'",
+    ESCAPED_DOUBLE: '\\".+\\"',
+    FRONTSLASH: '/.+/',
+    GUILLEMET: 'AB [-] BB',  # Guillemet quotes are not ANSI so require byte pattern
+}
+
 
-# Quote regexes used to hunt for particular binary patterns of interest
 def build_quote_capture_group(open_quote: bytes, close_quote: Union[bytes, None]=None):
     """Regex that captures everything between open and close quote (close_quote defaults to open_quote)"""
     return re.compile(open_quote + CAPTURE_BYTES + (close_quote or open_quote), re.DOTALL)
 
 
+# Deprecated binary Quote regexes used to hunt for particular binary patterns of interest
 QUOTE_REGEXES = {
-    'backtick': build_quote_capture_group(b'`'),
-    'guillemet': build_quote_capture_group(b'\xab', b'\xbb'),
-    'escaped_single': build_quote_capture_group(ESCAPED_SINGLE_QUOTE_BYTES),
-    'escaped_double': build_quote_capture_group(ESCAPED_DOUBLE_QUOTE_BYTES),
-    'frontslash': build_quote_capture_group(FRONT_SLASH_BYTE),
+    BACKTICK: build_quote_capture_group(b'`'),
+    GUILLEMET: build_quote_capture_group(b'\xab', b'\xbb'),
+    ESCAPED_SINGLE: build_quote_capture_group(ESCAPED_SINGLE_QUOTE_BYTES),
+    ESCAPED_DOUBLE: build_quote_capture_group(ESCAPED_DOUBLE_QUOTE_BYTES),
+    FRONTSLASH: build_quote_capture_group(FRONT_SLASH_BYTE),
 }
 
-QUOTE_PATTERNS = {
-    'backtick': '`.+`',
-    'escaped_single': "\\'.+\\'",
-    'escaped_double': '\\".+\\"',
-    'guillemet': '\\xab.+\\xbb',
-    'frontslash': '/.+/',
-}
diff --git a/pdfalyzer/detection/yaralyzer_helper.py b/pdfalyzer/detection/yaralyzer_helper.py
@@ -1,10 +1,10 @@
 """
 Class to help with the pre-configured YARA rules in /yara.
 """
+import re
 from os import path
-from typing import Iterator, Union
 
-from yaralyzer.bytes_match import BytesMatch
+from yaralyzer.util.logging import log
 from yaralyzer.yaralyzer import Yaralyzer
 
 from pdfalyzer.util.filesystem_awareness import PROJECT_DIR, YARA_RULES_DIR

diff --git a/pdfalyzer/font_info.py b/pdfalyzer/font_info.py
@@ -169,6 +169,7 @@ def print_summary(self):
         if self.binary_scanner is not None:
             self.binary_scanner.print_stream_preview(title_suffix=f" of /FontFile for {self.display_title}")
             self.binary_scanner.check_for_dangerous_instructions()
+            self.binary_scanner.check_for_boms()
 
             if not YaralyzerConfig.SUPPRESS_DECODES:
                 self.binary_scanner.force_decode_all_quoted_bytes()

diff --git a/pdfalyzer/helpers/rich_text_helper.py b/pdfalyzer/helpers/rich_text_helper.py
@@ -138,16 +138,6 @@
 NOT_FOUND_MSG = Text('(not found)', style='grey.dark_italic')
 
 
-def print_section_header(headline: str, style: str = '') -> None:
-    print_section_subheader(headline, f"{style} reverse", True)
-
-
-def print_section_subheader(headline: str, style: str = '', expand: bool = False) -> None:
-    console.line(2)
-    console.print(Panel(headline, style=style, expand=expand))
-    console.line()
-
-
 def get_label_style(label: str) -> str:
     """Lookup a style based on the label string"""
     return next((ls[1] for ls in LABEL_STYLES if ls[0].search(label)), DEFAULT_LABEL_STYLE)

diff --git a/pdfalyzer/pdfalyzer.py b/pdfalyzer/pdfalyzer.py
@@ -26,9 +26,9 @@
 from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode
 from pdfalyzer.helpers.number_helper import size_string, size_in_bytes_string
 from pdfalyzer.helpers.pdf_object_helper import get_symlink_representation
-from pdfalyzer.helpers.rich_text_helper import print_section_header, print_section_subheader
 from pdfalyzer.helpers.string_helper import pp
 from pdfalyzer.font_info import FontInfo
+from pdfalyzer.output.layout import print_section_header, print_section_subheader
 from pdfalyzer.util.adobe_strings import (COLOR_SPACE, D, DEST, EXT_G_STATE, FONT, K, KIDS,
      NON_TREE_REFERENCES, NUMS, OBJECT_STREAM, OPEN_ACTION, P, PARENT, PREV, RESOURCES, SIZE,
      STRUCT_ELEM, TRAILER, TYPE, UNLABELED, XOBJECT, XREF, XREF_STREAM)

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdfalyzer"
-version = "1.8.2"
+version = "1.8.3"
 description = "A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
 authors = ["Michel de Cryptadamus <[email protected]>"]
 license = "GPL-3.0-or-later"
@@ -49,7 +49,7 @@ PyPDF2 = "^2.10"
 python-dotenv = "^0.21.0"
 rich = "^12.5.1"
 rich-argparse = "^0.3.0"
-yaralyzer = "^0.3.3"
+yaralyzer = "^0.4.0"
 
 [tool.poetry.dev-dependencies]
 pytest = "^7.1.2"

diff --git a/tests/lib/binary/test_binary_scanner.py b/tests/lib/binary/test_binary_scanner.py
@@ -11,7 +11,7 @@ def test_front_slash_quoted_bytes_extraction(font_info):
 
 
 def test_extract_guillemet(font_info):
-    _check_matches(font_info.binary_scanner.extract_guillemet_quoted_bytes, 12, 2167)
+    _check_matches(font_info.binary_scanner.extract_guillemet_quoted_bytes, 59, 23138)
 
 
 def _check_matches(match_iterator, expected_matches: int, expected_bytes: int) -> None:

diff --git a/tests/test_file_export.py b/tests/test_file_export.py
@@ -20,7 +20,7 @@ def test_file_export(analyzing_malicious_documents_pdf_path, tmp_dir):
     rendered_files = files_in_dir(tmp_dir)
     assert len(rendered_files) == 6
     file_sizes = sorted([path.getsize(f) for f in rendered_files])
-    assert_array_is_close(file_sizes, [7031, 8346, 55178, 142742, 181310, 1464895])
+    assert_array_is_close(file_sizes, [7031, 8346, 55178, 181310, 437873, 1464895])
 
     for file in rendered_files:
         remove(file)
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,6 +15,7 @@ dist/ @@
     /tools/
     /tmp/
     /output/export/
+    /tests/tmp/
     # Other
     .DS_Store
@@ Expand Down @@