Skip to content

Commit

Permalink
Sync yaralyzer 0.4.0, enable coloring of non quote matches, better la…
Browse files Browse the repository at this point in the history
…beling
  • Loading branch information
ashariyar committed Oct 6, 2022
1 parent ff4d6f6 commit fcdc483
Show file tree
Hide file tree
Showing 13 changed files with 103 additions and 60 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dist/
/tools/
/tmp/
/output/export/
/tests/tmp/

# Other
.DS_Store
Expand Down
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
# NEXT RELEASE


# 1.8.3
* Highlight suspicious instructions in red, BOMs in green
* Reenable guillemet quote matching
* Clearer labeling of binary scan results
* Sync with `yaralyzer` v0.4.0

# 1.8.2
* Sync with `yaralyzer` 0.3.3
* Sync with `yaralyzer` v0.3.3

# 1.8.1
* Show defaults and valid values for command line options
Expand Down
81 changes: 58 additions & 23 deletions pdfalyzer/binary/binary_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import re
from collections import defaultdict
from numbers import Number
from typing import Any, Iterator, Pattern, Tuple
from typing import Any, Iterator, Optional, Pattern, Tuple

from deprecated import deprecated
from rich.panel import Panel
Expand All @@ -15,20 +15,21 @@
from yaralyzer.config import YaralyzerConfig
from yaralyzer.decoding.bytes_decoder import BytesDecoder
from yaralyzer.encoding_detection.character_encodings import BOMS
from yaralyzer.helpers.bytes_helper import print_bytes
from yaralyzer.helpers.bytes_helper import hex_string, print_bytes
from yaralyzer.helpers.rich_text_helper import CENTER, na_txt, prefix_with_plain_text_obj
from yaralyzer.helpers.string_helper import escape_yara_pattern
from yaralyzer.output.rich_console import console, console_width
from yaralyzer.output.rich_console import BYTES_NO_DIM, console, console_width
from yaralyzer.output.regex_match_metrics import RegexMatchMetrics
from yaralyzer.yara.yara_rule_builder import HEX, REGEX, safe_label
from yaralyzer.yaralyzer import Yaralyzer
from yaralyzer.util.logging import log

from pdfalyzer.config import PdfalyzerConfig
from pdfalyzer.detection.constants.binary_regexes import DANGEROUS_STRINGS, QUOTE_REGEXES, QUOTE_PATTERNS
from pdfalyzer.detection.constants.binary_regexes import BACKTICK, DANGEROUS_STRINGS, FRONTSLASH, GUILLEMET, QUOTE_PATTERNS
from pdfalyzer.helpers.rich_text_helper import (DANGER_HEADER, NOT_FOUND_MSG,
generate_subtable, get_label_style, pad_header, print_section_header)
generate_subtable, get_label_style, pad_header)
from pdfalyzer.helpers.string_helper import generate_hyphen_line
from pdfalyzer.output.layout import subheading_width
from pdfalyzer.output.layout import print_section_header, print_section_subheader, subheading_width
from pdfalyzer.util.adobe_strings import CURRENTFILE_EEXEC

# For rainbow colors
Expand All @@ -51,36 +52,46 @@ def __init__(self, _bytes: bytes, owner: Any = None, label: Any = None):

def check_for_dangerous_instructions(self) -> None:
"""Scan for all the strings in DANGEROUS_INSTRUCTIONS list and decode bytes around them"""
print_section_header("Scanning Font Binary For Anything 'Mad Sus'...", style=DANGER_HEADER)
print_section_header("Scanning Binary For Anything 'Mad Sus'...", style=DANGER_HEADER)

for instruction in DANGEROUS_STRINGS:
label = f"({BOMS[instruction]}) " if instruction in BOMS else instruction
self.process_yara_matches(instruction, label, force=True)
yaralyzer = self._pattern_yaralyzer(instruction, REGEX)
yaralyzer.highlight_style = 'bright_red bold'
self.process_yara_matches(yaralyzer, instruction, force=True)

def check_for_boms(self) -> None:
print_section_subheader("Scanning Binary for any BOMs...")

for bom_bytes, bom_name in BOMS.items():
yaralyzer = self._pattern_yaralyzer(hex_string(bom_bytes), HEX, bom_name)
yaralyzer.highlight_style = 'bright_green bold'
self.process_yara_matches(yaralyzer, bom_name, force=True)

def force_decode_all_quoted_bytes(self) -> None:
"""Find all strings matching QUOTE_PATTERNS (AKA between quote chars) and decode them with various encodings"""
quote_types = QUOTE_PATTERNS.keys() if PdfalyzerConfig.QUOTE_TYPE is None else [PdfalyzerConfig.QUOTE_TYPE]

for quote_type in quote_types:
quote_regex = QUOTE_PATTERNS[quote_type]
print_section_header(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style='color(100)')
self.process_yara_matches(quote_regex, rules_label=f"{quote_type} quoted")
quote_pattern = QUOTE_PATTERNS[quote_type]
print_section_header(f"Forcing Decode of {quote_type.capitalize()} Quoted Strings", style=BYTES_NO_DIM)
yaralyzer = self._quote_yaralyzer(quote_pattern, quote_type)
self.process_yara_matches(yaralyzer, f"{quote_type}_quoted")

# -------------------------------------------------------------------------------
# These extraction iterators will iterate over all matches for a specific pattern.
# extract_regex_capture_bytes() is the generalized method that acccepts any regex.
# -------------------------------------------------------------------------------
def extract_guillemet_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
"""Iterate on all strings surrounded by Guillemet quotes, e.g. «string»"""
return self._pattern_yaralyzer(QUOTE_PATTERNS['guillemet'], 'guillemet').match_iterator()
return self._quote_yaralyzer(QUOTE_PATTERNS[GUILLEMET], GUILLEMET).match_iterator()

def extract_backtick_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
"""Returns an interator over all strings surrounded by backticks"""
return self._pattern_yaralyzer(QUOTE_PATTERNS['backtick'], 'backtick').match_iterator()
return self._quote_yaralyzer(QUOTE_PATTERNS[BACKTICK], BACKTICK).match_iterator()

def extract_front_slash_quoted_bytes(self) -> Iterator[Tuple[BytesMatch, BytesDecoder]]:
"""Returns an interator over all strings surrounded by front_slashes (hint: regular expressions)"""
return self._pattern_yaralyzer(QUOTE_PATTERNS['frontslash'], 'frontslash').match_iterator()
return self._quote_yaralyzer(QUOTE_PATTERNS[FRONTSLASH], FRONTSLASH).match_iterator()

def print_stream_preview(self, num_bytes=None, title_suffix=None) -> None:
"""Print a preview showing the beginning and end of the stream data"""
Expand Down Expand Up @@ -141,21 +152,22 @@ def print_decoding_stats_table(self) -> None:
console.line(2)
console.print(stats_table)

def process_yara_matches(self, pattern: str, rules_label: str, force: bool = False) -> None:
def process_yara_matches(self, yaralyzer: Yaralyzer, pattern: str, force: bool = False) -> None:
"""Decide whether to attempt to decode the matched bytes, track stats. force param ignores min/max length"""
for bytes_match, bytes_decoder in self._pattern_yaralyzer(pattern, rules_label).match_iterator():
for bytes_match, bytes_decoder in yaralyzer.match_iterator():
self.regex_extraction_stats[pattern].match_count += 1
self.regex_extraction_stats[pattern].bytes_matched += bytes_match.match_length
self.regex_extraction_stats[pattern].bytes_match_objs.append(bytes_match)

# Send suppressed decodes to a queue and track the reason for the suppression in the stats
if not (force or (YaralyzerConfig.MIN_DECODE_LENGTH < bytes_match.match_length < YaralyzerConfig.MAX_DECODE_LENGTH)):
self._queue_suppression_notice(bytes_match, rules_label)
if not ((YaralyzerConfig.MIN_DECODE_LENGTH < bytes_match.match_length < YaralyzerConfig.MAX_DECODE_LENGTH) \
or force):
self._queue_suppression_notice(bytes_match, pattern)
continue

# Print out any queued suppressed notices before printing non suppressed matches
self._print_suppression_notices()
self._record_decode_stats(bytes_match, bytes_decoder, rules_label or pattern)
self._record_decode_stats(bytes_match, bytes_decoder, pattern)

if self.regex_extraction_stats[pattern].match_count == 0:
console.print(f"{pattern} was not found for {self.label}...", style='dim')
Expand All @@ -170,8 +182,31 @@ def extract_regex_capture_bytes(self, regex: Pattern[bytes]) -> Iterator[BytesMa
for i, match in enumerate(regex.finditer(self.bytes, self._eexec_idx())):
yield(BytesMatch.from_regex_match(self.bytes, match, i + 1))

def _pattern_yaralyzer(self, pattern: str, rules_label: str):
return Yaralyzer.for_patterns([escape_yara_pattern(pattern)], self.bytes, self.label.plain, rules_label)
def _pattern_yaralyzer(
self,
pattern: str,
pattern_type: str,
rules_label: Optional[str] = None,
pattern_label: Optional[str] = None
) -> Yaralyzer:
"""Build a yaralyzer to scan self.bytes"""
return Yaralyzer.for_patterns(
patterns=[escape_yara_pattern(pattern)],
patterns_type=pattern_type,
scannable=self.bytes,
scannable_label=self.label.plain,
rules_label=safe_label(rules_label or pattern),
pattern_label=safe_label(pattern_label or pattern)
)

def _quote_yaralyzer(self, quote_pattern: str, quote_type: str):
"""Helper method to build a Yaralyzer for a quote_pattern"""
label = f"{quote_type}_Quoted"

if quote_type == GUILLEMET:
return self._pattern_yaralyzer(quote_pattern, HEX, label, label)
else:
return self._pattern_yaralyzer(quote_pattern, REGEX, label, label)

def _record_decode_stats(self, bytes_match: BytesMatch, decoder: BytesDecoder, label: str) -> None:
"""Attempt to decode _bytes with all configured encodings and print a table of the results"""
Expand All @@ -198,7 +233,7 @@ def _queue_suppression_notice(self, bytes_match: BytesMatch, quote_type: str) ->
if bytes_match.match_length < YaralyzerConfig.MIN_DECODE_LENGTH:
txt = Text('Too little to actually attempt decode at ', style='grey') + txt
else:
txt.append(" is too large to decode ")
txt.append(" too long to decode ")
txt.append(f"(--max-decode-length is {YaralyzerConfig.MAX_DECODE_LENGTH} bytes)", style='grey')

log.debug(Text('Queueing suppression notice: ') + txt)
Expand Down
4 changes: 2 additions & 2 deletions pdfalyzer/decorators/pdf_tree_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from rich.table import Table
from rich.text import Text
from rich.tree import Tree
from yaralyzer.helpers.bytes_helper import clean_byte_string, hex_string
from yaralyzer.helpers.bytes_helper import clean_byte_string, hex_text
from yaralyzer.output.rich_console import BYTES_NO_DIM, YARALYZER_THEME, console
from yaralyzer.util.logging import log

Expand Down Expand Up @@ -262,7 +262,7 @@ def _get_stream_preview_rows(self) -> List[List[Text]]:
stream_preview_length = len(stream_preview)

if isinstance(self.stream_data, bytes):
stream_preview_hex = hex_string(stream_preview).plain
stream_preview_hex = hex_text(stream_preview).plain
else:
stream_preview_hex = f"N/A (Stream data is type '{type(self.stream_data).__name__}', not bytes)"

Expand Down
36 changes: 23 additions & 13 deletions pdfalyzer/detection/constants/binary_regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,35 @@
ESCAPED_DOUBLE_QUOTE_BYTES = b'\\"'
ESCAPED_SINGLE_QUOTE_BYTES = b"\\'"

GUILLEMET = 'guillemet'
FRONTSLASH = 'frontslash'
BACKSLASH = 'backslash'
BACKTICK = 'backtick'
SINGLE_QUOTE = 'single_quote'
DOUBLE_QUOTE = 'double_quote'
ESCAPED_SINGLE = f"escaped_{SINGLE_QUOTE}"
ESCAPED_DOUBLE = f"escaped_{DOUBLE_QUOTE}"

QUOTE_PATTERNS = {
BACKTICK: '`.+`',
ESCAPED_SINGLE: "\\'.+\\'",
ESCAPED_DOUBLE: '\\".+\\"',
FRONTSLASH: '/.+/',
GUILLEMET: 'AB [-] BB', # Guillemet quotes are not ANSI so require byte pattern
}


# Quote regexes used to hunt for particular binary patterns of interest
def build_quote_capture_group(open_quote: bytes, close_quote: Union[bytes, None]=None):
"""Regex that captures everything between open and close quote (close_quote defaults to open_quote)"""
return re.compile(open_quote + CAPTURE_BYTES + (close_quote or open_quote), re.DOTALL)


# Deprecated binary Quote regexes used to hunt for particular binary patterns of interest
QUOTE_REGEXES = {
'backtick': build_quote_capture_group(b'`'),
'guillemet': build_quote_capture_group(b'\xab', b'\xbb'),
'escaped_single': build_quote_capture_group(ESCAPED_SINGLE_QUOTE_BYTES),
'escaped_double': build_quote_capture_group(ESCAPED_DOUBLE_QUOTE_BYTES),
'frontslash': build_quote_capture_group(FRONT_SLASH_BYTE),
BACKTICK: build_quote_capture_group(b'`'),
GUILLEMET: build_quote_capture_group(b'\xab', b'\xbb'),
ESCAPED_SINGLE: build_quote_capture_group(ESCAPED_SINGLE_QUOTE_BYTES),
ESCAPED_DOUBLE: build_quote_capture_group(ESCAPED_DOUBLE_QUOTE_BYTES),
FRONTSLASH: build_quote_capture_group(FRONT_SLASH_BYTE),
}

QUOTE_PATTERNS = {
'backtick': '`.+`',
'escaped_single': "\\'.+\\'",
'escaped_double': '\\".+\\"',
'guillemet': '\\xab.+\\xbb',
'frontslash': '/.+/',
}
4 changes: 2 additions & 2 deletions pdfalyzer/detection/yaralyzer_helper.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"""
Class to help with the pre-configured YARA rules in /yara.
"""
import re
from os import path
from typing import Iterator, Union

from yaralyzer.bytes_match import BytesMatch
from yaralyzer.util.logging import log
from yaralyzer.yaralyzer import Yaralyzer

from pdfalyzer.util.filesystem_awareness import PROJECT_DIR, YARA_RULES_DIR
Expand Down
1 change: 1 addition & 0 deletions pdfalyzer/font_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ def print_summary(self):
if self.binary_scanner is not None:
self.binary_scanner.print_stream_preview(title_suffix=f" of /FontFile for {self.display_title}")
self.binary_scanner.check_for_dangerous_instructions()
self.binary_scanner.check_for_boms()

if not YaralyzerConfig.SUPPRESS_DECODES:
self.binary_scanner.force_decode_all_quoted_bytes()
Expand Down
10 changes: 0 additions & 10 deletions pdfalyzer/helpers/rich_text_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,16 +138,6 @@
NOT_FOUND_MSG = Text('(not found)', style='grey.dark_italic')


def print_section_header(headline: str, style: str = '') -> None:
print_section_subheader(headline, f"{style} reverse", True)


def print_section_subheader(headline: str, style: str = '', expand: bool = False) -> None:
console.line(2)
console.print(Panel(headline, style=style, expand=expand))
console.line()


def get_label_style(label: str) -> str:
"""Lookup a style based on the label string"""
return next((ls[1] for ls in LABEL_STYLES if ls[0].search(label)), DEFAULT_LABEL_STYLE)
Expand Down
2 changes: 1 addition & 1 deletion pdfalyzer/pdfalyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
from pdfalyzer.decorators.pdf_tree_node import PdfTreeNode
from pdfalyzer.helpers.number_helper import size_string, size_in_bytes_string
from pdfalyzer.helpers.pdf_object_helper import get_symlink_representation
from pdfalyzer.helpers.rich_text_helper import print_section_header, print_section_subheader
from pdfalyzer.helpers.string_helper import pp
from pdfalyzer.font_info import FontInfo
from pdfalyzer.output.layout import print_section_header, print_section_subheader
from pdfalyzer.util.adobe_strings import (COLOR_SPACE, D, DEST, EXT_G_STATE, FONT, K, KIDS,
NON_TREE_REFERENCES, NUMS, OBJECT_STREAM, OPEN_ACTION, P, PARENT, PREV, RESOURCES, SIZE,
STRUCT_ELEM, TRAILER, TYPE, UNLABELED, XOBJECT, XREF, XREF_STREAM)
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdfalyzer"
version = "1.8.2"
version = "1.8.3"
description = "A PDF analysis toolkit. Scan a PDF with relevant YARA rules, visualize its inner tree-like data structure in living color (lots of colors), force decodes of suspicious font binaries, and more."
authors = ["Michel de Cryptadamus <[email protected]>"]
license = "GPL-3.0-or-later"
Expand Down Expand Up @@ -49,7 +49,7 @@ PyPDF2 = "^2.10"
python-dotenv = "^0.21.0"
rich = "^12.5.1"
rich-argparse = "^0.3.0"
yaralyzer = "^0.3.3"
yaralyzer = "^0.4.0"

[tool.poetry.dev-dependencies]
pytest = "^7.1.2"
Expand Down
2 changes: 1 addition & 1 deletion tests/lib/binary/test_binary_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_front_slash_quoted_bytes_extraction(font_info):


def test_extract_guillemet(font_info):
_check_matches(font_info.binary_scanner.extract_guillemet_quoted_bytes, 12, 2167)
_check_matches(font_info.binary_scanner.extract_guillemet_quoted_bytes, 59, 23138)


def _check_matches(match_iterator, expected_matches: int, expected_bytes: int) -> None:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_file_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_file_export(analyzing_malicious_documents_pdf_path, tmp_dir):
rendered_files = files_in_dir(tmp_dir)
assert len(rendered_files) == 6
file_sizes = sorted([path.getsize(f) for f in rendered_files])
assert_array_is_close(file_sizes, [7031, 8346, 55178, 142742, 181310, 1464895])
assert_array_is_close(file_sizes, [7031, 8346, 55178, 181310, 437873, 1464895])

for file in rendered_files:
remove(file)
Expand Down

0 comments on commit fcdc483

Please sign in to comment.