Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New analysis module for find_float_consts #78

Merged
merged 5 commits into from
Jan 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions reccmp/isledecomp/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .float_const import find_float_consts
123 changes: 123 additions & 0 deletions reccmp/isledecomp/analysis/float_const.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Analysis related to x86 floating point instructions.
All floating point instructions use two byte opcodes. The first byte is in the range D8 to DF.
The second indicates the operation and pointer or registers used.

We are interested in floating point constants, so we want to exclude instructions that:
- access the status register or environment (FLDCW, FLDENV)
- store a value (FST, FSTP)
- refer to integers (FI*)

Then filter on pointers into read-only sections.
"""
import re
import struct
from typing import Iterator, NamedTuple
from reccmp.isledecomp.formats import PEImage

SINGLE_PRECISION_OPCODES = frozenset(
[
(0xD8, 0x05), # fadd
(0xD8, 0x0D), # fmul
(0xD8, 0x15), # fcom
(0xD8, 0x1D), # fcomp
(0xD8, 0x25), # fsub
(0xD8, 0x2D), # fsubr
(0xD8, 0x35), # fdiv
(0xD8, 0x3D), # fdivr
(0xD9, 0x05), # fld
]
)

DOUBLE_PRECISION_OPCODES = frozenset(
[
(0xDC, 0x05), # fadd
(0xDC, 0x0D), # fmul
(0xDC, 0x15), # fcom
(0xDC, 0x1D), # fcomp
(0xDC, 0x25), # fsub
(0xDC, 0x2D), # fsubr
(0xDC, 0x35), # fdiv
(0xDC, 0x3D), # fdivr
(0xDD, 0x05), # fld
]
)

FLOAT_OPCODES = frozenset([*SINGLE_PRECISION_OPCODES, *DOUBLE_PRECISION_OPCODES])


# Match a superset of the floating point instructions above.
# Uses positive lookahead to support overlapping matches.
FLOAT_INSTRUCTION_RE = re.compile(
rb"(?=([\xd8\xd9\xdc\xdd][\x05\x0d\x15\x1d\x25\x2d\x35\x3d].{4}))", flags=re.S
)


class FloatInstruction(NamedTuple):
# The address (or offset) of the instruction
address: int
# Two byte opcode of the instruction
opcode: tuple[int, int]
# The address used in the operand
pointer: int


def find_float_instructions_in_buffer(
buf: bytes, base_addr: int = 0
) -> Iterator[FloatInstruction]:
"""Search the given binary blob for floating-point instructions that reference a pointer.
If the base addr is given, add it to the offset of the instruction to get an absolute address.
TODO: Uses `bytes` as the generic type for the Buffer protocol. See PEP 688 added in Python 3.12.
"""
for match in FLOAT_INSTRUCTION_RE.finditer(buf):
inst = match.group(1)
opcode = (inst[0], inst[1])

if opcode in FLOAT_OPCODES:
(pointer,) = struct.unpack("<I", inst[2:6])
yield FloatInstruction(base_addr + match.start(), opcode, pointer)


class FloatConstant(NamedTuple):
address: int
size: int
value: float


def find_float_consts(image: PEImage) -> Iterator[FloatConstant]:
"""Floating point instructions that refer to a memory address can
point to constant values. Search the code sections to find FP
instructions and check whether the pointer address refers to
read-only data."""

# Multiple instructions can refer to the same float.
# Return each float only once from this function.
seen = set()

# TODO: Should check all code and const data sections.
code_sections = (image.get_section_by_name(".text"),)
const_sections = (image.get_section_by_name(".rdata"),)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this check also test write-able data sections?
e.g. code that does;

float g_Gravity = 9.8f;
void set_gravity(float g) { g_Gravity = g; }

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would not identify 9.8f if the value is in a writable section. If it were never modified (and in .rdata) then we would return it, but the correct behavior is to add the variable annotations first and then not replace g_Gravity with EntityType.FLOAT.


for sect in code_sections:
for inst in find_float_instructions_in_buffer(sect.view, sect.virtual_address):
if inst.pointer in seen:
continue

seen.add(inst.pointer)

# Make sure that the address of the operand is a relocation.
if inst.address + 2 not in image.relocations:
continue

# Ignore instructions that point to variables
if any(
const_sect.contains_vaddr(inst.pointer) for const_sect in const_sections
):
if inst.opcode in SINGLE_PRECISION_OPCODES:
# dword ptr -- single precision
(float_value,) = struct.unpack("<f", image.read(inst.pointer, 4))
yield FloatConstant(inst.pointer, 4, float_value)

elif inst.opcode in DOUBLE_PRECISION_OPCODES:
# qword ptr -- double precision
(float_value,) = struct.unpack("<d", image.read(inst.pointer, 8))
yield FloatConstant(inst.pointer, 8, float_value)
5 changes: 3 additions & 2 deletions reccmp/isledecomp/compare/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from reccmp.isledecomp.compare.asm import ParseAsm
from reccmp.isledecomp.compare.asm.replacement import create_name_lookup
from reccmp.isledecomp.compare.asm.fixes import assert_fixup, find_effective_match
from reccmp.isledecomp.analysis import find_float_consts
from .db import EntityDb, ReccmpEntity, ReccmpMatch
from .diff import combined_diff, CombinedDiffOutput
from .lines import LinesDb
Expand Down Expand Up @@ -422,12 +423,12 @@ def _find_float_const(self):
"""Add floating point constants in each binary to the database.
We are not matching anything right now because these values are not
deduped like strings."""
for addr, size, float_value in self.orig_bin.find_float_consts():
for addr, size, float_value in find_float_consts(self.orig_bin):
self._db.set_orig_symbol(
addr, type=EntityType.FLOAT, name=str(float_value), size=size
)

for addr, size, float_value in self.recomp_bin.find_float_consts():
for addr, size, float_value in find_float_consts(self.recomp_bin):
self._db.set_recomp_symbol(
addr, type=EntityType.FLOAT, name=str(float_value), size=size
)
Expand Down
40 changes: 2 additions & 38 deletions reccmp/isledecomp/formats/pe.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ class PEImage(Image):

# FIXME: do these belong to PEImage? Shouldn't the loade apply these to the data?
_relocated_addrs: set[int] = dataclasses.field(default_factory=set, repr=False)
_relocations: set[int] = dataclasses.field(default_factory=set, repr=False)
relocations: set[int] = dataclasses.field(default_factory=set, repr=False)
# find_str: bool = dataclasses.field(default=False, repr=False)
imports: list[tuple[str, str, int]] = dataclasses.field(
default_factory=list, repr=False
Expand Down Expand Up @@ -734,49 +734,13 @@ def _populate_relocations(self):
# We are now interested in the relocated addresses themselves. Seek to the
# address where there is a relocation, then read the four bytes into our set.
reloc_addrs.sort()
self._relocations = set(reloc_addrs)
self.relocations = set(reloc_addrs)

for section_id, offset in map(self.get_relative_addr, reloc_addrs):
section = self.get_section_by_index(section_id)
(relocated_addr,) = struct.unpack("<I", section.view[offset : offset + 4])
self._relocated_addrs.add(relocated_addr)

def find_float_consts(self) -> Iterator[tuple[int, int, float]]:
"""Floating point instructions that refer to a memory address can
point to constant values. Search the code sections to find FP
instructions and check whether the pointer address refers to
read-only data."""

# TODO: Should check any section that has code, not just .text
text = self.get_section_by_name(".text")
rdata = self.get_section_by_name(".rdata")

# These are the addresses where a relocation occurs.
# Meaning: it points to an absolute address of something
for addr in self._relocations:
if not text.contains_vaddr(addr):
continue

# Read the two bytes before the relocated address.
# We will check against possible float opcodes
raw = text.read_virtual(addr - 2, 6)
(opcode, opcode_ext, const_addr) = struct.unpack("<BBL", raw)

# Skip right away if this is not const data
if not rdata.contains_vaddr(const_addr):
continue

if opcode_ext in (0x5, 0xD, 0x15, 0x1D, 0x25, 0x2D, 0x35, 0x3D):
if opcode in (0xD8, 0xD9):
# dword ptr -- single precision
(float_value,) = struct.unpack("<f", self.read(const_addr, 4))
yield (const_addr, 4, float_value)

elif opcode in (0xDC, 0xDD):
# qword ptr -- double precision
(float_value,) = struct.unpack("<d", self.read(const_addr, 8))
yield (const_addr, 8, float_value)

def _populate_imports(self):
"""Parse .idata to find imported DLLs and their functions."""
import_directory = self.get_data_directory_region(
Expand Down
43 changes: 43 additions & 0 deletions tests/test_analysis_float_const.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Test find_float_const for PE images"""

from reccmp.isledecomp.formats import PEImage
from reccmp.isledecomp.analysis.float_const import (
find_float_instructions_in_buffer,
find_float_consts,
)


def test_float_detect_overlap():
"""Must be able to match potential instructions that overlap.
Because we are not disassembling, we don't know whether a given
byte is the start of an instruction."""
code = b"\xd8\x05\xd8\x05\x00\x10\x00\x10"
floats = list(find_float_instructions_in_buffer(code))
assert len(floats) == 2


def test_basic_float_detection(binfile: PEImage):
"""Make sure we detect some known floats in our sample PE image"""
floats = list(find_float_consts(binfile))

# Single and double precision, same value
assert (0x100DBD38, 4, 0.5) in floats
assert (0x100D8BC0, 8, 0.5) in floats

# Integer
assert (0x100D6F88, 4, 1024.0) in floats

# Both pi, both doubles, but different levels of precision
assert (0x100DB8F0, 8, 3.141592653589793) in floats
assert (0x100DBD50, 8, 3.14159265359) in floats

# Ignore float variable from .data
assert (0x100F7500, 4, 0.1) not in floats


def test_floats_appear_once(binfile: PEImage):
"""Multiple instructions may point at the same constant.
Our list should only return each constant once."""
floats = list(find_float_consts(binfile))

assert len(floats) == len(set(floats))
Loading