isledecomp · disinvite · Jan 25, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/reccmp/isledecomp/analysis/__init__.py b/reccmp/isledecomp/analysis/__init__.py
@@ -0,0 +1 @@
+from .float_const import find_float_consts
diff --git a/reccmp/isledecomp/analysis/float_const.py b/reccmp/isledecomp/analysis/float_const.py
@@ -0,0 +1,123 @@
+"""Analysis related to x86 floating point instructions.
+All floating point instructions use two byte opcodes. The first byte is in the range D8 to DF.
+The second indicates the operation and pointer or registers used.
+
+We are interested in floating point constants, so we want to exclude instructions that:
+- access the status register or environment (FLDCW, FLDENV)
+- store a value (FST, FSTP)
+- refer to integers (FI*)
+
+Then filter on pointers into read-only sections.
+"""
+import re
+import struct
+from typing import Iterator, NamedTuple
+from reccmp.isledecomp.formats import PEImage
+
+SINGLE_PRECISION_OPCODES = frozenset(
+    [
+        (0xD8, 0x05),  # fadd
+        (0xD8, 0x0D),  # fmul
+        (0xD8, 0x15),  # fcom
+        (0xD8, 0x1D),  # fcomp
+        (0xD8, 0x25),  # fsub
+        (0xD8, 0x2D),  # fsubr
+        (0xD8, 0x35),  # fdiv
+        (0xD8, 0x3D),  # fdivr
+        (0xD9, 0x05),  # fld
+    ]
+)
+
+DOUBLE_PRECISION_OPCODES = frozenset(
+    [
+        (0xDC, 0x05),  # fadd
+        (0xDC, 0x0D),  # fmul
+        (0xDC, 0x15),  # fcom
+        (0xDC, 0x1D),  # fcomp
+        (0xDC, 0x25),  # fsub
+        (0xDC, 0x2D),  # fsubr
+        (0xDC, 0x35),  # fdiv
+        (0xDC, 0x3D),  # fdivr
+        (0xDD, 0x05),  # fld
+    ]
+)
+
+FLOAT_OPCODES = frozenset([*SINGLE_PRECISION_OPCODES, *DOUBLE_PRECISION_OPCODES])
+
+
+# Match a superset of the floating point instructions above.
+# Uses positive lookahead to support overlapping matches.
+FLOAT_INSTRUCTION_RE = re.compile(
+    rb"(?=([\xd8\xd9\xdc\xdd][\x05\x0d\x15\x1d\x25\x2d\x35\x3d].{4}))", flags=re.S
+)
+
+
+class FloatInstruction(NamedTuple):
+    # The address (or offset) of the instruction
+    address: int
+    # Two byte opcode of the instruction
+    opcode: tuple[int, int]
+    # The address used in the operand
+    pointer: int
+
+
+def find_float_instructions_in_buffer(
+    buf: bytes, base_addr: int = 0
+) -> Iterator[FloatInstruction]:
+    """Search the given binary blob for floating-point instructions that reference a pointer.
+    If the base addr is given, add it to the offset of the instruction to get an absolute address.
+    TODO: Uses `bytes` as the generic type for the Buffer protocol. See PEP 688 added in Python 3.12.
+    """
+    for match in FLOAT_INSTRUCTION_RE.finditer(buf):
+        inst = match.group(1)
+        opcode = (inst[0], inst[1])
+
+        if opcode in FLOAT_OPCODES:
+            (pointer,) = struct.unpack("<I", inst[2:6])
+            yield FloatInstruction(base_addr + match.start(), opcode, pointer)
+
+
+class FloatConstant(NamedTuple):
+    address: int
+    size: int
+    value: float
+
+
+def find_float_consts(image: PEImage) -> Iterator[FloatConstant]:
+    """Floating point instructions that refer to a memory address can
+    point to constant values. Search the code sections to find FP
+    instructions and check whether the pointer address refers to
+    read-only data."""
+
+    # Multiple instructions can refer to the same float.
+    # Return each float only once from this function.
+    seen = set()
+
+    # TODO: Should check all code and const data sections.
+    code_sections = (image.get_section_by_name(".text"),)
+    const_sections = (image.get_section_by_name(".rdata"),)
+
+    for sect in code_sections:
+        for inst in find_float_instructions_in_buffer(sect.view, sect.virtual_address):
+            if inst.pointer in seen:
+                continue
+
+            seen.add(inst.pointer)
+
+            # Make sure that the address of the operand is a relocation.
+            if inst.address + 2 not in image.relocations:
+                continue
+
+            # Ignore instructions that point to variables
+            if any(
+                const_sect.contains_vaddr(inst.pointer) for const_sect in const_sections
+            ):
+                if inst.opcode in SINGLE_PRECISION_OPCODES:
+                    # dword ptr -- single precision
+                    (float_value,) = struct.unpack("<f", image.read(inst.pointer, 4))
+                    yield FloatConstant(inst.pointer, 4, float_value)
+
+                elif inst.opcode in DOUBLE_PRECISION_OPCODES:
+                    # qword ptr -- double precision
+                    (float_value,) = struct.unpack("<d", image.read(inst.pointer, 8))
+                    yield FloatConstant(inst.pointer, 8, float_value)
diff --git a/reccmp/isledecomp/compare/core.py b/reccmp/isledecomp/compare/core.py
@@ -19,6 +19,7 @@
 from reccmp.isledecomp.compare.asm import ParseAsm
 from reccmp.isledecomp.compare.asm.replacement import create_name_lookup
 from reccmp.isledecomp.compare.asm.fixes import assert_fixup, find_effective_match
+from reccmp.isledecomp.analysis import find_float_consts
 from .db import EntityDb, ReccmpEntity, ReccmpMatch
 from .diff import combined_diff, CombinedDiffOutput
 from .lines import LinesDb
@@ -422,12 +423,12 @@ def _find_float_const(self):
         """Add floating point constants in each binary to the database.
         We are not matching anything right now because these values are not
         deduped like strings."""
-        for addr, size, float_value in self.orig_bin.find_float_consts():
+        for addr, size, float_value in find_float_consts(self.orig_bin):
             self._db.set_orig_symbol(
                 addr, type=EntityType.FLOAT, name=str(float_value), size=size
             )
 
-        for addr, size, float_value in self.recomp_bin.find_float_consts():
+        for addr, size, float_value in find_float_consts(self.recomp_bin):
             self._db.set_recomp_symbol(
                 addr, type=EntityType.FLOAT, name=str(float_value), size=size
             )

diff --git a/reccmp/isledecomp/formats/pe.py b/reccmp/isledecomp/formats/pe.py
@@ -492,7 +492,7 @@ class PEImage(Image):
 
     # FIXME: do these belong to PEImage? Shouldn't the loade apply these to the data?
     _relocated_addrs: set[int] = dataclasses.field(default_factory=set, repr=False)
-    _relocations: set[int] = dataclasses.field(default_factory=set, repr=False)
+    relocations: set[int] = dataclasses.field(default_factory=set, repr=False)
     # find_str: bool = dataclasses.field(default=False, repr=False)
     imports: list[tuple[str, str, int]] = dataclasses.field(
         default_factory=list, repr=False
@@ -734,49 +734,13 @@ def _populate_relocations(self):
         # We are now interested in the relocated addresses themselves. Seek to the
         # address where there is a relocation, then read the four bytes into our set.
         reloc_addrs.sort()
-        self._relocations = set(reloc_addrs)
+        self.relocations = set(reloc_addrs)
 
         for section_id, offset in map(self.get_relative_addr, reloc_addrs):
             section = self.get_section_by_index(section_id)
             (relocated_addr,) = struct.unpack("<I", section.view[offset : offset + 4])
             self._relocated_addrs.add(relocated_addr)
 
-    def find_float_consts(self) -> Iterator[tuple[int, int, float]]:
-        """Floating point instructions that refer to a memory address can
-        point to constant values. Search the code sections to find FP
-        instructions and check whether the pointer address refers to
-        read-only data."""
-
-        # TODO: Should check any section that has code, not just .text
-        text = self.get_section_by_name(".text")
-        rdata = self.get_section_by_name(".rdata")
-
-        # These are the addresses where a relocation occurs.
-        # Meaning: it points to an absolute address of something
-        for addr in self._relocations:
-            if not text.contains_vaddr(addr):
-                continue
-
-            # Read the two bytes before the relocated address.
-            # We will check against possible float opcodes
-            raw = text.read_virtual(addr - 2, 6)
-            (opcode, opcode_ext, const_addr) = struct.unpack("<BBL", raw)
-
-            # Skip right away if this is not const data
-            if not rdata.contains_vaddr(const_addr):
-                continue
-
-            if opcode_ext in (0x5, 0xD, 0x15, 0x1D, 0x25, 0x2D, 0x35, 0x3D):
-                if opcode in (0xD8, 0xD9):
-                    # dword ptr -- single precision
-                    (float_value,) = struct.unpack("<f", self.read(const_addr, 4))
-                    yield (const_addr, 4, float_value)
-
-                elif opcode in (0xDC, 0xDD):
-                    # qword ptr -- double precision
-                    (float_value,) = struct.unpack("<d", self.read(const_addr, 8))
-                    yield (const_addr, 8, float_value)
-
     def _populate_imports(self):
         """Parse .idata to find imported DLLs and their functions."""
         import_directory = self.get_data_directory_region(

diff --git a/tests/test_analysis_float_const.py b/tests/test_analysis_float_const.py
@@ -0,0 +1,43 @@
+"""Test find_float_const for PE images"""
+
+from reccmp.isledecomp.formats import PEImage
+from reccmp.isledecomp.analysis.float_const import (
+    find_float_instructions_in_buffer,
+    find_float_consts,
+)
+
+
+def test_float_detect_overlap():
+    """Must be able to match potential instructions that overlap.
+    Because we are not disassembling, we don't know whether a given
+    byte is the start of an instruction."""
+    code = b"\xd8\x05\xd8\x05\x00\x10\x00\x10"
+    floats = list(find_float_instructions_in_buffer(code))
+    assert len(floats) == 2
+
+
+def test_basic_float_detection(binfile: PEImage):
+    """Make sure we detect some known floats in our sample PE image"""
+    floats = list(find_float_consts(binfile))
+
+    # Single and double precision, same value
+    assert (0x100DBD38, 4, 0.5) in floats
+    assert (0x100D8BC0, 8, 0.5) in floats
+
+    # Integer
+    assert (0x100D6F88, 4, 1024.0) in floats
+
+    # Both pi, both doubles, but different levels of precision
+    assert (0x100DB8F0, 8, 3.141592653589793) in floats
+    assert (0x100DBD50, 8, 3.14159265359) in floats
+
+    # Ignore float variable from .data
+    assert (0x100F7500, 4, 0.1) not in floats
+
+
+def test_floats_appear_once(binfile: PEImage):
+    """Multiple instructions may point at the same constant.
+    Our list should only return each constant once."""
+    floats = list(find_float_consts(binfile))
+
+    assert len(floats) == len(set(floats))