From bce7cf5abb5822a8055f9e81cb79dde741244f5e Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Tue, 21 Jan 2025 23:32:16 -0500
Subject: [PATCH 1/5] Floating point analysis module

---
 reccmp/isledecomp/analysis/__init__.py    |   1 +
 reccmp/isledecomp/analysis/float_const.py | 124 ++++++++++++++++++++++
 reccmp/isledecomp/compare/core.py         |   5 +-
 tests/test_analysis_float_const.py        |  40 +++++++
 4 files changed, 168 insertions(+), 2 deletions(-)
 create mode 100644 reccmp/isledecomp/analysis/__init__.py
 create mode 100644 reccmp/isledecomp/analysis/float_const.py
 create mode 100644 tests/test_analysis_float_const.py

diff --git a/reccmp/isledecomp/analysis/__init__.py b/reccmp/isledecomp/analysis/__init__.py
new file mode 100644
index 00000000..1a1d3eec
--- /dev/null
+++ b/reccmp/isledecomp/analysis/__init__.py
@@ -0,0 +1 @@
+from .float_const import find_float_consts
diff --git a/reccmp/isledecomp/analysis/float_const.py b/reccmp/isledecomp/analysis/float_const.py
new file mode 100644
index 00000000..d3ee4a71
--- /dev/null
+++ b/reccmp/isledecomp/analysis/float_const.py
@@ -0,0 +1,124 @@
+"""Analysis related to x86 floating point instructions.
+All floating point instructions use two byte opcodes. The first byte is in the range D8 to DF.
+The second indicates the operation and pointer or registers used.
+
+We are interested in floating point constants, so we want to exclude instructions that:
+- access the status register or environment (FLDCW, FLDENV)
+- store a value (FST, FSTP)
+- refer to integers (FI*)
+
+Then filter on pointers into read-only sections.
+"""
+import re
+import struct
+from typing import Iterator, NamedTuple
+from reccmp.isledecomp.formats import PEImage
+
+SINGLE_PRECISION_OPCODES = frozenset(
+    [
+        (0xD8, 0x05),  # fadd
+        (0xD8, 0x0D),  # fmul
+        (0xD8, 0x15),  # fcom
+        (0xD8, 0x1D),  # fcomp
+        (0xD8, 0x25),  # fsub
+        (0xD8, 0x2D),  # fsubr
+        (0xD8, 0x35),  # fdiv
+        (0xD8, 0x3D),  # fdivr
+        (0xD9, 0x05),  # fld
+    ]
+)
+
+DOUBLE_PRECISION_OPCODES = frozenset(
+    [
+        (0xDC, 0x05),  # fadd
+        (0xDC, 0x0D),  # fmul
+        (0xDC, 0x15),  # fcom
+        (0xDC, 0x1D),  # fcomp
+        (0xDC, 0x25),  # fsub
+        (0xDC, 0x2D),  # fsubr
+        (0xDC, 0x35),  # fdiv
+        (0xDC, 0x3D),  # fdivr
+        (0xDD, 0x05),  # fld
+    ]
+)
+
+FLOAT_OPCODES = frozenset([*SINGLE_PRECISION_OPCODES, *DOUBLE_PRECISION_OPCODES])
+
+
+# Match floating point instructions.
+FLOAT_INSTRUCTION_RE = re.compile(
+    rb"(?=([\xd8\xd9\xdc\xdd][\x05\x0d\x15\x1d\x25\x2d\x35\x3d].{4}))", flags=re.S
+)
+
+
+class FloatInstruction(NamedTuple):
+    # The address (or offset) of the instruction
+    address: int
+    # Two byte opcode of the instruction
+    opcode: tuple[int, int]
+    # The address used in the operand
+    pointer: int
+
+
+def find_float_instructions_in_bytes(
+    raw: bytes, base_addr: int = 0
+) -> Iterator[FloatInstruction]:
+    """Search the given binary blob for floating-point instructions that reference a pointer.
+    If the base addr is given, add it to the offset of the instruction to get an absolute address.
+    """
+    for match in FLOAT_INSTRUCTION_RE.finditer(raw):
+        inst = match.group(1)
+        opcode = (inst[0], inst[1])
+
+        if opcode in FLOAT_OPCODES:
+            (pointer,) = struct.unpack("<I", inst[2:6])
+            yield FloatInstruction(base_addr + match.start(), opcode, pointer)
+
+
+class FloatConstant(NamedTuple):
+    address: int
+    size: int
+    value: float
+
+
+def find_float_consts(image: PEImage) -> Iterator[FloatConstant]:
+    """Floating point instructions that refer to a memory address can
+    point to constant values. Search the code sections to find FP
+    instructions and check whether the pointer address refers to
+    read-only data."""
+
+    # Multiple instructions can refer to the same float.
+    # Return each float only once from this function.
+    seen = set()
+
+    # TODO: Should check all code and const data sections.
+    code_sections = (image.get_section_by_name(".text"),)
+    const_sections = (image.get_section_by_name(".rdata"),)
+
+    for sect in code_sections:
+        for inst in find_float_instructions_in_bytes(
+            bytes(sect.view), sect.virtual_address
+        ):
+            if inst.pointer in seen:
+                continue
+
+            seen.add(inst.pointer)
+
+            # Make sure that the address of the operand is a relocation.
+            # pylint: disable=protected-access
+            if inst.address + 2 not in image._relocations:
+                continue
+
+            # Ignore instructions that point to variables
+            if any(
+                const_sect.contains_vaddr(inst.pointer) for const_sect in const_sections
+            ):
+                if inst.opcode in SINGLE_PRECISION_OPCODES:
+                    # dword ptr -- single precision
+                    (float_value,) = struct.unpack("<f", image.read(inst.pointer, 4))
+                    yield FloatConstant(inst.pointer, 4, float_value)
+
+                elif inst.opcode in DOUBLE_PRECISION_OPCODES:
+                    # qword ptr -- double precision
+                    (float_value,) = struct.unpack("<d", image.read(inst.pointer, 8))
+                    yield FloatConstant(inst.pointer, 8, float_value)
diff --git a/reccmp/isledecomp/compare/core.py b/reccmp/isledecomp/compare/core.py
index e9f17e6e..9726ecd6 100644
--- a/reccmp/isledecomp/compare/core.py
+++ b/reccmp/isledecomp/compare/core.py
@@ -19,6 +19,7 @@
 from reccmp.isledecomp.compare.asm import ParseAsm
 from reccmp.isledecomp.compare.asm.replacement import create_name_lookup
 from reccmp.isledecomp.compare.asm.fixes import assert_fixup, find_effective_match
+from reccmp.isledecomp.analysis import find_float_consts
 from .db import EntityDb, ReccmpEntity, ReccmpMatch
 from .diff import combined_diff, CombinedDiffOutput
 from .lines import LinesDb
@@ -422,12 +423,12 @@ def _find_float_const(self):
         """Add floating point constants in each binary to the database.
         We are not matching anything right now because these values are not
         deduped like strings."""
-        for addr, size, float_value in self.orig_bin.find_float_consts():
+        for addr, size, float_value in find_float_consts(self.orig_bin):
             self._db.set_orig_symbol(
                 addr, type=EntityType.FLOAT, name=str(float_value), size=size
             )
 
-        for addr, size, float_value in self.recomp_bin.find_float_consts():
+        for addr, size, float_value in find_float_consts(self.recomp_bin):
             self._db.set_recomp_symbol(
                 addr, type=EntityType.FLOAT, name=str(float_value), size=size
             )
diff --git a/tests/test_analysis_float_const.py b/tests/test_analysis_float_const.py
new file mode 100644
index 00000000..b2f1e7a9
--- /dev/null
+++ b/tests/test_analysis_float_const.py
@@ -0,0 +1,40 @@
+"""Test find_float_const for PE images"""
+
+from reccmp.isledecomp.formats import PEImage
+from reccmp.isledecomp.analysis.float_const import (
+    find_float_instructions_in_bytes,
+    find_float_consts,
+)
+
+
+def test_float_detect_overlap():
+    """Must be able to match potential instructions that overlap.
+    Because we are not disassembling, we don't know whether a given
+    byte is the start of an instruction."""
+    code = b"\xd8\x05\xd8\x05\x00\x10\x00\x10"
+    floats = list(find_float_instructions_in_bytes(code))
+    assert len(floats) == 2
+
+
+def test_basic_float_detection(binfile: PEImage):
+    """Make sure we detect some known floats in our sample PE image"""
+    floats = list(find_float_consts(binfile))
+
+    # Single and double precision, same value
+    assert (0x100DBD38, 4, 0.5) in floats
+    assert (0x100D8BC0, 8, 0.5) in floats
+
+    # Integer
+    assert (0x100D6F88, 4, 1024.0) in floats
+
+    # Both pi, both doubles, but different levels of precision
+    assert (0x100DB8F0, 8, 3.141592653589793) in floats
+    assert (0x100DBD50, 8, 3.14159265359) in floats
+
+
+def test_floats_appear_once(binfile: PEImage):
+    """Multiple instructions may point at the same constant.
+    Our list should only return each constant once."""
+    floats = list(find_float_consts(binfile))
+
+    assert len(floats) == len(set(floats))

From 2847c915cb76682887ff1f8b548edd7f91653955 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Tue, 21 Jan 2025 23:35:38 -0500
Subject: [PATCH 2/5] Remove find_float_consts from PE class

---
 reccmp/isledecomp/formats/pe.py | 36 ---------------------------------
 1 file changed, 36 deletions(-)

diff --git a/reccmp/isledecomp/formats/pe.py b/reccmp/isledecomp/formats/pe.py
index 55c72a87..68fba05e 100644
--- a/reccmp/isledecomp/formats/pe.py
+++ b/reccmp/isledecomp/formats/pe.py
@@ -741,42 +741,6 @@ def _populate_relocations(self):
             (relocated_addr,) = struct.unpack("<I", section.view[offset : offset + 4])
             self._relocated_addrs.add(relocated_addr)
 
-    def find_float_consts(self) -> Iterator[tuple[int, int, float]]:
-        """Floating point instructions that refer to a memory address can
-        point to constant values. Search the code sections to find FP
-        instructions and check whether the pointer address refers to
-        read-only data."""
-
-        # TODO: Should check any section that has code, not just .text
-        text = self.get_section_by_name(".text")
-        rdata = self.get_section_by_name(".rdata")
-
-        # These are the addresses where a relocation occurs.
-        # Meaning: it points to an absolute address of something
-        for addr in self._relocations:
-            if not text.contains_vaddr(addr):
-                continue
-
-            # Read the two bytes before the relocated address.
-            # We will check against possible float opcodes
-            raw = text.read_virtual(addr - 2, 6)
-            (opcode, opcode_ext, const_addr) = struct.unpack("<BBL", raw)
-
-            # Skip right away if this is not const data
-            if not rdata.contains_vaddr(const_addr):
-                continue
-
-            if opcode_ext in (0x5, 0xD, 0x15, 0x1D, 0x25, 0x2D, 0x35, 0x3D):
-                if opcode in (0xD8, 0xD9):
-                    # dword ptr -- single precision
-                    (float_value,) = struct.unpack("<f", self.read(const_addr, 4))
-                    yield (const_addr, 4, float_value)
-
-                elif opcode in (0xDC, 0xDD):
-                    # qword ptr -- double precision
-                    (float_value,) = struct.unpack("<d", self.read(const_addr, 8))
-                    yield (const_addr, 8, float_value)
-
     def _populate_imports(self):
         """Parse .idata to find imported DLLs and their functions."""
         import_directory = self.get_data_directory_region(

From 195bfc8be86fcf7e5d1cb567a761964ac9f1b654 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Wed, 22 Jan 2025 14:38:11 -0500
Subject: [PATCH 3/5] Use collections.abc.Buffer, make pe.relocations public

---
 reccmp/isledecomp/analysis/float_const.py | 17 ++++++++---------
 reccmp/isledecomp/formats/pe.py           |  4 ++--
 tests/test_analysis_float_const.py        |  4 ++--
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/reccmp/isledecomp/analysis/float_const.py b/reccmp/isledecomp/analysis/float_const.py
index d3ee4a71..3b1cfc5e 100644
--- a/reccmp/isledecomp/analysis/float_const.py
+++ b/reccmp/isledecomp/analysis/float_const.py
@@ -11,6 +11,7 @@
 """
 import re
 import struct
+from collections.abc import Buffer
 from typing import Iterator, NamedTuple
 from reccmp.isledecomp.formats import PEImage
 
@@ -45,7 +46,8 @@
 FLOAT_OPCODES = frozenset([*SINGLE_PRECISION_OPCODES, *DOUBLE_PRECISION_OPCODES])
 
 
-# Match floating point instructions.
+# Match a superset of the floating point instructions above.
+# Uses positive lookahead to support overlapping matches.
 FLOAT_INSTRUCTION_RE = re.compile(
     rb"(?=([\xd8\xd9\xdc\xdd][\x05\x0d\x15\x1d\x25\x2d\x35\x3d].{4}))", flags=re.S
 )
@@ -60,13 +62,13 @@ class FloatInstruction(NamedTuple):
     pointer: int
 
 
-def find_float_instructions_in_bytes(
-    raw: bytes, base_addr: int = 0
+def find_float_instructions_in_buffer(
+    buf: Buffer, base_addr: int = 0
 ) -> Iterator[FloatInstruction]:
     """Search the given binary blob for floating-point instructions that reference a pointer.
     If the base addr is given, add it to the offset of the instruction to get an absolute address.
     """
-    for match in FLOAT_INSTRUCTION_RE.finditer(raw):
+    for match in FLOAT_INSTRUCTION_RE.finditer(buf):
         inst = match.group(1)
         opcode = (inst[0], inst[1])
 
@@ -96,17 +98,14 @@ def find_float_consts(image: PEImage) -> Iterator[FloatConstant]:
     const_sections = (image.get_section_by_name(".rdata"),)
 
     for sect in code_sections:
-        for inst in find_float_instructions_in_bytes(
-            bytes(sect.view), sect.virtual_address
-        ):
+        for inst in find_float_instructions_in_buffer(sect.view, sect.virtual_address):
             if inst.pointer in seen:
                 continue
 
             seen.add(inst.pointer)
 
             # Make sure that the address of the operand is a relocation.
-            # pylint: disable=protected-access
-            if inst.address + 2 not in image._relocations:
+            if inst.address + 2 not in image.relocations:
                 continue
 
             # Ignore instructions that point to variables
diff --git a/reccmp/isledecomp/formats/pe.py b/reccmp/isledecomp/formats/pe.py
index 68fba05e..677c79cf 100644
--- a/reccmp/isledecomp/formats/pe.py
+++ b/reccmp/isledecomp/formats/pe.py
@@ -492,7 +492,7 @@ class PEImage(Image):
 
     # FIXME: do these belong to PEImage? Shouldn't the loade apply these to the data?
     _relocated_addrs: set[int] = dataclasses.field(default_factory=set, repr=False)
-    _relocations: set[int] = dataclasses.field(default_factory=set, repr=False)
+    relocations: set[int] = dataclasses.field(default_factory=set, repr=False)
     # find_str: bool = dataclasses.field(default=False, repr=False)
     imports: list[tuple[str, str, int]] = dataclasses.field(
         default_factory=list, repr=False
@@ -734,7 +734,7 @@ def _populate_relocations(self):
         # We are now interested in the relocated addresses themselves. Seek to the
         # address where there is a relocation, then read the four bytes into our set.
         reloc_addrs.sort()
-        self._relocations = set(reloc_addrs)
+        self.relocations = set(reloc_addrs)
 
         for section_id, offset in map(self.get_relative_addr, reloc_addrs):
             section = self.get_section_by_index(section_id)
diff --git a/tests/test_analysis_float_const.py b/tests/test_analysis_float_const.py
index b2f1e7a9..c78722f5 100644
--- a/tests/test_analysis_float_const.py
+++ b/tests/test_analysis_float_const.py
@@ -2,7 +2,7 @@
 
 from reccmp.isledecomp.formats import PEImage
 from reccmp.isledecomp.analysis.float_const import (
-    find_float_instructions_in_bytes,
+    find_float_instructions_in_buffer,
     find_float_consts,
 )
 
@@ -12,7 +12,7 @@ def test_float_detect_overlap():
     Because we are not disassembling, we don't know whether a given
     byte is the start of an instruction."""
     code = b"\xd8\x05\xd8\x05\x00\x10\x00\x10"
-    floats = list(find_float_instructions_in_bytes(code))
+    floats = list(find_float_instructions_in_buffer(code))
     assert len(floats) == 2
 
 

From e6786e9b3dd50c12790073c4379260b54e4bfa75 Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Wed, 22 Jan 2025 14:43:27 -0500
Subject: [PATCH 4/5] Remove collections abc.Buffer for pre 3.12

---
 reccmp/isledecomp/analysis/float_const.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/reccmp/isledecomp/analysis/float_const.py b/reccmp/isledecomp/analysis/float_const.py
index 3b1cfc5e..95093769 100644
--- a/reccmp/isledecomp/analysis/float_const.py
+++ b/reccmp/isledecomp/analysis/float_const.py
@@ -11,7 +11,6 @@
 """
 import re
 import struct
-from collections.abc import Buffer
 from typing import Iterator, NamedTuple
 from reccmp.isledecomp.formats import PEImage
 
@@ -63,10 +62,11 @@ class FloatInstruction(NamedTuple):
 
 
 def find_float_instructions_in_buffer(
-    buf: Buffer, base_addr: int = 0
+    buf: bytes, base_addr: int = 0
 ) -> Iterator[FloatInstruction]:
     """Search the given binary blob for floating-point instructions that reference a pointer.
     If the base addr is given, add it to the offset of the instruction to get an absolute address.
+    TODO: Uses `bytes` as the generic type for the Buffer protocol. See PEP 688 added in Python 3.12.
     """
     for match in FLOAT_INSTRUCTION_RE.finditer(buf):
         inst = match.group(1)

From 7e8a6a9823b3202b20da8c96c483af675f074e8a Mon Sep 17 00:00:00 2001
From: disinvite <disinvite@users.noreply.github.com>
Date: Thu, 23 Jan 2025 23:17:20 -0500
Subject: [PATCH 5/5] Should ignore float variable

---
 tests/test_analysis_float_const.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_analysis_float_const.py b/tests/test_analysis_float_const.py
index c78722f5..5ab3fd53 100644
--- a/tests/test_analysis_float_const.py
+++ b/tests/test_analysis_float_const.py
@@ -31,6 +31,9 @@ def test_basic_float_detection(binfile: PEImage):
     assert (0x100DB8F0, 8, 3.141592653589793) in floats
     assert (0x100DBD50, 8, 3.14159265359) in floats
 
+    # Ignore float variable from .data
+    assert (0x100F7500, 4, 0.1) not in floats
+
 
 def test_floats_appear_once(binfile: PEImage):
     """Multiple instructions may point at the same constant.