From bce7cf5abb5822a8055f9e81cb79dde741244f5e Mon Sep 17 00:00:00 2001 From: disinvite Date: Tue, 21 Jan 2025 23:32:16 -0500 Subject: [PATCH 1/5] Floating point analysis module --- reccmp/isledecomp/analysis/__init__.py | 1 + reccmp/isledecomp/analysis/float_const.py | 124 ++++++++++++++++++++++ reccmp/isledecomp/compare/core.py | 5 +- tests/test_analysis_float_const.py | 40 +++++++ 4 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 reccmp/isledecomp/analysis/__init__.py create mode 100644 reccmp/isledecomp/analysis/float_const.py create mode 100644 tests/test_analysis_float_const.py diff --git a/reccmp/isledecomp/analysis/__init__.py b/reccmp/isledecomp/analysis/__init__.py new file mode 100644 index 00000000..1a1d3eec --- /dev/null +++ b/reccmp/isledecomp/analysis/__init__.py @@ -0,0 +1 @@ +from .float_const import find_float_consts diff --git a/reccmp/isledecomp/analysis/float_const.py b/reccmp/isledecomp/analysis/float_const.py new file mode 100644 index 00000000..d3ee4a71 --- /dev/null +++ b/reccmp/isledecomp/analysis/float_const.py @@ -0,0 +1,124 @@ +"""Analysis related to x86 floating point instructions. +All floating point instructions use two byte opcodes. The first byte is in the range D8 to DF. +The second indicates the operation and pointer or registers used. + +We are interested in floating point constants, so we want to exclude instructions that: +- access the status register or environment (FLDCW, FLDENV) +- store a value (FST, FSTP) +- refer to integers (FI*) + +Then filter on pointers into read-only sections. +""" +import re +import struct +from typing import Iterator, NamedTuple +from reccmp.isledecomp.formats import PEImage + +SINGLE_PRECISION_OPCODES = frozenset( + [ + (0xD8, 0x05), # fadd + (0xD8, 0x0D), # fmul + (0xD8, 0x15), # fcom + (0xD8, 0x1D), # fcomp + (0xD8, 0x25), # fsub + (0xD8, 0x2D), # fsubr + (0xD8, 0x35), # fdiv + (0xD8, 0x3D), # fdivr + (0xD9, 0x05), # fld + ] +) + +DOUBLE_PRECISION_OPCODES = frozenset( + [ + (0xDC, 0x05), # fadd + (0xDC, 0x0D), # fmul + (0xDC, 0x15), # fcom + (0xDC, 0x1D), # fcomp + (0xDC, 0x25), # fsub + (0xDC, 0x2D), # fsubr + (0xDC, 0x35), # fdiv + (0xDC, 0x3D), # fdivr + (0xDD, 0x05), # fld + ] +) + +FLOAT_OPCODES = frozenset([*SINGLE_PRECISION_OPCODES, *DOUBLE_PRECISION_OPCODES]) + + +# Match floating point instructions. +FLOAT_INSTRUCTION_RE = re.compile( + rb"(?=([\xd8\xd9\xdc\xdd][\x05\x0d\x15\x1d\x25\x2d\x35\x3d].{4}))", flags=re.S +) + + +class FloatInstruction(NamedTuple): + # The address (or offset) of the instruction + address: int + # Two byte opcode of the instruction + opcode: tuple[int, int] + # The address used in the operand + pointer: int + + +def find_float_instructions_in_bytes( + raw: bytes, base_addr: int = 0 +) -> Iterator[FloatInstruction]: + """Search the given binary blob for floating-point instructions that reference a pointer. + If the base addr is given, add it to the offset of the instruction to get an absolute address. + """ + for match in FLOAT_INSTRUCTION_RE.finditer(raw): + inst = match.group(1) + opcode = (inst[0], inst[1]) + + if opcode in FLOAT_OPCODES: + (pointer,) = struct.unpack(" Iterator[FloatConstant]: + """Floating point instructions that refer to a memory address can + point to constant values. Search the code sections to find FP + instructions and check whether the pointer address refers to + read-only data.""" + + # Multiple instructions can refer to the same float. + # Return each float only once from this function. + seen = set() + + # TODO: Should check all code and const data sections. + code_sections = (image.get_section_by_name(".text"),) + const_sections = (image.get_section_by_name(".rdata"),) + + for sect in code_sections: + for inst in find_float_instructions_in_bytes( + bytes(sect.view), sect.virtual_address + ): + if inst.pointer in seen: + continue + + seen.add(inst.pointer) + + # Make sure that the address of the operand is a relocation. + # pylint: disable=protected-access + if inst.address + 2 not in image._relocations: + continue + + # Ignore instructions that point to variables + if any( + const_sect.contains_vaddr(inst.pointer) for const_sect in const_sections + ): + if inst.opcode in SINGLE_PRECISION_OPCODES: + # dword ptr -- single precision + (float_value,) = struct.unpack(" Date: Tue, 21 Jan 2025 23:35:38 -0500 Subject: [PATCH 2/5] Remove find_float_consts from PE class --- reccmp/isledecomp/formats/pe.py | 36 --------------------------------- 1 file changed, 36 deletions(-) diff --git a/reccmp/isledecomp/formats/pe.py b/reccmp/isledecomp/formats/pe.py index 55c72a87..68fba05e 100644 --- a/reccmp/isledecomp/formats/pe.py +++ b/reccmp/isledecomp/formats/pe.py @@ -741,42 +741,6 @@ def _populate_relocations(self): (relocated_addr,) = struct.unpack(" Iterator[tuple[int, int, float]]: - """Floating point instructions that refer to a memory address can - point to constant values. Search the code sections to find FP - instructions and check whether the pointer address refers to - read-only data.""" - - # TODO: Should check any section that has code, not just .text - text = self.get_section_by_name(".text") - rdata = self.get_section_by_name(".rdata") - - # These are the addresses where a relocation occurs. - # Meaning: it points to an absolute address of something - for addr in self._relocations: - if not text.contains_vaddr(addr): - continue - - # Read the two bytes before the relocated address. - # We will check against possible float opcodes - raw = text.read_virtual(addr - 2, 6) - (opcode, opcode_ext, const_addr) = struct.unpack(" Date: Wed, 22 Jan 2025 14:38:11 -0500 Subject: [PATCH 3/5] Use collections.abc.Buffer, make pe.relocations public --- reccmp/isledecomp/analysis/float_const.py | 17 ++++++++--------- reccmp/isledecomp/formats/pe.py | 4 ++-- tests/test_analysis_float_const.py | 4 ++-- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/reccmp/isledecomp/analysis/float_const.py b/reccmp/isledecomp/analysis/float_const.py index d3ee4a71..3b1cfc5e 100644 --- a/reccmp/isledecomp/analysis/float_const.py +++ b/reccmp/isledecomp/analysis/float_const.py @@ -11,6 +11,7 @@ """ import re import struct +from collections.abc import Buffer from typing import Iterator, NamedTuple from reccmp.isledecomp.formats import PEImage @@ -45,7 +46,8 @@ FLOAT_OPCODES = frozenset([*SINGLE_PRECISION_OPCODES, *DOUBLE_PRECISION_OPCODES]) -# Match floating point instructions. +# Match a superset of the floating point instructions above. +# Uses positive lookahead to support overlapping matches. FLOAT_INSTRUCTION_RE = re.compile( rb"(?=([\xd8\xd9\xdc\xdd][\x05\x0d\x15\x1d\x25\x2d\x35\x3d].{4}))", flags=re.S ) @@ -60,13 +62,13 @@ class FloatInstruction(NamedTuple): pointer: int -def find_float_instructions_in_bytes( - raw: bytes, base_addr: int = 0 +def find_float_instructions_in_buffer( + buf: Buffer, base_addr: int = 0 ) -> Iterator[FloatInstruction]: """Search the given binary blob for floating-point instructions that reference a pointer. If the base addr is given, add it to the offset of the instruction to get an absolute address. """ - for match in FLOAT_INSTRUCTION_RE.finditer(raw): + for match in FLOAT_INSTRUCTION_RE.finditer(buf): inst = match.group(1) opcode = (inst[0], inst[1]) @@ -96,17 +98,14 @@ def find_float_consts(image: PEImage) -> Iterator[FloatConstant]: const_sections = (image.get_section_by_name(".rdata"),) for sect in code_sections: - for inst in find_float_instructions_in_bytes( - bytes(sect.view), sect.virtual_address - ): + for inst in find_float_instructions_in_buffer(sect.view, sect.virtual_address): if inst.pointer in seen: continue seen.add(inst.pointer) # Make sure that the address of the operand is a relocation. - # pylint: disable=protected-access - if inst.address + 2 not in image._relocations: + if inst.address + 2 not in image.relocations: continue # Ignore instructions that point to variables diff --git a/reccmp/isledecomp/formats/pe.py b/reccmp/isledecomp/formats/pe.py index 68fba05e..677c79cf 100644 --- a/reccmp/isledecomp/formats/pe.py +++ b/reccmp/isledecomp/formats/pe.py @@ -492,7 +492,7 @@ class PEImage(Image): # FIXME: do these belong to PEImage? Shouldn't the loade apply these to the data? _relocated_addrs: set[int] = dataclasses.field(default_factory=set, repr=False) - _relocations: set[int] = dataclasses.field(default_factory=set, repr=False) + relocations: set[int] = dataclasses.field(default_factory=set, repr=False) # find_str: bool = dataclasses.field(default=False, repr=False) imports: list[tuple[str, str, int]] = dataclasses.field( default_factory=list, repr=False @@ -734,7 +734,7 @@ def _populate_relocations(self): # We are now interested in the relocated addresses themselves. Seek to the # address where there is a relocation, then read the four bytes into our set. reloc_addrs.sort() - self._relocations = set(reloc_addrs) + self.relocations = set(reloc_addrs) for section_id, offset in map(self.get_relative_addr, reloc_addrs): section = self.get_section_by_index(section_id) diff --git a/tests/test_analysis_float_const.py b/tests/test_analysis_float_const.py index b2f1e7a9..c78722f5 100644 --- a/tests/test_analysis_float_const.py +++ b/tests/test_analysis_float_const.py @@ -2,7 +2,7 @@ from reccmp.isledecomp.formats import PEImage from reccmp.isledecomp.analysis.float_const import ( - find_float_instructions_in_bytes, + find_float_instructions_in_buffer, find_float_consts, ) @@ -12,7 +12,7 @@ def test_float_detect_overlap(): Because we are not disassembling, we don't know whether a given byte is the start of an instruction.""" code = b"\xd8\x05\xd8\x05\x00\x10\x00\x10" - floats = list(find_float_instructions_in_bytes(code)) + floats = list(find_float_instructions_in_buffer(code)) assert len(floats) == 2 From e6786e9b3dd50c12790073c4379260b54e4bfa75 Mon Sep 17 00:00:00 2001 From: disinvite Date: Wed, 22 Jan 2025 14:43:27 -0500 Subject: [PATCH 4/5] Remove collections abc.Buffer for pre 3.12 --- reccmp/isledecomp/analysis/float_const.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reccmp/isledecomp/analysis/float_const.py b/reccmp/isledecomp/analysis/float_const.py index 3b1cfc5e..95093769 100644 --- a/reccmp/isledecomp/analysis/float_const.py +++ b/reccmp/isledecomp/analysis/float_const.py @@ -11,7 +11,6 @@ """ import re import struct -from collections.abc import Buffer from typing import Iterator, NamedTuple from reccmp.isledecomp.formats import PEImage @@ -63,10 +62,11 @@ class FloatInstruction(NamedTuple): def find_float_instructions_in_buffer( - buf: Buffer, base_addr: int = 0 + buf: bytes, base_addr: int = 0 ) -> Iterator[FloatInstruction]: """Search the given binary blob for floating-point instructions that reference a pointer. If the base addr is given, add it to the offset of the instruction to get an absolute address. + TODO: Uses `bytes` as the generic type for the Buffer protocol. See PEP 688 added in Python 3.12. """ for match in FLOAT_INSTRUCTION_RE.finditer(buf): inst = match.group(1) From 7e8a6a9823b3202b20da8c96c483af675f074e8a Mon Sep 17 00:00:00 2001 From: disinvite Date: Thu, 23 Jan 2025 23:17:20 -0500 Subject: [PATCH 5/5] Should ignore float variable --- tests/test_analysis_float_const.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_analysis_float_const.py b/tests/test_analysis_float_const.py index c78722f5..5ab3fd53 100644 --- a/tests/test_analysis_float_const.py +++ b/tests/test_analysis_float_const.py @@ -31,6 +31,9 @@ def test_basic_float_detection(binfile: PEImage): assert (0x100DB8F0, 8, 3.141592653589793) in floats assert (0x100DBD50, 8, 3.14159265359) in floats + # Ignore float variable from .data + assert (0x100F7500, 4, 0.1) not in floats + def test_floats_appear_once(binfile: PEImage): """Multiple instructions may point at the same constant.