diff --git a/decompiler/backend/cexpressiongenerator.py b/decompiler/backend/cexpressiongenerator.py index d6eda361..d7a5a33e 100644 --- a/decompiler/backend/cexpressiongenerator.py +++ b/decompiler/backend/cexpressiongenerator.py @@ -15,20 +15,67 @@ ) from decompiler.structures.pseudo import instructions as instructions from decompiler.structures.pseudo import operations as operations +from decompiler.structures.pseudo.complextypes import Struct from decompiler.structures.pseudo.operations import MemberAccess from decompiler.structures.visitors.interfaces import DataflowObjectVisitorInterface from decompiler.util.integer_util import normalize_int MAX_GLOBAL_INIT_LENGTH = 128 +INLINE_STRUCT_STRINGS = True +DETECT_STRUCT_STRINGS = True + + +def get_struct_string_address_offset(vartype) -> int | None: + """This function return the offset of its address field if the vartype is a "struct string". + Otherwise it returns None. + + struct strings are structs comprising of a length and a pointer to string data. + The code does not assume whether data or length comes first. The loop is for determining the order. + """ + if not isinstance(vartype, Struct): + return None + if len(vartype.members) != 2: + return None + address_offset = None + length_offset = None + for offset, member in vartype.members.items(): + match member.type: + case Pointer(type=Integer(size=8)): + address_offset = offset + case Integer(): + length_offset = offset + case _: + return None + if address_offset is None or length_offset is None: + return None + return address_offset + + +def is_struct_string(vartype) -> bool: + """Checks if a vartype represents a "struct string" (i.e. a struct comprising of a length and a pointer to string data) or not.""" + if not DETECT_STRUCT_STRINGS: + return False + return get_struct_string_address_offset(vartype) is not None + + +def get_data_of_struct_string(variable) -> GlobalVariable: + """Returns the data of a "struct string" (i.e. a struct comprising of a length and a pointer to string data).""" + address_offset = get_struct_string_address_offset(variable.type) + address = variable.initial_value.value[address_offset] + return address def inline_global_variable(var) -> bool: + """Decides whether or not to inline a global variable.""" if not var.is_constant: return False match var.type: case ArrayType(): if var.type.type in [Integer.char(), CustomType.wchar16(), CustomType.wchar32()]: return True + case Struct(): + if INLINE_STRUCT_STRINGS and is_struct_string(var.type): + return True case _: return False return False @@ -219,6 +266,8 @@ def visit_variable(self, expr: expressions.Variable) -> str: def visit_global_variable(self, expr: expressions.GlobalVariable): """Inline a global variable if its initial value is constant and not of void type""" if inline_global_variable(expr): + if is_struct_string(expr.type): + return self.visit(get_data_of_struct_string(expr)) return self.visit(expr.initial_value) return expr.name diff --git a/decompiler/backend/variabledeclarations.py b/decompiler/backend/variabledeclarations.py index 55639f48..efd0cc70 100644 --- a/decompiler/backend/variabledeclarations.py +++ b/decompiler/backend/variabledeclarations.py @@ -3,9 +3,16 @@ from collections import defaultdict from typing import Iterable, Iterator, List -from decompiler.backend.cexpressiongenerator import CExpressionGenerator, inline_global_variable +from decompiler.backend.cexpressiongenerator import ( + CExpressionGenerator, + get_data_of_struct_string, + inline_global_variable, + is_struct_string, +) from decompiler.structures.ast.syntaxtree import AbstractSyntaxTree from decompiler.structures.pseudo import GlobalVariable, Integer, Variable +from decompiler.structures.pseudo.complextypes import Struct +from decompiler.structures.pseudo.expressions import StructConstant from decompiler.structures.pseudo.typing import ArrayType, CustomType, Pointer from decompiler.structures.visitors.ast_dataflowobjectvisitor import BaseAstDataflowObjectVisitor from decompiler.task import DecompilerTask @@ -66,6 +73,16 @@ def _generate_definitions(global_variables: set[GlobalVariable]) -> Iterator[str if not variable.type.type in [Integer.char(), Integer.uint8_t(), CustomType.wchar16(), CustomType.wchar32()]: br, bl = "{", "}" yield f"{base}{variable.type.type} {variable.name}[{hex(variable.type.elements)}] = {br}{CExpressionGenerator().visit(variable.initial_value)}{bl};" + case Struct(): + if is_struct_string(variable.type): + yield base + f"struct {variable.type.name} {variable.name} = {CExpressionGenerator().visit(get_data_of_struct_string(variable))};" + continue + string = f"struct {variable.type.name} {variable.name}" + "{\n" + for m_type, m_value in zip(variable.type.members.values(), variable.initial_value.value.values()): + value = CExpressionGenerator().visit(m_value) + string += f"\t.{m_type.name} = {value};\n" + string += "}" + yield base + string case _: yield f"{base}{variable.type} {variable.name} = {CExpressionGenerator().visit(variable.initial_value)};" @@ -88,3 +105,6 @@ def visit_global_variable(self, expr: GlobalVariable): self._global_vars.add(expr.copy(ssa_label=0, ssa_name=None)) if not expr.is_constant or expr.type == Pointer(CustomType.void()): self._global_vars.add(expr.copy(ssa_label=0, ssa_name=None)) + if isinstance(expr.initial_value, StructConstant): + for member_value in expr.initial_value.value.values(): + self.visit(member_value) diff --git a/decompiler/frontend/binaryninja/handlers/globals.py b/decompiler/frontend/binaryninja/handlers/globals.py index a3b71256..ae25747c 100644 --- a/decompiler/frontend/binaryninja/handlers/globals.py +++ b/decompiler/frontend/binaryninja/handlers/globals.py @@ -3,6 +3,7 @@ from typing import Callable, Optional, Tuple, Union from binaryninja import BinaryView, DataVariable, Endianness, MediumLevelILInstruction, SectionSemantics, Type +from binaryninja.enums import NamedTypeReferenceClass from binaryninja.types import ( ArrayType, BoolType, @@ -12,6 +13,7 @@ IntegerType, NamedTypeReferenceType, PointerType, + StructureType, Type, VoidType, ) @@ -19,6 +21,7 @@ from decompiler.frontend.lifter import Handler from decompiler.structures.pseudo import ArrayType as PseudoArrayType from decompiler.structures.pseudo import ( + ComplexTypeMember, Constant, ConstantComposition, CustomType, @@ -28,6 +31,8 @@ Integer, OperationType, Pointer, + Struct, + StructConstant, Symbol, UnaryOperation, ) @@ -59,9 +64,12 @@ ==> trust bninja lift normally => If a void*, then we try determine the value via get_unknown_pointer_value - NamedTypeReferenceType + - (enum/structs + => lifts struct members recursively + => includes special handling of a BNinja bug when accessing certain PDB enum types + - StructType - enum/structs - => not supported currently - => has a BNinja bug when accessing certain PDB enum types + => implementation *very* similar to NamedTypeReferenceType MISC: - ._callers will be empty for each call of lift_global_variable @@ -88,8 +96,11 @@ def __init__(self, lifter): ArrayType: self._lift_array_type, PointerType: self._lift_pointer_type, NamedTypeReferenceType: self._lift_named_type_ref, + StructureType: self._lift_structure_type, } - self._lifted_globals: dict[int, GlobalVariable] = {} # Cache for already lifted global variables, keys are addresses + self._lifted_globals: dict[tuple, GlobalVariable] = ( + {} + ) # Cache for already lifted global variables, keys are addresses + type (required to distinguish struct from its first member) self._view: Optional[BinaryView] = None # Will be set in first call to lift_global_variable def register(self): @@ -101,9 +112,18 @@ def _get_gvar_name(self, bninjaName: Optional[str], addr: int) -> str: lifted_names = [v.name for v in self._lifted_globals.values()] if bninjaName is None: return GLOBAL_VARIABLE_PREFIX + f"{addr:x}" - if bninjaName in lifted_names: - return bninjaName + "_" + f"{addr:x}" - return bninjaName + name = bninjaName.translate( + { + ord(" "): "_", + ord("'"): "", + ord("."): "_", + ord("`"): "", + ord('"'): "", + } + ).strip() + if name in lifted_names: + return name + "_" + f"{addr:x}" + return name def _build_global_variable(self, name: Optional[str], type: Type, addr: int, init_value, ssa_label: Optional[int]) -> GlobalVariable: """Wrapper for building global variables.""" @@ -117,10 +137,10 @@ def _build_global_variable(self, name: Optional[str], type: Type, addr: int, ini case _: raise TypeError(f"Type violation: '{init_value}'") - self._lifted_globals[addr] = GlobalVariable( + self._lifted_globals[(addr, type)] = GlobalVariable( name=vname, vartype=type, initial_value=vinit_value, ssa_label=ssa_label, is_constant=addr_in_ro_section(self._view, addr) ) - return self._lifted_globals[addr] + return self._lifted_globals[(addr, type)] def lift_global_variable( self, @@ -136,11 +156,12 @@ def lift_global_variable( self._view = view # If addr was already lifted: Return lifted GlobalVariable with updated SSA - if variable.address in self._lifted_globals.keys(): + variable_identifier = (variable.address, self._lifter.lift(variable.type)) + if variable_identifier in self._lifted_globals.keys(): return ( - self._lifted_globals[variable.address].copy(ssa_label=parent.ssa_memory_version) + self._lifted_globals[variable_identifier].copy(ssa_label=parent.ssa_memory_version) if parent - else self._lifted_globals[variable.address] + else self._lifted_globals[variable_identifier] ) # BNinja error cases: nullptr/small numbers (0, -12...) @@ -237,9 +258,46 @@ def _lift_pointer_type( def _lift_named_type_ref(self, variable: DataVariable, parent: Optional[MediumLevelILInstruction] = None, **_): """Lift a named custom type (Enum, Structs)""" - return Constant( - "Unknown value", self._lifter.lift(variable.type) - ) # BNinja error, need to check with the issue to get the correct value + entry for structs + match variable.type.named_type_class: + case NamedTypeReferenceClass.StructNamedTypeClass: + struct_type = self._view.get_type_by_id(variable.type.type_id) + return self._lift_struct_helper(variable, parent, struct_type) + + case NamedTypeReferenceClass.EnumNamedTypeClass: + try: + value = Constant(variable.value, self._lifter.lift(variable.type)) + return self._build_global_variable( + variable.name, + value.type, + variable.address, + value, + parent.ssa_memory_version if parent else 0, + ) + except Exception: + return Constant("Unknown value", self._lifter.lift(variable.type)) # BNinja error + case _: + raise NotImplementedError(f"No handler for '{variable.type.named_type_class}' in lifter") + + def _lift_structure_type(self, variable: DataVariable, parent: Optional[MediumLevelILInstruction] = None, **_): + """Lift a struct""" + struct_type = variable.type + return self._lift_struct_helper(variable, parent, struct_type) + + def _lift_struct_helper(self, variable, parent, struct_type): + """This helper method for lifting structs does the heavy lifting. + A structs initial value is comprised of its membembers' initial values. + This method iterates over all struct members, interprets the corresponding memory locations as new data variables + and lifts them (recursively) to gain access to the members' initial values. + """ + values = {} + s_type = self._lifter.lift(struct_type) + for member_type in struct_type.members: + dv = DataVariable(self._view, variable.address + member_type.offset, member_type.type, False) + lift = self._lifter.lift(dv, view=self._view) + values[member_type.offset] = lift.initial_value + return self._build_global_variable( + variable.name, s_type, variable.address, StructConstant(values, s_type), parent.ssa_memory_version if parent else 0 + ) def _get_unknown_value(self, variable: DataVariable): """Return string or bytes at dv.address(!) (dv.type must be void)""" diff --git a/decompiler/frontend/binaryninja/handlers/types.py b/decompiler/frontend/binaryninja/handlers/types.py index daf2902f..929162a7 100644 --- a/decompiler/frontend/binaryninja/handlers/types.py +++ b/decompiler/frontend/binaryninja/handlers/types.py @@ -75,8 +75,12 @@ def lift_named_type_reference_type(self, custom: NamedTypeReferenceType, **kwarg return CustomType(str(custom), custom.width * self.BYTE_SIZE) def lift_enum(self, binja_enum: EnumerationType, name: str = None, **kwargs) -> Enum: - """Lift enum type.""" + """Lift enum type. + The cache lookup uses the hash of the Binary Ninja object instead of names, as names might collide.""" type_id = hash(binja_enum) + cached_type = self._lifter.complex_types.retrieve_by_id(type_id) + if cached_type is not None: + return cached_type enum_name = self._get_data_type_name(binja_enum, keyword="enum", provided_name=name) enum = Enum(binja_enum.width * self.BYTE_SIZE, enum_name, {}) for member in binja_enum.members: @@ -89,6 +93,8 @@ def lift_enum_member(self, enum_member: EnumerationMember, **kwargs) -> ComplexT return ComplexTypeMember(size=0, name=enum_member.name, offset=-1, type=Integer(32), value=int(enum_member.value)) def lift_struct(self, struct: StructureType, name: str = None, **kwargs) -> Union[Struct, Union_, Class, ComplexTypeName]: + """lift struct/class and union types. + The cache lookup uses the hash of the Binary Ninja object instead of names, as names might collide.""" type_id = hash(struct) cached_type = self._lifter.complex_types.retrieve_by_id(type_id) if cached_type is not None: diff --git a/decompiler/structures/pseudo/__init__.py b/decompiler/structures/pseudo/__init__.py index 2024faef..54e3148b 100644 --- a/decompiler/structures/pseudo/__init__.py +++ b/decompiler/structures/pseudo/__init__.py @@ -11,6 +11,7 @@ IntrinsicSymbol, NotUseableConstant, RegisterPair, + StructConstant, Symbol, Tag, UnknownExpression, diff --git a/decompiler/structures/pseudo/expressions.py b/decompiler/structures/pseudo/expressions.py index 3ace5ae8..268eeb05 100644 --- a/decompiler/structures/pseudo/expressions.py +++ b/decompiler/structures/pseudo/expressions.py @@ -34,7 +34,7 @@ from typing import TYPE_CHECKING, Generic, Iterator, List, Optional, Tuple, TypeVar, Union, final from ...util.insertion_ordered_set import InsertionOrderedSet -from .complextypes import Enum +from .complextypes import Enum, Struct from .typing import CustomType, Type, UnknownType T = TypeVar("T") @@ -560,12 +560,14 @@ def accept(self, visitor: DataflowObjectVisitorInterface[T]) -> T: class ConstantComposition(Constant): + """This class stores multiple constants of the same type in a list. + It is used to represent arrays and string constants""" + def __init__(self, value: list[Constant], vartype: DecompiledType = UnknownType(), tags: Optional[Tuple[Tag, ...]] = None): super().__init__( value, - vartype, - None, - tags, + vartype=vartype, + tags=tags, ) def __eq__(self, __value): @@ -580,8 +582,39 @@ def __str__(self) -> str: def copy(self) -> ConstantComposition: """Generate a copy of the UnknownExpression with the same message.""" - return ConstantComposition([x.copy() for x in self.value], self._type.copy()) + return ConstantComposition(self.value, self._type) def accept(self, visitor: DataflowObjectVisitorInterface[T]) -> T: """Invoke the appropriate visitor for this Expression.""" return visitor.visit_constant_composition(self) + + +class StructConstant(Constant): + """This class represents constant structs. + The value is a dictionary mapping offsets to the corresponding fields' value. + The vartype is a 'Struct' (a special ComplexType), which provides a mapping from offsets to field names.""" + + def __init__(self, value: dict[int, Expression], vartype: Struct, tags: Optional[Tuple[Tag, ...]] = None): + super().__init__( + value, + vartype=vartype, + tags=tags, + ) + + def __eq__(self, __value): + return isinstance(__value, StructConstant) and super().__eq__(__value) + + def __hash__(self): + return hash(tuple(sorted(self.value.items()))) + + def __str__(self) -> str: + """Return a string representation of the struct""" + + return str(self.value) + + def __iter__(self) -> Iterator[Expression]: + yield from self.value.values() + + def copy(self) -> StructConstant: + """Generate a copy of the UnknownExpression with the same message.""" + return StructConstant(self.value, self._type)