Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial struct support #418

Merged
merged 30 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9ee0ff3
Pop
NeoQuix Apr 24, 2024
4014f07
Merge branch 'main' into initial_struct_support
blattm May 16, 2024
fe92ec4
fix bug related to duplicate lifiting of enum types
blattm May 21, 2024
e590264
stabilize struct handling
blattm May 21, 2024
728ab55
format
blattm May 21, 2024
fffb1b4
add type names to struct defs and detect complex strings
blattm May 21, 2024
b0fb77a
enum fix for #214
blattm Jun 18, 2024
a921506
fix format
blattm Jun 18, 2024
c3b91a6
rename StructTesting to StructConstant
blattm Jul 3, 2024
fc95d20
update globals.py doc
blattm Jul 3, 2024
972b58b
rename functions and fix signature
blattm Jul 3, 2024
3e51bae
inline string structs by default
blattm Jul 3, 2024
7079a47
Merge branch 'main' into initial_struct_support
blattm Jul 3, 2024
2596f02
fix format
blattm Jul 3, 2024
2373c7b
rename flags
blattm Jul 3, 2024
1feaa21
fix StructConstant's hash
blattm Jul 5, 2024
6dd8778
Merge branch 'main' into initial_struct_support
blattm Aug 1, 2024
cf29eec
move constants in cexpressingenerator together
blattm Jul 24, 2024
3bcc961
remove unused variables
blattm Aug 15, 2024
63fa269
remove duplicate code
blattm Aug 15, 2024
d5c040b
add docstrings
blattm Aug 15, 2024
74b4318
remove TODO
blattm Aug 15, 2024
88fb831
removed unnecessary parameters for clarity
blattm Aug 16, 2024
12c9352
add/improve docstrings for enum/struct/union type lifting
blattm Aug 16, 2024
89810b6
improve struct hash calculation
blattm Aug 19, 2024
7f220f8
add docstrings
blattm Aug 19, 2024
346df6f
shallow copy structs and arrays
blattm Aug 20, 2024
e7a0832
change struct __str__ to output dictionary
blattm Aug 21, 2024
350b6a8
add comment
blattm Aug 21, 2024
ad95495
Merge branch 'main' into initial_struct_support
0x6e62 Aug 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions decompiler/backend/cexpressiongenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,67 @@
)
from decompiler.structures.pseudo import instructions as instructions
from decompiler.structures.pseudo import operations as operations
from decompiler.structures.pseudo.complextypes import Struct
from decompiler.structures.pseudo.operations import MemberAccess
from decompiler.structures.visitors.interfaces import DataflowObjectVisitorInterface
from decompiler.util.integer_util import normalize_int

MAX_GLOBAL_INIT_LENGTH = 128
INLINE_STRUCT_STRINGS = True
DETECT_STRUCT_STRINGS = True
0x6e62 marked this conversation as resolved.
Show resolved Hide resolved


def get_struct_string_address_offset(vartype) -> int | None:
blattm marked this conversation as resolved.
Show resolved Hide resolved
"""This function return the offset of its address field if the vartype is a "struct string".
Otherwise it returns None.

struct strings are structs comprising of a length and a pointer to string data.
The code does not assume whether data or length comes first. The loop is for determining the order.
"""
if not isinstance(vartype, Struct):
return None
if len(vartype.members) != 2:
return None
address_offset = None
length_offset = None
for offset, member in vartype.members.items():
match member.type:
case Pointer(type=Integer(size=8)):
address_offset = offset
case Integer():
length_offset = offset
case _:
return None
blattm marked this conversation as resolved.
Show resolved Hide resolved
if address_offset is None or length_offset is None:
return None
return address_offset


def is_struct_string(vartype) -> bool:
"""Checks if a vartype represents a "struct string" (i.e. a struct comprising of a length and a pointer to string data) or not."""
if not DETECT_STRUCT_STRINGS:
return False
return get_struct_string_address_offset(vartype) is not None


def get_data_of_struct_string(variable) -> GlobalVariable:
"""Returns the data of a "struct string" (i.e. a struct comprising of a length and a pointer to string data)."""
address_offset = get_struct_string_address_offset(variable.type)
address = variable.initial_value.value[address_offset]
return address


def inline_global_variable(var) -> bool:
"""Decides whether or not to inline a global variable."""
if not var.is_constant:
return False
match var.type:
case ArrayType():
if var.type.type in [Integer.char(), CustomType.wchar16(), CustomType.wchar32()]:
return True
case Struct():
if INLINE_STRUCT_STRINGS and is_struct_string(var.type):
return True
case _:
return False
return False
Expand Down Expand Up @@ -219,6 +266,8 @@ def visit_variable(self, expr: expressions.Variable) -> str:
def visit_global_variable(self, expr: expressions.GlobalVariable):
"""Inline a global variable if its initial value is constant and not of void type"""
if inline_global_variable(expr):
if is_struct_string(expr.type):
return self.visit(get_data_of_struct_string(expr))
return self.visit(expr.initial_value)
return expr.name

Expand Down
22 changes: 21 additions & 1 deletion decompiler/backend/variabledeclarations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,16 @@
from collections import defaultdict
from typing import Iterable, Iterator, List

from decompiler.backend.cexpressiongenerator import CExpressionGenerator, inline_global_variable
from decompiler.backend.cexpressiongenerator import (
CExpressionGenerator,
get_data_of_struct_string,
inline_global_variable,
is_struct_string,
)
from decompiler.structures.ast.syntaxtree import AbstractSyntaxTree
from decompiler.structures.pseudo import GlobalVariable, Integer, Variable
from decompiler.structures.pseudo.complextypes import Struct
from decompiler.structures.pseudo.expressions import StructConstant
from decompiler.structures.pseudo.typing import ArrayType, CustomType, Pointer
from decompiler.structures.visitors.ast_dataflowobjectvisitor import BaseAstDataflowObjectVisitor
from decompiler.task import DecompilerTask
Expand Down Expand Up @@ -66,6 +73,16 @@ def _generate_definitions(global_variables: set[GlobalVariable]) -> Iterator[str
if not variable.type.type in [Integer.char(), Integer.uint8_t(), CustomType.wchar16(), CustomType.wchar32()]:
br, bl = "{", "}"
yield f"{base}{variable.type.type} {variable.name}[{hex(variable.type.elements)}] = {br}{CExpressionGenerator().visit(variable.initial_value)}{bl};"
case Struct():
if is_struct_string(variable.type):
yield base + f"struct {variable.type.name} {variable.name} = {CExpressionGenerator().visit(get_data_of_struct_string(variable))};"
continue
string = f"struct {variable.type.name} {variable.name}" + "{\n"
0x6e62 marked this conversation as resolved.
Show resolved Hide resolved
for m_type, m_value in zip(variable.type.members.values(), variable.initial_value.value.values()):
value = CExpressionGenerator().visit(m_value)
string += f"\t.{m_type.name} = {value};\n"
string += "}"
yield base + string
case _:
yield f"{base}{variable.type} {variable.name} = {CExpressionGenerator().visit(variable.initial_value)};"

Expand All @@ -88,3 +105,6 @@ def visit_global_variable(self, expr: GlobalVariable):
self._global_vars.add(expr.copy(ssa_label=0, ssa_name=None))
if not expr.is_constant or expr.type == Pointer(CustomType.void()):
self._global_vars.add(expr.copy(ssa_label=0, ssa_name=None))
if isinstance(expr.initial_value, StructConstant):
for member_value in expr.initial_value.value.values():
self.visit(member_value)
86 changes: 72 additions & 14 deletions decompiler/frontend/binaryninja/handlers/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Callable, Optional, Tuple, Union

from binaryninja import BinaryView, DataVariable, Endianness, MediumLevelILInstruction, SectionSemantics, Type
from binaryninja.enums import NamedTypeReferenceClass
from binaryninja.types import (
ArrayType,
BoolType,
Expand All @@ -12,13 +13,15 @@
IntegerType,
NamedTypeReferenceType,
PointerType,
StructureType,
Type,
VoidType,
)
from decompiler.frontend.binaryninja.handlers.symbols import GLOBAL_VARIABLE_PREFIX
from decompiler.frontend.lifter import Handler
from decompiler.structures.pseudo import ArrayType as PseudoArrayType
from decompiler.structures.pseudo import (
ComplexTypeMember,
Constant,
ConstantComposition,
CustomType,
Expand All @@ -28,6 +31,8 @@
Integer,
OperationType,
Pointer,
Struct,
StructConstant,
Symbol,
UnaryOperation,
)
Expand Down Expand Up @@ -59,9 +64,12 @@
==> trust bninja lift normally
=> If a void*, then we try determine the value via get_unknown_pointer_value
- NamedTypeReferenceType
- (enum/structs
=> lifts struct members recursively
=> includes special handling of a BNinja bug when accessing certain PDB enum types
- StructType
- enum/structs
=> not supported currently
=> has a BNinja bug when accessing certain PDB enum types
=> implementation *very* similar to NamedTypeReferenceType

MISC:
- ._callers will be empty for each call of lift_global_variable
Expand All @@ -88,8 +96,11 @@ def __init__(self, lifter):
ArrayType: self._lift_array_type,
PointerType: self._lift_pointer_type,
NamedTypeReferenceType: self._lift_named_type_ref,
StructureType: self._lift_structure_type,
}
self._lifted_globals: dict[int, GlobalVariable] = {} # Cache for already lifted global variables, keys are addresses
self._lifted_globals: dict[tuple, GlobalVariable] = (
{}
) # Cache for already lifted global variables, keys are addresses + type (required to distinguish struct from its first member)
self._view: Optional[BinaryView] = None # Will be set in first call to lift_global_variable

def register(self):
Expand All @@ -101,9 +112,18 @@ def _get_gvar_name(self, bninjaName: Optional[str], addr: int) -> str:
lifted_names = [v.name for v in self._lifted_globals.values()]
if bninjaName is None:
return GLOBAL_VARIABLE_PREFIX + f"{addr:x}"
if bninjaName in lifted_names:
return bninjaName + "_" + f"{addr:x}"
return bninjaName
name = bninjaName.translate(
{
ord(" "): "_",
ord("'"): "",
ord("."): "_",
ord("`"): "",
ord('"'): "",
}
).strip()
if name in lifted_names:
return name + "_" + f"{addr:x}"
return name

def _build_global_variable(self, name: Optional[str], type: Type, addr: int, init_value, ssa_label: Optional[int]) -> GlobalVariable:
"""Wrapper for building global variables."""
Expand All @@ -117,10 +137,10 @@ def _build_global_variable(self, name: Optional[str], type: Type, addr: int, ini
case _:
raise TypeError(f"Type violation: '{init_value}'")

self._lifted_globals[addr] = GlobalVariable(
self._lifted_globals[(addr, type)] = GlobalVariable(
name=vname, vartype=type, initial_value=vinit_value, ssa_label=ssa_label, is_constant=addr_in_ro_section(self._view, addr)
)
return self._lifted_globals[addr]
return self._lifted_globals[(addr, type)]

def lift_global_variable(
self,
Expand All @@ -136,11 +156,12 @@ def lift_global_variable(
self._view = view

# If addr was already lifted: Return lifted GlobalVariable with updated SSA
if variable.address in self._lifted_globals.keys():
variable_identifier = (variable.address, self._lifter.lift(variable.type))
if variable_identifier in self._lifted_globals.keys():
return (
self._lifted_globals[variable.address].copy(ssa_label=parent.ssa_memory_version)
self._lifted_globals[variable_identifier].copy(ssa_label=parent.ssa_memory_version)
if parent
else self._lifted_globals[variable.address]
else self._lifted_globals[variable_identifier]
)

# BNinja error cases: nullptr/small numbers (0, -12...)
Expand Down Expand Up @@ -237,9 +258,46 @@ def _lift_pointer_type(

def _lift_named_type_ref(self, variable: DataVariable, parent: Optional[MediumLevelILInstruction] = None, **_):
"""Lift a named custom type (Enum, Structs)"""
return Constant(
"Unknown value", self._lifter.lift(variable.type)
) # BNinja error, need to check with the issue to get the correct value + entry for structs
match variable.type.named_type_class:
case NamedTypeReferenceClass.StructNamedTypeClass:
struct_type = self._view.get_type_by_id(variable.type.type_id)
return self._lift_struct_helper(variable, parent, struct_type)

case NamedTypeReferenceClass.EnumNamedTypeClass:
try:
value = Constant(variable.value, self._lifter.lift(variable.type))
return self._build_global_variable(
variable.name,
value.type,
variable.address,
value,
parent.ssa_memory_version if parent else 0,
)
except Exception:
return Constant("Unknown value", self._lifter.lift(variable.type)) # BNinja error
case _:
raise NotImplementedError(f"No handler for '{variable.type.named_type_class}' in lifter")

def _lift_structure_type(self, variable: DataVariable, parent: Optional[MediumLevelILInstruction] = None, **_):
blattm marked this conversation as resolved.
Show resolved Hide resolved
"""Lift a struct"""
struct_type = variable.type
return self._lift_struct_helper(variable, parent, struct_type)

def _lift_struct_helper(self, variable, parent, struct_type):
"""This helper method for lifting structs does the heavy lifting.
A structs initial value is comprised of its membembers' initial values.
This method iterates over all struct members, interprets the corresponding memory locations as new data variables
and lifts them (recursively) to gain access to the members' initial values.
"""
values = {}
s_type = self._lifter.lift(struct_type)
for member_type in struct_type.members:
dv = DataVariable(self._view, variable.address + member_type.offset, member_type.type, False)
lift = self._lifter.lift(dv, view=self._view)
values[member_type.offset] = lift.initial_value
return self._build_global_variable(
variable.name, s_type, variable.address, StructConstant(values, s_type), parent.ssa_memory_version if parent else 0
)

def _get_unknown_value(self, variable: DataVariable):
"""Return string or bytes at dv.address(!) (dv.type must be void)"""
Expand Down
8 changes: 7 additions & 1 deletion decompiler/frontend/binaryninja/handlers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,12 @@ def lift_named_type_reference_type(self, custom: NamedTypeReferenceType, **kwarg
return CustomType(str(custom), custom.width * self.BYTE_SIZE)

def lift_enum(self, binja_enum: EnumerationType, name: str = None, **kwargs) -> Enum:
"""Lift enum type."""
"""Lift enum type.
The cache lookup uses the hash of the Binary Ninja object instead of names, as names might collide."""
type_id = hash(binja_enum)
cached_type = self._lifter.complex_types.retrieve_by_id(type_id)
if cached_type is not None:
return cached_type
blattm marked this conversation as resolved.
Show resolved Hide resolved
enum_name = self._get_data_type_name(binja_enum, keyword="enum", provided_name=name)
enum = Enum(binja_enum.width * self.BYTE_SIZE, enum_name, {})
for member in binja_enum.members:
Expand All @@ -89,6 +93,8 @@ def lift_enum_member(self, enum_member: EnumerationMember, **kwargs) -> ComplexT
return ComplexTypeMember(size=0, name=enum_member.name, offset=-1, type=Integer(32), value=int(enum_member.value))

def lift_struct(self, struct: StructureType, name: str = None, **kwargs) -> Union[Struct, Union_, Class, ComplexTypeName]:
"""lift struct/class and union types.
The cache lookup uses the hash of the Binary Ninja object instead of names, as names might collide."""
type_id = hash(struct)
cached_type = self._lifter.complex_types.retrieve_by_id(type_id)
if cached_type is not None:
Expand Down
1 change: 1 addition & 0 deletions decompiler/structures/pseudo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
IntrinsicSymbol,
NotUseableConstant,
RegisterPair,
StructConstant,
Symbol,
Tag,
UnknownExpression,
Expand Down
43 changes: 38 additions & 5 deletions decompiler/structures/pseudo/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from typing import TYPE_CHECKING, Generic, Iterator, List, Optional, Tuple, TypeVar, Union, final

from ...util.insertion_ordered_set import InsertionOrderedSet
from .complextypes import Enum
from .complextypes import Enum, Struct
from .typing import CustomType, Type, UnknownType

T = TypeVar("T")
Expand Down Expand Up @@ -560,12 +560,14 @@ def accept(self, visitor: DataflowObjectVisitorInterface[T]) -> T:


class ConstantComposition(Constant):
"""This class stores multiple constants of the same type in a list.
It is used to represent arrays and string constants"""

def __init__(self, value: list[Constant], vartype: DecompiledType = UnknownType(), tags: Optional[Tuple[Tag, ...]] = None):
super().__init__(
value,
vartype,
None,
tags,
vartype=vartype,
tags=tags,
)

def __eq__(self, __value):
Expand All @@ -580,8 +582,39 @@ def __str__(self) -> str:

def copy(self) -> ConstantComposition:
"""Generate a copy of the UnknownExpression with the same message."""
return ConstantComposition([x.copy() for x in self.value], self._type.copy())
return ConstantComposition(self.value, self._type)

def accept(self, visitor: DataflowObjectVisitorInterface[T]) -> T:
"""Invoke the appropriate visitor for this Expression."""
return visitor.visit_constant_composition(self)


class StructConstant(Constant):
blattm marked this conversation as resolved.
Show resolved Hide resolved
"""This class represents constant structs.
The value is a dictionary mapping offsets to the corresponding fields' value.
The vartype is a 'Struct' (a special ComplexType), which provides a mapping from offsets to field names."""

def __init__(self, value: dict[int, Expression], vartype: Struct, tags: Optional[Tuple[Tag, ...]] = None):
super().__init__(
value,
vartype=vartype,
tags=tags,
)

def __eq__(self, __value):
return isinstance(__value, StructConstant) and super().__eq__(__value)

def __hash__(self):
return hash(tuple(sorted(self.value.items())))

def __str__(self) -> str:
"""Return a string representation of the struct"""

return str(self.value)

def __iter__(self) -> Iterator[Expression]:
yield from self.value.values()

def copy(self) -> StructConstant:
"""Generate a copy of the UnknownExpression with the same message."""
return StructConstant(self.value, self._type)
Loading