From 127fb736d13879fc328fbad42ca65898caaddcb8 Mon Sep 17 00:00:00 2001 From: rihi <19492038+rihi@users.noreply.github.com> Date: Thu, 14 Mar 2024 13:14:26 +0100 Subject: [PATCH 1/6] Fix ordering when inserting cse --- .../common_subexpression_elimination.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py b/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py index 91848bbbb..a83d86d4c 100644 --- a/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py +++ b/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py @@ -36,7 +36,10 @@ class CfgInstruction: instruction: Instruction block: BasicBlock - index: int + + @property + def index(self): + return self.block.instructions.index(self.instruction) @dataclass() @@ -221,7 +224,7 @@ def from_cfg(cls, cfg: ControlFlowGraph) -> DefinitionGenerator: usages: DefaultDict[Expression, Counter[CfgInstruction]] = defaultdict(Counter) for basic_block in cfg: for index, instruction in enumerate(basic_block.instructions): - instruction_with_position = CfgInstruction(instruction, basic_block, index) + instruction_with_position = CfgInstruction(instruction, basic_block) for subexpression in _subexpression_dfs(instruction): usages[subexpression][instruction_with_position] += 1 return cls(usages, cfg.dominator_tree) @@ -236,7 +239,7 @@ def define(self, expression: Expression, variable: Variable): basic_block, index = self._find_location_for_insertion(expression) for usage in self._usages[expression]: usage.instruction.substitute(expression, variable.copy()) - self._insert_definition(CfgInstruction(Assignment(variable, expression), basic_block, index)) + self._insert_definition(CfgInstruction(Assignment(variable, expression), basic_block), index) def _find_location_for_insertion(self, expression) -> Tuple[BasicBlock, int]: """ @@ -265,9 +268,9 @@ def _is_invalid_dominator(self, basic_block: BasicBlock, expression: Expression) usages_in_the_same_block = [usage for usage in self._usages[expression] if usage.block == basic_block] return any([isinstance(usage.instruction, Phi) for usage in usages_in_the_same_block]) - def _insert_definition(self, definition: CfgInstruction): + def _insert_definition(self, definition: CfgInstruction, index: int): """Insert a new intermediate definition for the given expression at the given location.""" - definition.block.instructions.insert(definition.index, definition.instruction) + definition.block.instructions.insert(index, definition.instruction) for subexpression in _subexpression_dfs(definition.instruction): self._usages[subexpression][definition] += 1 From 77f0fd62fb6d9431be3924d3926a8b9f41f616e7 Mon Sep 17 00:00:00 2001 From: rihi <19492038+rihi@users.noreply.github.com> Date: Thu, 21 Mar 2024 12:09:53 +0100 Subject: [PATCH 2/6] Fix potential bugs where two instructions are equal --- .../common_subexpression_elimination.py | 62 ++++++++++--------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py b/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py index a83d86d4c..a33f619a4 100644 --- a/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py +++ b/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py @@ -2,6 +2,7 @@ from __future__ import annotations +import dataclasses from collections import Counter, defaultdict, deque from dataclasses import dataclass from itertools import chain @@ -18,28 +19,20 @@ from networkx import dfs_postorder_nodes -@dataclass(frozen=True, eq=False) -class CfgInstruction: +@dataclass(frozen=True) +class CfgInstructionLocation: """ dataclass in charge of tracking the location of Instruction objects in the cfg -> The considered instruction, where block is the basic block where it is contained and index the position in the basic block. - - Note: Two instances with the same data will not be equal (because of eq=False). - This way, eq and hash are way more performant, because at the time of writing this, eq and hash are very - expensive on big instructions. - - eq=True would probably be nicer to use, but we don't actually create instances with the same data - multiple times. (Rationale: initially just one instance is created per (block, index) pair. - All further instances with the same (block, index) will have a less complex instruction than before) """ - instruction: Instruction block: BasicBlock + index: int @property - def index(self): - return self.block.instructions.index(self.instruction) + def instruction(self): + return self.block.instructions[self.index] @dataclass() @@ -211,7 +204,7 @@ class DefinitionGenerator: def __init__( self, - expression_usages: DefaultDict[Expression, Counter[CfgInstruction]], + expression_usages: DefaultDict[Expression, Counter[CfgInstructionLocation]], dominator_tree: NetworkXGraph, ): """Generate a new instance based on data parsed from a cfg.""" @@ -221,16 +214,16 @@ def __init__( @classmethod def from_cfg(cls, cfg: ControlFlowGraph) -> DefinitionGenerator: """Initialize a DefinitionGenerator utilizing the data of the given cfg.""" - usages: DefaultDict[Expression, Counter[CfgInstruction]] = defaultdict(Counter) + usages: DefaultDict[Expression, Counter[CfgInstructionLocation]] = defaultdict(Counter) for basic_block in cfg: for index, instruction in enumerate(basic_block.instructions): - instruction_with_position = CfgInstruction(instruction, basic_block) + instruction_with_position = CfgInstructionLocation(basic_block, index) for subexpression in _subexpression_dfs(instruction): usages[subexpression][instruction_with_position] += 1 return cls(usages, cfg.dominator_tree) @property - def usages(self) -> DefaultDict[Expression, Counter[CfgInstruction]]: + def usages(self) -> DefaultDict[Expression, Counter[CfgInstructionLocation]]: """Return a mapping from expressions to a set of instructions using them.""" return self._usages @@ -239,7 +232,7 @@ def define(self, expression: Expression, variable: Variable): basic_block, index = self._find_location_for_insertion(expression) for usage in self._usages[expression]: usage.instruction.substitute(expression, variable.copy()) - self._insert_definition(CfgInstruction(Assignment(variable, expression), basic_block), index) + self._insert_definition(Assignment(variable, expression), basic_block, index) def _find_location_for_insertion(self, expression) -> Tuple[BasicBlock, int]: """ @@ -268,18 +261,31 @@ def _is_invalid_dominator(self, basic_block: BasicBlock, expression: Expression) usages_in_the_same_block = [usage for usage in self._usages[expression] if usage.block == basic_block] return any([isinstance(usage.instruction, Phi) for usage in usages_in_the_same_block]) - def _insert_definition(self, definition: CfgInstruction, index: int): + def _insert_definition(self, instruction: Instruction, block: BasicBlock, index: int): """Insert a new intermediate definition for the given expression at the given location.""" - definition.block.instructions.insert(index, definition.instruction) - for subexpression in _subexpression_dfs(definition.instruction): - self._usages[subexpression][definition] += 1 + block.instructions.insert(index, instruction) + + def update_location(location: CfgInstructionLocation) -> CfgInstructionLocation: + if location.block == block and location.index < index: + return dataclasses.replace(location, index=location.index + 1) + else: + return location + + # update positions of expression usages + for occurrences in self._usages.values(): + for location in list(occurrences): + if location.block == block and location.index >= index: + occurrences[dataclasses.replace(location, index=location.index + 1)] = occurrences.pop(location) + + for subexpression in _subexpression_dfs(instruction): + self._usages[subexpression][CfgInstructionLocation(block, index)] += 1 @staticmethod - def _find_insertion_index(basic_block: BasicBlock, usages: Iterable[CfgInstruction]) -> int: + def _find_insertion_index(basic_block: BasicBlock, usages: Iterable[CfgInstructionLocation]) -> int: """Find the first index in the given basic block where a definition could be inserted.""" usage = min((usage for usage in usages if usage.block == basic_block), default=None, key=lambda x: x.index) if usage: - return basic_block.instructions.index(usage.instruction, usage.index) + return usage.index if not basic_block.instructions: return 0 if isinstance(basic_block.instructions[-1], GenericBranch): @@ -327,7 +333,7 @@ def eliminate_common_subexpressions(self, definition_generator: DefinitionGenera except StopIteration: warning(f"[{self.name}] No dominating basic block could be found for {replacee}") - def _find_elimination_candidates(self, usages: DefaultDict[Expression, Counter[CfgInstruction]]) -> Iterator[Expression]: + def _find_elimination_candidates(self, usages: DefaultDict[Expression, Counter[CfgInstructionLocation]]) -> Iterator[Expression]: """ Iterate all expressions, yielding the expressions which should be eliminated. @@ -341,7 +347,7 @@ def _find_elimination_candidates(self, usages: DefaultDict[Expression, Counter[C usages[subexpression].subtract(expression_usage) yield expression - def _is_cse_candidate(self, expression: Expression, usages: DefaultDict[Expression, Counter[CfgInstruction]]): + def _is_cse_candidate(self, expression: Expression, usages: DefaultDict[Expression, Counter[CfgInstructionLocation]]): """Checks that we can add a common subexpression for the given expression.""" return ( self._is_elimination_candidate(expression, usages[expression]) @@ -359,14 +365,14 @@ def _is_complex_string(self, expression: Expression) -> bool: return isinstance(expression.value, str) and len(expression.value) >= self._min_string_length return False - def _check_inter_instruction(self, expression: Expression, instructions: Counter[CfgInstruction]) -> bool: + def _check_inter_instruction(self, expression: Expression, instructions: Counter[CfgInstructionLocation]) -> bool: """Check if the given expressions should be eliminated based on its global occurrences.""" referencing_instructions_count = sum(1 for _, count in instructions.items() if count > 0) return (expression.complexity >= 2 and referencing_instructions_count >= self._threshold) or ( self._is_complex_string(expression) and referencing_instructions_count >= self._string_threshold ) - def _check_intra_instruction(self, expression: Expression, instructions: Counter[CfgInstruction]) -> bool: + def _check_intra_instruction(self, expression: Expression, instructions: Counter[CfgInstructionLocation]) -> bool: """Check if this expression should be eliminated based on the amount of unique instructions utilizing it.""" referencing_count = instructions.total() return (expression.complexity >= 2 and referencing_count >= self._threshold) or ( From 24f85f4f26af182d32e363604b88441fbe047abd Mon Sep 17 00:00:00 2001 From: rihi <19492038+rihi@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:34:31 +0200 Subject: [PATCH 3/6] Remove unused function --- .../dataflowanalysis/common_subexpression_elimination.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py b/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py index a33f619a4..733365090 100644 --- a/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py +++ b/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py @@ -265,12 +265,6 @@ def _insert_definition(self, instruction: Instruction, block: BasicBlock, index: """Insert a new intermediate definition for the given expression at the given location.""" block.instructions.insert(index, instruction) - def update_location(location: CfgInstructionLocation) -> CfgInstructionLocation: - if location.block == block and location.index < index: - return dataclasses.replace(location, index=location.index + 1) - else: - return location - # update positions of expression usages for occurrences in self._usages.values(): for location in list(occurrences): From d9b4578ed1ca10d3034d64f617d3ec540e78f857 Mon Sep 17 00:00:00 2001 From: rihi <19492038+rihi@users.noreply.github.com> Date: Wed, 3 Apr 2024 15:10:28 +0200 Subject: [PATCH 4/6] Change back to previous approach --- .../common_subexpression_elimination.py | 53 ++++++++++--------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py b/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py index 733365090..0184bf74f 100644 --- a/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py +++ b/decompiler/pipeline/dataflowanalysis/common_subexpression_elimination.py @@ -2,7 +2,6 @@ from __future__ import annotations -import dataclasses from collections import Counter, defaultdict, deque from dataclasses import dataclass from itertools import chain @@ -19,20 +18,28 @@ from networkx import dfs_postorder_nodes -@dataclass(frozen=True) -class CfgInstructionLocation: +@dataclass(frozen=True, eq=False) +class CfgInstruction: """ dataclass in charge of tracking the location of Instruction objects in the cfg -> The considered instruction, where block is the basic block where it is contained and index the position in the basic block. + + Note: Two instances with the same data will not be equal (because of eq=False). + This way, eq and hash are way more performant, because at the time of writing this, eq and hash are very + expensive on big instructions. + + eq=True would probably be nicer to use, but we don't actually create instances with the same data + multiple times. (Rationale: initially just one instance is created per (block, index) pair. + All further instances with the same (block, index) will have a less complex instruction than before) """ + instruction: Instruction block: BasicBlock - index: int @property - def instruction(self): - return self.block.instructions[self.index] + def index(self): + return next(index for index, instruction in enumerate(self.block.instructions) if id(instruction) == id(self.instruction)) @dataclass() @@ -204,7 +211,7 @@ class DefinitionGenerator: def __init__( self, - expression_usages: DefaultDict[Expression, Counter[CfgInstructionLocation]], + expression_usages: DefaultDict[Expression, Counter[CfgInstruction]], dominator_tree: NetworkXGraph, ): """Generate a new instance based on data parsed from a cfg.""" @@ -214,16 +221,16 @@ def __init__( @classmethod def from_cfg(cls, cfg: ControlFlowGraph) -> DefinitionGenerator: """Initialize a DefinitionGenerator utilizing the data of the given cfg.""" - usages: DefaultDict[Expression, Counter[CfgInstructionLocation]] = defaultdict(Counter) + usages: DefaultDict[Expression, Counter[CfgInstruction]] = defaultdict(Counter) for basic_block in cfg: for index, instruction in enumerate(basic_block.instructions): - instruction_with_position = CfgInstructionLocation(basic_block, index) + instruction_with_position = CfgInstruction(instruction, basic_block) for subexpression in _subexpression_dfs(instruction): usages[subexpression][instruction_with_position] += 1 return cls(usages, cfg.dominator_tree) @property - def usages(self) -> DefaultDict[Expression, Counter[CfgInstructionLocation]]: + def usages(self) -> DefaultDict[Expression, Counter[CfgInstruction]]: """Return a mapping from expressions to a set of instructions using them.""" return self._usages @@ -264,22 +271,16 @@ def _is_invalid_dominator(self, basic_block: BasicBlock, expression: Expression) def _insert_definition(self, instruction: Instruction, block: BasicBlock, index: int): """Insert a new intermediate definition for the given expression at the given location.""" block.instructions.insert(index, instruction) - - # update positions of expression usages - for occurrences in self._usages.values(): - for location in list(occurrences): - if location.block == block and location.index >= index: - occurrences[dataclasses.replace(location, index=location.index + 1)] = occurrences.pop(location) - + cfg_instruction = CfgInstruction(instruction, block) for subexpression in _subexpression_dfs(instruction): - self._usages[subexpression][CfgInstructionLocation(block, index)] += 1 + self._usages[subexpression][cfg_instruction] += 1 @staticmethod - def _find_insertion_index(basic_block: BasicBlock, usages: Iterable[CfgInstructionLocation]) -> int: + def _find_insertion_index(basic_block: BasicBlock, usages: Iterable[CfgInstruction]) -> int: """Find the first index in the given basic block where a definition could be inserted.""" - usage = min((usage for usage in usages if usage.block == basic_block), default=None, key=lambda x: x.index) - if usage: - return usage.index + first_usage_index = min((usage.index for usage in usages if usage.block == basic_block), default=None) + if first_usage_index is not None: + return first_usage_index if not basic_block.instructions: return 0 if isinstance(basic_block.instructions[-1], GenericBranch): @@ -327,7 +328,7 @@ def eliminate_common_subexpressions(self, definition_generator: DefinitionGenera except StopIteration: warning(f"[{self.name}] No dominating basic block could be found for {replacee}") - def _find_elimination_candidates(self, usages: DefaultDict[Expression, Counter[CfgInstructionLocation]]) -> Iterator[Expression]: + def _find_elimination_candidates(self, usages: DefaultDict[Expression, Counter[CfgInstruction]]) -> Iterator[Expression]: """ Iterate all expressions, yielding the expressions which should be eliminated. @@ -341,7 +342,7 @@ def _find_elimination_candidates(self, usages: DefaultDict[Expression, Counter[C usages[subexpression].subtract(expression_usage) yield expression - def _is_cse_candidate(self, expression: Expression, usages: DefaultDict[Expression, Counter[CfgInstructionLocation]]): + def _is_cse_candidate(self, expression: Expression, usages: DefaultDict[Expression, Counter[CfgInstruction]]): """Checks that we can add a common subexpression for the given expression.""" return ( self._is_elimination_candidate(expression, usages[expression]) @@ -359,14 +360,14 @@ def _is_complex_string(self, expression: Expression) -> bool: return isinstance(expression.value, str) and len(expression.value) >= self._min_string_length return False - def _check_inter_instruction(self, expression: Expression, instructions: Counter[CfgInstructionLocation]) -> bool: + def _check_inter_instruction(self, expression: Expression, instructions: Counter[CfgInstruction]) -> bool: """Check if the given expressions should be eliminated based on its global occurrences.""" referencing_instructions_count = sum(1 for _, count in instructions.items() if count > 0) return (expression.complexity >= 2 and referencing_instructions_count >= self._threshold) or ( self._is_complex_string(expression) and referencing_instructions_count >= self._string_threshold ) - def _check_intra_instruction(self, expression: Expression, instructions: Counter[CfgInstructionLocation]) -> bool: + def _check_intra_instruction(self, expression: Expression, instructions: Counter[CfgInstruction]) -> bool: """Check if this expression should be eliminated based on the amount of unique instructions utilizing it.""" referencing_count = instructions.total() return (expression.complexity >= 2 and referencing_count >= self._threshold) or ( From 55cdbd0fc6f0e9b2b601e301bdde00a3e97e1703 Mon Sep 17 00:00:00 2001 From: rihi <19492038+rihi@users.noreply.github.com> Date: Wed, 3 Apr 2024 15:46:38 +0200 Subject: [PATCH 5/6] Add test --- .../test_common_subexpression_elimination.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/pipeline/dataflowanalysis/test_common_subexpression_elimination.py b/tests/pipeline/dataflowanalysis/test_common_subexpression_elimination.py index 009f70a8a..33bcce1d1 100644 --- a/tests/pipeline/dataflowanalysis/test_common_subexpression_elimination.py +++ b/tests/pipeline/dataflowanalysis/test_common_subexpression_elimination.py @@ -1283,3 +1283,32 @@ def test_common_subexpression_elimination_correct_place(): Assignment(Variable("d", ssa_label=4), BinaryOperation(OperationType.plus, [Variable("e", ssa_label=2), replacement0])), Assignment(Variable("f", ssa_label=1), BinaryOperation(OperationType.minus, [Variable("g", ssa_label=4), replacement1])), ] + + +def test_common_subexpression_elimination_correct_place2(): + """Check that the instruction is inserted at the correct position""" + expr0 = BinaryOperation(OperationType.multiply, [Variable("a", Integer.int32_t()), Constant(2, Integer.int32_t())]) + expr1 = BinaryOperation(OperationType.plus, [expr0.copy(), Constant(5, Integer.int32_t())]) + + cfg = ControlFlowGraph() + cfg.add_node( + node := BasicBlock( + 0, + instructions=[ + Assignment(ListOperation([]), Call(FunctionSymbol("func", 0), [expr0.copy(), expr1.copy()])), + Assignment(ListOperation([]), Call(FunctionSymbol("func", 0), [expr1.copy()])) + ], + ) + ) + _run_cse(cfg, _generate_options(threshold=2)) + + replacement0 = Variable("c0", Integer.int32_t(), ssa_label=0) + replacement1 = Variable("c1", Integer.int32_t(), ssa_label=0) + expr_new = BinaryOperation(OperationType.plus, [replacement1.copy(), Constant(5, Integer.int32_t())]) + + assert node.instructions == [ + Assignment(replacement1.copy(), expr0.copy()), + Assignment(replacement0.copy(), expr_new.copy()), + Assignment(ListOperation([]), Call(FunctionSymbol("func", 0), [replacement1.copy(), replacement0.copy()])), + Assignment(ListOperation([]), Call(FunctionSymbol("func", 0), [replacement0.copy()])), + ] From 593c29eda542fc10d0044f0bd9efceb4f25dac1e Mon Sep 17 00:00:00 2001 From: rihi <19492038+rihi@users.noreply.github.com> Date: Wed, 3 Apr 2024 15:53:38 +0200 Subject: [PATCH 6/6] black --- .../dataflowanalysis/test_common_subexpression_elimination.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipeline/dataflowanalysis/test_common_subexpression_elimination.py b/tests/pipeline/dataflowanalysis/test_common_subexpression_elimination.py index 33bcce1d1..f5e3bc0bf 100644 --- a/tests/pipeline/dataflowanalysis/test_common_subexpression_elimination.py +++ b/tests/pipeline/dataflowanalysis/test_common_subexpression_elimination.py @@ -1296,7 +1296,7 @@ def test_common_subexpression_elimination_correct_place2(): 0, instructions=[ Assignment(ListOperation([]), Call(FunctionSymbol("func", 0), [expr0.copy(), expr1.copy()])), - Assignment(ListOperation([]), Call(FunctionSymbol("func", 0), [expr1.copy()])) + Assignment(ListOperation([]), Call(FunctionSymbol("func", 0), [expr1.copy()])), ], ) )