From adc1f4132712ab1d0de178d1c5e6e7fdbd3f5ba0 Mon Sep 17 00:00:00 2001 From: Philipp Schaad Date: Fri, 5 Jul 2024 16:58:16 +0200 Subject: [PATCH] Loop Region Code Generation (#1597) This PR adapts code generation to make use of hierarchical control flow regions, and by extension `LoopRegion`s. This forms the fourth core element of the [plan to make loops first class citizens of SDFGs](https://github.com/orgs/spcl/projects/10) and marks the last element in the architecture. By extending codegen with the capability of handling hierarchical control flow graphs and SDFGs, a myriad of complexities that come with control flow detection are circumvented, which currently lead to significant issues for certain SDFGs (e.g., #635 and #1586). Making use of control flow regions such as `LoopRegion`s instead allows codegen to be much less 'smart' and behave more akin to a lookup table that decides what code to generate for what SDFG element, making it significantly less error prone. --- dace/codegen/codegen.py | 14 +- dace/codegen/control_flow.py | 546 ++++++++++++++---- dace/codegen/dispatcher.py | 166 ++++-- dace/codegen/instrumentation/gpu_events.py | 42 +- dace/codegen/instrumentation/likwid.py | 66 ++- dace/codegen/instrumentation/provider.py | 43 +- dace/codegen/prettycode.py | 10 +- dace/codegen/targets/cpp.py | 41 +- dace/codegen/targets/cpu.py | 457 ++++++++------- dace/codegen/targets/cuda.py | 358 ++++++------ dace/codegen/targets/fpga.py | 223 +++---- dace/codegen/targets/framecode.py | 144 ++--- dace/codegen/targets/intel_fpga.py | 172 +++--- dace/codegen/targets/mlir/mlir.py | 15 +- dace/codegen/targets/mpi.py | 25 +- dace/codegen/targets/rtl.py | 57 +- dace/codegen/targets/snitch.py | 120 ++-- dace/codegen/targets/sve/codegen.py | 45 +- dace/codegen/targets/target.py | 29 +- dace/codegen/targets/unroller.py | 10 +- dace/codegen/targets/xilinx.py | 203 ++++--- dace/sdfg/analysis/cfg.py | 389 +++++++------ dace/sdfg/analysis/cutout.py | 4 +- .../analysis/schedule_tree/sdfg_to_tree.py | 14 +- dace/sdfg/analysis/schedule_tree/treenodes.py | 32 + dace/sdfg/replace.py | 19 +- dace/sdfg/scope.py | 29 +- dace/sdfg/sdfg.py | 45 +- dace/sdfg/state.py | 189 +++--- dace/sdfg/utils.py | 4 +- dace/transformation/dataflow/map_fission.py | 1 + dace/transformation/dataflow/map_for_loop.py | 1 + .../dataflow/prune_connectors.py | 2 +- dace/transformation/helpers.py | 36 +- dace/transformation/interstate/loop_to_map.py | 4 +- .../passes/array_elimination.py | 2 +- .../passes/constant_propagation.py | 2 +- .../passes/dead_dataflow_elimination.py | 2 +- dace/transformation/transformation.py | 10 +- doc/general/errors.rst | 2 +- doc/sdfg/ir.rst | 2 +- samples/codegen/tensor_cores.py | 37 +- tests/python_frontend/loop_regions_test.py | 16 - tests/python_frontend/loops_test.py | 10 - tests/sdfg/loop_region_test.py | 185 ++++-- tests/transformations/nest_subgraph_test.py | 4 +- .../block_allreduce_cudatest.py | 2 - 47 files changed, 2284 insertions(+), 1545 deletions(-) diff --git a/dace/codegen/codegen.py b/dace/codegen/codegen.py index f73e3f8d11..d1427bf037 100644 --- a/dace/codegen/codegen.py +++ b/dace/codegen/codegen.py @@ -1,12 +1,11 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import functools -import os -from typing import List, Set +from typing import List import dace from dace import dtypes from dace import data -from dace.sdfg import SDFG, utils as sdutils +from dace.sdfg import SDFG from dace.codegen.targets import framecode from dace.codegen.codeobject import CodeObject from dace.config import Config @@ -95,7 +94,7 @@ def _get_codegen_targets(sdfg: SDFG, frame: framecode.DaCeCodeGenerator): for node, parent in sdfg.all_nodes_recursive(): # Query nodes and scopes if isinstance(node, SDFGState): - frame.targets.add(disp.get_state_dispatcher(parent, node)) + frame.targets.add(disp.get_state_dispatcher(node.sdfg, node)) elif isinstance(node, dace.nodes.EntryNode): frame.targets.add(disp.get_scope_dispatcher(node.schedule)) elif isinstance(node, dace.nodes.Node): @@ -149,7 +148,7 @@ def _get_codegen_targets(sdfg: SDFG, frame: framecode.DaCeCodeGenerator): disp.instrumentation[sdfg.instrument] = provider_mapping[sdfg.instrument] -def generate_code(sdfg, validate=True) -> List[CodeObject]: +def generate_code(sdfg: SDFG, validate=True) -> List[CodeObject]: """ Generates code as a list of code objects for a given SDFG. @@ -186,11 +185,6 @@ def generate_code(sdfg, validate=True) -> List[CodeObject]: shutil.move(f'{tmp_dir}/test2.sdfg', 'test2.sdfg') raise RuntimeError(f'SDFG serialization failed - files do not match:\n{diff}') - # Convert any loop constructs with hierarchical loop regions into simple 1-level state machine loops. - # TODO (later): Adapt codegen to deal with hierarchical CFGs instead. - sdutils.inline_loop_blocks(sdfg) - sdutils.inline_control_flow_regions(sdfg) - # Before generating the code, run type inference on the SDFG connectors infer_types.infer_connector_types(sdfg) diff --git a/dace/codegen/control_flow.py b/dace/codegen/control_flow.py index 9f7e19ea9a..82b3bb47cf 100644 --- a/dace/codegen/control_flow.py +++ b/dace/codegen/control_flow.py @@ -57,19 +57,20 @@ import ast from dataclasses import dataclass -from typing import (Callable, Dict, Iterator, List, Optional, Sequence, Set, Tuple, Union) +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Sequence, Set, Tuple, Union import sympy as sp -import dace from dace import dtypes -from dace.sdfg.state import SDFGState +from dace.sdfg.analysis import cfg as cfg_analysis +from dace.sdfg.state import (BreakBlock, ContinueBlock, ControlFlowBlock, ControlFlowRegion, LoopRegion, + ReturnBlock, SDFGState) from dace.sdfg.sdfg import SDFG, InterstateEdge from dace.sdfg.graph import Edge from dace.properties import CodeBlock from dace.codegen import cppunparse from dace.codegen.common import unparse_interstate_edge, sym2cpp -DaCeCodeGenerator = 'dace.codegen.targets.framecode.DaCeCodeGenerator' -############################################################################### +if TYPE_CHECKING: + from dace.codegen.targets.framecode import DaCeCodeGenerator @dataclass @@ -78,19 +79,22 @@ class ControlFlow: Abstract class representing a control flow block. """ - # A callback to the code generator that receives an SDFGState and returns - # a string with its generated code. + # A callback to the code generator that receives an SDFGState and returns a string with its generated code. dispatch_state: Callable[[SDFGState], str] # The parent control flow block of this one, used to avoid generating extraneous ``goto``s parent: Optional['ControlFlow'] + # Set to true if this is the last block in the parent control flow block, in order to avoid generating an + # extraneous "goto exit" statement. + last_block: bool + @property - def first_state(self) -> SDFGState: + def first_block(self) -> ControlFlowBlock: """ - Returns the first or initializing state in this control flow block. - Used to determine which will be the next state in a control flow block - to avoid generating extraneous ``goto`` calls. + Returns the first or initializing block in this control flow block. + Used to determine which will be the next block in a control flow block to avoid generating extraneous + ``goto`` calls. """ return None @@ -101,7 +105,7 @@ def children(self) -> List['ControlFlow']: """ return [] - def as_cpp(self, codegen: DaCeCodeGenerator, symbols: Dict[str, dtypes.typeclass]) -> str: + def as_cpp(self, codegen: 'DaCeCodeGenerator', symbols: Dict[str, dtypes.typeclass]) -> str: """ Returns C++ code for this control flow block. @@ -111,53 +115,21 @@ def as_cpp(self, codegen: DaCeCodeGenerator, symbols: Dict[str, dtypes.typeclass """ raise NotImplementedError - -@dataclass -class SingleState(ControlFlow): - """ A control flow element containing a single state. """ - - # The state in this element. - state: SDFGState - - # Set to true if this is the last state in the parent control flow block, - # in order to avoid generating an extraneous "goto exit" statement. - last_state: bool = False - - def as_cpp(self, codegen, symbols) -> str: - sdfg = self.state.parent - - expr = '__state_{}_{}:;\n'.format(sdfg.cfg_id, self.state.label) - if self.state.number_of_nodes() > 0: - expr += '{\n' - expr += self.dispatch_state(self.state) - expr += '\n}\n' - else: - # Dispatch empty state in any case in order to register that the - # state was dispatched - expr += self.dispatch_state(self.state) - - # If any state has no children, it should jump to the end of the SDFG - if not self.last_state and sdfg.out_degree(self.state) == 0: - expr += 'goto __state_exit_{};\n'.format(sdfg.cfg_id) - return expr - def generate_transition(self, sdfg: SDFG, + cfg: ControlFlowRegion, edge: Edge[InterstateEdge], - successor: SDFGState = None, + successor: Optional[ControlFlowBlock] = None, assignments_only: bool = False, - framecode: DaCeCodeGenerator = None) -> str: + framecode: 'DaCeCodeGenerator' = None) -> str: """ - Helper function that generates a state transition (conditional goto) - from a state and an SDFG edge. + Helper function that generates a state transition (conditional goto) from a control flow block and an SDFG edge. :param sdfg: The parent SDFG. :param edge: The state transition edge to generate. - :param successor: If not None, the state that will be generated right - after the current state (used to avoid extraneous - gotos). - :param assignments_only: If True, generates only the assignments - of the inter-state edge. + :param successor: If not None, the state that will be generated right after the current state (used to avoid + extraneous gotos). + :param assignments_only: If True, generates only the assignments of the inter-state edge. :param framecode: Code generator object (used for allocation information). :return: A c++ string representing the state transition code. """ @@ -173,39 +145,117 @@ def generate_transition(self, for variable, value in edge.data.assignments.items() ] + ['']) - if (not edge.data.is_unconditional() - or ((successor is None or edge.dst is not successor) and not assignments_only)): - expr += 'goto __state_{}_{};\n'.format(sdfg.cfg_id, edge.dst.label) + generate_goto = False + if not edge.data.is_unconditional(): + generate_goto = True + elif not assignments_only: + if successor is None: + generate_goto = True + elif isinstance(edge.dst, SDFGState) and edge.dst is not successor: + generate_goto = True + elif isinstance(edge.dst, ControlFlowRegion) and edge.dst.start_block is not successor: + generate_goto = True + if generate_goto and not assignments_only: + expr += 'goto __state_{}_{};\n'.format(cfg.cfg_id, edge.dst.label) if not edge.data.is_unconditional() and not assignments_only: expr += '}\n' return expr + +@dataclass +class BasicCFBlock(ControlFlow): + """ A CFG basic block, representing a single dataflow state """ + + # The state in this element. + state: SDFGState + + def as_cpp(self, codegen, symbols) -> str: + cfg = self.state.parent_graph + + expr = '__state_{}_{}:;\n'.format(cfg.cfg_id, self.state.label) + if self.state.number_of_nodes() > 0: + expr += '{\n' + expr += self.dispatch_state(self.state) + expr += '\n}\n' + else: + # Dispatch empty state in any case in order to register that the state was dispatched. + expr += self.dispatch_state(self.state) + + # If any state has no children, it should jump to the end of the SDFG + if not self.last_block and cfg.out_degree(self.state) == 0: + expr += 'goto __state_exit_{};\n'.format(cfg.cfg_id) + return expr + @property - def first_state(self) -> SDFGState: + def first_block(self) -> SDFGState: return self.state +@dataclass +class BreakCFBlock(ControlFlow): + """ A CFG block that generates a 'break' statement. """ + + block: BreakBlock + + def as_cpp(self, codegen, symbols) -> str: + return 'break;\n' + + @property + def first_block(self) -> BreakBlock: + return self.block + + +@dataclass +class ContinueCFBlock(ControlFlow): + """ A CFG block that generates a 'continue' statement. """ + + block: ContinueBlock + + def as_cpp(self, codegen, symbols) -> str: + return 'continue;\n' + + @property + def first_block(self) -> ContinueBlock: + return self.block + + +@dataclass +class ReturnCFBlock(ControlFlow): + """ A CFG block that generates a 'return' statement. """ + + block: ReturnBlock + + def as_cpp(self, codegen, symbols) -> str: + return 'return;\n' + + @property + def first_block(self) -> ReturnBlock: + return self.block + + @dataclass class GeneralBlock(ControlFlow): """ - General (or unrecognized) control flow block with gotos between states. + General (or unrecognized) control flow block with gotos between blocks. """ + # The control flow region that this block corresponds to (may be the SDFG in the absence of hierarchical regions). + region: Optional[ControlFlowRegion] + # List of children control flow blocks elements: List[ControlFlow] - # List or set of edges to not generate conditional gotos for. This is used - # to avoid generating extra assignments or gotos before entering a for - # loop, for example. + # List or set of edges to not generate conditional gotos for. This is used to avoid generating extra assignments or + # gotos before entering a for loop, for example. gotos_to_ignore: Sequence[Edge[InterstateEdge]] - # List or set of edges to generate `continue;` statements in lieu of goto. - # This is used for loop blocks. + # List or set of edges to generate `continue;` statements in lieu of goto. This is used for loop blocks. + # NOTE: Can be removed after a full conversion to only using hierarchical control flow and ditching CF detection. gotos_to_continue: Sequence[Edge[InterstateEdge]] - # List or set of edges to generate `break;` statements in lieu of goto. - # This is used for loop blocks. + # List or set of edges to generate `break;` statements in lieu of goto. This is used for loop blocks. + # NOTE: Can be removed after a full conversion to only using hierarchical control flow and ditching CF detection. gotos_to_break: Sequence[Edge[InterstateEdge]] # List or set of edges to not generate inter-state assignments for. @@ -218,11 +268,11 @@ def as_cpp(self, codegen, symbols) -> str: expr = '' for i, elem in enumerate(self.elements): expr += elem.as_cpp(codegen, symbols) - # In a general block, emit transitions and assignments after each - # individual state - if isinstance(elem, SingleState): - sdfg = elem.state.parent - out_edges = sdfg.out_edges(elem.state) + # In a general block, emit transitions and assignments after each individual block or region. + if isinstance(elem, BasicCFBlock) or (isinstance(elem, GeneralBlock) and elem.region): + cfg = elem.state.parent_graph if isinstance(elem, BasicCFBlock) else elem.region.parent_graph + sdfg = cfg if isinstance(cfg, SDFG) else cfg.sdfg + out_edges = cfg.out_edges(elem.state) if isinstance(elem, BasicCFBlock) else cfg.out_edges(elem.region) for j, e in enumerate(out_edges): if e not in self.gotos_to_ignore: # Skip gotos to immediate successors @@ -231,24 +281,24 @@ def as_cpp(self, codegen, symbols) -> str: if j == (len(out_edges) - 1): if (i + 1) < len(self.elements): # If last edge leads to next state in block - successor = self.elements[i + 1].first_state + successor = self.elements[i + 1].first_block elif i == len(self.elements) - 1: # If last edge leads to first state in next block - next_block = _find_next_block(self) + next_block = _find_next_block(self) if next_block is not None: - successor = next_block.first_state + successor = next_block.first_block - expr += elem.generate_transition(sdfg, e, successor) + expr += elem.generate_transition(sdfg, cfg, e, successor) else: if e not in self.assignments_to_ignore: # Need to generate assignments but not gotos - expr += elem.generate_transition(sdfg, e, assignments_only=True) + expr += elem.generate_transition(sdfg, cfg, e, assignments_only=True) if e in self.gotos_to_break: expr += 'break;\n' elif e in self.gotos_to_continue: expr += 'continue;\n' # Add exit goto as necessary - if elem.last_state: + if elem.last_block: continue # Two negating conditions if (len(out_edges) == 2 @@ -262,10 +312,10 @@ def as_cpp(self, codegen, symbols) -> str: return expr @property - def first_state(self) -> SDFGState: + def first_block(self) -> Optional[ControlFlowBlock]: if not self.elements: return None - return self.elements[0].first_state + return self.elements[0].first_block @property def children(self) -> List[ControlFlow]: @@ -276,14 +326,13 @@ def children(self) -> List[ControlFlow]: class IfScope(ControlFlow): """ A control flow scope of an if (else) block. """ - sdfg: SDFG #: Parent SDFG - branch_state: SDFGState #: State that branches out to if/else scopes + branch_block: ControlFlowBlock #: Block that branches out to if/else scopes condition: CodeBlock #: If-condition body: GeneralBlock #: Body of if condition orelse: Optional[GeneralBlock] = None #: Optional body of else condition def as_cpp(self, codegen, symbols) -> str: - condition_string = unparse_interstate_edge(self.condition.code[0], self.sdfg, codegen=codegen) + condition_string = unparse_interstate_edge(self.condition.code[0], self.branch_block.sdfg, codegen=codegen) expr = f'if ({condition_string}) {{\n' expr += self.body.as_cpp(codegen, symbols) expr += '\n}' @@ -295,8 +344,8 @@ def as_cpp(self, codegen, symbols) -> str: return expr @property - def first_state(self) -> SDFGState: - return self.branch_state + def first_block(self) -> ControlFlowBlock: + return self.branch_block @property def children(self) -> List[ControlFlow]: @@ -306,8 +355,8 @@ def children(self) -> List[ControlFlow]: @dataclass class IfElseChain(ControlFlow): """ A control flow scope of "if, else if, ..., else" chain of blocks. """ - sdfg: SDFG #: Parent SDFG - branch_state: SDFGState #: State that branches out to all blocks + + branch_block: ControlFlowBlock #: Block that branches out to all blocks body: List[Tuple[CodeBlock, GeneralBlock]] #: List of (condition, block) def as_cpp(self, codegen, symbols) -> str: @@ -316,7 +365,7 @@ def as_cpp(self, codegen, symbols) -> str: # First block in the chain is just "if", rest are "else if" prefix = '' if i == 0 else ' else ' - condition_string = unparse_interstate_edge(condition.code[0], self.sdfg, codegen=codegen) + condition_string = unparse_interstate_edge(condition.code[0], self.branch_block.sdfg, codegen=codegen) expr += f'{prefix}if ({condition_string}) {{\n' expr += body.as_cpp(codegen, symbols) expr += '\n}' @@ -326,14 +375,14 @@ def as_cpp(self, codegen, symbols) -> str: # execution should end, so we emit an "else goto exit" here. if len(self.body) > 0: expr += ' else {\n' - expr += 'goto __state_exit_{};\n'.format(self.sdfg.cfg_id) + expr += 'goto __state_exit_{};\n'.format(self.branch_block.sdfg.cfg_id) if len(self.body) > 0: expr += '\n}' return expr @property - def first_state(self) -> SDFGState: - return self.branch_state + def first_block(self) -> ControlFlowBlock: + return self.branch_block @property def children(self) -> List[ControlFlow]: @@ -351,6 +400,7 @@ def _clean_loop_body(body: str) -> str: @dataclass class ForScope(ControlFlow): """ For loop block (without break or continue statements). """ + itervar: str #: Name of iteration variable guard: SDFGState #: Loop guard state init: str #: C++ code for initializing iteration variable @@ -372,8 +422,8 @@ def as_cpp(self, codegen, symbols) -> str: init = self.itervar else: init = f'{symbols[self.itervar]} {self.itervar}' - init += ' = ' + unparse_interstate_edge(self.init_edges[0].data.assignments[self.itervar], - sdfg, codegen=codegen) + init += ' = ' + unparse_interstate_edge( + self.init_edges[0].data.assignments[self.itervar], sdfg, codegen=codegen) preinit = '' if self.init_edges: @@ -399,7 +449,7 @@ def as_cpp(self, codegen, symbols) -> str: return expr @property - def first_state(self) -> SDFGState: + def first_block(self) -> SDFGState: return self.guard @property @@ -427,7 +477,7 @@ def as_cpp(self, codegen, symbols) -> str: return expr @property - def first_state(self) -> SDFGState: + def first_block(self) -> SDFGState: return self.guard @property @@ -454,7 +504,7 @@ def as_cpp(self, codegen, symbols) -> str: return expr @property - def first_state(self) -> SDFGState: + def first_block(self) -> SDFGState: return self.body[0].first_state @property @@ -462,11 +512,72 @@ def children(self) -> List[ControlFlow]: return [self.body] +@dataclass +class GeneralLoopScope(ControlFlow): + """ General loop block based on a loop control flow region. """ + + loop: LoopRegion + body: ControlFlow + + def as_cpp(self, codegen, symbols) -> str: + sdfg = self.loop.sdfg + + cond = unparse_interstate_edge(self.loop.loop_condition.code[0], sdfg, codegen=codegen, symbols=symbols) + cond = cond.strip(';') + + expr = '' + + if self.loop.update_statement and self.loop.init_statement and self.loop.loop_variable: + # Initialize to either "int i = 0" or "i = 0" depending on whether the type has been defined. + defined_vars = codegen.dispatcher.defined_vars + if not defined_vars.has(self.loop.loop_variable): + try: + init = f'{symbols[self.loop.loop_variable]} ' + except KeyError: + init = 'auto ' + symbols[self.loop.loop_variable] = None + init += unparse_interstate_edge(self.loop.init_statement.code[0], sdfg, codegen=codegen, symbols=symbols) + init = init.strip(';') + + update = unparse_interstate_edge(self.loop.update_statement.code[0], sdfg, codegen=codegen, symbols=symbols) + update = update.strip(';') + + if self.loop.inverted: + expr += f'{init};\n' + expr += 'do {\n' + expr += _clean_loop_body(self.body.as_cpp(codegen, symbols)) + expr += f'{update};\n' + expr += f'\n}} while({cond});\n' + else: + expr += f'for ({init}; {cond}; {update}) {{\n' + expr += _clean_loop_body(self.body.as_cpp(codegen, symbols)) + expr += '\n}\n' + else: + if self.loop.inverted: + expr += 'do {\n' + expr += _clean_loop_body(self.body.as_cpp(codegen, symbols)) + expr += f'\n}} while({cond});\n' + else: + expr += f'while ({cond}) {{\n' + expr += _clean_loop_body(self.body.as_cpp(codegen, symbols)) + expr += '\n}\n' + + return expr + + @property + def first_block(self) -> ControlFlowBlock: + return self.loop.start_block + + @property + def children(self) -> List[ControlFlow]: + return [self.body] + + @dataclass class SwitchCaseScope(ControlFlow): """ Simple switch-case scope without fall-through cases. """ - sdfg: SDFG #: Parent SDFG - branch_state: SDFGState #: Branching state + + branch_block: ControlFlowBlock #: Branching block switchvar: str #: C++ code for switch expression cases: Dict[str, GeneralBlock] #: Mapping of cases to control flow blocks @@ -476,13 +587,13 @@ def as_cpp(self, codegen, symbols) -> str: expr += f'case {case}: {{\n' expr += body.as_cpp(codegen, symbols) expr += 'break;\n}\n' - expr += f'default: goto __state_exit_{self.sdfg.cfg_id};' + expr += f'default: goto __state_exit_{self.branch_block.sdfg.cfg_id};' expr += '\n}\n' return expr @property - def first_state(self) -> SDFGState: - return self.branch_state + def first_block(self) -> ControlFlowBlock: + return self.branch_block @property def children(self) -> List[ControlFlow]: @@ -498,7 +609,16 @@ def _loop_from_structure(sdfg: SDFG, guard: SDFGState, enter_edge: Edge[Intersta set of states. Can construct for or while loops. """ - body = GeneralBlock(dispatch_state, parent_block, [], [], [], [], [], True) + body = GeneralBlock(dispatch_state=dispatch_state, + parent=parent_block, + last_block=False, + region=None, + elements=[], + gotos_to_ignore=[], + gotos_to_continue=[], + gotos_to_break=[], + assignments_to_ignore=[], + sequential=True) guard_inedges = sdfg.in_edges(guard) increment_edges = [e for e in guard_inedges if e in back_edges] @@ -549,10 +669,11 @@ def _loop_from_structure(sdfg: SDFG, guard: SDFGState, enter_edge: Edge[Intersta # Also ignore assignments in increment edge (handled in for stmt) body.assignments_to_ignore.append(increment_edge) - return ForScope(dispatch_state, parent_block, itvar, guard, init, condition, update, body, init_edges) + return ForScope(dispatch_state, parent_block, False, itvar, guard, init, condition, update, body, + init_edges) # Otherwise, it is a while loop - return WhileScope(dispatch_state, parent_block, guard, condition, body) + return WhileScope(dispatch_state, parent_block, False, guard, condition, body) def _cases_from_branches( @@ -684,7 +805,16 @@ def _structured_control_flow_traversal(sdfg: SDFG, """ def make_empty_block(): - return GeneralBlock(dispatch_state, parent_block, [], [], [], [], [], True) + return GeneralBlock(dispatch_state=dispatch_state, + last_block=False, + parent=parent_block, + region=None, + elements=[], + gotos_to_ignore=[], + gotos_to_continue=[], + gotos_to_break=[], + assignments_to_ignore=[], + sequential=True) # Traverse states in custom order visited = set() if visited is None else visited @@ -696,14 +826,14 @@ def make_empty_block(): if node in visited or node is stop: continue visited.add(node) - stateblock = SingleState(dispatch_state, parent_block, node) + stateblock = BasicCFBlock(dispatch_state=dispatch_state, parent=parent_block, last_block=False, state=node) oe = sdfg.out_edges(node) if len(oe) == 0: # End state # If there are no remaining nodes, this is the last state and it can # be marked as such if len(stack) == 0: - stateblock.last_state = True + stateblock.last_block = True parent_block.elements.append(stateblock) continue elif len(oe) == 1: # No traversal change @@ -719,7 +849,7 @@ def make_empty_block(): parent_block.elements.append(stateblock) parent_block.gotos_to_ignore.extend(oe) parent_block.assignments_to_ignore.extend(oe) - stateblock.last_state = True + stateblock.last_block = True # Parse all outgoing edges recursively first cblocks: Dict[Edge[InterstateEdge], GeneralBlock] = {} @@ -747,13 +877,13 @@ def make_empty_block(): if (len(oe) == 2 and oe[0].data.condition_sympy() == sp.Not(oe[1].data.condition_sympy())): # If without else if oe[0].dst is mergestate: - branch_block = IfScope(dispatch_state, parent_block, sdfg, node, oe[1].data.condition, + branch_block = IfScope(dispatch_state, parent_block, False, node, oe[1].data.condition, cblocks[oe[1]]) elif oe[1].dst is mergestate: - branch_block = IfScope(dispatch_state, parent_block, sdfg, node, oe[0].data.condition, + branch_block = IfScope(dispatch_state, parent_block, False, node, oe[0].data.condition, cblocks[oe[0]]) else: - branch_block = IfScope(dispatch_state, parent_block, sdfg, node, oe[0].data.condition, + branch_block = IfScope(dispatch_state, parent_block, False, node, oe[0].data.condition, cblocks[oe[0]], cblocks[oe[1]]) else: # If there are 2 or more edges (one is not the negation of the @@ -762,10 +892,10 @@ def make_empty_block(): if switch: # If all edges are of form "x == y" for a single x and # integer y, it is a switch/case - branch_block = SwitchCaseScope(dispatch_state, parent_block, sdfg, node, switch[0], switch[1]) + branch_block = SwitchCaseScope(dispatch_state, parent_block, False, node, switch[0], switch[1]) else: # Otherwise, create if/else if/.../else goto exit chain - branch_block = IfElseChain(dispatch_state, parent_block, sdfg, node, + branch_block = IfElseChain(dispatch_state, parent_block, False, node, [(e.data.condition, cblocks[e] if e in cblocks else make_empty_block()) for e in oe]) # End of branch classification @@ -829,6 +959,188 @@ def make_empty_block(): return visited - {stop} +def _structured_control_flow_traversal_with_regions(cfg: ControlFlowRegion, + dispatch_state: Callable[[SDFGState], str], + parent_block: GeneralBlock, + start: Optional[ControlFlowBlock] = None, + stop: Optional[ControlFlowBlock] = None, + generate_children_of: Optional[ControlFlowBlock] = None, + branch_merges: Optional[Dict[ControlFlowBlock, + ControlFlowBlock]] = None, + ptree: Optional[Dict[ControlFlowBlock, ControlFlowBlock]] = None, + visited: Optional[Set[ControlFlowBlock]] = None): + if branch_merges is None: + # Avoid import loops + from dace.sdfg import utils as sdutil + + # Annotate branches + branch_merges: Dict[ControlFlowBlock, ControlFlowBlock] = {} + adf = cfg_analysis.acyclic_dominance_frontier(cfg) + ipostdom = sdutil.postdominators(cfg) + + for block in cfg.nodes(): + oedges = cfg.out_edges(block) + # Skip if not branch + if len(oedges) <= 1: + continue + # Try to obtain the common dominance frontier to find merge state. + common_frontier = set() + for oedge in oedges: + frontier = adf[oedge.dst] + if not frontier: + frontier = {oedge.dst} + common_frontier |= frontier + if len(common_frontier) == 1: + branch_merges[block] = next(iter(common_frontier)) + elif len(common_frontier) > 1 and ipostdom and ipostdom[block] in common_frontier: + branch_merges[block] = ipostdom[block] + + if ptree is None: + ptree = cfg_analysis.block_parent_tree(cfg, with_loops=False) + + start = start if start is not None else cfg.start_block + + def make_empty_block(): + return GeneralBlock(dispatch_state, parent_block, + last_block=False, region=None, elements=[], gotos_to_ignore=[], + gotos_to_break=[], gotos_to_continue=[], assignments_to_ignore=[], sequential=True) + + # Traverse states in custom order + visited = set() if visited is None else visited + stack = [start] + while stack: + node = stack.pop() + if (generate_children_of is not None and not _child_of(node, generate_children_of, ptree)): + continue + if node in visited or node is stop: + continue + visited.add(node) + + cfg_block: ControlFlow + if isinstance(node, SDFGState): + cfg_block = BasicCFBlock(dispatch_state, parent_block, False, node) + elif isinstance(node, BreakBlock): + cfg_block = BreakCFBlock(dispatch_state, parent_block, True, node) + elif isinstance(node, ContinueBlock): + cfg_block = ContinueCFBlock(dispatch_state, parent_block, True, node) + elif isinstance(node, ReturnBlock): + cfg_block = ReturnCFBlock(dispatch_state, parent_block, True, node) + elif isinstance(node, ControlFlowRegion): + if isinstance(node, LoopRegion): + body = make_empty_block() + cfg_block = GeneralLoopScope(dispatch_state, parent_block, False, node, body) + body.parent = cfg_block + _structured_control_flow_traversal_with_regions(node, dispatch_state, body) + else: + cfg_block = make_empty_block() + cfg_block.region = node + _structured_control_flow_traversal_with_regions(node, dispatch_state, cfg_block) + + oe = cfg.out_edges(node) + if len(oe) == 0: # End state + # If there are no remaining nodes, this is the last state and it can + # be marked as such + if len(stack) == 0: + cfg_block.last_block = True + parent_block.elements.append(cfg_block) + continue + elif len(oe) == 1: # No traversal change + stack.append(oe[0].dst) + parent_block.elements.append(cfg_block) + continue + + # Potential branch or loop + if node in branch_merges: + mergeblock = branch_merges[node] + + # Add branching node and ignore outgoing edges + parent_block.elements.append(cfg_block) + parent_block.gotos_to_ignore.extend(oe) # TODO: why? + parent_block.assignments_to_ignore.extend(oe) # TODO: why? + cfg_block.last_block = True + + # Parse all outgoing edges recursively first + cblocks: Dict[Edge[InterstateEdge], GeneralBlock] = {} + for branch in oe: + if branch.dst is mergeblock: + # If we hit the merge state (if without else), defer to end of branch traversal + continue + cblocks[branch] = make_empty_block() + _structured_control_flow_traversal_with_regions(cfg=cfg, + dispatch_state=dispatch_state, + parent_block=cblocks[branch], + start=branch.dst, + stop=mergeblock, + generate_children_of=node, + branch_merges=branch_merges, + ptree=ptree, + visited=visited) + + # Classify branch type: + branch_block = None + # If there are 2 out edges, one negation of the other: + # * if/else in case both branches are not merge state + # * if without else in case one branch is merge state + if (len(oe) == 2 and oe[0].data.condition_sympy() == sp.Not(oe[1].data.condition_sympy())): + if oe[0].dst is mergeblock: + # If without else + branch_block = IfScope(dispatch_state, parent_block, False, node, oe[1].data.condition, + cblocks[oe[1]]) + elif oe[1].dst is mergeblock: + branch_block = IfScope(dispatch_state, parent_block, False, node, oe[0].data.condition, + cblocks[oe[0]]) + else: + branch_block = IfScope(dispatch_state, parent_block, False, node, oe[0].data.condition, + cblocks[oe[0]], cblocks[oe[1]]) + else: + # If there are 2 or more edges (one is not the negation of the + # other): + switch = _cases_from_branches(oe, cblocks) + if switch: + # If all edges are of form "x == y" for a single x and + # integer y, it is a switch/case + branch_block = SwitchCaseScope(dispatch_state, parent_block, False, node, switch[0], switch[1]) + else: + # Otherwise, create if/else if/.../else goto exit chain + branch_block = IfElseChain(dispatch_state, parent_block, False, node, + [(e.data.condition, cblocks[e] if e in cblocks else make_empty_block()) + for e in oe]) + # End of branch classification + parent_block.elements.append(branch_block) + if mergeblock != stop: + stack.append(mergeblock) + + else: # No merge state: Unstructured control flow + parent_block.sequential = False + parent_block.elements.append(cfg_block) + stack.extend([e.dst for e in oe]) + + return visited - {stop} + + +def structured_control_flow_tree_with_regions(sdfg: SDFG, dispatch_state: Callable[[SDFGState], str]) -> ControlFlow: + """ + Returns a structured control-flow tree (i.e., with constructs such as branches and loops) from an SDFG based on the + control flow regions it contains. + + :param sdfg: The SDFG to iterate over. + :return: Control-flow block representing the entire SDFG. + """ + root_block = GeneralBlock(dispatch_state=dispatch_state, + parent=None, + last_block=False, + region=None, + elements=[], + gotos_to_ignore=[], + gotos_to_continue=[], + gotos_to_break=[], + assignments_to_ignore=[], + sequential=True) + _structured_control_flow_traversal_with_regions(sdfg, dispatch_state, root_block) + _reset_block_parents(root_block) + return root_block + + def structured_control_flow_tree(sdfg: SDFG, dispatch_state: Callable[[SDFGState], str]) -> ControlFlow: """ Returns a structured control-flow tree (i.e., with constructs such as @@ -838,11 +1150,14 @@ def structured_control_flow_tree(sdfg: SDFG, dispatch_state: Callable[[SDFGState :param sdfg: The SDFG to iterate over. :return: Control-flow block representing the entire SDFG. """ + if sdfg.root_sdfg.using_experimental_blocks: + return structured_control_flow_tree_with_regions(sdfg, dispatch_state) + # Avoid import loops from dace.sdfg.analysis import cfg # Get parent states and back-edges - ptree = cfg.state_parent_tree(sdfg) + ptree = cfg.block_parent_tree(sdfg) back_edges = cfg.back_edges(sdfg) # Annotate branches @@ -877,7 +1192,16 @@ def structured_control_flow_tree(sdfg: SDFG, dispatch_state: Callable[[SDFGState if len(common_frontier) == 1: branch_merges[state] = next(iter(common_frontier)) - root_block = GeneralBlock(dispatch_state, None, [], [], [], [], [], True) + root_block = GeneralBlock(dispatch_state=dispatch_state, + parent=None, + last_block=False, + region=None, + elements=[], + gotos_to_ignore=[], + gotos_to_continue=[], + gotos_to_break=[], + assignments_to_ignore=[], + sequential=True) _structured_control_flow_traversal(sdfg, sdfg.start_state, ptree, branch_merges, back_edges, dispatch_state, root_block) _reset_block_parents(root_block) diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py index be032556a0..3ac9e097f8 100644 --- a/dace/codegen/dispatcher.py +++ b/dace/codegen/dispatcher.py @@ -7,10 +7,14 @@ from dace.codegen.prettycode import CodeIOStream import aenum from dace import config, data as dt, dtypes, nodes, registry +from dace.memlet import Memlet from dace.codegen import exceptions as cgx, prettycode from dace.codegen.targets import target from dace.sdfg import utils as sdutil, SDFG, SDFGState, ScopeSubgraphView -from typing import Dict, Set, Tuple, Union +from dace.sdfg.graph import MultiConnectorEdge +from typing import Callable, Dict, List, Optional, Set, Tuple, Union + +from dace.sdfg.state import ControlFlowRegion, StateSubgraphView @registry.extensible_enum @@ -53,10 +57,8 @@ def has(self, name, ancestor: int = 0): return False def get(self, name: str, ancestor: int = 0, is_global: bool = False) -> Tuple[DefinedType, str]: - last_visited_scope = None for parent, scope, can_access_parent in reversed(self._scopes): last_parent = parent - last_visited_scope = scope if ancestor > 0: ancestor -= 1 continue @@ -101,7 +103,7 @@ def add(self, name: str, dtype: DefinedType, ctype: str, ancestor: int = 0, allo break self._scopes[-1 - ancestor][1][name] = (dtype, ctype) - def add_global(self, name: str, dtype: DefinedType, ctype: str): + def add_global(self, name: str, dtype: DefinedType, ctype: str) -> None: """ Adds a global variable (top scope) """ @@ -110,11 +112,9 @@ def add_global(self, name: str, dtype: DefinedType, ctype: str): self._scopes[0][1][name] = (dtype, ctype) - def remove(self, name: str, ancestor: int = 0, is_global: bool = False) -> Tuple[DefinedType, str]: - last_visited_scope = None + def remove(self, name: str, ancestor: int = 0, is_global: bool = False) -> None: for parent, scope, can_access_parent in reversed(self._scopes): last_parent = parent - last_visited_scope = scope if ancestor > 0: ancestor -= 1 continue @@ -145,6 +145,23 @@ class TargetDispatcher(object): """ Dispatches sub-SDFG generation (according to scope), storage<->storage copies, and storage<->tasklet copies to targets. """ + _array_dispatchers: Dict[dtypes.StorageType, target.TargetCodeGenerator] + _map_dispatchers: Dict[dtypes.ScheduleType, target.TargetCodeGenerator] + + _copy_dispatchers: Dict[Tuple[dtypes.StorageType, dtypes.StorageType, dtypes.ScheduleType], + List[Tuple[Callable, target.TargetCodeGenerator]]] + _generic_copy_dispatcher: Dict[Tuple[dtypes.StorageType, dtypes.StorageType, dtypes.ScheduleType], + target.TargetCodeGenerator] + + _node_dispatchers: List[Tuple[Callable, target.TargetCodeGenerator]] + _generic_node_dispatcher: Optional[target.TargetCodeGenerator] + + _state_dispatchers: List[Tuple[Callable, target.TargetCodeGenerator]] + _generic_state_dispatcher: Optional[target.TargetCodeGenerator] + + _declared_arrays: DefinedMemlets + _defined_vars: DefinedMemlets + def __init__(self, framecode): # Avoid import loop from dace.codegen.targets import framecode as fc @@ -157,20 +174,14 @@ def __init__(self, framecode): self.instrumentation: Dict[Union[dtypes.InstrumentationType, dtypes.DataInstrumentationType], instrumentation.InstrumentationProvider] = {} - self._array_dispatchers: Dict[dtypes.StorageType, target.TargetCodeGenerator] = {} - self._map_dispatchers: Dict[dtypes.ScheduleType, target.TargetCodeGenerator] = {} - self._copy_dispatchers = {} # Type: (dtypes.StorageType src, - # dtypes.StorageType dst, - # dtypes.ScheduleType dst_schedule) - # -> List[(predicate, TargetCodeGenerator)] - self._generic_copy_dispatchers = {} # Type: (dtypes.StorageType src, - # dtypes.StorageType dst, - # dtypes.ScheduleType dst_schedule) - # -> TargetCodeGenerator - self._node_dispatchers = [] # [(predicate, dispatcher)] - self._generic_node_dispatcher = None # Type: TargetCodeGenerator - self._state_dispatchers = [] # [(predicate, dispatcher)] - self._generic_state_dispatcher = None # Type: TargetCodeGenerator + self._array_dispatchers = {} + self._map_dispatchers = {} + self._copy_dispatchers = {} + self._generic_copy_dispatchers = {} + self._node_dispatchers = [] + self._generic_node_dispatcher = None + self._state_dispatchers = [] + self._generic_state_dispatcher = None self._declared_arrays = DefinedMemlets() self._defined_vars = DefinedMemlets() @@ -233,7 +244,8 @@ def get_predicated_state_dispatchers(self): """ Returns a list of state dispatchers with predicates. """ return list(self._state_dispatchers) - def register_node_dispatcher(self, dispatcher, predicate=None): + def register_node_dispatcher(self, dispatcher: target.TargetCodeGenerator, + predicate: Optional[Callable] = None) -> None: """ Registers a code generator that processes a single node, calling ``generate_node``. @@ -260,7 +272,9 @@ def get_predicated_node_dispatchers(self): """ Returns a list of node dispatchers with predicates. """ return list(self._node_dispatchers) - def register_map_dispatcher(self, schedule_type, func): + def register_map_dispatcher(self, + schedule_type: Union[List[dtypes.ScheduleType], dtypes.ScheduleType], + func: target.TargetCodeGenerator) -> None: """ Registers a function that processes a scope, used when calling ``dispatch_subgraph`` and ``dispatch_scope``. @@ -274,13 +288,15 @@ def register_map_dispatcher(self, schedule_type, func): self.register_map_dispatcher(stype, func) return - if not isinstance(schedule_type, dtypes.ScheduleType): raise TypeError - if not isinstance(func, target.TargetCodeGenerator): raise TypeError + if not isinstance(schedule_type, dtypes.ScheduleType): + raise TypeError + if not isinstance(func, target.TargetCodeGenerator): + raise TypeError if schedule_type in self._map_dispatchers: raise ValueError('Schedule already mapped to ' + str(self._map_dispatchers[schedule_type])) self._map_dispatchers[schedule_type] = func - def register_array_dispatcher(self, storage_type, func): + def register_array_dispatcher(self, storage_type: dtypes.StorageType, func: target.TargetCodeGenerator) -> None: """ Registers a function that processes data allocation, initialization, and deinitialization. Used when calling ``dispatch_allocate/deallocate/initialize``. @@ -299,7 +315,9 @@ def register_array_dispatcher(self, storage_type, func): if not isinstance(func, target.TargetCodeGenerator): raise TypeError self._array_dispatchers[storage_type] = func - def register_copy_dispatcher(self, src_storage, dst_storage, dst_schedule, func, predicate=None): + def register_copy_dispatcher(self, src_storage: dtypes.StorageType, dst_storage: dtypes.StorageType, + dst_schedule: dtypes.ScheduleType, func: target.TargetCodeGenerator, + predicate: Optional[Callable] = None) -> None: """ Registers code generation of data-to-data (or data from/to tasklet, if src/dst storage is StorageType.Register) copy functions. Can also be target-schedule specific, or @@ -336,7 +354,7 @@ def register_copy_dispatcher(self, src_storage, dst_storage, dst_schedule, func, self._copy_dispatchers[dispatcher].append((predicate, func)) - def get_state_dispatcher(self, sdfg, state): + def get_state_dispatcher(self, sdfg: SDFG, state: SDFGState) -> target.TargetCodeGenerator: # Check if the state satisfies any predicates that delegate to a # specific code generator satisfied_dispatchers = [ @@ -351,22 +369,23 @@ def get_state_dispatcher(self, sdfg, state): return self._generic_state_dispatcher - def dispatch_state(self, sdfg, state, function_stream, callsite_stream): + def dispatch_state(self, state: SDFGState, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: """ Dispatches a code generator for an SDFG state. """ self.defined_vars.enter_scope(state) - disp = self.get_state_dispatcher(sdfg, state) - disp.generate_state(sdfg, state, function_stream, callsite_stream) + disp = self.get_state_dispatcher(state.sdfg, state) + disp.generate_state(state.sdfg, state.parent_graph, state, function_stream, callsite_stream) self.defined_vars.exit_scope(state) def dispatch_subgraph(self, - sdfg, - dfg, - state_id, - function_stream, - callsite_stream, - skip_entry_node=False, - skip_exit_node=False): + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: StateSubgraphView, + state_id: int, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + skip_entry_node: bool = False, + skip_exit_node: bool = False): """ Dispatches a code generator for a scope subgraph of an `SDFGState`. """ @@ -388,16 +407,18 @@ def dispatch_subgraph(self, continue if isinstance(v, nodes.MapEntry): - scope_subgraph = sdfg.node(state_id).scope_subgraph(v) + state = cfg.state(state_id) + scope_subgraph = state.scope_subgraph(v) - self.dispatch_scope(v.map.schedule, sdfg, scope_subgraph, state_id, function_stream, callsite_stream) + self.dispatch_scope(v.map.schedule, sdfg, cfg, scope_subgraph, state_id, function_stream, + callsite_stream) # Skip scope subgraph nodes nodes_to_skip.update(scope_subgraph.nodes()) else: - self.dispatch_node(sdfg, dfg, state_id, v, function_stream, callsite_stream) + self.dispatch_node(sdfg, cfg, dfg, state_id, v, function_stream, callsite_stream) - def get_node_dispatcher(self, sdfg, state, node): + def get_node_dispatcher(self, sdfg: SDFG, state: SDFGState, node: nodes.Node): satisfied_dispatchers = [dispatcher for pred, dispatcher in self._node_dispatchers if pred(sdfg, state, node)] num_satisfied = len(satisfied_dispatchers) if num_satisfied > 1: @@ -409,7 +430,8 @@ def get_node_dispatcher(self, sdfg, state, node): # Otherwise use the generic code generator return self._generic_node_dispatcher - def dispatch_node(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): + def dispatch_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, + function_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Dispatches a code generator for a single node. """ # If this node depends on any environments, register this for @@ -419,29 +441,38 @@ def dispatch_node(self, sdfg, dfg, state_id, node, function_stream, callsite_str # Check if the node satisfies any predicates that delegate to a # specific code generator - state = sdfg.node(state_id) + state = cfg.state(state_id) disp = self.get_node_dispatcher(sdfg, state, node) self._used_targets.add(disp) - disp.generate_node(sdfg, dfg, state_id, node, function_stream, callsite_stream) + disp.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - def get_scope_dispatcher(self, schedule): + def get_scope_dispatcher(self, schedule: dtypes.ScheduleType) -> target.TargetCodeGenerator: return self._map_dispatchers[schedule] - def dispatch_scope(self, map_schedule, sdfg, sub_dfg, state_id, function_stream, callsite_stream): + def dispatch_scope(self, + map_schedule: dtypes.ScheduleType, + sdfg: SDFG, + cfg: ControlFlowRegion, + sub_dfg: StateSubgraphView, + state_id: int, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: """ Dispatches a code generator function for a scope in an SDFG state. """ entry_node = sub_dfg.source_nodes()[0] self.defined_vars.enter_scope(entry_node) self._used_targets.add(self._map_dispatchers[map_schedule]) - self._map_dispatchers[map_schedule].generate_scope(sdfg, sub_dfg, state_id, function_stream, callsite_stream) + self._map_dispatchers[map_schedule].generate_scope(sdfg, cfg, sub_dfg, state_id, function_stream, + callsite_stream) self.defined_vars.exit_scope(entry_node) - def get_array_dispatcher(self, storage: dtypes.StorageType): + def get_array_dispatcher(self, storage: dtypes.StorageType) -> target.TargetCodeGenerator: return self._array_dispatchers[storage] def dispatch_allocate(self, sdfg: SDFG, + cfg: ControlFlowRegion, dfg: ScopeSubgraphView, state_id: int, node: nodes.AccessNode, @@ -449,7 +480,7 @@ def dispatch_allocate(self, function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream, declare: bool = True, - allocate: bool = True): + allocate: bool = True) -> None: """ Dispatches a code generator for data allocation. """ self._used_targets.add(self._array_dispatchers[datadesc.storage]) @@ -463,16 +494,16 @@ def dispatch_allocate(self, declaration_stream = callsite_stream if declare and not allocate: - self._array_dispatchers[datadesc.storage].declare_array(sdfg, dfg, state_id, node, datadesc, + self._array_dispatchers[datadesc.storage].declare_array(sdfg, cfg, dfg, state_id, node, datadesc, function_stream, declaration_stream) elif allocate: - self._array_dispatchers[datadesc.storage].allocate_array(sdfg, dfg, state_id, node, datadesc, + self._array_dispatchers[datadesc.storage].allocate_array(sdfg, cfg, dfg, state_id, node, datadesc, function_stream, declaration_stream, callsite_stream) - def dispatch_deallocate(self, sdfg: SDFG, dfg: ScopeSubgraphView, state_id: int, node: nodes.AccessNode, - datadesc: dt.Data, function_stream: prettycode.CodeIOStream, - callsite_stream: prettycode.CodeIOStream): + def dispatch_deallocate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: ScopeSubgraphView, state_id: int, + node: nodes.AccessNode, datadesc: dt.Data, function_stream: prettycode.CodeIOStream, + callsite_stream: prettycode.CodeIOStream) -> None: """ Dispatches a code generator for a data deallocation. """ self._used_targets.add(self._array_dispatchers[datadesc.storage]) @@ -481,11 +512,14 @@ def dispatch_deallocate(self, sdfg: SDFG, dfg: ScopeSubgraphView, state_id: int, elif datadesc.lifetime == dtypes.AllocationLifetime.External: return - self._array_dispatchers[datadesc.storage].deallocate_array(sdfg, dfg, state_id, node, datadesc, function_stream, - callsite_stream) + self._array_dispatchers[datadesc.storage].deallocate_array(sdfg, cfg, dfg, state_id, node, datadesc, + function_stream, callsite_stream) # Dispatches copy code for a memlet - def get_copy_dispatcher(self, src_node, dst_node, edge, sdfg, state): + def get_copy_dispatcher(self, src_node: Union[nodes.CodeNode, nodes.AccessNode], + dst_node: Union[nodes.CodeNode, nodes.AccessNode, nodes.EntryNode], + edge: MultiConnectorEdge[Memlet], + sdfg: SDFG, state: SDFGState) -> Optional[target.TargetCodeGenerator]: """ (Internal) Returns a code generator that should be dispatched for a memory copy operation. @@ -560,25 +594,29 @@ def get_copy_dispatcher(self, src_node, dst_node, edge, sdfg, state): return target - def dispatch_copy(self, src_node, dst_node, edge, sdfg, dfg, state_id, function_stream, output_stream): + def dispatch_copy(self, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet], sdfg: SDFG, + cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, function_stream: CodeIOStream, + output_stream: CodeIOStream) -> None: """ Dispatches a code generator for a memory copy operation. """ - state = sdfg.node(state_id) + state = cfg.state(state_id) target = self.get_copy_dispatcher(src_node, dst_node, edge, sdfg, state) if target is None: return # Dispatch copy self._used_targets.add(target) - target.copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, output_stream) + target.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, function_stream, output_stream) # Dispatches definition code for a memlet that is outgoing from a tasklet - def dispatch_output_definition(self, src_node, dst_node, edge, sdfg, dfg, state_id, function_stream, output_stream): + def dispatch_output_definition(self, src_node: nodes.Node, dst_node: nodes.Node, edge, sdfg: SDFG, + cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + function_stream: CodeIOStream, output_stream: CodeIOStream) -> None: """ Dispatches a code generator for an output memlet definition in a tasklet. """ - state = sdfg.node(state_id) + state = cfg.state(state_id) target = self.get_copy_dispatcher(src_node, dst_node, edge, sdfg, state) # Dispatch self._used_targets.add(target) - target.define_out_memlet(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, output_stream) + target.define_out_memlet(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, function_stream, output_stream) diff --git a/dace/codegen/instrumentation/gpu_events.py b/dace/codegen/instrumentation/gpu_events.py index d6fc21f305..cfd5a1cbb3 100644 --- a/dace/codegen/instrumentation/gpu_events.py +++ b/dace/codegen/instrumentation/gpu_events.py @@ -1,8 +1,12 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Union from dace import config, dtypes, registry +from dace.codegen.prettycode import CodeIOStream from dace.sdfg import nodes, is_devicelevel_gpu from dace.codegen import common from dace.codegen.instrumentation.provider import InstrumentationProvider +from dace.sdfg.sdfg import SDFG +from dace.sdfg.state import SDFGState @registry.autoregister_params(type=dtypes.InstrumentationType.GPU_Events) @@ -12,7 +16,7 @@ def __init__(self): self.backend = common.get_gpu_backend() super().__init__() - def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen): + def on_sdfg_begin(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream, codegen) -> None: if self.backend == 'cuda': header_name = 'cuda_runtime.h' elif self.backend == 'hip': @@ -27,7 +31,7 @@ def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen): sdfg.append_global_code('\n#include ', None) sdfg.append_global_code('\n#include <%s>' % header_name, None) - def _get_sobj(self, node): + def _get_sobj(self, node: Union[nodes.EntryNode, nodes.ExitNode]): # Get object behind scope if hasattr(node, 'consume'): return node.consume @@ -49,13 +53,13 @@ def _record_event(self, id, stream): streamstr = f'__state->gpu_context->streams[{stream}]' return '%sEventRecord(__dace_ev_%s, %s);' % (self.backend, id, streamstr) - def _report(self, timer_name: str, sdfg=None, state=None, node=None): + def _report(self, timer_name: str, sdfg: SDFG = None, state: SDFGState = None, node: nodes.Node = None): idstr = self._idstr(sdfg, state, node) state_id = -1 node_id = -1 if state is not None: - state_id = sdfg.node_id(state) + state_id = state.block_id if node is not None: node_id = state.node_id(node) @@ -74,8 +78,9 @@ def _report(self, timer_name: str, sdfg=None, state=None, node=None): node_id=node_id) # Code generation hooks - def on_state_begin(self, sdfg, state, local_stream, global_stream): - state_id = sdfg.node_id(state) + def on_state_begin(self, sdfg: SDFG, state: SDFGState, local_stream: CodeIOStream, + global_stream: CodeIOStream) -> None: + state_id = state.parent_graph.node_id(state) # Create GPU events for each instrumented scope in the state for node in state.nodes(): if isinstance(node, (nodes.CodeNode, nodes.EntryNode)): @@ -93,8 +98,9 @@ def on_state_begin(self, sdfg, state, local_stream, global_stream): idstr = 'e' + self._idstr(sdfg, state, None) local_stream.write(self._create_event(idstr), sdfg, state_id) - def on_state_end(self, sdfg, state, local_stream, global_stream): - state_id = sdfg.node_id(state) + def on_state_end(self, sdfg: SDFG, state: SDFGState, local_stream: CodeIOStream, + global_stream: CodeIOStream) -> None: + state_id = state.parent_graph.node_id(state) # Record and measure state stream event if state.instrument == dtypes.InstrumentationType.GPU_Events: idstr = self._idstr(sdfg, state, None) @@ -112,8 +118,9 @@ def on_state_end(self, sdfg, state, local_stream, global_stream): local_stream.write(self._destroy_event('b' + idstr), sdfg, state_id, node) local_stream.write(self._destroy_event('e' + idstr), sdfg, state_id, node) - def on_scope_entry(self, sdfg, state, node, outer_stream, inner_stream, global_stream): - state_id = sdfg.node_id(state) + def on_scope_entry(self, sdfg: SDFG, state: SDFGState, node: nodes.EntryNode, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: + state_id = state.parent_graph.node_id(state) s = self._get_sobj(node) if s.instrument == dtypes.InstrumentationType.GPU_Events: if s.schedule != dtypes.ScheduleType.GPU_Device: @@ -123,8 +130,9 @@ def on_scope_entry(self, sdfg, state, node, outer_stream, inner_stream, global_s stream = getattr(node, '_cuda_stream', -1) outer_stream.write(self._record_event(idstr, stream), sdfg, state_id, node) - def on_scope_exit(self, sdfg, state, node, outer_stream, inner_stream, global_stream): - state_id = sdfg.node_id(state) + def on_scope_exit(self, sdfg: SDFG, state: SDFGState, node: nodes.ExitNode, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: + state_id = state.parent_graph.node_id(state) entry_node = state.entry_node(node) s = self._get_sobj(node) if s.instrument == dtypes.InstrumentationType.GPU_Events: @@ -134,24 +142,26 @@ def on_scope_exit(self, sdfg, state, node, outer_stream, inner_stream, global_st outer_stream.write(self._report('%s %s' % (type(s).__name__, s.label), sdfg, state, entry_node), sdfg, state_id, node) - def on_node_begin(self, sdfg, state, node, outer_stream, inner_stream, global_stream): + def on_node_begin(self, sdfg: SDFG, state: SDFGState, node: nodes.Node, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: if (not isinstance(node, nodes.CodeNode) or is_devicelevel_gpu(sdfg, state, node)): return # Only run for host nodes # TODO(later): Implement "clock64"-based GPU counters if node.instrument == dtypes.InstrumentationType.GPU_Events: - state_id = sdfg.node_id(state) + state_id = state.parent_graph.node_id(state) idstr = 'b' + self._idstr(sdfg, state, node) stream = getattr(node, '_cuda_stream', -1) outer_stream.write(self._record_event(idstr, stream), sdfg, state_id, node) - def on_node_end(self, sdfg, state, node, outer_stream, inner_stream, global_stream): + def on_node_end(self, sdfg: SDFG, state: SDFGState, node: nodes.Node, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: if (not isinstance(node, nodes.Tasklet) or is_devicelevel_gpu(sdfg, state, node)): return # Only run for host nodes # TODO(later): Implement "clock64"-based GPU counters if node.instrument == dtypes.InstrumentationType.GPU_Events: - state_id = sdfg.node_id(state) + state_id = state.parent_graph.node_id(state) idstr = 'e' + self._idstr(sdfg, state, node) stream = getattr(node, '_cuda_stream', -1) outer_stream.write(self._record_event(idstr, stream), sdfg, state_id, node) diff --git a/dace/codegen/instrumentation/likwid.py b/dace/codegen/instrumentation/likwid.py index efbd6da934..8d1c9e3b71 100644 --- a/dace/codegen/instrumentation/likwid.py +++ b/dace/codegen/instrumentation/likwid.py @@ -11,7 +11,11 @@ from dace import dtypes, registry, library from dace.codegen.instrumentation.provider import InstrumentationProvider +from dace.codegen.prettycode import CodeIOStream from dace.config import Config +from dace.sdfg import nodes +from dace.sdfg.sdfg import SDFG +from dace.sdfg.state import SDFGState from dace.transformation import helpers as xfh @@ -80,7 +84,7 @@ def __init__(self): except KeyError: self._default_events = "CLOCK" - def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen): + def on_sdfg_begin(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream, codegen) -> None: if sdfg.parent is not None: return @@ -209,13 +213,14 @@ def on_sdfg_end(self, sdfg, local_stream, global_stream): ''' self.codegen._exitcode.write(exit_code, sdfg) - def on_state_begin(self, sdfg, state, local_stream, global_stream): + def on_state_begin(self, sdfg: SDFG, state: SDFGState, local_stream: CodeIOStream, + global_stream: CodeIOStream) -> None: if not self._likwid_used: return if state.instrument == dace.InstrumentationType.LIKWID_CPU: - cfg_id = sdfg.cfg_id - state_id = sdfg.node_id(state) + cfg_id = state.parent_graph.cfg_id + state_id = state.block_id node_id = -1 region = f"state_{cfg_id}_{state_id}_{node_id}" self._regions.append((region, cfg_id, state_id, node_id)) @@ -245,13 +250,14 @@ def on_state_begin(self, sdfg, state, local_stream, global_stream): ''' local_stream.write(marker_code) - def on_state_end(self, sdfg, state, local_stream, global_stream): + def on_state_end(self, sdfg: SDFG, state: SDFGState, local_stream: CodeIOStream, + global_stream: CodeIOStream) -> None: if not self._likwid_used: return if state.instrument == dace.InstrumentationType.LIKWID_CPU: - cfg_id = sdfg.cfg_id - state_id = sdfg.node_id(state) + cfg_id = state.parent_graph.cfg_id + state_id = state.block_id node_id = -1 region = f"state_{cfg_id}_{state_id}_{node_id}" @@ -263,7 +269,8 @@ def on_state_end(self, sdfg, state, local_stream, global_stream): ''' local_stream.write(marker_code) - def on_scope_entry(self, sdfg, state, node, outer_stream, inner_stream, global_stream): + def on_scope_entry(self, sdfg: SDFG, state: SDFGState, node: nodes.EntryNode, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: if not self._likwid_used or node.instrument != dace.InstrumentationType.LIKWID_CPU: return @@ -272,8 +279,8 @@ def on_scope_entry(self, sdfg, state, node, outer_stream, inner_stream, global_s elif node.schedule not in LIKWIDInstrumentationCPU.perf_whitelist_schedules: raise TypeError("Unsupported schedule on scope") - cfg_id = sdfg.cfg_id - state_id = sdfg.node_id(state) + cfg_id = state.parent_graph.cfg_id + state_id = state.block_id node_id = state.node_id(node) region = f"scope_{cfg_id}_{state_id}_{node_id}" @@ -289,13 +296,14 @@ def on_scope_entry(self, sdfg, state, node, outer_stream, inner_stream, global_s ''' outer_stream.write(marker_code) - def on_scope_exit(self, sdfg, state, node, outer_stream, inner_stream, global_stream): + def on_scope_exit(self, sdfg: SDFG, state: SDFGState, node: nodes.ExitNode, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: entry_node = state.entry_node(node) if not self._likwid_used or entry_node.instrument != dace.InstrumentationType.LIKWID_CPU: return - cfg_id = sdfg.cfg_id - state_id = sdfg.node_id(state) + cfg_id = state.parent_graph.cfg_id + state_id = state.block_id node_id = state.node_id(entry_node) region = f"scope_{cfg_id}_{state_id}_{node_id}" @@ -325,7 +333,7 @@ def __init__(self): except KeyError: self._default_events = "FLOPS_SP" - def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen): + def on_sdfg_begin(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream, codegen) -> None: if sdfg.parent is not None: return @@ -362,7 +370,7 @@ def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen): ''' codegen._initcode.write(init_code) - def on_sdfg_end(self, sdfg, local_stream, global_stream): + def on_sdfg_end(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream) -> None: if not self._likwid_used or sdfg.parent is not None: return @@ -397,13 +405,14 @@ def on_sdfg_end(self, sdfg, local_stream, global_stream): ''' self.codegen._exitcode.write(exit_code, sdfg) - def on_state_begin(self, sdfg, state, local_stream, global_stream): + def on_state_begin(self, sdfg: SDFG, state: SDFGState, local_stream: CodeIOStream, + global_stream: CodeIOStream) -> None: if not self._likwid_used: return if state.instrument == dace.InstrumentationType.LIKWID_GPU: - cfg_id = sdfg.cfg_id - state_id = sdfg.node_id(state) + cfg_id = state.parent_graph.cfg_id + state_id = state.block_id node_id = -1 region = f"state_{cfg_id}_{state_id}_{node_id}" self._regions.append((region, cfg_id, state_id, node_id)) @@ -419,13 +428,14 @@ def on_state_begin(self, sdfg, state, local_stream, global_stream): ''' local_stream.write(marker_code) - def on_state_end(self, sdfg, state, local_stream, global_stream): + def on_state_end(self, sdfg: SDFG, state: SDFGState, local_stream: CodeIOStream, + global_stream: CodeIOStream) -> None: if not self._likwid_used: return if state.instrument == dace.InstrumentationType.LIKWID_GPU: - cfg_id = sdfg.cfg_id - state_id = sdfg.node_id(state) + cfg_id = state.parent_graph.cfg_id + state_id = state.block_id node_id = -1 region = f"state_{cfg_id}_{state_id}_{node_id}" @@ -434,7 +444,8 @@ def on_state_end(self, sdfg, state, local_stream, global_stream): ''' local_stream.write(marker_code) - def on_scope_entry(self, sdfg, state, node, outer_stream, inner_stream, global_stream): + def on_scope_entry(self, sdfg: SDFG, state: SDFGState, node: nodes.EntryNode, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: if not self._likwid_used or node.instrument != dace.InstrumentationType.LIKWID_GPU: return @@ -443,8 +454,8 @@ def on_scope_entry(self, sdfg, state, node, outer_stream, inner_stream, global_s elif node.schedule not in LIKWIDInstrumentationGPU.perf_whitelist_schedules: raise TypeError("Unsupported schedule on scope") - cfg_id = sdfg.cfg_id - state_id = sdfg.node_id(state) + cfg_id = state.parent_graph.cfg_id + state_id = state.block_id node_id = state.node_id(node) region = f"scope_{cfg_id}_{state_id}_{node_id}" @@ -460,13 +471,14 @@ def on_scope_entry(self, sdfg, state, node, outer_stream, inner_stream, global_s ''' outer_stream.write(marker_code) - def on_scope_exit(self, sdfg, state, node, outer_stream, inner_stream, global_stream): + def on_scope_exit(self, sdfg: SDFG, state: SDFGState, node: nodes.ExitNode, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: entry_node = state.entry_node(node) if not self._likwid_used or entry_node.instrument != dace.InstrumentationType.LIKWID_GPU: return - cfg_id = sdfg.cfg_id - state_id = sdfg.node_id(state) + cfg_id = state.parent_graph.cfg_id + state_id = state.block_id node_id = state.node_id(entry_node) region = f"scope_{cfg_id}_{state_id}_{node_id}" diff --git a/dace/codegen/instrumentation/provider.py b/dace/codegen/instrumentation/provider.py index d05e8b001d..a3748b241b 100644 --- a/dace/codegen/instrumentation/provider.py +++ b/dace/codegen/instrumentation/provider.py @@ -1,13 +1,20 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +from dace.codegen.prettycode import CodeIOStream from dace.dtypes import DataInstrumentationType, InstrumentationType from dace.registry import make_registry from typing import Dict, Type, Union +from dace.memlet import Memlet +from dace.sdfg import nodes, SDFG +from dace.sdfg.graph import MultiConnectorEdge +from dace.sdfg.state import ControlFlowRegion, SDFGState + @make_registry class InstrumentationProvider(object): """ Instrumentation provider for SDFGs, states, scopes, and memlets. Emits code on event. """ + @staticmethod def get_provider_mapping( ) -> Dict[Union[InstrumentationType, DataInstrumentationType], Type['InstrumentationProvider']]: @@ -25,16 +32,16 @@ class types, given the currently-registered extensions of this class. return result - def _idstr(self, sdfg, state, node): + def _idstr(self, cfg: ControlFlowRegion, state: SDFGState, node: nodes.Node) -> str: """ Returns a unique identifier string from a node or state. """ - result = str(sdfg.cfg_id) + result = str(cfg.cfg_id) if state is not None: - result += '_' + str(sdfg.node_id(state)) + result += '_' + str(cfg.node_id(state)) if node is not None: result += '_' + str(state.node_id(node)) return result - def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen): + def on_sdfg_begin(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream, codegen) -> None: """ Event called at the beginning of SDFG code generation. :param sdfg: The generated SDFG object. @@ -44,7 +51,7 @@ def on_sdfg_begin(self, sdfg, local_stream, global_stream, codegen): """ pass - def on_sdfg_end(self, sdfg, local_stream, global_stream): + def on_sdfg_end(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream) -> None: """ Event called at the end of SDFG code generation. :param sdfg: The generated SDFG object. @@ -53,7 +60,8 @@ def on_sdfg_end(self, sdfg, local_stream, global_stream): """ pass - def on_state_begin(self, sdfg, state, local_stream, global_stream): + def on_state_begin(self, sdfg: SDFG, state: SDFGState, local_stream: CodeIOStream, + global_stream: CodeIOStream) -> None: """ Event called at the beginning of SDFG state code generation. :param sdfg: The generated SDFG object. @@ -63,7 +71,8 @@ def on_state_begin(self, sdfg, state, local_stream, global_stream): """ pass - def on_state_end(self, sdfg, state, local_stream, global_stream): + def on_state_end(self, sdfg: SDFG, state: SDFGState, local_stream: CodeIOStream, + global_stream: CodeIOStream) -> None: """ Event called at the end of SDFG state code generation. :param sdfg: The generated SDFG object. @@ -73,7 +82,8 @@ def on_state_end(self, sdfg, state, local_stream, global_stream): """ pass - def on_scope_entry(self, sdfg, state, node, outer_stream, inner_stream, global_stream): + def on_scope_entry(self, sdfg: SDFG, state: SDFGState, node: nodes.EntryNode, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: """ Event called at the beginning of a scope (on generating an EntryNode). @@ -88,7 +98,8 @@ def on_scope_entry(self, sdfg, state, node, outer_stream, inner_stream, global_s """ pass - def on_scope_exit(self, sdfg, state, node, outer_stream, inner_stream, global_stream): + def on_scope_exit(self, sdfg: SDFG, state: SDFGState, node: nodes.ExitNode, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: """ Event called at the end of a scope (on generating an ExitNode). :param sdfg: The generated SDFG object. @@ -102,8 +113,9 @@ def on_scope_exit(self, sdfg, state, node, outer_stream, inner_stream, global_st """ pass - def on_copy_begin(self, sdfg, state, src_node, dst_node, edge, local_stream, global_stream, copy_shape, src_strides, - dst_strides): + def on_copy_begin(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node, + edge: MultiConnectorEdge[Memlet], local_stream: CodeIOStream, global_stream: CodeIOStream, + copy_shape, src_strides, dst_strides) -> None: """ Event called at the beginning of generating a copy operation. :param sdfg: The generated SDFG object. @@ -119,7 +131,8 @@ def on_copy_begin(self, sdfg, state, src_node, dst_node, edge, local_stream, glo """ pass - def on_copy_end(self, sdfg, state, src_node, dst_node, edge, local_stream, global_stream): + def on_copy_end(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node, + edge: MultiConnectorEdge[Memlet], local_stream: CodeIOStream, global_stream: CodeIOStream) -> None: """ Event called at the end of generating a copy operation. :param sdfg: The generated SDFG object. @@ -132,7 +145,8 @@ def on_copy_end(self, sdfg, state, src_node, dst_node, edge, local_stream, globa """ pass - def on_node_begin(self, sdfg, state, node, outer_stream, inner_stream, global_stream): + def on_node_begin(self, sdfg: SDFG, state: SDFGState, node: nodes.Node, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: """ Event called at the beginning of generating a node. :param sdfg: The generated SDFG object. @@ -146,7 +160,8 @@ def on_node_begin(self, sdfg, state, node, outer_stream, inner_stream, global_st """ pass - def on_node_end(self, sdfg, state, node, outer_stream, inner_stream, global_stream): + def on_node_end(self, sdfg: SDFG, state: SDFGState, node: nodes.Node, outer_stream: CodeIOStream, + inner_stream: CodeIOStream, global_stream: CodeIOStream) -> None: """ Event called at the end of generating a node. :param sdfg: The generated SDFG object. diff --git a/dace/codegen/prettycode.py b/dace/codegen/prettycode.py index 72096ca819..de143f5e86 100644 --- a/dace/codegen/prettycode.py +++ b/dace/codegen/prettycode.py @@ -6,6 +6,7 @@ from six import StringIO from dace.config import Config from dace.sdfg.graph import NodeNotFoundError +from dace.sdfg.state import ControlFlowRegion, SDFGState class CodeIOStream(StringIO): @@ -17,7 +18,7 @@ def __init__(self, base_indentation=0): self._spaces = int(Config.get('compiler', 'indentation_spaces')) self._lineinfo = Config.get_bool('compiler', 'codegen_lineinfo') - def write(self, contents, sdfg=None, state_id=None, node_id=None): + def write(self, contents, cfg: ControlFlowRegion = None, state_id: int = None, node_id: int = None) -> None: # Delete single trailing newline, as this will be implicitly inserted # anyway if contents: @@ -29,8 +30,8 @@ def write(self, contents, sdfg=None, state_id=None, node_id=None): lines = contents # If SDFG/state/node location is given, annotate this line - if sdfg is not None: - location_identifier = ' ////__DACE:%d' % sdfg.cfg_id + if cfg is not None: + location_identifier = ' ////__DACE:%d' % cfg.cfg_id if state_id is not None: location_identifier += ':' + str(state_id) if node_id is not None: @@ -39,7 +40,8 @@ def write(self, contents, sdfg=None, state_id=None, node_id=None): for i, nid in enumerate(node_id): if not isinstance(nid, int): try: - node_id[i] = sdfg.node(state_id).node_id(nid) + state = cfg.state(state_id) + node_id[i] = state.node_id(nid) except NodeNotFoundError: node_id[i] = -1 location_identifier += ':' + ','.join([str(nid) for nid in node_id]) diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index db00dccb19..c34c829c31 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -21,6 +21,7 @@ from dace.codegen import common, cppunparse from dace.codegen.common import (sym2cpp, find_incoming_edges, codeblock_to_cpp) from dace.codegen.dispatcher import DefinedType +from dace.codegen.prettycode import CodeIOStream from dace.config import Config from dace.frontend import operations from dace.frontend.python import astutils @@ -29,6 +30,7 @@ from dace.properties import LambdaProperty from dace.sdfg import SDFG, is_devicelevel_gpu, SDFGState from dace.codegen.targets import fpga +from dace.sdfg.state import ControlFlowRegion, StateSubgraphView if TYPE_CHECKING: from dace.codegen.dispatcher import TargetDispatcher @@ -267,7 +269,7 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str: return name -def emit_memlet_reference(dispatcher, +def emit_memlet_reference(dispatcher: 'TargetDispatcher', sdfg: SDFG, memlet: mmlt.Memlet, pointer_name: str, @@ -858,13 +860,13 @@ def connected_to_gpu_memory(node: nodes.Node, state: SDFGState, sdfg: SDFG): return False -def unparse_tasklet(sdfg, state_id, dfg, node, function_stream, callsite_stream, locals, ldepth, toplevel_schedule, +def unparse_tasklet(sdfg, cfg, state_id, dfg, node, function_stream, callsite_stream, locals, ldepth, toplevel_schedule, codegen): if node.label is None or node.label == "": return "" - state_dfg = sdfg.nodes()[state_id] + state_dfg = cfg.state(state_id) # Not [], "" or None if not node.code: @@ -874,11 +876,11 @@ def unparse_tasklet(sdfg, state_id, dfg, node, function_stream, callsite_stream, if node.code_global and node.code_global.code: function_stream.write( codeblock_to_cpp(node.code_global), - sdfg, + cfg, state_id, node, ) - function_stream.write("\n", sdfg, state_id, node) + function_stream.write("\n", cfg, state_id, node) # add node state_fields to the statestruct codegen._frame.statestruct.extend(node.state_fields) @@ -894,14 +896,14 @@ def unparse_tasklet(sdfg, state_id, dfg, node, function_stream, callsite_stream, callsite_stream.write( 'int __dace_current_stream_id = %d;\n%sStream_t __dace_current_stream = __state->gpu_context->streams[__dace_current_stream_id];' % (node._cuda_stream, common.get_gpu_backend()), - sdfg, + cfg, state_id, node, ) else: callsite_stream.write( '%sStream_t __dace_current_stream = nullptr;' % common.get_gpu_backend(), - sdfg, + cfg, state_id, node, ) @@ -914,7 +916,7 @@ def unparse_tasklet(sdfg, state_id, dfg, node, function_stream, callsite_stream, # Doesn't cause crashes due to missing pyMLIR if a MLIR tasklet is not present from dace.codegen.targets.mlir import utils - mlir_func_uid = "_" + str(sdfg.cfg_id) + "_" + str(state_id) + "_" + str(dfg.node_id(node)) + mlir_func_uid = "_" + str(cfg.cfg_id) + "_" + str(state_id) + "_" + str(dfg.node_id(node)) mlir_ast = utils.get_ast(node.code.code) mlir_is_generic = utils.is_generic(mlir_ast) @@ -943,7 +945,7 @@ def unparse_tasklet(sdfg, state_id, dfg, node, function_stream, callsite_stream, callsite_stream.write(mlir_out_name + " = mlir_entry" + mlir_func_uid + "(" + mlir_in_untyped + ");") if node.language == dtypes.Language.CPP: - callsite_stream.write(type(node).__properties__["code"].to_string(node.code), sdfg, state_id, node) + callsite_stream.write(type(node).__properties__["code"].to_string(node.code), cfg, state_id, node) if not is_devicelevel_gpu(sdfg, state_dfg, node) and hasattr(node, "_cuda_stream"): # Get GPU codegen @@ -952,7 +954,7 @@ def unparse_tasklet(sdfg, state_id, dfg, node, function_stream, callsite_stream, gpu_codegen = next(cg for cg in codegen._dispatcher.used_targets if isinstance(cg, cuda.CUDACodeGen)) except StopIteration: return - synchronize_streams(sdfg, state_dfg, state_id, node, node, callsite_stream, gpu_codegen) + synchronize_streams(sdfg, cfg, state_dfg, state_id, node, node, callsite_stream, gpu_codegen) return body = node.code.code @@ -989,7 +991,7 @@ def unparse_tasklet(sdfg, state_id, dfg, node, function_stream, callsite_stream, if connector is not None: defined_symbols.update({connector: conntype}) - callsite_stream.write("// Tasklet code (%s)\n" % node.label, sdfg, state_id, node) + callsite_stream.write("// Tasklet code (%s)\n" % node.label, cfg, state_id, node) for stmt in body: stmt = copy.deepcopy(stmt) rk = StructInitializer(sdfg).visit(stmt) @@ -1002,7 +1004,7 @@ def unparse_tasklet(sdfg, state_id, dfg, node, function_stream, callsite_stream, # Unparse to C++ and add 'auto' declarations if locals not declared result = StringIO() cppunparse.CPPUnparser(rk, ldepth + 1, locals, result, defined_symbols=defined_symbols) - callsite_stream.write(result.getvalue(), sdfg, state_id, node) + callsite_stream.write(result.getvalue(), cfg, state_id, node) def shape_to_strides(shape): @@ -1366,8 +1368,9 @@ def visit_Call(self, node): # TODO: This should be in the CUDA code generator. Add appropriate conditions to node dispatch predicate -def presynchronize_streams(sdfg, dfg, state_id, node, callsite_stream): - state_dfg = sdfg.nodes()[state_id] +def presynchronize_streams(sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Node, callsite_stream: CodeIOStream): + state_dfg: SDFGState = cfg.nodes()[state_id] if hasattr(node, "_cuda_stream") or is_devicelevel_gpu(sdfg, state_dfg, node): return for e in state_dfg.in_edges(node): @@ -1382,7 +1385,7 @@ def presynchronize_streams(sdfg, dfg, state_id, node, callsite_stream): # TODO: This should be in the CUDA code generator. Add appropriate conditions to node dispatch predicate -def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream, codegen): +def synchronize_streams(sdfg, cfg, dfg, state_id, node, scope_exit, callsite_stream, codegen): # Post-kernel stream synchronization (with host or other streams) max_streams = int(Config.get("compiler", "cuda", "max_concurrent_streams")) if max_streams >= 0: @@ -1412,11 +1415,11 @@ def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream, if isinstance(desc, data.Array) and desc.start_offset != 0: ptrname = f'({ptrname} - {sym2cpp(desc.start_offset)})' if Config.get_bool('compiler', 'cuda', 'syncdebug'): - callsite_stream.write(f'DACE_GPU_CHECK({backend}FreeAsync({ptrname}, {cudastream}));\n', sdfg, state_id, + callsite_stream.write(f'DACE_GPU_CHECK({backend}FreeAsync({ptrname}, {cudastream}));\n', cfg, state_id, scope_exit) callsite_stream.write(f'DACE_GPU_CHECK({backend}DeviceSynchronize());') else: - callsite_stream.write(f'{backend}FreeAsync({ptrname}, {cudastream});\n', sdfg, state_id, scope_exit) + callsite_stream.write(f'{backend}FreeAsync({ptrname}, {cudastream});\n', cfg, state_id, scope_exit) to_remove.add((sd, name)) # Clear all released memory from tracking @@ -1444,7 +1447,7 @@ def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream, dst_stream=edge.dst._cuda_stream, backend=backend, ), - sdfg, + cfg, state_id, [edge.src, edge.dst], ) @@ -1476,7 +1479,7 @@ def synchronize_streams(sdfg, dfg, state_id, node, scope_exit, callsite_stream, dst_stream=e.dst._cuda_stream, backend=backend, ), - sdfg, + cfg, state_id, [e.src, e.dst], ) diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 0d153fb332..a77f8147aa 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -1,13 +1,12 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. from copy import deepcopy -from dace.sdfg.state import SDFGState +from dace.sdfg.graph import MultiConnectorEdge +from dace.sdfg.state import ControlFlowRegion, SDFGState, StateSubgraphView import functools import itertools import warnings -from sympy.functions.elementary.complexes import arg - -from dace import data, dtypes, registry, memlet as mmlt, sdfg as sd, subsets, symbolic, Config +from dace import data, dtypes, registry, memlet as mmlt, subsets, symbolic, Config from dace.codegen import cppunparse, exceptions as cgx from dace.codegen.prettycode import CodeIOStream from dace.codegen.targets import cpp @@ -17,13 +16,17 @@ from dace.frontend import operations from dace.sdfg import nodes, utils as sdutils from dace.sdfg import (ScopeSubgraphView, SDFG, scope_contains_scope, is_array_stream_view, NodeNotExpandedError, - dynamic_map_inputs, local_transients) -from dace.sdfg.scope import is_devicelevel_gpu, is_devicelevel_fpga, is_in_scope + dynamic_map_inputs) +from dace.sdfg.scope import is_devicelevel_gpu, is_in_scope from dace.sdfg.validation import validate_memlet_data -from typing import Union +from typing import TYPE_CHECKING, Optional, Tuple, Union from dace.codegen.targets import fpga +if TYPE_CHECKING: + from dace.codegen.targets.framecode import DaCeCodeGenerator + + @registry.autoregister_params(name='cpu') class CPUCodeGen(TargetCodeGenerator): """ SDFG CPU code generator. """ @@ -78,7 +81,7 @@ def _visit_structure(struct: data.Structure, args: dict, prefix: str = ''): raise TypeError("Unrecognized argument type: {t} (value {v})".format(t=type(arg_type).__name__, v=str(arg_type))) - def __init__(self, frame_codegen, sdfg): + def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._frame = frame_codegen self._dispatcher: TargetDispatcher = frame_codegen.dispatcher self.calling_codegen = self @@ -145,26 +148,33 @@ def has_initializer(self): def has_finalizer(self): return False - def generate_scope( - self, - sdfg: SDFG, - dfg_scope: ScopeSubgraphView, - state_id, - function_stream, - callsite_stream, - ): + def generate_scope(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg_scope: ScopeSubgraphView, + state_id: int, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: entry_node = dfg_scope.source_nodes()[0] - cpp.presynchronize_streams(sdfg, dfg_scope, state_id, entry_node, callsite_stream) + cpp.presynchronize_streams(sdfg, cfg, dfg_scope, state_id, entry_node, callsite_stream) - self.generate_node(sdfg, dfg_scope, state_id, entry_node, function_stream, callsite_stream) + self.generate_node(sdfg, cfg, dfg_scope, state_id, entry_node, function_stream, callsite_stream) self._dispatcher.dispatch_subgraph(sdfg, + cfg, dfg_scope, state_id, function_stream, callsite_stream, skip_entry_node=True) - def generate_node(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): + def generate_node(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: ScopeSubgraphView, + state_id: int, + node: nodes.Node, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: # Dynamically obtain node generator according to class name try: gen = getattr(self, "_generate_" + type(node).__name__) @@ -173,14 +183,15 @@ def generate_node(self, sdfg, dfg, state_id, node, function_stream, callsite_str raise NodeNotExpandedError(sdfg, state_id, dfg.node_id(node)) raise - gen(sdfg, dfg, state_id, node, function_stream, callsite_stream) + gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) # Mark node as "generated" self._generated_nodes.add(node) self._locals.clear_scope(self._ldepth + 1) - def allocate_view(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.AccessNode, - global_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + def allocate_view(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.AccessNode, + global_stream: CodeIOStream, declaration_stream: CodeIOStream, + allocation_stream: CodeIOStream) -> None: """ Allocates (creates pointer and refers to original) a view of an existing array, scalar, or view. @@ -202,9 +213,9 @@ def allocate_view(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.A # Allocate the viewed data before the view, if necessary mpath = dfg.memlet_path(edge) - viewed_dnode = mpath[-1].dst if is_write else mpath[0].src - self._dispatcher.dispatch_allocate(sdfg, dfg, state_id, viewed_dnode, viewed_dnode.desc(sdfg), global_stream, - allocation_stream) + viewed_dnode: nodes.AccessNode = mpath[-1].dst if is_write else mpath[0].src + self._dispatcher.dispatch_allocate(sdfg, cfg, dfg, state_id, viewed_dnode, viewed_dnode.desc(sdfg), + global_stream, allocation_stream) # Memlet points to view, construct mirror memlet memlet = edge.data @@ -267,12 +278,12 @@ def allocate_view(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.A atype = atype[:-1] if value.startswith('&'): value = value[1:] - declaration_stream.write(f'{atype} {aname};', sdfg, state_id, node) - allocation_stream.write(f'{aname} = {value};', sdfg, state_id, node) + declaration_stream.write(f'{atype} {aname};', cfg, state_id, node) + allocation_stream.write(f'{aname} = {value};', cfg, state_id, node) - def allocate_reference(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.AccessNode, - global_stream: CodeIOStream, declaration_stream: CodeIOStream, - allocation_stream: CodeIOStream): + def allocate_reference(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, + node: nodes.AccessNode, global_stream: CodeIOStream, declaration_stream: CodeIOStream, + allocation_stream: CodeIOStream) -> None: name = node.data nodedesc = node.desc(sdfg) ptrname = cpp.ptr(name, nodedesc, sdfg, self._frame) @@ -281,13 +292,20 @@ def allocate_reference(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: no declared = self._dispatcher.declared_arrays.has(ptrname) if not declared: - declaration_stream.write(f'{nodedesc.dtype.ctype} *{ptrname};', sdfg, state_id, node) + declaration_stream.write(f'{nodedesc.dtype.ctype} *{ptrname};', cfg, state_id, node) ctypedef = dtypes.pointer(nodedesc.dtype).ctype self._dispatcher.declared_arrays.add(ptrname, DefinedType.Pointer, ctypedef) self._dispatcher.defined_vars.add(ptrname, DefinedType.Pointer, ctypedef) - def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream): - + def declare_array(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: StateSubgraphView, + state_id: int, + node: nodes.Node, + nodedesc: data.Data, + function_stream: CodeIOStream, + declaration_stream: CodeIOStream) -> None: fsymbols = self._frame.symbols_and_constants(sdfg) # NOTE: `dfg` (state) will be None iff `nodedesc` is non-free symbol dependent # (see `DaCeCodeGenerator.determine_allocation_lifetime` in `dace.codegen.targets.framecode`). @@ -316,7 +334,7 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de ctypedef = dtypes.pointer(nodedesc.dtype).ctype - declaration_stream.write(f'{nodedesc.dtype.ctype} *{name} = nullptr;\n', sdfg, state_id, node) + declaration_stream.write(f'{nodedesc.dtype.ctype} *{name} = nullptr;\n', cfg, state_id, node) self._dispatcher.declared_arrays.add(name, DefinedType.Pointer, ctypedef) return elif nodedesc.storage is dtypes.StorageType.CPU_ThreadLocal: @@ -325,7 +343,7 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de function_stream.write( "{ctype} *{name} = nullptr;\n" "#pragma omp threadprivate({name})".format(ctype=nodedesc.dtype.ctype, name=name), - sdfg, + cfg, state_id, node, ) @@ -333,8 +351,10 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de else: raise NotImplementedError("Unimplemented storage type " + str(nodedesc.storage)) - def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, - allocation_stream, allocate_nested_data: bool = True): + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream, + allocate_nested_data: bool = True) -> None: alloc_name = cpp.ptr(node.data, nodedesc, sdfg, self._frame) name = alloc_name @@ -358,7 +378,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d tmp_name = '.'.join(tokens[:i + 1]) tmp_alloc_name = cpp.ptr(tmp_name, sdfg.arrays[tmp_name], sdfg, self._frame) if not self._dispatcher.defined_vars.has(tmp_alloc_name): - self.allocate_array(sdfg, dfg, state_id, nodes.AccessNode(tmp_name), sdfg.arrays[tmp_name], + self.allocate_array(sdfg, cfg, dfg, state_id, nodes.AccessNode(tmp_name), sdfg.arrays[tmp_name], function_stream, declaration_stream, allocation_stream, allocate_nested_data=False) declared = True @@ -386,19 +406,20 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d ctypedef = dtypes.pointer(v.dtype).ctype if isinstance(v, data.Array) else v.dtype.ctype defined_type = DefinedType.Scalar if isinstance(v, data.Scalar) else DefinedType.Pointer self._dispatcher.declared_arrays.add(f"{name}->{k}", defined_type, ctypedef) - self.allocate_array(sdfg, dfg, state_id, nodes.AccessNode(f"{name}.{k}"), v, function_stream, - declaration_stream, allocation_stream) + self.allocate_array(sdfg, cfg, dfg, state_id, nodes.AccessNode(f"{name}.{k}"), v, + function_stream, declaration_stream, allocation_stream) return if isinstance(nodedesc, data.View): - return self.allocate_view(sdfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) + return self.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, + allocation_stream) if isinstance(nodedesc, data.Reference): - return self.allocate_reference(sdfg, dfg, state_id, node, function_stream, declaration_stream, + return self.allocate_reference(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) if isinstance(nodedesc, data.Scalar): if node.setzero: - declaration_stream.write("%s %s = 0;\n" % (nodedesc.dtype.ctype, name), sdfg, state_id, node) + declaration_stream.write("%s %s = 0;\n" % (nodedesc.dtype.ctype, name), cfg, state_id, node) else: - declaration_stream.write("%s %s;\n" % (nodedesc.dtype.ctype, name), sdfg, state_id, node) + declaration_stream.write("%s %s;\n" % (nodedesc.dtype.ctype, name), cfg, state_id, node) define_var(name, DefinedType.Scalar, nodedesc.dtype.ctype) elif isinstance(nodedesc, data.Stream): ################################################################### @@ -409,14 +430,14 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d raise SyntaxError("Stream-view of array may not be defined in more than one state") arrnode = sdfg.arrays[nodedesc.sink] - state = sdfg.nodes()[state_id] + state: SDFGState = cfg.nodes()[state_id] edges = state.out_edges(node) if len(edges) > 1: raise NotImplementedError("Cannot handle streams writing to multiple arrays.") memlet_path = state.memlet_path(edges[0]) # Allocate the array before its stream view, if necessary - self.allocate_array(sdfg, dfg, state_id, memlet_path[-1].dst, memlet_path[-1].dst.desc(sdfg), + self.allocate_array(sdfg, cfg, dfg, state_id, memlet_path[-1].dst, memlet_path[-1].dst.desc(sdfg), function_stream, declaration_stream, allocation_stream) array_expr = cpp.copy_expr(self._dispatcher, @@ -431,7 +452,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d ctype = 'dace::ArrayStreamView%s<%s>' % (threadlocal, arrnode.dtype.ctype) declaration_stream.write( "%s %s (%s);\n" % (ctype, name, array_expr), - sdfg, + cfg, state_id, node, ) @@ -448,7 +469,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d else: definition = "{} {};".format(ctypedef, name) - declaration_stream.write(definition, sdfg, state_id, node) + declaration_stream.write(definition, cfg, state_id, node) define_var(name, DefinedType.Stream, ctypedef) elif (nodedesc.storage == dtypes.StorageType.CPU_Heap @@ -471,9 +492,9 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d ctypedef = dtypes.pointer(nodedesc.dtype).ctype if not declared: - declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', sdfg, state_id, node) + declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', cfg, state_id, node) allocation_stream.write( - "%s = new %s DACE_ALIGN(64)[%s];\n" % (alloc_name, nodedesc.dtype.ctype, cpp.sym2cpp(arrsize)), sdfg, + "%s = new %s DACE_ALIGN(64)[%s];\n" % (alloc_name, nodedesc.dtype.ctype, cpp.sym2cpp(arrsize)), cfg, state_id, node) define_var(name, DefinedType.Pointer, ctypedef) @@ -481,7 +502,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d allocation_stream.write("memset(%s, 0, sizeof(%s)*%s);" % (alloc_name, nodedesc.dtype.ctype, cpp.sym2cpp(arrsize))) if nodedesc.start_offset != 0: - allocation_stream.write(f'{alloc_name} += {cpp.sym2cpp(nodedesc.start_offset)};\n', sdfg, state_id, + allocation_stream.write(f'{alloc_name} += {cpp.sym2cpp(nodedesc.start_offset)};\n', cfg, state_id, node) return @@ -492,7 +513,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d if node.setzero: declaration_stream.write( "%s %s[%s] DACE_ALIGN(64) = {0};\n" % (nodedesc.dtype.ctype, name, cpp.sym2cpp(arrsize)), - sdfg, + cfg, state_id, node, ) @@ -500,7 +521,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d return declaration_stream.write( "%s %s[%s] DACE_ALIGN(64);\n" % (nodedesc.dtype.ctype, name, cpp.sym2cpp(arrsize)), - sdfg, + cfg, state_id, node, ) @@ -512,7 +533,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d if not declared: function_stream.write( "{ctype} *{name};\n#pragma omp threadprivate({name})".format(ctype=nodedesc.dtype.ctype, name=name), - sdfg, + cfg, state_id, node, ) @@ -526,7 +547,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d {name} = new {ctype} DACE_ALIGN(64)[{arrsize}];""".format(ctype=nodedesc.dtype.ctype, name=alloc_name, arrsize=cpp.sym2cpp(arrsize)), - sdfg, + cfg, state_id, node, ) @@ -534,7 +555,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d allocation_stream.write("memset(%s, 0, sizeof(%s)*%s);" % (alloc_name, nodedesc.dtype.ctype, cpp.sym2cpp(arrsize))) if nodedesc.start_offset != 0: - allocation_stream.write(f'{alloc_name} += {cpp.sym2cpp(nodedesc.start_offset)};\n', sdfg, state_id, + allocation_stream.write(f'{alloc_name} += {cpp.sym2cpp(nodedesc.start_offset)};\n', cfg, state_id, node) # Close OpenMP parallel section @@ -543,7 +564,9 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d else: raise NotImplementedError("Unimplemented storage type " + str(nodedesc.storage)) - def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, callsite_stream): + def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: arrsize = nodedesc.total_size alloc_name = cpp.ptr(node.data, nodedesc, sdfg, self._frame) if isinstance(nodedesc, data.Array) and nodedesc.start_offset != 0: @@ -558,7 +581,7 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, return elif (nodedesc.storage == dtypes.StorageType.CPU_Heap or (nodedesc.storage == dtypes.StorageType.Register and symbolic.issymbolic(arrsize, sdfg.constants))): - callsite_stream.write("delete[] %s;\n" % alloc_name, sdfg, state_id, node) + callsite_stream.write("delete[] %s;\n" % alloc_name, cfg, state_id, node) elif nodedesc.storage is dtypes.StorageType.CPU_ThreadLocal: # Deallocate in each OpenMP thread callsite_stream.write( @@ -566,7 +589,7 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, {{ delete[] {name}; }}""".format(name=alloc_name), - sdfg, + cfg, state_id, node, ) @@ -575,15 +598,16 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, def copy_memory( self, - sdfg, - dfg, - state_id, - src_node, - dst_node, - edge, - function_stream, - callsite_stream, - ): + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: StateSubgraphView, + state_id: int, + src_node: Union[nodes.Tasklet, nodes.AccessNode], + dst_node: Union[nodes.Tasklet, nodes.AccessNode], + edge: Tuple[nodes.Node, Optional[str], nodes.Node, Optional[str], mmlt.Memlet], + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + ) -> None: if isinstance(src_node, nodes.Tasklet): src_storage = dtypes.StorageType.Register try: @@ -605,11 +629,12 @@ def copy_memory( dst_parent = None dst_schedule = None if dst_parent is None else dst_parent.map.schedule - state_dfg = sdfg.node(state_id) + state_dfg = cfg.node(state_id) # Emit actual copy self._emit_copy( sdfg, + cfg, state_id, src_node, src_storage, @@ -624,17 +649,18 @@ def copy_memory( def _emit_copy( self, - sdfg, - state_id, - src_node, - src_storage, - dst_node, - dst_storage, - dst_schedule, - edge, - dfg, - stream, - ): + sdfg: SDFG, + cfg: ControlFlowRegion, + state_id: int, + src_node: nodes.Node, + src_storage: dtypes.StorageType, + dst_node: nodes.Node, + dst_storage: dtypes.StorageType, + dst_schedule: dtypes.ScheduleType, + edge: Tuple[nodes.Node, Optional[str], nodes.Node, Optional[str], mmlt.Memlet], + dfg: StateSubgraphView, + stream: CodeIOStream, + ) -> None: u, uconn, v, vconn, memlet = edge orig_vconn = vconn @@ -656,7 +682,7 @@ def _emit_copy( # Copy into tasklet stream.write( " " + self.memlet_definition(sdfg, memlet, False, vconn, dst_node.in_connectors[vconn]), - sdfg, + cfg, state_id, [src_node, dst_node], ) @@ -665,7 +691,7 @@ def _emit_copy( # Copy out of tasklet stream.write( " " + self.memlet_definition(sdfg, memlet, True, uconn, src_node.out_connectors[uconn]), - sdfg, + cfg, state_id, [src_node, dst_node], ) @@ -687,7 +713,7 @@ def _emit_copy( defined_type, _ = self._dispatcher.defined_vars.get(srcptr) stream.write( "%s = %s;" % (vconn, cpp.cpp_ptr_expr(sdfg, memlet, defined_type)), - sdfg, + cfg, state_id, [src_node, dst_node], ) @@ -723,7 +749,7 @@ def _emit_copy( self._frame), aexpr=array_expr, maxsize=cpp.sym2cpp(array_subset.num_elements())), - sdfg, + cfg, state_id, [src_node, dst_node], ) @@ -734,7 +760,7 @@ def _emit_copy( stream.write( "{s}.push({arr});".format(s=cpp.ptr(dst_node.data, dst_nodedesc, sdfg, self._frame), arr=cpp.ptr(src_node.data, src_nodedesc, sdfg, self._frame)), - sdfg, + cfg, state_id, [src_node, dst_node], ) @@ -743,7 +769,7 @@ def _emit_copy( "{s}.push({arr});".format(s=cpp.ptr(dst_node.data, dst_nodedesc, sdfg, self._frame), arr=cpp.ptr(src_nodedesc.src, sdfg.arrays[src_nodedesc.src], sdfg, self._frame)), - sdfg, + cfg, state_id, [src_node, dst_node], ) @@ -754,7 +780,7 @@ def _emit_copy( arr=cpp.ptr(src_node.data, src_nodedesc, sdfg, self._frame), size=copysize), - sdfg, + cfg, state_id, [src_node, dst_node], ) @@ -765,7 +791,7 @@ def _emit_copy( ############################################# - state_dfg = sdfg.nodes()[state_id] + state_dfg: SDFGState = cfg.nodes()[state_id] copy_shape, src_strides, dst_strides, src_expr, dst_expr = cpp.memlet_copy_to_absolute_strides( self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._packed_types) @@ -840,7 +866,7 @@ def _emit_copy( copy_func="Copy" if memlet.wcr is None else "Accumulate", copy_args=", ".join(copy_args), ), - sdfg, + cfg, state_id, [src_node, dst_node], ) @@ -856,7 +882,7 @@ def _emit_copy( shape_tmpl=shape_tmpl, copy_args=", ".join(copy_args), ), - sdfg, + cfg, state_id, [src_node, dst_node], ) @@ -864,7 +890,7 @@ def _emit_copy( dst_expr = self.memlet_view_ctor(sdfg, memlet, dst_nodedesc.dtype, True) stream.write( self.write_and_resolve_expr( - sdfg, memlet, nc, dst_expr, '*(' + src_expr + ')', dtype=dst_nodedesc.dtype) + ';', sdfg, + sdfg, memlet, nc, dst_expr, '*(' + src_expr + ')', dtype=dst_nodedesc.dtype) + ';', cfg, state_id, [src_node, dst_node]) else: warnings.warn('Minor performance warning: Emitting statically-' @@ -877,7 +903,7 @@ def _emit_copy( shape_tmpl=shape_tmpl, copy_args=", ".join(copy_args), ), - sdfg, + cfg, state_id, [src_node, dst_node], ) @@ -892,7 +918,8 @@ def _emit_copy( ########################################################################### # Memlet handling - def write_and_resolve_expr(self, sdfg, memlet, nc, outname, inname, indices=None, dtype=None): + def write_and_resolve_expr(self, sdfg: SDFG, memlet: mmlt.Memlet, nc: bool, outname: str, inname: str, + indices=None, dtype=None): """ Emits a conflict resolution call from a memlet. """ @@ -935,18 +962,20 @@ def write_and_resolve_expr(self, sdfg, memlet, nc, outname, inname, indices=None return (f'dace::wcr_custom<{dtype.ctype}>:: template {func}({custom_reduction}, {ptr}, {inname})') def process_out_memlets(self, - sdfg, - state_id, - node, - dfg, - dispatcher, - result, - locals_defined, - function_stream, - skip_wcr=False, - codegen=None): - codegen = codegen or self - scope_dict = sdfg.nodes()[state_id].scope_dict() + sdfg: SDFG, + cfg: ControlFlowRegion, + state_id: int, + node: nodes.Node, + dfg: StateSubgraphView, + dispatcher: TargetDispatcher, + result: CodeIOStream, + locals_defined: bool, + function_stream: CodeIOStream, + skip_wcr: bool = False, + codegen: Optional[TargetCodeGenerator] = None): + codegen = codegen if codegen is not None else self + state: SDFGState = cfg.nodes()[state_id] + scope_dict = state.scope_dict() for edge in dfg.out_edges(node): _, uconn, v, _, memlet = edge @@ -969,12 +998,12 @@ def process_out_memlets(self, shared_data_name = edge.data.data if not shared_data_name: # Very unique name. TODO: Make more intuitive - shared_data_name = '__dace_%d_%d_%d_%d_%s' % (sdfg.cfg_id, state_id, dfg.node_id(node), + shared_data_name = '__dace_%d_%d_%d_%d_%s' % (cfg.cfg_id, state_id, dfg.node_id(node), dfg.node_id(dst_node), edge.src_conn) result.write( "%s = %s;" % (shared_data_name, edge.src_conn), - sdfg, + cfg, state_id, [edge.src, edge.dst], ) @@ -1015,8 +1044,6 @@ def process_out_memlets(self, assert len(in_memlets) == 1 in_local_name = self.memlet_ctor(sdfg, in_memlets[0], node.out_connectors[uconn], False) - state_dfg = sdfg.nodes()[state_id] - if memlet.wcr is not None: nc = not cpp.is_write_conflicted(dfg, edge, sdfg_schedule=self._toplevel_schedule) write_expr = codegen.write_and_resolve_expr( @@ -1071,7 +1098,7 @@ def process_out_memlets(self, write_expr = codegen.make_ptr_assignment(in_local_name, conntype, expr, desc_dtype) # Write out - result.write(write_expr, sdfg, state_id, node) + result.write(write_expr, cfg, state_id, node) # Dispatch array-to-array outgoing copies here elif isinstance(node, nodes.AccessNode): @@ -1081,6 +1108,7 @@ def process_out_memlets(self, dst_node, edge, sdfg, + cfg, dfg, state_id, function_stream, @@ -1099,7 +1127,7 @@ def make_ptr_assignment(self, src_expr, src_dtype, dst_expr, dst_dtype, codegen= dst_expr = codegen.make_ptr_vector_cast(dst_expr, dst_dtype, src_dtype, True, DefinedType.Pointer) return f"{dst_expr} = {src_expr};" - def memlet_view_ctor(self, sdfg, memlet, dtype, is_output): + def memlet_view_ctor(self, sdfg: SDFG, memlet: mmlt.Memlet, dtype, is_output: bool) -> str: memlet_params = [] memlet_name = cpp.ptr(memlet.data, sdfg.arrays[memlet.data], sdfg, self._frame) @@ -1112,8 +1140,6 @@ def memlet_view_ctor(self, sdfg, memlet, dtype, is_output): else: raise TypeError("Unsupported connector type {}".format(def_type)) - pointer = '' - if isinstance(memlet.subset, subsets.Indices): # FIXME: _packed_types influences how this offset is @@ -1200,8 +1226,8 @@ def memlet_definition(self, output: bool, local_name: str, conntype: Union[data.Data, dtypes.typeclass] = None, - allow_shadowing=False, - codegen=None): + allow_shadowing: bool = False, + codegen: 'CPUCodeGen' = None): # TODO: Robust rule set if conntype is None: raise ValueError('Cannot define memlet for "%s" without connector type' % local_name) @@ -1311,16 +1337,12 @@ def memlet_definition(self, return result - def memlet_stream_ctor(self, sdfg, memlet): + def memlet_stream_ctor(self, sdfg: SDFG, memlet: mmlt.Memlet) -> str: stream = sdfg.arrays[memlet.data] - ptrname = cpp.ptr(memlet.data, stream, sdfg, self._frame) - - def_type, _ = self._dispatcher.defined_vars.get(ptrname) - return memlet.data + ("[{}]".format(cpp.cpp_offset_expr(stream, memlet.subset)) if isinstance(stream, data.Stream) and stream.is_stream_array() else "") - def memlet_ctor(self, sdfg, memlet, dtype, is_output): + def memlet_ctor(self, sdfg: SDFG, memlet: mmlt.Memlet, dtype, is_output: bool) -> str: ptrname = cpp.ptr(memlet.data, sdfg.arrays[memlet.data], sdfg, self._frame) def_type, _ = self._dispatcher.defined_vars.get(ptrname) @@ -1336,7 +1358,9 @@ def memlet_ctor(self, sdfg, memlet, dtype, is_output): ######################################################################### # Dynamically-called node dispatchers - def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream, callsite_stream, codegen=None): + def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream, + codegen=None): # Allow other code generators to call this with a callback codegen = codegen or self @@ -1349,16 +1373,16 @@ def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream, callsite self._frame._initcode.write(codeblock_to_cpp(node.code_init), sdfg) self._frame._exitcode.write(codeblock_to_cpp(node.code_exit), sdfg) - state_dfg: SDFGState = sdfg.nodes()[state_id] + state_dfg: SDFGState = cfg.nodes()[state_id] # Free tasklets need to be presynchronized (e.g., CPU tasklet after # GPU->CPU copy) if state_dfg.entry_node(node) is None: - cpp.presynchronize_streams(sdfg, state_dfg, state_id, node, callsite_stream) + cpp.presynchronize_streams(sdfg, cfg, state_dfg, state_id, node, callsite_stream) # Prepare preamble and code for after memlets after_memlets_stream = CodeIOStream() - codegen.generate_tasklet_preamble(sdfg, dfg, state_id, node, function_stream, callsite_stream, + codegen.generate_tasklet_preamble(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream, after_memlets_stream) self._dispatcher.defined_vars.enter_scope(node) @@ -1378,7 +1402,7 @@ def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream, callsite shared_data_name = edge.data.data if not shared_data_name: # Very unique name. TODO: Make more intuitive - shared_data_name = '__dace_%d_%d_%d_%d_%s' % (sdfg.cfg_id, state_id, dfg.node_id(src_node), + shared_data_name = '__dace_%d_%d_%d_%d_%s' % (cfg.cfg_id, state_id, dfg.node_id(src_node), dfg.node_id(node), edge.src_conn) # Read variable from shared storage @@ -1387,7 +1411,7 @@ def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream, callsite assign_str = (f"const {ctype} {edge.dst_conn} = {shared_data_name};") else: assign_str = (f"const {ctype} &{edge.dst_conn} = {shared_data_name};") - inner_stream.write(assign_str, sdfg, state_id, [edge.src, edge.dst]) + inner_stream.write(assign_str, cfg, state_id, [edge.src, edge.dst]) self._dispatcher.defined_vars.add(edge.dst_conn, defined_type, f"const {ctype}") else: @@ -1396,6 +1420,7 @@ def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream, callsite node, edge, sdfg, + cfg, dfg, state_id, function_stream, @@ -1420,8 +1445,8 @@ def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream, callsite if edge.src_conn in tasklet_out_connectors: # Disallow duplicates continue - self._dispatcher.dispatch_output_definition(node, dst_node, edge, sdfg, dfg, state_id, function_stream, - inner_stream) + self._dispatcher.dispatch_output_definition(node, dst_node, edge, sdfg, cfg, dfg, state_id, + function_stream, inner_stream) # Also define variables in the C++ unparser scope self._locals.define(edge.src_conn, -1, self._ldepth + 1, node.out_connectors[edge.src_conn].ctype) @@ -1447,12 +1472,12 @@ def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream, callsite local_name = edge.data.data if not local_name: # Very unique name. TODO: Make more intuitive - local_name = '__dace_%d_%d_%d_%d_%s' % (sdfg.cfg_id, state_id, dfg.node_id(node), + local_name = '__dace_%d_%d_%d_%d_%s' % (cfg.cfg_id, state_id, dfg.node_id(node), dfg.node_id(dst_node), edge.src_conn) # Allocate variable type code = "%s %s;" % (ctype, local_name) - outer_stream_begin.write(code, sdfg, state_id, [edge.src, dst_node]) + outer_stream_begin.write(code, cfg, state_id, [edge.src, dst_node]) if (isinstance(arg_type, data.Scalar) or isinstance(arg_type, dtypes.typeclass)): self._dispatcher.defined_vars.add(local_name, DefinedType.Scalar, ctype, ancestor=1) elif isinstance(arg_type, data.Array): @@ -1465,7 +1490,7 @@ def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream, callsite else: raise TypeError("Unrecognized argument type: {}".format(type(arg_type).__name__)) - inner_stream.write("%s %s;" % (ctype, edge.src_conn), sdfg, state_id, [edge.src, edge.dst]) + inner_stream.write("%s %s;" % (ctype, edge.src_conn), cfg, state_id, [edge.src, edge.dst]) tasklet_out_connectors.add(edge.src_conn) self._dispatcher.defined_vars.add(edge.src_conn, DefinedType.Scalar, ctype) self._locals.define(edge.src_conn, -1, self._ldepth + 1, ctype) @@ -1479,21 +1504,22 @@ def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream, callsite if instr is not None: instr.on_node_begin(sdfg, state_dfg, node, outer_stream_begin, inner_stream, function_stream) - inner_stream.write("\n ///////////////////\n", sdfg, state_id, node) + inner_stream.write("\n ///////////////////\n", cfg, state_id, node) - codegen.unparse_tasklet(sdfg, state_id, dfg, node, function_stream, inner_stream, self._locals, self._ldepth, - self._toplevel_schedule) + codegen.unparse_tasklet(sdfg, cfg, state_id, dfg, node, function_stream, inner_stream, self._locals, + self._ldepth, self._toplevel_schedule) - inner_stream.write(" ///////////////////\n\n", sdfg, state_id, node) + inner_stream.write(" ///////////////////\n\n", cfg, state_id, node) # Generate pre-memlet tasklet postamble after_memlets_stream = CodeIOStream() - codegen.generate_tasklet_postamble(sdfg, dfg, state_id, node, function_stream, inner_stream, + codegen.generate_tasklet_postamble(sdfg, cfg, dfg, state_id, node, function_stream, inner_stream, after_memlets_stream) # Process outgoing memlets codegen.process_out_memlets( sdfg, + cfg, state_id, node, dfg, @@ -1507,23 +1533,25 @@ def _generate_Tasklet(self, sdfg, dfg, state_id, node, function_stream, callsite if instr is not None: instr.on_node_end(sdfg, state_dfg, node, outer_stream_end, inner_stream, function_stream) - callsite_stream.write(outer_stream_begin.getvalue(), sdfg, state_id, node) - callsite_stream.write('{', sdfg, state_id, node) - callsite_stream.write(inner_stream.getvalue(), sdfg, state_id, node) + callsite_stream.write(outer_stream_begin.getvalue(), cfg, state_id, node) + callsite_stream.write('{', cfg, state_id, node) + callsite_stream.write(inner_stream.getvalue(), cfg, state_id, node) callsite_stream.write(after_memlets_stream.getvalue()) - callsite_stream.write('}', sdfg, state_id, node) - callsite_stream.write(outer_stream_end.getvalue(), sdfg, state_id, node) + callsite_stream.write('}', cfg, state_id, node) + callsite_stream.write(outer_stream_end.getvalue(), cfg, state_id, node) self._locals.clear_scope(self._ldepth + 1) self._dispatcher.defined_vars.exit_scope(node) - def unparse_tasklet(self, sdfg, state_id, dfg, node, function_stream, inner_stream, locals, ldepth, + def unparse_tasklet(self, sdfg, cfg, state_id, dfg, node, function_stream, inner_stream, locals, ldepth, toplevel_schedule): # Call the generic CPP unparse_tasklet method - cpp.unparse_tasklet(sdfg, state_id, dfg, node, function_stream, inner_stream, locals, ldepth, toplevel_schedule, - self) + cpp.unparse_tasklet(sdfg, cfg, state_id, dfg, node, function_stream, inner_stream, locals, ldepth, + toplevel_schedule, self) - def define_out_memlet(self, sdfg, state_dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream): + def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[mmlt.Memlet], + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: cdtype = src_node.out_connectors[edge.src_conn] if isinstance(sdfg.arrays[edge.data.data], data.Stream): pass @@ -1539,13 +1567,13 @@ def define_out_memlet(self, sdfg, state_dfg, state_id, src_node, dst_node, edge, dtypes.AllocationLifetime.External) defined_type, _ = self._dispatcher.defined_vars.get(ptrname, is_global=is_global) base_ptr = cpp.cpp_ptr_expr(sdfg, edge.data, defined_type, codegen=self._frame) - callsite_stream.write(f'{cdtype.ctype} {edge.src_conn} = {base_ptr};', sdfg, state_id, src_node) + callsite_stream.write(f'{cdtype.ctype} {edge.src_conn} = {base_ptr};', cfg, state_id, src_node) else: - callsite_stream.write(f'{cdtype.as_arg(edge.src_conn)};', sdfg, state_id, src_node) + callsite_stream.write(f'{cdtype.as_arg(edge.src_conn)};', cfg, state_id, src_node) else: - callsite_stream.write(f'{cdtype.ctype} {edge.src_conn};', sdfg, state_id, src_node) + callsite_stream.write(f'{cdtype.ctype} {edge.src_conn};', cfg, state_id, src_node) - def generate_nsdfg_header(self, sdfg, state, state_id, node, memlet_references, sdfg_label, state_struct=True): + def generate_nsdfg_header(self, sdfg, cfg, state, state_id, node, memlet_references, sdfg_label, state_struct=True): # TODO: Use a single method for GPU kernels, FPGA modules, and NSDFGs arguments = [] @@ -1580,7 +1608,7 @@ def make_restrict(expr: str) -> str: arguments = ', '.join(arguments) return f'void {sdfg_label}({arguments}) {{' - def generate_nsdfg_call(self, sdfg, state, node, memlet_references, sdfg_label, state_struct=True): + def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_label, state_struct=True): prepend = [] if state_struct: prepend = ['__state'] @@ -1591,7 +1619,7 @@ def generate_nsdfg_call(self, sdfg, state, node, memlet_references, sdfg_label, ]) return f'{sdfg_label}({args});' - def generate_nsdfg_arguments(self, sdfg, dfg, state, node): + def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): # Connectors that are both input and output share the same name inout = set(node.in_connectors.keys() & node.out_connectors.keys()) @@ -1625,16 +1653,17 @@ def generate_nsdfg_arguments(self, sdfg, dfg, state, node): def _generate_NestedSDFG( self, - sdfg, + sdfg: SDFG, + cfg: ControlFlowRegion, dfg: ScopeSubgraphView, - state_id, + state_id: int, node: nodes.NestedSDFG, function_stream: CodeIOStream, callsite_stream: CodeIOStream, ): inline = Config.get_bool('compiler', 'inline_sdfgs') self._dispatcher.defined_vars.enter_scope(sdfg, can_access_parent=inline) - state_dfg = sdfg.nodes()[state_id] + state_dfg = cfg.nodes()[state_id] fsyms = self._frame.free_symbols(node.sdfg) arglist = node.sdfg.arglist(scalars_only=False, free_symbols=fsyms) @@ -1705,21 +1734,21 @@ def _generate_NestedSDFG( # Take care of nested SDFG I/O (arguments) # Arguments are input connectors, output connectors, and symbols codegen = self.calling_codegen - memlet_references = codegen.generate_nsdfg_arguments(sdfg, dfg, state_dfg, node) + memlet_references = codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state_dfg, node) if not inline and (not unique_functions or not code_already_generated): nested_stream.write( ('inline ' if codegen is self else '') + - codegen.generate_nsdfg_header(sdfg, state_dfg, state_id, node, memlet_references, sdfg_label), sdfg, - state_id, node) + codegen.generate_nsdfg_header(sdfg, cfg, state_dfg, state_id, node, memlet_references, sdfg_label), + cfg, state_id, node) ############################# # Generate function contents if inline: - callsite_stream.write('{', sdfg, state_id, node) + callsite_stream.write('{', cfg, state_id, node) for ref in memlet_references: - callsite_stream.write('%s %s = %s;' % ref, sdfg, state_id, node) + callsite_stream.write('%s %s = %s;' % ref, cfg, state_id, node) # Emit symbol mappings # We first emit variables of the form __dacesym_X = Y to avoid # overriding symbolic expressions when the symbol names match @@ -1729,14 +1758,14 @@ def _generate_NestedSDFG( callsite_stream.write( '{dtype} __dacesym_{symname} = {symval};\n'.format(dtype=node.sdfg.symbols[symname], symname=symname, - symval=cpp.sym2cpp(symval)), sdfg, state_id, + symval=cpp.sym2cpp(symval)), cfg, state_id, node) for symname in sorted(node.symbol_mapping.keys()): if symname in sdfg.constants: continue callsite_stream.write( '{dtype} {symname} = __dacesym_{symname};\n'.format(symname=symname, - dtype=node.sdfg.symbols[symname]), sdfg, + dtype=node.sdfg.symbols[symname]), cfg, state_id, node) ## End of symbol mappings ############################# @@ -1761,6 +1790,7 @@ def _generate_NestedSDFG( # Process outgoing memlets with the internal SDFG codegen.process_out_memlets(sdfg, + cfg, state_id, node, state_dfg, @@ -1770,13 +1800,14 @@ def _generate_NestedSDFG( nested_global_stream, skip_wcr=True) - nested_stream.write('}\n\n', sdfg, state_id, node) + nested_stream.write('}\n\n', cfg, state_id, node) ######################## if not inline: # Generate function call - callsite_stream.write(codegen.generate_nsdfg_call(sdfg, state_dfg, node, memlet_references, sdfg_label), - sdfg, state_id, node) + callsite_stream.write(codegen.generate_nsdfg_call(sdfg, cfg, state_dfg, node, memlet_references, + sdfg_label), + cfg, state_id, node) ############################################################### # Write generated code in the proper places (nested SDFG writes @@ -1790,29 +1821,29 @@ def _generate_NestedSDFG( def _generate_MapEntry( self, - sdfg, - dfg, - state_id, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: StateSubgraphView, + state_id: int, node: nodes.MapEntry, - function_stream, - callsite_stream, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, ): - state_dfg = sdfg.node(state_id) + state_dfg = cfg.state(state_id) map_params = node.map.params - map_name = "__DACEMAP_" + str(state_id) + "_" + str(dfg.node_id(node)) result = callsite_stream map_header = "" # Encapsulate map with a C scope # TODO: Refactor out of MapEntry generation (generate_scope_header?) - callsite_stream.write('{', sdfg, state_id, node) + callsite_stream.write('{', cfg, state_id, node) # Define all input connectors of this map entry for e in dynamic_map_inputs(state_dfg, node): if e.data.data != e.dst_conn: callsite_stream.write( - self.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), sdfg, + self.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), cfg, state_id, node) inner_stream = CodeIOStream() @@ -1868,10 +1899,10 @@ def _generate_MapEntry( if node.map.schedule in (dtypes.ScheduleType.CPU_Multicore, dtypes.ScheduleType.CPU_Persistent): raise ValueError("An OpenMP map cannot be unrolled (" + node.map.label + ")") - result.write(map_header, sdfg, state_id, node) + result.write(map_header, cfg, state_id, node) if node.map.schedule == dtypes.ScheduleType.CPU_Persistent: - result.write('{\n', sdfg, state_id, node) + result.write('{\n', cfg, state_id, node) # Find if bounds are used within the scope scope = state_dfg.scope_subgraph(node, False, False) @@ -1885,11 +1916,11 @@ def _generate_MapEntry( ntid_is_used = '__omp_num_threads' in fsyms tid_is_used = node.map.params[0] in fsyms if tid_is_used or ntid_is_used: - function_stream.write('#include ', sdfg, state_id, node) + function_stream.write('#include ', cfg, state_id, node) if tid_is_used: - result.write(f'auto {node.map.params[0]} = omp_get_thread_num();', sdfg, state_id, node) + result.write(f'auto {node.map.params[0]} = omp_get_thread_num();', cfg, state_id, node) if ntid_is_used: - result.write(f'auto __omp_num_threads = omp_get_num_threads();', sdfg, state_id, node) + result.write(f'auto __omp_num_threads = omp_get_num_threads();', cfg, state_id, node) else: # Emit nested loops for i, r in enumerate(node.map.range): @@ -1897,12 +1928,12 @@ def _generate_MapEntry( begin, end, skip = r if node.map.unroll: - result.write("#pragma unroll", sdfg, state_id, node) + result.write("#pragma unroll", cfg, state_id, node) result.write( "for (auto %s = %s; %s < %s; %s += %s) {\n" % (var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, cpp.sym2cpp(skip)), - sdfg, + cfg, state_id, node, ) @@ -1910,21 +1941,22 @@ def _generate_MapEntry( callsite_stream.write(inner_stream.getvalue()) # Emit internal transient array allocation - self._frame.allocate_arrays_in_scope(sdfg, node, function_stream, result) + self._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, result) - def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): + def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: result = callsite_stream # Obtain start of map scope_dict = dfg.scope_dict() map_node = scope_dict[node] - state_dfg = sdfg.node(state_id) + state_dfg = cfg.state(state_id) if map_node is None: raise ValueError("Exit node " + str(node.map.label) + " is not dominated by a scope entry node") # Emit internal transient array deallocation - self._frame.deallocate_arrays_in_scope(sdfg, map_node, function_stream, result) + self._frame.deallocate_arrays_in_scope(sdfg, cfg, map_node, function_stream, result) outer_stream = CodeIOStream() @@ -1936,28 +1968,28 @@ def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite self.generate_scope_postamble(sdfg, dfg, state_id, function_stream, outer_stream, callsite_stream) if map_node.map.schedule == dtypes.ScheduleType.CPU_Persistent: - result.write("}", sdfg, state_id, node) + result.write("}", cfg, state_id, node) else: for _ in map_node.map.range: - result.write("}", sdfg, state_id, node) + result.write("}", cfg, state_id, node) result.write(outer_stream.getvalue()) - callsite_stream.write('}', sdfg, state_id, node) + callsite_stream.write('}', cfg, state_id, node) def _generate_ConsumeEntry( self, - sdfg, - dfg, - state_id, - node: nodes.MapEntry, - function_stream, - callsite_stream, - ): + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: StateSubgraphView, + state_id: int, + node: nodes.ConsumeEntry, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + ) -> None: result = callsite_stream - constsize = all([not symbolic.issymbolic(v, sdfg.constants) for r in node.map.range for v in r]) - state_dfg = sdfg.nodes()[state_id] + state_dfg: SDFGState = cfg.nodes()[state_id] input_sedge = next(e for e in state_dfg.in_edges(node) if e.dst_conn == "IN_stream") output_sedge = next(e for e in state_dfg.out_edges(node) if e.src_conn == "OUT_stream") @@ -2004,7 +2036,7 @@ def _generate_ConsumeEntry( num_pes=cpp.sym2cpp(node.consume.num_pes), pe_index=node.consume.pe_index, ), - sdfg, + cfg, state_id, node, ) @@ -2050,7 +2082,7 @@ def _generate_ConsumeEntry( result.write(inner_stream.getvalue()) # Emit internal transient array allocation - self._frame.allocate_arrays_in_scope(sdfg, node, function_stream, result) + self._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, result) # Generate register definitions for inter-tasklet memlets scope_dict = dfg.scope_dict() @@ -2068,27 +2100,29 @@ def _generate_ConsumeEntry( ctype = node.out_connectors[edge.src_conn].ctype if not local_name: # Very unique name. TODO: Make more intuitive - local_name = '__dace_%d_%d_%d_%d_%s' % (sdfg.cfg_id, state_id, dfg.node_id( + local_name = '__dace_%d_%d_%d_%d_%s' % (cfg.cfg_id, state_id, dfg.node_id( edge.src), dfg.node_id(edge.dst), edge.src_conn) # Allocate variable type code = '%s %s;' % (ctype, local_name) - result.write(code, sdfg, state_id, [edge.src, edge.dst]) + result.write(code, cfg, state_id, [edge.src, edge.dst]) self._dispatcher.defined_vars.add(local_name, DefinedType.Scalar, ctype) - def _generate_ConsumeExit(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): + def _generate_ConsumeExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.ConsumeExit, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: result = callsite_stream # Obtain start of map scope_dict = dfg.scope_dict() entry_node = scope_dict[node] - state_dfg = sdfg.node(state_id) + state_dfg: SDFGState = cfg.node(state_id) if entry_node is None: raise ValueError("Exit node " + str(node.consume.label) + " is not dominated by a scope entry node") # Emit internal transient array deallocation - self._frame.deallocate_arrays_in_scope(sdfg, entry_node, function_stream, result) + self._frame.deallocate_arrays_in_scope(sdfg, cfg, entry_node, function_stream, result) outer_stream = CodeIOStream() @@ -2099,16 +2133,17 @@ def _generate_ConsumeExit(self, sdfg, dfg, state_id, node, function_stream, call self.generate_scope_postamble(sdfg, dfg, state_id, function_stream, outer_stream, callsite_stream) - result.write("});", sdfg, state_id, node) + result.write("});", cfg, state_id, node) result.write(outer_stream.getvalue()) - def _generate_AccessNode(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): - state_dfg = sdfg.nodes()[state_id] + def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + state_dfg: SDFGState = cfg.nodes()[state_id] if node not in state_dfg.sink_nodes(): # NOTE: sink nodes are synchronized at the end of a state - cpp.presynchronize_streams(sdfg, state_dfg, state_id, node, callsite_stream) + cpp.presynchronize_streams(sdfg, cfg, state_dfg, state_id, node, callsite_stream) # Instrumentation: Pre-node instr = self._dispatcher.instrumentation[node.instrument] @@ -2133,6 +2168,7 @@ def _generate_AccessNode(self, sdfg, dfg, state_id, node, function_stream, calls node, edge, sdfg, + cfg, dfg, state_id, function_stream, @@ -2143,6 +2179,7 @@ def _generate_AccessNode(self, sdfg, dfg, state_id, node, function_stream, calls # from the first leading edge out of the array) self.process_out_memlets( sdfg, + cfg, state_id, node, dfg, @@ -2199,7 +2236,7 @@ def generate_scope_postamble(self, sdfg, dfg_scope, state_id, function_stream, o """ pass - def generate_tasklet_preamble(self, sdfg, dfg_scope, state_id, node, function_stream, before_memlets_stream, + def generate_tasklet_preamble(self, sdfg, cfg, dfg_scope, state_id, node, function_stream, before_memlets_stream, after_memlets_stream): """ Generates code for the beginning of a tasklet. This method is @@ -2219,7 +2256,7 @@ def generate_tasklet_preamble(self, sdfg, dfg_scope, state_id, node, function_st """ pass - def generate_tasklet_postamble(self, sdfg, dfg_scope, state_id, node, function_stream, before_memlets_stream, + def generate_tasklet_postamble(self, sdfg, cfg, dfg_scope, state_id, node, function_stream, before_memlets_stream, after_memlets_stream): """ Generates code for the end of a tasklet. This method is intended to be diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index 4731165309..f080f2cc62 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -2,14 +2,14 @@ import ctypes import functools import warnings -from typing import Dict, List, Set, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union import networkx as nx import sympy from six import StringIO import dace -from dace import data as dt +from dace import data as dt, Memlet from dace import dtypes, registry from dace import subsets, symbolic from dace.codegen import common, cppunparse @@ -23,12 +23,18 @@ from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute from dace.config import Config from dace.frontend import operations -from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, dynamic_map_inputs, has_dynamic_map_inputs, +from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs, is_array_stream_view, is_devicelevel_gpu, nodes, scope_contains_scope) from dace.sdfg import utils as sdutil +from dace.sdfg.graph import MultiConnectorEdge +from dace.sdfg.state import ControlFlowRegion, StateSubgraphView from dace.transformation import helpers as xfh from dace.transformation.passes import analysis as ap +if TYPE_CHECKING: + from dace.codegen.targets.framecode import DaCeCodeGenerator + from dace.codegen.targets.cpu import CPUCodeGen + def prod(iterable): return functools.reduce(sympy.Mul, iterable, 1) @@ -56,7 +62,7 @@ class CUDACodeGen(TargetCodeGenerator): title = 'CUDA' _in_device_code = False - def __init__(self, frame_codegen, sdfg: SDFG): + def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._frame = frame_codegen self._dispatcher = frame_codegen.dispatcher dispatcher = self._dispatcher @@ -64,7 +70,7 @@ def __init__(self, frame_codegen, sdfg: SDFG): self.create_grid_barrier = False self.extra_nsdfg_args = [] CUDACodeGen._in_device_code = False - self._cpu_codegen = None + self._cpu_codegen: Optional['CPUCodeGen'] = None self._block_dims = None self._grid_dims = None self._kernel_map = None @@ -506,8 +512,9 @@ def cmake_options(): return options - def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream): - + def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream) -> None: fsymbols = self._frame.symbols_and_constants(sdfg) # NOTE: `dfg` (state) will be None iff `nodedesc` is non-free symbol dependent # (see `DaCeCodeGenerator.determine_allocation_lifetime` in `dace.codegen.targets.framecode`). @@ -538,10 +545,11 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de else: raise NotImplementedError("CUDA: Unimplemented storage type " + str(nodedesc.storage)) - declaration_stream.write(result_decl.getvalue(), sdfg, state_id, node) + declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) - def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, - allocation_stream): + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: dataname = cpp.ptr(node.data, nodedesc, sdfg, self._frame) try: @@ -559,14 +567,14 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d pass if isinstance(nodedesc, dace.data.Stream): - return self.allocate_stream(sdfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, + return self.allocate_stream(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) elif isinstance(nodedesc, dace.data.View): - return self._cpu_codegen.allocate_view(sdfg, dfg, state_id, node, function_stream, declaration_stream, + return self._cpu_codegen.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) elif isinstance(nodedesc, dace.data.Reference): - return self._cpu_codegen.allocate_reference(sdfg, dfg, state_id, node, function_stream, declaration_stream, - allocation_stream) + return self._cpu_codegen.allocate_reference(sdfg, cfg, dfg, state_id, node, function_stream, + declaration_stream, allocation_stream) if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): nodedesc = update_persistent_desc(nodedesc, sdfg) @@ -636,11 +644,12 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d else: raise NotImplementedError("CUDA: Unimplemented storage type " + str(nodedesc.storage)) - declaration_stream.write(result_decl.getvalue(), sdfg, state_id, node) - allocation_stream.write(result_alloc.getvalue(), sdfg, state_id, node) + declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) + allocation_stream.write(result_alloc.getvalue(), cfg, state_id, node) - def allocate_stream(self, sdfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, - allocation_stream): + def allocate_stream(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: dataname = node.data allocname = cpp.ptr(dataname, nodedesc, sdfg, self._frame) if nodedesc.storage == dtypes.StorageType.GPU_Global: @@ -649,7 +658,7 @@ def allocate_stream(self, sdfg, dfg, state_id, node, nodedesc, function_stream, 'allocname': allocname, 'type': nodedesc.dtype.ctype, 'is_pow2': sym2cpp(sympy.log(nodedesc.buffer_size, 2).is_Integer), - 'location': '%s_%s_%s' % (sdfg.cfg_id, state_id, dfg.node_id(node)) + 'location': '%s_%s_%s' % (cfg.cfg_id, state_id, dfg.node_id(node)) } ctypedef = 'dace::GPUStream<{type}, {is_pow2}>'.format(**fmtargs) @@ -670,48 +679,52 @@ def allocate_stream(self, sdfg, dfg, state_id, node, nodedesc, function_stream, # (important) Ensure GPU array is allocated before the stream datanode = dfg.out_edges(node)[0].dst sinkdesc = sdfg.arrays[datanode.data] - self._dispatcher.dispatch_allocate(sdfg, dfg, state_id, datanode, sinkdesc, function_stream, + self._dispatcher.dispatch_allocate(sdfg, cfg, dfg, state_id, datanode, sinkdesc, function_stream, allocation_stream) function_stream.write( 'DACE_EXPORTED void __dace_alloc_{location}({type} *ptr, uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result);' - .format(**fmtargs), sdfg, state_id, node) + .format(**fmtargs), cfg, state_id, node) self._globalcode.write( """ DACE_EXPORTED void __dace_alloc_{location}({type} *ptr, uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result); void __dace_alloc_{location}({type} *ptr, uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result) {{ result = dace::AllocGPUArrayStreamView<{type}, {is_pow2}>(ptr, size); -}}""".format(**fmtargs), sdfg, state_id, node) - declaration_stream.write('dace::GPUStream<{type}, {is_pow2}> {name};'.format(**fmtargs), sdfg, state_id, +}}""".format(**fmtargs), cfg, state_id, node) + declaration_stream.write('dace::GPUStream<{type}, {is_pow2}> {name};'.format(**fmtargs), cfg, state_id, node) - allocation_stream.write('__dace_alloc_{location}({ptr}, {size}, {allocname});'.format(**fmtargs), sdfg, + allocation_stream.write('__dace_alloc_{location}({ptr}, {size}, {allocname});'.format(**fmtargs), cfg, state_id, node) else: fmtargs['size'] = sym2cpp(nodedesc.buffer_size) function_stream.write( 'DACE_EXPORTED void __dace_alloc_{location}(uint32_t size, dace::GPUStream<{type}, {is_pow2}>& result);' - .format(**fmtargs), sdfg, state_id, node) + .format(**fmtargs), cfg, state_id, node) self._globalcode.write( """ DACE_EXPORTED void __dace_alloc_{location}(uint32_t {size}, dace::GPUStream<{type}, {is_pow2}>& result); void __dace_alloc_{location}(uint32_t {size}, dace::GPUStream<{type}, {is_pow2}>& result) {{ result = dace::AllocGPUStream<{type}, {is_pow2}>({size}); -}}""".format(**fmtargs), sdfg, state_id, node) - declaration_stream.write('dace::GPUStream<{type}, {is_pow2}> {name};'.format(**fmtargs), sdfg, state_id, +}}""".format(**fmtargs), cfg, state_id, node) + declaration_stream.write('dace::GPUStream<{type}, {is_pow2}> {name};'.format(**fmtargs), cfg, state_id, node) - allocation_stream.write('__dace_alloc_{location}({size}, {allocname});'.format(**fmtargs), sdfg, + allocation_stream.write('__dace_alloc_{location}({size}, {allocname});'.format(**fmtargs), cfg, state_id, node) - def deallocate_stream(self, sdfg, dfg, state_id, node, nodedesc, function_stream, callsite_stream): + def deallocate_stream(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: dataname = cpp.ptr(node.data, nodedesc, sdfg, self._frame) if nodedesc.storage == dtypes.StorageType.GPU_Global: if is_array_stream_view(sdfg, dfg, node): - callsite_stream.write('dace::FreeGPUArrayStreamView(%s);' % dataname, sdfg, state_id, node) + callsite_stream.write('dace::FreeGPUArrayStreamView(%s);' % dataname, cfg, state_id, node) else: - callsite_stream.write('dace::FreeGPUStream(%s);' % dataname, sdfg, state_id, node) + callsite_stream.write('dace::FreeGPUStream(%s);' % dataname, cfg, state_id, node) - def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, callsite_stream): + def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: dataname = cpp.ptr(node.data, nodedesc, sdfg, self._frame) if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0: dataname = f'({dataname} - {cpp.sym2cpp(nodedesc.start_offset)})' @@ -722,15 +735,15 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, self._dispatcher.declared_arrays.remove(dataname, is_global=is_global) if isinstance(nodedesc, dace.data.Stream): - return self.deallocate_stream(sdfg, dfg, state_id, node, nodedesc, function_stream, callsite_stream) + return self.deallocate_stream(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, callsite_stream) elif isinstance(nodedesc, dace.data.View): return if nodedesc.storage == dtypes.StorageType.GPU_Global: if not nodedesc.pool: # If pooled, will be freed somewhere else - callsite_stream.write('DACE_GPU_CHECK(%sFree(%s));\n' % (self.backend, dataname), sdfg, state_id, node) + callsite_stream.write('DACE_GPU_CHECK(%sFree(%s));\n' % (self.backend, dataname), cfg, state_id, node) elif nodedesc.storage == dtypes.StorageType.CPU_Pinned: - callsite_stream.write('DACE_GPU_CHECK(%sFreeHost(%s));\n' % (self.backend, dataname), sdfg, state_id, node) + callsite_stream.write('DACE_GPU_CHECK(%sFreeHost(%s));\n' % (self.backend, dataname), cfg, state_id, node) elif nodedesc.storage == dtypes.StorageType.GPU_Shared or \ nodedesc.storage == dtypes.StorageType.Register: pass # Do nothing @@ -879,10 +892,12 @@ def increment(streams): return max_streams, max_events - def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst_schedule, edge, sdfg, dfg, - callsite_stream): + def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType, + dst_node: nodes.Node, dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType, + edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], sdfg: SDFG, cfg: ControlFlowRegion, + dfg: StateSubgraphView, callsite_stream: CodeIOStream) -> None: u, uconn, v, vconn, memlet = edge - state_dfg = sdfg.nodes()[state_id] + state_dfg = cfg.state(state_id) cpu_storage_types = [ dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.CPU_Pinned @@ -1004,7 +1019,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst _topy(dst_strides[-2]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, current_src_expr, sym2cpp(src_strides[-2]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, sym2cpp(copy_shape[-1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, - sym2cpp(copy_shape[-2]), self.backend, src_location, dst_location, cudastream), sdfg, state_id, + sym2cpp(copy_shape[-2]), self.backend, src_location, dst_location, cudastream), cfg, state_id, [src_node, dst_node]) # Write for-loop footers for d in range(dims - 2): @@ -1018,7 +1033,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst callsite_stream.write( 'DACE_GPU_CHECK(%sMemcpyAsync(%s, %s, %s, %sMemcpy%sTo%s, %s));\n' % (self.backend, dst_expr, src_expr, copysize, self.backend, src_location, dst_location, cudastream), - sdfg, state_id, [src_node, dst_node]) + cfg, state_id, [src_node, dst_node]) node_dtype = dst_node.desc(sdfg).dtype if issubclass(node_dtype.type, ctypes.Structure): callsite_stream.write('for (size_t __idx = 0; __idx < {arrlen}; ++__idx) ' @@ -1045,7 +1060,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst sloc=src_location, dloc=dst_location, stream=cudastream, - backend=self.backend), sdfg, + backend=self.backend), cfg, state_id, [src_node, dst_node]) callsite_stream.write('}') elif dims == 1 and ((src_strides[-1] != 1 or dst_strides[-1] != 1)): @@ -1054,7 +1069,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst (self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, 'sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp( - copy_shape[0]), self.backend, src_location, dst_location, cudastream), sdfg, state_id, + copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id, [src_node, dst_node]) elif dims == 2: callsite_stream.write( @@ -1062,7 +1077,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst (self.backend, dst_expr, _topy(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, sym2cpp(copy_shape[1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp( - copy_shape[0]), self.backend, src_location, dst_location, cudastream), sdfg, state_id, + copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id, [src_node, dst_node]) # Post-copy synchronization @@ -1077,7 +1092,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst ''' DACE_GPU_CHECK({backend}EventRecord(__state->gpu_context->events[{ev}], {src_stream})); DACE_GPU_CHECK({backend}StreamWaitEvent({dst_stream}, __state->gpu_context->events[{ev}], 0)); - '''.format(ev=event, src_stream=cudastream, dst_stream=syncstream, backend=self.backend), sdfg, + '''.format(ev=event, src_stream=cudastream, dst_stream=syncstream, backend=self.backend), cfg, state_id, [src_node, dst_node]) self._emit_sync(callsite_stream) @@ -1085,7 +1100,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst # Copy within the GPU elif (src_storage in gpu_storage_types and dst_storage in gpu_storage_types): - state_dfg = sdfg.nodes()[state_id] + state_dfg = cfg.state(state_id) sdict = state_dfg.scope_dict() schedule_node = src_node if scope_contains_scope(sdict, src_node, dst_node): @@ -1137,19 +1152,22 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', accum=accum, args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + custom_reduction + - _topy(dst_strides) + _topy(copy_shape))), sdfg, state_id, [src_node, dst_node]) + _topy(dst_strides) + _topy(copy_shape))), cfg, state_id, [src_node, dst_node]) elif funcname == 'dace::SharedToGlobal1D': # special case: use a new template struct that provides functions for copy and reduction callsite_stream.write( (' {func}<{type}, {bdims}, {copysize}, {is_async}>{accum}({args});').format( - func=funcname, - type=dst_node.desc(sdfg).dtype.ctype, - bdims=', '.join(_topy(self._block_dims)), - copysize=', '.join(_topy(copy_shape)), - is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', - accum=accum or '::Copy', - args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + custom_reduction)), sdfg, - state_id, [src_node, dst_node]) + func=funcname, + type=dst_node.desc(sdfg).dtype.ctype, + bdims=', '.join(_topy(self._block_dims)), + copysize=', '.join(_topy(copy_shape)), + is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', + accum=accum or '::Copy', + args=', '.join( + [src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + custom_reduction + ) + ), + cfg, state_id, [src_node, dst_node]) else: callsite_stream.write( (' {func}<{type}, {bdims}, {copysize}, ' + @@ -1161,16 +1179,18 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst dststrides=', '.join(_topy(dst_strides)), is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', accum=accum, - args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + custom_reduction)), sdfg, + args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + custom_reduction)), cfg, state_id, [src_node, dst_node]) # Per-thread load (same as CPU copies) else: - self._cpu_codegen.copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) + self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) else: - self._cpu_codegen.copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) + self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) - def copy_memory(self, sdfg, dfg, state_id, src_node, dst_node, memlet, function_stream, callsite_stream): - state = sdfg.node(state_id) + def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode], + memlet: Memlet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + state = cfg.state(state_id) if isinstance(src_node, nodes.Tasklet): src_storage = dtypes.StorageType.Register src_parent = state.entry_node(src_node) @@ -1187,11 +1207,13 @@ def copy_memory(self, sdfg, dfg, state_id, src_node, dst_node, memlet, function_ dst_schedule = None if dst_parent is None else dst_parent.map.schedule # Emit actual copy - self._emit_copy(state_id, src_node, src_storage, dst_node, dst_storage, dst_schedule, memlet, sdfg, dfg, + self._emit_copy(state_id, src_node, src_storage, dst_node, dst_storage, dst_schedule, memlet, sdfg, cfg, dfg, callsite_stream) - def define_out_memlet(self, sdfg, state_dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream): - self._cpu_codegen.define_out_memlet(sdfg, state_dfg, state_id, src_node, dst_node, edge, function_stream, + def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet], + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + self._cpu_codegen.define_out_memlet(sdfg, cfg, state_dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream) def process_out_memlets(self, *args, **kwargs): @@ -1214,14 +1236,16 @@ def _begin_streams(self, sdfg, state): result.add(e.dst._cuda_stream) return result - def generate_state(self, sdfg, state, function_stream, callsite_stream): + def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, + function_stream: CodeIOStream, callsite_stream: CodeIOStream, + generate_state_footer: bool = False) -> None: # Two modes: device-level state and if this state has active streams if CUDACodeGen._in_device_code: - self.generate_devicelevel_state(sdfg, state, function_stream, callsite_stream) + self.generate_devicelevel_state(sdfg, cfg, state, function_stream, callsite_stream) else: # Active streams found. Generate state normally and sync with the # streams in the end - self._frame.generate_state(sdfg, state, function_stream, callsite_stream, generate_state_footer=False) + self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) # Reset thread-block-level information self._scope_has_collaborative_copy = False @@ -1269,30 +1293,29 @@ def generate_state(self, sdfg, state, function_stream, callsite_stream): for stream in streams_to_sync: callsite_stream.write( 'DACE_GPU_CHECK(%sStreamSynchronize(__state->gpu_context->streams[%d]));' % - (self.backend, stream), sdfg, sdfg.node_id(state)) + (self.backend, stream), cfg, state.block_id) # After synchronizing streams, generate state footer normally callsite_stream.write('\n') # Emit internal transient array deallocation - self._frame.deallocate_arrays_in_scope(sdfg, state, function_stream, callsite_stream) + self._frame.deallocate_arrays_in_scope(sdfg, cfg, state, function_stream, callsite_stream) # Invoke all instrumentation providers for instr in self._frame._dispatcher.instrumentation.values(): if instr is not None: instr.on_state_end(sdfg, state, callsite_stream, function_stream) - def generate_devicelevel_state(self, sdfg, state, function_stream, callsite_stream): - + def generate_devicelevel_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: # Special case: if this is a GPU grid state and something is reading # from a possible result of a collaborative write, sync first if self._toplevel_schedule == dtypes.ScheduleType.GPU_Device: - state_id = next(i for i, s in enumerate(sdfg.nodes()) if s == state) for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage == dtypes.StorageType.GPU_Shared and state.in_degree(node) == 0 and state.out_degree(node) > 0): if not self._scope_has_collaborative_copy: - callsite_stream.write('__syncthreads();', sdfg, state_id) + callsite_stream.write('__syncthreads();', cfg, state.block_id) break # In GPU_Persistent scopes, states need global barriers between them, @@ -1311,7 +1334,7 @@ def generate_devicelevel_state(self, sdfg, state, function_stream, callsite_stre ] for stream in streams_to_reset: ptrname = cpp.ptr(stream.data, stream.desc(sdfg), sdfg, self._frame) - callsite_stream.write("{}.reset();".format(ptrname), sdfg, state.node_id) + callsite_stream.write("{}.reset();".format(ptrname), cfg, state.block_id) components = dace.sdfg.concurrent_subgraphs(state) for c in components: @@ -1336,14 +1359,14 @@ def generate_devicelevel_state(self, sdfg, state, function_stream, callsite_stre if write_scope == 'grid': callsite_stream.write("if (blockIdx.x == 0 " "&& threadIdx.x == 0) " - "{ // sub-graph begin", sdfg, state.node_id) + "{ // sub-graph begin", cfg, state.block_id) elif write_scope == 'block': callsite_stream.write("if (threadIdx.x == 0) " - "{ // sub-graph begin", sdfg, state.node_id) + "{ // sub-graph begin", cfg, state.block_id) else: - callsite_stream.write("{ // subgraph begin", sdfg, state.node_id) + callsite_stream.write("{ // subgraph begin", cfg, state.block_id) else: - callsite_stream.write("{ // subgraph begin", sdfg, state.node_id) + callsite_stream.write("{ // subgraph begin", cfg, state.block_id) # Need to skip certain entry nodes to make sure that they are # not processed twice @@ -1354,32 +1377,34 @@ def generate_devicelevel_state(self, sdfg, state, function_stream, callsite_stre skip_entry = len(comp_same_entry) > 0 and has_map self._dispatcher.dispatch_subgraph(sdfg, + cfg, c, - sdfg.node_id(state), + state.block_id, function_stream, callsite_stream, skip_entry_node=skip_entry) - callsite_stream.write("} // subgraph end", sdfg, state.node_id) + callsite_stream.write("} // subgraph end", cfg, state.block_id) - callsite_stream.write('__gbar.Sync();', sdfg, state.node_id) + callsite_stream.write('__gbar.Sync();', cfg, state.block_id) # done here, code is generated return - self._frame.generate_state(sdfg, state, function_stream, callsite_stream) + self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream) # NOTE: This function is ONLY called from the CPU side. Therefore, any # schedule that is out of the ordinary will raise an exception - def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_stream): + def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: scope_entry = dfg_scope.source_nodes()[0] scope_exit = dfg_scope.sink_nodes()[0] - state = sdfg.nodes()[state_id] + state = cfg.state(state_id) # If in device-level code, call appropriate function if (self._kernel_map is not None and self._kernel_map.map.schedule in dtypes.GPU_SCHEDULES): - self.generate_devicelevel_scope(sdfg, dfg_scope, state_id, function_stream, callsite_stream) + self.generate_devicelevel_scope(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) return # If not device-level code, ensure the schedule is correct @@ -1536,7 +1561,7 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st for k, v in prototype_kernel_args.items()] kernel_stream = CodeIOStream() - self.generate_kernel_scope(sdfg, dfg_scope, state_id, scope_entry.map, kernel_name, grid_dims, block_dims, + self.generate_kernel_scope(sdfg, cfg, dfg_scope, state_id, scope_entry.map, kernel_name, grid_dims, block_dims, tbmap, dtbmap, kernel_args_typed, self._globalcode, kernel_stream) self._dispatcher.defined_vars.exit_scope(scope_entry) @@ -1586,7 +1611,7 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st DACE_EXPORTED void __dace_runkernel_{fname}({fargs}); void __dace_runkernel_{fname}({fargs}) {{ -""".format(fname=kernel_name, fargs=', '.join(state_param + kernel_args_typed + extra_call_args_typed)), sdfg, state_id, +""".format(fname=kernel_name, fargs=', '.join(state_param + kernel_args_typed + extra_call_args_typed)), cfg, state_id, node) if is_persistent: @@ -1600,11 +1625,11 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st if create_grid_barrier: gbar = '__gbar_' + kernel_name - self._localcode.write(' cub::GridBarrierLifetime %s;\n' % gbar, sdfg, state_id, node) + self._localcode.write(' cub::GridBarrierLifetime %s;\n' % gbar, cfg, state_id, node) self._localcode.write( '{}.Setup({});'.format(gbar, ' * '.join(_topy(grid_dims)) if not is_persistent else 'dace_number_blocks'), - sdfg, state_id, node) + cfg, state_id, node) extra_kernel_args.append('(void *)((cub::GridBarrier *)&%s)' % gbar) # Compute dynamic shared memory @@ -1635,7 +1660,7 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st for e in dace.sdfg.dynamic_map_inputs(state, scope_entry): self._localcode.write( self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), - sdfg, state_id, scope_entry) + cfg, state_id, scope_entry) gdims = 'dace_number_blocks, 1, 1' if is_persistent else ', '.join(_topy(grid_dims)) bdims = ', '.join(_topy(block_dims)) @@ -1663,7 +1688,7 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st if ({dimcheck}) {{ {emptygrid_warning} return; - }}''', sdfg, state_id, scope_entry) + }}''', cfg, state_id, scope_entry) self._localcode.write( ''' @@ -1675,7 +1700,7 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st bdims=bdims, dynsmem=_topy(dynsmem_size), stream=cudastream, - backend=self.backend), sdfg, state_id, scope_entry) + backend=self.backend), cfg, state_id, scope_entry) # Check kernel launch for errors self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});') @@ -1688,12 +1713,12 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st # Add invocation to calling code (in another file) function_stream.write( 'DACE_EXPORTED void __dace_runkernel_%s(%s);\n' % - (kernel_name, ', '.join(state_param + kernel_args_typed + extra_call_args_typed)), sdfg, state_id, + (kernel_name, ', '.join(state_param + kernel_args_typed + extra_call_args_typed)), cfg, state_id, scope_entry) # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions. if dace.sdfg.has_dynamic_map_inputs(state, scope_entry): - callsite_stream.write('{', sdfg, state_id, scope_entry) + callsite_stream.write('{', cfg, state_id, scope_entry) # Synchronize all events leading to dynamic map range connectors for e in dace.sdfg.dynamic_map_inputs(state, scope_entry): @@ -1701,24 +1726,24 @@ def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_st ev = e._cuda_event callsite_stream.write( 'DACE_GPU_CHECK({backend}EventSynchronize(__state->gpu_context->events[{ev}]));'.format( - ev=ev, backend=self.backend), sdfg, state_id, [e.src, e.dst]) + ev=ev, backend=self.backend), cfg, state_id, [e.src, e.dst]) callsite_stream.write( self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), - sdfg, state_id, node) + cfg, state_id, node) # Invoke kernel call callsite_stream.write( '__dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(['__state'] + [cpp.ptr(aname, arg, sdfg, self._frame) - for aname, arg in kernel_args.items()] + extra_call_args)), sdfg, state_id, + for aname, arg in kernel_args.items()] + extra_call_args)), cfg, state_id, scope_entry) # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions. if dace.sdfg.has_dynamic_map_inputs(state, scope_entry): - callsite_stream.write('}', sdfg, state_id, scope_entry) + callsite_stream.write('}', cfg, state_id, scope_entry) - synchronize_streams(sdfg, state, state_id, scope_entry, scope_exit, callsite_stream, self) + synchronize_streams(sdfg, cfg, state, state_id, scope_entry, scope_exit, callsite_stream, self) # Instrumentation (post-kernel) if instr is not None: @@ -1945,9 +1970,10 @@ def get_kernel_dimensions(self, dfg_scope): return grid_size, block_size, len(tb_maps_sym_map) > 0, has_dtbmap, extra_dim_offsets - def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_id: int, kernel_map: nodes.Map, - kernel_name: str, grid_dims: list, block_dims: list, has_tbmap: bool, has_dtbmap: bool, - kernel_params: list, function_stream: CodeIOStream, kernel_stream: CodeIOStream): + def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + kernel_map: nodes.Map, kernel_name: str, grid_dims: list, block_dims: list, + has_tbmap: bool, has_dtbmap: bool, kernel_params: list, function_stream: CodeIOStream, + kernel_stream: CodeIOStream) -> None: node = dfg_scope.source_nodes()[0] # Get the thread/block index type @@ -1967,16 +1993,16 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_ if Config.get_bool('compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'), block_size=functools.reduce( (lambda x, y: x * y), - [int(x) for x in Config.get('compiler', 'cuda', 'dynamic_map_block_size').split(',')])), sdfg, + [int(x) for x in Config.get('compiler', 'cuda', 'dynamic_map_block_size').split(',')])), cfg, state_id, node) # Add extra opening brace (dynamic map ranges, closed in MapExit # generator) - kernel_stream.write('{', sdfg, state_id, node) + kernel_stream.write('{', cfg, state_id, node) # Add more opening braces for scope exit to close for dim in range(len(node.map.range) - 1): - kernel_stream.write('{', sdfg, state_id, node) + kernel_stream.write('{', cfg, state_id, node) # Generate all index arguments for kernel grid krange = subsets.Range(kernel_map.range[::-1]) @@ -1988,7 +2014,7 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_ for e in dace.sdfg.dynamic_map_inputs(sdfg.states()[state_id], dfg_scope.source_nodes()[0]): kernel_stream.write( self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), - sdfg, state_id, + cfg, state_id, dfg_scope.source_nodes()[0]) # do not generate an index if the kernel map is persistent @@ -2028,7 +2054,7 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_ ) expr = _topy(bidx[i]).replace('__DAPB%d' % i, block_expr) - kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', sdfg, state_id, node) + kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', cfg, state_id, node) self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype) # Dispatch internal code @@ -2040,7 +2066,7 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_ self._grid_dims = grid_dims # Emit internal array allocation (deallocation handled at MapExit) - self._frame.allocate_arrays_in_scope(sdfg, node, function_stream, kernel_stream) + self._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, kernel_stream) scope_entry = dfg_scope.source_nodes()[0] @@ -2064,12 +2090,13 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_ condition += '%s < %s' % (v, _topy(maxel + 1)) if len(condition) > 0: self._kernel_grid_conditions.append(f'if ({condition}) {{') - kernel_stream.write('if (%s) {' % condition, sdfg, state_id, scope_entry) + kernel_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry) else: self._kernel_grid_conditions.append('{') - kernel_stream.write('{', sdfg, state_id, scope_entry) + kernel_stream.write('{', cfg, state_id, scope_entry) self._dispatcher.dispatch_subgraph(sdfg, + cfg, dfg_scope, state_id, function_stream, @@ -2078,7 +2105,7 @@ def generate_kernel_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_ if (not has_tbmap and not has_dtbmap and node.map.schedule != dtypes.ScheduleType.GPU_Persistent): for _ in kernel_map.params: - kernel_stream.write('}', sdfg, state_id, node) + kernel_stream.write('}', cfg, state_id, node) self._block_dims = None self._kernel_map = None @@ -2104,12 +2131,12 @@ def get_next_scope_entries(self, dfg, scope_entry): return all_scopes[all_scopes.index(scope_entry) + 1:] - def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_stream): + def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: # Sanity check assert CUDACodeGen._in_device_code == True - dfg = sdfg.nodes()[state_id] - sdict = dfg.scope_dict() + dfg = cfg.state(state_id) scope_entry = dfg_scope.source_nodes()[0] scope_exit = dfg_scope.sink_nodes()[0] scope_map = scope_entry.map @@ -2117,7 +2144,7 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, # Add extra opening brace (dynamic map ranges, closed in MapExit # generator) - callsite_stream.write('{', sdfg, state_id, scope_entry) + callsite_stream.write('{', cfg, state_id, scope_entry) if scope_map.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic: if self.backend == 'hip': @@ -2140,10 +2167,10 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, # Define all input connectors of this map entry # Note: no need for a C scope around these, as there will not be # more than one dynamic thread-block map in a GPU device map - callsite_stream.write('unsigned int __dace_dynmap_begin = 0, __dace_dynmap_end = 0;', sdfg, state_id, + callsite_stream.write('unsigned int __dace_dynmap_begin = 0, __dace_dynmap_end = 0;', cfg, state_id, scope_entry) - outer_scope = sdfg.nodes()[state_id].entry_node(scope_entry) + outer_scope = dfg.entry_node(scope_entry) current_sdfg = sdfg while not outer_scope and current_sdfg: current_state = current_sdfg.parent @@ -2154,7 +2181,7 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, raise ValueError(f'Failed to find the outer scope of {scope_entry}') callsite_stream.write( 'if ({} < {}) {{'.format(outer_scope.map.params[0], - _topy(subsets.Range(outer_scope.map.range[::-1]).max_element()[0] + 1)), sdfg, + _topy(subsets.Range(outer_scope.map.range[::-1]).max_element()[0] + 1)), cfg, state_id, scope_entry) # NOTE: Dynamic map inputs must be defined both outside and inside the dynamic Map schedule. @@ -2164,7 +2191,7 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry): callsite_stream.write( self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, - e.dst.in_connectors[e.dst_conn]), sdfg, state_id, scope_entry) + e.dst.in_connectors[e.dst_conn]), cfg, state_id, scope_entry) dynmap_var = scope_map.params[0] dynmap_begin = scope_map.range[0][0] @@ -2176,10 +2203,10 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, dynmap_end = f'int_ceil({dynmap_end - dynmap_begin}, {dynmap_step})' callsite_stream.write( '__dace_dynmap_begin = {begin};\n' - '__dace_dynmap_end = {end};'.format(begin=dynmap_begin, end=dynmap_end), sdfg, state_id, scope_entry) + '__dace_dynmap_end = {end};'.format(begin=dynmap_begin, end=dynmap_end), cfg, state_id, scope_entry) # close if - callsite_stream.write('}', sdfg, state_id, scope_entry) + callsite_stream.write('}', cfg, state_id, scope_entry) callsite_stream.write( 'dace::DynamicMap<{fine_grained}, {bsize}>::' @@ -2189,16 +2216,16 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, 'compiler', 'cuda', 'dynamic_map_fine_grained') else 'false'), bsize=total_block_size, kmapIdx=outer_scope.map.params[0], - param=dynmap_var), sdfg, state_id, scope_entry) + param=dynmap_var), cfg, state_id, scope_entry) for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry): callsite_stream.write( self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, - e.dst.in_connectors[e.dst_conn]), sdfg, state_id, scope_entry) + e.dst.in_connectors[e.dst_conn]), cfg, state_id, scope_entry) if dynmap_step != 1: callsite_stream.write( - f'auto {scope_map.params[0]} = {scope_map.range[0][0]} + {dynmap_step} * {dynmap_var};', sdfg, + f'auto {scope_map.params[0]} = {scope_map.range[0][0]} + {dynmap_step} * {dynmap_var};', cfg, state_id, scope_entry) elif scope_map.schedule == dtypes.ScheduleType.GPU_Device: @@ -2226,7 +2253,7 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, for e in dace.sdfg.dynamic_map_inputs(dfg, scope_entry): callsite_stream.write( self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, - e.dst.in_connectors[e.dst_conn]), sdfg, state_id, + e.dst.in_connectors[e.dst_conn]), cfg, state_id, scope_entry) # variables that need to be declared + the value they need to be initialized with @@ -2311,7 +2338,7 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, cond=condition, stride=stride, pers=is_persistent, - ), sdfg, state_id, node) + ), cfg, state_id, node) else: # will only be entered once varname, expr = declarations.pop(0) @@ -2319,7 +2346,7 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, '{{'.format( varname=varname, expr=expr, - ), sdfg, state_id, node) + ), cfg, state_id, node) else: # Device map in Device map brange = subsets.Range(scope_map.range[::-1]) kdims = brange.size() @@ -2343,7 +2370,7 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, block_expr = f'(blockIdx.{idx} * {_topy(relevant_block_dims[i])} + threadIdx.{idx})' expr = _topy(tidx[i]).replace('__DAPT%d' % i, block_expr) - callsite_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, scope_entry) + callsite_stream.write('int %s = %s;' % (varname, expr), cfg, state_id, scope_entry) self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') # Generate conditions for this subgrid's execution using min and max @@ -2375,23 +2402,23 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, # Emit condition in code if len(condition) > 0: self._kernel_grid_conditions.append(f'if ({condition}) {{') - callsite_stream.write('if (%s) {' % condition, sdfg, state_id, scope_entry) + callsite_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry) else: self._kernel_grid_conditions.append('{') - callsite_stream.write('{', sdfg, state_id, scope_entry) + callsite_stream.write('{', cfg, state_id, scope_entry) else: for dim in range(len(scope_map.range)): - callsite_stream.write('{', sdfg, state_id, scope_entry) + callsite_stream.write('{', cfg, state_id, scope_entry) # Emit internal array allocation (deallocation handled at MapExit) - self._frame.allocate_arrays_in_scope(sdfg, scope_entry, function_stream, callsite_stream) + self._frame.allocate_arrays_in_scope(sdfg, cfg, scope_entry, function_stream, callsite_stream) # Generate all index arguments for block if scope_map.schedule == dtypes.ScheduleType.GPU_ThreadBlock: if self._scope_has_collaborative_copy: # Emit post-copy synchronization - callsite_stream.write('__syncthreads();', sdfg, state_id, scope_entry) + callsite_stream.write('__syncthreads();', cfg, state_id, scope_entry) # Reset thread-block-level information self._scope_has_collaborative_copy = False @@ -2412,7 +2439,7 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, block_expr = 'threadIdx.%s' % _named_idx(i) expr = _topy(tidx[i]).replace('__DAPT%d' % i, block_expr) - callsite_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, scope_entry) + callsite_stream.write('int %s = %s;' % (varname, expr), cfg, state_id, scope_entry) self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') # Delinearize beyond the third dimension @@ -2426,7 +2453,7 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, ) expr = _topy(tidx[i]).replace('__DAPT%d' % i, block_expr) - callsite_stream.write('int %s = %s;' % (varname, expr), sdfg, state_id, scope_entry) + callsite_stream.write('int %s = %s;' % (varname, expr), cfg, state_id, scope_entry) self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') # Generate conditions for this block's execution using min and max @@ -2457,9 +2484,9 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, # Emit condition in code if len(condition) > 0: - callsite_stream.write('if (%s) {' % condition, sdfg, state_id, scope_entry) + callsite_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry) else: - callsite_stream.write('{', sdfg, state_id, scope_entry) + callsite_stream.write('{', cfg, state_id, scope_entry) ########################################################## @@ -2471,7 +2498,7 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, subgraphs = dace.sdfg.concurrent_subgraphs(dfg_scope) for subdfg in subgraphs: components = dace.sdfg.utils.separate_maps( - sdfg.nodes()[state_id], + cfg.state(state_id), subdfg, dtypes.ScheduleType.GPU_ThreadBlock_Dynamic, ) @@ -2481,9 +2508,10 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_stream.write( 'if ({} < {}) {{'.format(scope_map.params[0], _topy(subsets.Range(scope_map.range[::-1]).max_element()[0] + 1)), - sdfg, state_id, scope_entry) + cfg, state_id, scope_entry) self._dispatcher.dispatch_subgraph(sdfg, + cfg, c, state_id, function_stream, @@ -2495,11 +2523,12 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, # exit node gets lost in the process, thus needs to be # dispatched manually - self._dispatcher.dispatch_node(sdfg, dfg_scope, state_id, scope_exit, function_stream, callsite_stream) + self._dispatcher.dispatch_node(sdfg, cfg, dfg_scope, state_id, scope_exit, function_stream, callsite_stream) else: # Generate contents normally self._dispatcher.dispatch_subgraph(sdfg, + cfg, dfg_scope, state_id, function_stream, @@ -2512,31 +2541,32 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream, if (len(next_scopes) > 0 or parent_scope.schedule == dtypes.ScheduleType.Sequential): # Thread-block synchronization if scope_entry.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock: - callsite_stream.write('__syncthreads();', sdfg, state_id, scope_entry) + callsite_stream.write('__syncthreads();', cfg, state_id, scope_entry) # Grid synchronization (kernel fusion) elif scope_entry.map.schedule == dtypes.ScheduleType.GPU_Device \ and self._kernel_map.schedule == dtypes.ScheduleType.GPU_Device: # Escape grid conditions for _ in self._kernel_grid_conditions: - callsite_stream.write('}', sdfg, state_id, scope_entry) + callsite_stream.write('}', cfg, state_id, scope_entry) # Synchronize entire grid - callsite_stream.write('__gbar.Sync();', sdfg, state_id, scope_entry) + callsite_stream.write('__gbar.Sync();', cfg, state_id, scope_entry) # Rewrite grid conditions for cond in self._kernel_grid_conditions: - callsite_stream.write(cond, sdfg, state_id, scope_entry) + callsite_stream.write(cond, cfg, state_id, scope_entry) - def generate_node(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: if self.node_dispatch_predicate(sdfg, dfg, node): # Dynamically obtain node generator according to class name gen = getattr(self, '_generate_' + type(node).__name__, False) if gen is not False: # Not every node type has a code generator here - gen(sdfg, dfg, state_id, node, function_stream, callsite_stream) + gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) return if not CUDACodeGen._in_device_code: - self._cpu_codegen.generate_node(sdfg, dfg, state_id, node, function_stream, callsite_stream) + self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) return if isinstance(node, nodes.ExitNode): @@ -2545,22 +2575,23 @@ def generate_node(self, sdfg, dfg, state_id, node, function_stream, callsite_str if CUDACodeGen._in_device_code and isinstance(node, nodes.MapExit): return # skip - self._cpu_codegen.generate_node(sdfg, dfg, state_id, node, function_stream, callsite_stream) + self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - def generate_nsdfg_header(self, sdfg, state, state_id, node, memlet_references, sdfg_label): + def generate_nsdfg_header(self, sdfg, cfg, state, state_id, node, memlet_references, sdfg_label): return 'DACE_DFI ' + self._cpu_codegen.generate_nsdfg_header( - sdfg, state, state_id, node, memlet_references, sdfg_label, state_struct=False) + sdfg, cfg, state, state_id, node, memlet_references, sdfg_label, state_struct=False) - def generate_nsdfg_call(self, sdfg, state, node, memlet_references, sdfg_label): + def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_label): return self._cpu_codegen.generate_nsdfg_call(sdfg, + cfg, state, node, memlet_references, sdfg_label, state_struct=False) - def generate_nsdfg_arguments(self, sdfg, dfg, state, node): - result = self._cpu_codegen.generate_nsdfg_arguments(sdfg, dfg, state, node) + def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): + result = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node) if self.create_grid_barrier: result.append(('cub::GridBarrier&', '__gbar', '__gbar')) @@ -2572,18 +2603,21 @@ def generate_nsdfg_arguments(self, sdfg, dfg, state, node): return result - def _generate_NestedSDFG(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): + def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.NestedSDFG, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: old_schedule = self._toplevel_schedule self._toplevel_schedule = node.schedule old_codegen = self._cpu_codegen.calling_codegen self._cpu_codegen.calling_codegen = self - self._cpu_codegen._generate_NestedSDFG(sdfg, dfg, state_id, node, function_stream, callsite_stream) + self._cpu_codegen._generate_NestedSDFG(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) self._cpu_codegen.calling_codegen = old_codegen self._toplevel_schedule = old_schedule - def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): + def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: if node.map.schedule == dtypes.ScheduleType.GPU_Device: # Remove grid invocation conditions for i in range(len(node.map.params)): @@ -2593,16 +2627,16 @@ def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite elif node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock: # Close block invocation conditions for i in range(len(node.map.params)): - callsite_stream.write('}', sdfg, state_id, node) + callsite_stream.write('}', cfg, state_id, node) elif node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic: # Close lambda function - callsite_stream.write('});', sdfg, state_id, node) + callsite_stream.write('});', cfg, state_id, node) # Close block invocation - callsite_stream.write('}', sdfg, state_id, node) + callsite_stream.write('}', cfg, state_id, node) return - self._cpu_codegen._generate_MapExit(sdfg, dfg, state_id, node, function_stream, callsite_stream) + self._cpu_codegen._generate_MapExit(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) def _get_thread_id(self) -> str: result = 'threadIdx.x' @@ -2652,8 +2686,8 @@ def _generate_condition_from_location(self, name: str, index_expr: str, node: no return 1 - def _generate_Tasklet(self, sdfg: SDFG, dfg, state_id: int, node: nodes.Tasklet, function_stream: CodeIOStream, - callsite_stream: CodeIOStream): + def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: generated_preamble_scopes = 0 if self._in_device_code: # If location dictionary prescribes that the code should run on a certain group of threads/blocks, @@ -2668,13 +2702,13 @@ def _generate_Tasklet(self, sdfg: SDFG, dfg, state_id: int, node: nodes.Tasklet, # Call standard tasklet generation old_codegen = self._cpu_codegen.calling_codegen self._cpu_codegen.calling_codegen = self - self._cpu_codegen._generate_Tasklet(sdfg, dfg, state_id, node, function_stream, callsite_stream) + self._cpu_codegen._generate_Tasklet(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) self._cpu_codegen.calling_codegen = old_codegen if generated_preamble_scopes > 0: # Generate appropriate postamble for i in range(generated_preamble_scopes): - callsite_stream.write('}', sdfg, state_id, node) + callsite_stream.write('}', cfg, state_id, node) def make_ptr_vector_cast(self, *args, **kwargs): return cpp.make_ptr_vector_cast(*args, **kwargs) diff --git a/dace/codegen/targets/fpga.py b/dace/codegen/targets/fpga.py index fb85bdb464..29150a5ed6 100644 --- a/dace/codegen/targets/fpga.py +++ b/dace/codegen/targets/fpga.py @@ -2,36 +2,35 @@ from six import StringIO import collections -import enum -import functools import itertools import re import warnings -import sympy as sp import numpy as np -from typing import Dict, Iterable, List, Set, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union import copy import dace from dace.codegen.targets import cpp -from dace import subsets, data as dt, dtypes, memlet, sdfg as sd, symbolic +from dace import subsets, data as dt, dtypes, memlet, symbolic from dace.config import Config -from dace.frontend import operations from dace.sdfg import SDFG, nodes, utils, dynamic_map_inputs -from dace.sdfg import ScopeSubgraphView, find_input_arraynode, find_output_arraynode +from dace.sdfg import ScopeSubgraphView +from dace.sdfg.graph import MultiConnectorEdge from dace.codegen import exceptions as cgx -from dace.codegen.codeobject import CodeObject from dace.codegen.dispatcher import DefinedType from dace.codegen.prettycode import CodeIOStream from dace.codegen.common import update_persistent_desc -from dace.codegen.targets.target import (TargetCodeGenerator, IllegalCopy, make_absolute) +from dace.codegen.targets.target import TargetCodeGenerator from dace.codegen import cppunparse -from dace.properties import Property, make_properties, indirect_properties -from dace.sdfg.state import SDFGState +from dace.sdfg.state import ControlFlowRegion, SDFGState, StateSubgraphView from dace.sdfg.utils import is_fpga_kernel from dace.symbolic import evaluate from collections import defaultdict +if TYPE_CHECKING: + from dace.codegen.targets.framecode import DaCeCodeGenerator + from dace.codegen.targets.cpu import CPUCodeGen + _CPU_STORAGE_TYPES = {dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.CPU_Pinned} _FPGA_STORAGE_TYPES = { dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local, dtypes.StorageType.FPGA_Registers, @@ -325,12 +324,12 @@ class FPGACodeGen(TargetCodeGenerator): title = None language = None - def __init__(self, frame_codegen, sdfg: SDFG): + def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): # The inheriting class must set target_name, title and language. self._in_device_code = False - self._cpu_codegen = None + self._cpu_codegen: Optional['CPUCodeGen'] = None self._frame = frame_codegen self._dispatcher = frame_codegen.dispatcher self._kernel_count = 0 @@ -515,8 +514,8 @@ def _kernels_subgraphs(self, graph: Union[dace.sdfg.SDFGState, ScopeSubgraphView del kernels_graph return subgraph_views - def generate_state(self, sdfg: dace.SDFG, state: dace.SDFGState, function_stream: CodeIOStream, - callsite_stream: CodeIOStream): + def generate_state(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, state: dace.SDFGState, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: """ Generate an FPGA State, possibly comprising multiple Kernels and/or PEs. @@ -527,7 +526,7 @@ def generate_state(self, sdfg: dace.SDFG, state: dace.SDFGState, function_stream :param callsite_stream: CPU code stream, contains the actual code (for creating global buffers, invoking device host functions, and so on). """ - state_id = sdfg.node_id(state) + state_id = state.block_id if not self._in_device_code: # Avoid import loop @@ -610,15 +609,15 @@ def generate_state(self, sdfg: dace.SDFG, state: dace.SDFGState, function_stream continue if (data.storage == dtypes.StorageType.FPGA_Global and not isinstance(data, dt.View)): allocated.add(node.data) - self._dispatcher.dispatch_allocate(sdfg, kern, state_id, node, data, function_stream, + self._dispatcher.dispatch_allocate(sdfg, cfg, kern, state_id, node, data, function_stream, callsite_stream) # Create a unique kernel name to avoid name clashes # If this kernels comes from a Nested SDFG, use that name also if sdfg.parent_nsdfg_node is not None: - kernel_name = f"{sdfg.parent_nsdfg_node.label}_{state.label}_{kern_id}_{sdfg.cfg_id}" + kernel_name = f"{sdfg.parent_nsdfg_node.label}_{state.label}_{kern_id}_{cfg.cfg_id}" else: - kernel_name = f"{state.label}_{kern_id}_{sdfg.cfg_id}" + kernel_name = f"{state.label}_{kern_id}_{cfg.cfg_id}" # Vitis HLS removes double underscores, which leads to a compilation # error down the road due to kernel name mismatch. Remove them here @@ -634,7 +633,7 @@ def generate_state(self, sdfg: dace.SDFG, state: dace.SDFGState, function_stream self._num_kernels += 1 # Generate kernel code - self.generate_kernel(sdfg, state, kernel_name, single_sgs, function_stream, callsite_stream, + self.generate_kernel(sdfg, cfg, state, kernel_name, single_sgs, function_stream, callsite_stream, state_host_header_stream, state_host_body_stream, instrumentation_stream, state_parameters, kern_id) @@ -645,7 +644,7 @@ def generate_state(self, sdfg: dace.SDFG, state: dace.SDFGState, function_stream # TODO should be able to generate multiple 'pumps'. e.g. pump b and d in # a > b > c > d > e # Currently, it only works if the subgraphs are directly chained - self.generate_kernel(sdfg, state, f'{kernel_name}_pumped', multi_sgs, func_stream, call_stream, + self.generate_kernel(sdfg, cfg, state, f'{kernel_name}_pumped', multi_sgs, func_stream, call_stream, state_host_header_stream, state_host_body_stream, ignore, state_parameters, 42) kernel_args_call_host = [] @@ -676,7 +675,7 @@ def generate_state(self, sdfg: dace.SDFG, state: dace.SDFGState, function_stream ## Generate the global function here kernel_host_stream = CodeIOStream() - host_function_name = f"__dace_runstate_{sdfg.cfg_id}_{state.name}_{state_id}" + host_function_name = f"__dace_runstate_{cfg.cfg_id}_{state.name}_{state_id}" function_stream.write("\n\nDACE_EXPORTED void {}({});\n\n".format(host_function_name, ", ".join(kernel_args_opencl))) @@ -749,9 +748,10 @@ def generate_state(self, sdfg: dace.SDFG, state: dace.SDFGState, function_stream raise cgx.CodegenError("Cannot allocate global memory from device code.") allocated.add(node.data) # Allocate transients - self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, data, function_stream, callsite_stream) + self._dispatcher.dispatch_allocate(sdfg, cfg, state, state_id, node, data, function_stream, + callsite_stream) - self.generate_nested_state(sdfg, state, state.label, subgraphs, function_stream, callsite_stream) + self.generate_nested_state(sdfg, cfg, state, state.label, subgraphs, function_stream, callsite_stream) @staticmethod def shared_data(subgraphs): @@ -1097,32 +1097,40 @@ def make_parameters(self, sdfg: SDFG, state: SDFGState, subgraphs): return (global_data_parameters, top_level_local_data, subgraph_parameters, nested_global_transients, bank_assignments, external_streams) - def generate_nested_state(self, sdfg, state, nest_name, subgraphs, function_stream, callsite_stream): + def generate_nested_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: dace.SDFGState, nest_name: str, + subgraphs: List[ScopeSubgraphView], function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: for sg in subgraphs: self._dispatcher.dispatch_subgraph(sdfg, + cfg, sg, sdfg.node_id(state), function_stream, callsite_stream, skip_entry_node=False) - def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_stream): + def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: if not self._in_device_code: # If we're not already generating kernel code, fail raise cgx.CodegenError('FPGA kernel needs to be generated inside a device state.') - self.generate_node(sdfg, dfg_scope, state_id, dfg_scope.source_nodes()[0], function_stream, callsite_stream) + self.generate_node(sdfg, cfg, dfg_scope, state_id, dfg_scope.source_nodes()[0], function_stream, + callsite_stream) self._dispatcher.dispatch_subgraph(sdfg, + cfg, dfg_scope, state_id, function_stream, callsite_stream, skip_entry_node=True) - def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream): + def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream) -> None: fsymbols = self._frame.symbols_and_constants(sdfg) if not utils.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols): @@ -1163,10 +1171,11 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de else: raise NotImplementedError("Unimplemented storage type " + str(nodedesc.storage)) - declaration_stream.write(result_decl.getvalue(), sdfg, state_id, node) + declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) - def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, - allocation_stream): + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: # NOTE: The code below fixes symbol-related issues with transient data originally defined in a NestedSDFG scope # but promoted to be persistent. These data must have their free symbols replaced with the corresponding @@ -1191,9 +1200,10 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d declared = self._dispatcher.declared_arrays.has(dataname) if isinstance(nodedesc, dt.View): - return self.allocate_view(sdfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) + return self.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, + allocation_stream) elif isinstance(nodedesc, dt.Reference): - return self.allocate_reference(sdfg, dfg, state_id, node, function_stream, declaration_stream, + return self.allocate_reference(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) elif isinstance(nodedesc, dt.Stream): @@ -1314,10 +1324,12 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d else: raise TypeError("Unhandled data type: {}".format(type(nodedesc).__name__)) - declaration_stream.write(result_decl.getvalue(), sdfg, state_id, node) - allocation_stream.write(result_alloc.getvalue(), sdfg, state_id, node) + declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) + allocation_stream.write(result_alloc.getvalue(), cfg, state_id, node) - def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, callsite_stream): + def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: pass # Handled by destructor def partition_kernels(self, state: dace.SDFGState, default_kernel: int = 0): @@ -1332,7 +1344,7 @@ def partition_kernels(self, state: dace.SDFGState, default_kernel: int = 0): """ concurrent_kernels = 0 # Max number of kernels - sdfg = state.parent + sdfg = state.sdfg def increment(kernel_id): if concurrent_kernels > 0: @@ -1453,7 +1465,7 @@ def increment(kernel_id): return max_kernels, dependencies def _trace_back_edge(self, - edge: dace.sdfg.sdfg.Edge, + edge: MultiConnectorEdge[dace.Memlet], state: dace.SDFGState, look_for_kernel_id: bool = False) -> Union[bool, int]: """ @@ -1497,7 +1509,7 @@ def _trace_back_edge(self, src_repr = utils.unique_node_repr(state, curedge.src) return self._node_to_kernel[src_repr] if src_repr in self._node_to_kernel else None - def _trace_forward_edge(self, edge: dace.sdfg.sdfg.Edge, state: dace.SDFGState) -> Tuple[bool, int]: + def _trace_forward_edge(self, edge: MultiConnectorEdge[dace.Memlet], state: dace.SDFGState) -> Tuple[bool, int]: """ Given an edge, this traverses the edges forward. It can be used either for: @@ -1530,8 +1542,10 @@ def _trace_forward_edge(self, edge: dace.sdfg.sdfg.Edge, state: dace.SDFGState) kernel_id = self._node_to_kernel[dst_repr] if dst_repr in self._node_to_kernel else None return contains_only_global_buffers, kernel_id - def _emit_copy(self, sdfg, state_id, src_node, src_storage, dst_node, dst_storage, dst_schedule, edge, dfg, - function_stream, callsite_stream): + def _emit_copy(self, sdfg: SDFG, cfg: ControlFlowRegion, state_id: int, src_node: nodes.Node, + src_storage: dtypes.StorageType, dst_node: nodes.Node, dst_storage: dtypes.StorageType, + dst_schedule: dtypes.ScheduleType, edge: MultiConnectorEdge[memlet.Memlet], dfg: StateSubgraphView, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: u, v, memlet = edge.src, edge.dst, edge.data @@ -1647,7 +1661,7 @@ def _emit_copy(self, sdfg, state_id, src_node, src_storage, dst_node, dst_storag sdfg, dst_subset, decouple_array_interfaces=self._decouple_array_interfaces), - (offset_dst if not outgoing_memlet else 0), copysize, ptr_str), sdfg, state_id, + (offset_dst if not outgoing_memlet else 0), copysize, ptr_str), cfg, state_id, [src_node, dst_node]) elif device_to_host: @@ -1668,7 +1682,7 @@ def _emit_copy(self, sdfg, state_id, src_node, src_storage, dst_node, dst_storag sdfg, src_subset, decouple_array_interfaces=self._decouple_array_interfaces), - (offset_src if outgoing_memlet else 0), copysize, ptr_str), sdfg, state_id, + (offset_src if outgoing_memlet else 0), copysize, ptr_str), cfg, state_id, [src_node, dst_node]) elif device_to_device: @@ -1686,7 +1700,7 @@ def _emit_copy(self, sdfg, state_id, src_node, src_storage, dst_node, dst_storag sdfg, dst_subset, decouple_array_interfaces=self._decouple_array_interfaces), - (offset_dst if not outgoing_memlet else 0)), sdfg, state_id, [src_node, dst_node]) + (offset_dst if not outgoing_memlet else 0)), cfg, state_id, [src_node, dst_node]) # Reject copying to/from local memory from/to outside the FPGA elif (data_to_data and @@ -1761,37 +1775,37 @@ def _emit_copy(self, sdfg, state_id, src_node, src_storage, dst_node, dst_storag if has_pipelined_loops: # Language-specific - self.generate_pipeline_loop_pre(callsite_stream, sdfg, state_id, dst_node) + self.generate_pipeline_loop_pre(callsite_stream, sdfg, cfg, state_id, dst_node) if len(copy_shape) > 1: # Language-specific - self.generate_flatten_loop_pre(callsite_stream, sdfg, state_id, dst_node) + self.generate_flatten_loop_pre(callsite_stream, sdfg, cfg, state_id, dst_node) for node in dependency_pragma_nodes: # Inject dependence pragmas - self.generate_no_dependence_pre(callsite_stream, sdfg, state_id, dst_node, node.data) + self.generate_no_dependence_pre(callsite_stream, sdfg, cfg, state_id, dst_node, node.data) # Loop intro for i, copy_dim in enumerate(copy_shape): if copy_dim != 1: if register_to_register: # Language-specific - self.generate_unroll_loop_pre(callsite_stream, None, sdfg, state_id, dst_node) + self.generate_unroll_loop_pre(callsite_stream, None, sdfg, cfg, state_id, dst_node) callsite_stream.write( "for (int __dace_copy{} = 0; __dace_copy{} < {}; " - "++__dace_copy{}) {{".format(i, i, cpp.sym2cpp(copy_dim), i), sdfg, state_id, dst_node) + "++__dace_copy{}) {{".format(i, i, cpp.sym2cpp(copy_dim), i), cfg, state_id, dst_node) if register_to_register: # Language-specific - self.generate_unroll_loop_post(callsite_stream, None, sdfg, state_id, dst_node) + self.generate_unroll_loop_post(callsite_stream, None, sdfg, cfg, state_id, dst_node) # Pragmas if has_pipelined_loops: # Language-specific - self.generate_pipeline_loop_post(callsite_stream, sdfg, state_id, dst_node) - self.generate_flatten_loop_post(callsite_stream, sdfg, state_id, dst_node) + self.generate_pipeline_loop_post(callsite_stream, sdfg, cfg, state_id, dst_node) + self.generate_flatten_loop_post(callsite_stream, sdfg, cfg, state_id, dst_node) # Inject dependence pragmas for node in dependency_pragma_nodes: - self.generate_no_dependence_post(callsite_stream, sdfg, state_id, dst_node, node.data) + self.generate_no_dependence_post(callsite_stream, sdfg, cfg, state_id, dst_node, node.data) src_name = cpp.ptr(src_node.data, src_node.desc(sdfg), sdfg, self._frame) dst_name = cpp.ptr(dst_node.data, dst_node.desc(sdfg), sdfg, self._frame) @@ -1833,7 +1847,7 @@ def _emit_copy(self, sdfg, state_id, src_node, src_storage, dst_node, dst_storag else: - self.generate_memlet_definition(sdfg, dfg, state_id, src_node, dst_node, edge, callsite_stream) + self.generate_memlet_definition(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, callsite_stream) @staticmethod def make_opencl_parameter(name, desc): @@ -1852,7 +1866,8 @@ def get_next_scope_entries(self, sdfg, dfg, scope_entry): return all_scopes[all_scopes.index(scope_entry) + 1:] - def generate_node(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: method_name = "_generate_" + type(node).__name__ # Fake inheritance... use this class' method if it exists, # otherwise fall back on CPU codegen @@ -1865,17 +1880,19 @@ def generate_node(self, sdfg, dfg, state_id, node, function_stream, callsite_str "Ignoring.".format(node.schedule, type(node).__name__)) - getattr(self, method_name)(sdfg, dfg, state_id, node, function_stream, callsite_stream) + getattr(self, method_name)(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) else: old_codegen = self._cpu_codegen.calling_codegen self._cpu_codegen.calling_codegen = self - self._cpu_codegen.generate_node(sdfg, dfg, state_id, node, function_stream, callsite_stream) + self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) self._cpu_codegen.calling_codegen = old_codegen - def copy_memory(self, sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream): - + def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + src_node: Union[nodes.CodeNode, nodes.AccessNode], + dst_node: Union[nodes.CodeNode, nodes.AccessNode], edge: MultiConnectorEdge[memlet.Memlet], + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: if isinstance(src_node, dace.sdfg.nodes.CodeNode): src_storage = dtypes.StorageType.Register try: @@ -1896,7 +1913,7 @@ def copy_memory(self, sdfg, dfg, state_id, src_node, dst_node, edge, function_st except KeyError: dst_parent = None dst_schedule = None if dst_parent is None else dst_parent.map.schedule - state_dfg = sdfg.nodes()[state_id] + state_dfg = cfg.state(state_id) # Check if this is a copy memlet using at least one multibank array edge_list = [] @@ -1934,8 +1951,8 @@ def copy_memory(self, sdfg, dfg, state_id, src_node, dst_node, edge, function_st # Emit actual copy for current_edge in edge_list: - self._emit_copy(sdfg, state_id, src_node, src_storage, dst_node, dst_storage, dst_schedule, current_edge, - state_dfg, function_stream, callsite_stream) + self._emit_copy(sdfg, cfg, state_id, src_node, src_storage, dst_node, dst_storage, dst_schedule, + current_edge, state_dfg, function_stream, callsite_stream) def _generate_PipelineEntry(self, *args, **kwargs): self._generate_MapEntry(*args, **kwargs) @@ -1973,8 +1990,8 @@ def _is_degenerate(begin, end, skip, sdfg): except TypeError: # Cannot statically evaluate expression return False, begin - def _generate_MapEntry(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): - + def _generate_MapEntry(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.MapEntry, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: result = callsite_stream scope_dict = dfg.scope_dict() @@ -1987,15 +2004,15 @@ def _generate_MapEntry(self, sdfg, dfg, state_id, node, function_stream, callsit else: # Add extra opening brace (dynamic map ranges, closed in MapExit # generator) - callsite_stream.write('{', sdfg, state_id, node) + callsite_stream.write('{', cfg, state_id, node) # Define dynamic loop bounds variables (dynamic input memlets to # the MapEntry node) - for e in dynamic_map_inputs(sdfg.node(state_id), node): + for e in dynamic_map_inputs(cfg.state(state_id), node): if e.data.data != e.dst_conn: callsite_stream.write( self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, - e.dst.in_connectors[e.dst_conn]), sdfg, state_id, node) + e.dst.in_connectors[e.dst_conn]), cfg, state_id, node) # Pipeline innermost loops scope_children = dfg.scope_children() @@ -2024,7 +2041,7 @@ def _generate_MapEntry(self, sdfg, dfg, state_id, node, function_stream, callsit # that is read/written inside this map, if there are no WCR. If there are no WCR at all, we can add # a more generic pragma to ignore all loop-carried dependencies. map_exit_node = dfg.exit_node(node) - state = sdfg.nodes()[state_id] + state = cfg.state(state_id) candidates_in = set() candidates_out = set() is_there_a_wcr = False @@ -2058,19 +2075,19 @@ def _generate_MapEntry(self, sdfg, dfg, state_id, node, function_stream, callsit # Add pragmas if not fully_degenerate and not is_degenerate[i]: if node.map.unroll: - self.generate_unroll_loop_pre(result, None, sdfg, state_id, node) + self.generate_unroll_loop_pre(result, None, sdfg, cfg, state_id, node) elif is_innermost: - self.generate_pipeline_loop_pre(result, sdfg, state_id, node) + self.generate_pipeline_loop_pre(result, sdfg, cfg, state_id, node) # Do not put pragma if this is degenerate (loop does not exist) - self.generate_flatten_loop_pre(result, sdfg, state_id, node) + self.generate_flatten_loop_pre(result, sdfg, cfg, state_id, node) if not node.map.unroll: if len(in_out_data) > 0 and is_there_a_wcr == False: # add pragma to ignore all loop carried dependencies - self.generate_no_dependence_pre(result, sdfg, state_id, node) + self.generate_no_dependence_pre(result, sdfg, cfg, state_id, node) else: # add specific pragmas for candidate in in_out_data: - self.generate_no_dependence_pre(result, sdfg, state_id, node, candidate) + self.generate_no_dependence_pre(result, sdfg, cfg, state_id, node, candidate) var = node.map.params[i] begin, end, skip = r @@ -2119,11 +2136,11 @@ def _generate_MapEntry(self, sdfg, dfg, state_id, node, function_stream, callsit result.write( "for ({} {} = {}; {} < {}; {} += {}) {{\n".format(loop_var_type, var, cpp.sym2cpp(begin), var, cpp.sym2cpp(end + 1), var, - cpp.sym2cpp(skip)), sdfg, state_id, node) + cpp.sym2cpp(skip)), cfg, state_id, node) #Add unroll pragma if not fully_degenerate and not is_degenerate[i] and node.map.unroll: - self.generate_unroll_loop_post(result, None, sdfg, state_id, node) + self.generate_unroll_loop_post(result, None, sdfg, cfg, state_id, node) else: pipeline = node.pipeline @@ -2133,11 +2150,11 @@ def _generate_MapEntry(self, sdfg, dfg, state_id, node, function_stream, callsit if len(in_out_data) > 0: if is_there_a_wcr == False: # add pragma to ignore all loop carried dependencies - self.generate_no_dependence_pre(result, sdfg, state_id, node) + self.generate_no_dependence_pre(result, sdfg, cfg, state_id, node) else: # add specific pragmas for candidate in in_out_data: - self.generate_no_dependence_pre(result, sdfg, state_id, node, candidate) + self.generate_no_dependence_pre(result, sdfg, cfg, state_id, node, candidate) result.write("for (long {it} = 0; {it} < {bound}; ++{it}) {{\n".format( it=flat_it, bound=node.pipeline.loop_bound_str())) if pipeline.init_size != 0: @@ -2152,15 +2169,15 @@ def _generate_MapEntry(self, sdfg, dfg, state_id, node, function_stream, callsit if not fully_degenerate: if not node.map.unroll: if is_innermost: - self.generate_pipeline_loop_post(result, sdfg, state_id, node) - self.generate_flatten_loop_post(result, sdfg, state_id, node) + self.generate_pipeline_loop_post(result, sdfg, cfg, state_id, node) + self.generate_flatten_loop_post(result, sdfg, cfg, state_id, node) # add pragmas for data read/written inside this map, but only for local arrays for candidate in in_out_data: if sdfg.arrays[candidate].storage != dtypes.StorageType.FPGA_Global: - self.generate_no_dependence_post(result, sdfg, state_id, node, candidate) + self.generate_no_dependence_post(result, sdfg, cfg, state_id, node, candidate) # Emit internal transient array allocation - to_allocate = dace.sdfg.local_transients(sdfg, sdfg.node(state_id), node) + to_allocate = dace.sdfg.local_transients(sdfg, cfg.state(state_id), node) allocated = set() for child in dfg.scope_children()[node]: if not isinstance(child, dace.sdfg.nodes.AccessNode): @@ -2168,12 +2185,13 @@ def _generate_MapEntry(self, sdfg, dfg, state_id, node, function_stream, callsit if child.data not in to_allocate or child.data in allocated: continue allocated.add(child.data) - self._dispatcher.dispatch_allocate(sdfg, dfg, state_id, child, child.desc(sdfg), None, result) + self._dispatcher.dispatch_allocate(sdfg, cfg, dfg, state_id, child, child.desc(sdfg), None, result) def _generate_PipelineExit(self, *args, **kwargs): self._generate_MapExit(*args, **kwargs) - def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): + def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: scope_dict = dfg.scope_dict() entry_node = scope_dict[node] if entry_node.map in self._unrolled_pes: @@ -2206,10 +2224,11 @@ def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite callsite_stream.write("}\n") callsite_stream.write("}\n}\n") else: - self._cpu_codegen._generate_MapExit(sdfg, dfg, state_id, node, function_stream, callsite_stream) + self._cpu_codegen._generate_MapExit(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) def generate_kernel(self, sdfg: dace.SDFG, + cfg: ControlFlowRegion, state: dace.SDFGState, kernel_name: str, subgraphs: list, @@ -2260,7 +2279,7 @@ def get_kernel_name(val): predecessors.append(get_kernel_name(pred)) # Actual kernel code generation - self.generate_kernel_internal(sdfg, state, kernel_name, predecessors, subgraphs, kernel_stream, + self.generate_kernel_internal(sdfg, cfg, state, kernel_name, predecessors, subgraphs, kernel_stream, state_host_header_stream, state_host_body_stream, instrumentation_stream, function_stream, callsite_stream, state_parameters) self._kernel_count = self._kernel_count + 1 @@ -2306,18 +2325,20 @@ def _module_name(self, subgraph, state): raise RuntimeError("Expected at least one tasklet or data node.") return "_".join(labels) - def generate_modules(self, sdfg, state, kernel_name, subgraphs, subgraph_parameters, module_stream, entry_stream, + def generate_modules(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, kernel_name: str, + subgraphs, subgraph_parameters, module_stream, entry_stream, host_stream, instrumentation_stream): """ Generate all PEs inside an FPGA Kernel. """ for subgraph in subgraphs: module_name = self._module_name(subgraph, state) - self.generate_module(sdfg, state, kernel_name, module_name, subgraph, subgraph_parameters[subgraph], + self.generate_module(sdfg, cfg, state, kernel_name, module_name, subgraph, subgraph_parameters[subgraph], module_stream, entry_stream, host_stream, instrumentation_stream) - def generate_nsdfg_header(self, sdfg, state, state_id, node, memlet_references, sdfg_label): + def generate_nsdfg_header(self, sdfg, cfg, state, state_id, node, memlet_references, sdfg_label): return self._cpu_codegen.generate_nsdfg_header(sdfg, + cfg, state, state_id, node, @@ -2325,18 +2346,19 @@ def generate_nsdfg_header(self, sdfg, state, state_id, node, memlet_references, sdfg_label, state_struct=False) - def generate_nsdfg_call(self, sdfg, state, node, memlet_references, sdfg_label): + def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_label): return self._cpu_codegen.generate_nsdfg_call(sdfg, + cfg, state, node, memlet_references, sdfg_label, state_struct=False) - def generate_nsdfg_arguments(self, sdfg, dfg, state, node): - return self._cpu_codegen.generate_nsdfg_arguments(sdfg, state, dfg, node) + def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): + return self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, state, dfg, node) - def generate_host_function_boilerplate(self, sdfg, state, nested_global_transients, host_code_stream): + def generate_host_function_boilerplate(self, sdfg, cfg, state, nested_global_transients, host_code_stream): """ Generates global transients that must be passed to the state (required by a kernel) """ @@ -2344,14 +2366,17 @@ def generate_host_function_boilerplate(self, sdfg, state, nested_global_transien # Any extra transients stored in global memory on the FPGA must now be # allocated and passed to the kernel for arr_node in nested_global_transients: - self._dispatcher.dispatch_allocate(sdfg, state, None, arr_node, arr_node.desc(sdfg), None, host_code_stream) + self._dispatcher.dispatch_allocate(sdfg, cfg, state, None, arr_node, arr_node.desc(sdfg), None, + host_code_stream) def _generate_Tasklet(self, *args, **kwargs): # Call CPU implementation with this code generator as callback self._cpu_codegen._generate_Tasklet(*args, codegen=self, **kwargs) - def define_out_memlet(self, sdfg, state_dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream): - self._dispatcher.dispatch_copy(src_node, dst_node, edge, sdfg, state_dfg, state_id, function_stream, + def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[memlet.Memlet], + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + self._dispatcher.dispatch_copy(src_node, dst_node, edge, sdfg, cfg, state_dfg, state_id, function_stream, callsite_stream) def process_out_memlets(self, *args, **kwargs): @@ -2362,8 +2387,9 @@ def generate_tasklet_preamble(self, *args, **kwargs): # Fall back on CPU implementation self._cpu_codegen.generate_tasklet_preamble(*args, **kwargs) - def generate_tasklet_postamble(self, sdfg, dfg, state_id, node, function_stream, before_memlets_stream, - after_memlets_stream): + def generate_tasklet_postamble(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Node, function_stream: CodeIOStream, + before_memlets_stream: CodeIOStream, after_memlets_stream: CodeIOStream) -> None: # Inject dependency pragmas on memlets for edge in dfg.out_edges(node): dataname = edge.data.data @@ -2379,7 +2405,8 @@ def generate_tasklet_postamble(self, sdfg, dfg, state_id, node, function_stream, else: accessed_subset = 0 - self.generate_no_dependence_post(after_memlets_stream, sdfg, state_id, node, dataname, accessed_subset) + self.generate_no_dependence_post(after_memlets_stream, sdfg, cfg, state_id, node, dataname, + accessed_subset) def make_ptr_vector_cast(self, *args, **kwargs): return cpp.make_ptr_vector_cast(*args, **kwargs) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index d1e540c39e..5b756b413c 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -1,7 +1,6 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import collections import copy -import functools import re from typing import Any, DefaultDict, Dict, List, Optional, Set, Tuple, Union @@ -14,12 +13,13 @@ from dace.codegen import control_flow as cflow from dace.codegen import dispatcher as disp from dace.codegen.prettycode import CodeIOStream -from dace.codegen.common import codeblock_to_cpp, sym2cpp, unparse_interstate_edge +from dace.codegen.common import codeblock_to_cpp, sym2cpp from dace.codegen.targets.target import TargetCodeGenerator -from dace.frontend.python import wrappers -from dace.sdfg import SDFG, ScopeSubgraphView, SDFGState, nodes +from dace.sdfg import SDFG, SDFGState, nodes from dace.sdfg import scope as sdscope from dace.sdfg import utils +from dace.sdfg.analysis import cfg as cfg_analysis +from dace.sdfg.state import ControlFlowRegion from dace.transformation.passes.analysis import StateReachability @@ -43,7 +43,8 @@ def __init__(self, sdfg: SDFG): self.environments: List[Any] = [] self.targets: Set[TargetCodeGenerator] = set() self.to_allocate: DefaultDict[Union[SDFG, SDFGState, nodes.EntryNode], - List[Tuple[int, int, nodes.AccessNode]]] = collections.defaultdict(list) + List[Tuple[SDFG, Optional[SDFGState], Optional[nodes.AccessNode], bool, bool, + bool]]] = collections.defaultdict(list) self.where_allocated: Dict[Tuple[SDFG, str], SDFG] = {} self.fsyms: Dict[int, Set[str]] = {} self._symbols_and_constants: Dict[int, Set[str]] = {} @@ -131,7 +132,7 @@ def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend: :param global_stream: Stream to write to (global). :param backend: Whose backend this header belongs to. """ - from dace.codegen.targets.cpp import mangle_dace_state_struct_name # Avoid circular import + from dace.codegen.targets.cpp import mangle_dace_state_struct_name # Avoid circular import # Hash file include if backend == 'frame': global_stream.write('#include "../../include/hash.h"\n', sdfg) @@ -154,9 +155,9 @@ def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend: for _, arrname, arr in sdfg.arrays_recursive(): if arr is not None: datatypes.add(arr.dtype) - + emitted = set() - + def _emit_definitions(dtype: dtypes.typeclass, wrote_something: bool) -> bool: if isinstance(dtype, dtypes.pointer): wrote_something = _emit_definitions(dtype._typeclass, wrote_something) @@ -232,7 +233,7 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre :param callsite_stream: Stream to write to (at call site). """ import dace.library - from dace.codegen.targets.cpp import mangle_dace_state_struct_name # Avoid circular import + from dace.codegen.targets.cpp import mangle_dace_state_struct_name # Avoid circular import fname = sdfg.name params = sdfg.signature(arglist=self.arglist) paramnames = sdfg.signature(False, for_call=True, arglist=self.arglist) @@ -270,10 +271,12 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre for target in self._dispatcher.used_targets: if target.has_initializer: callsite_stream.write( - f'DACE_EXPORTED int __dace_init_{target.target_name}({mangle_dace_state_struct_name(sdfg)} *__state{initparams_comma});\n', sdfg) + f'DACE_EXPORTED int __dace_init_{target.target_name}({mangle_dace_state_struct_name(sdfg)} *__state{initparams_comma});\n', + sdfg) if target.has_finalizer: callsite_stream.write( - f'DACE_EXPORTED int __dace_exit_{target.target_name}({mangle_dace_state_struct_name(sdfg)} *__state);\n', sdfg) + f'DACE_EXPORTED int __dace_exit_{target.target_name}({mangle_dace_state_struct_name(sdfg)} *__state);\n', + sdfg) callsite_stream.write( f""" @@ -358,8 +361,8 @@ def generate_external_memory_management(self, sdfg: SDFG, callsite_stream: CodeI can be ``CPU_Heap`` or any other ``dtypes.StorageType``); and (2) set the externally-allocated pointer to the generated code's internal state (``__dace_set_external_memory_``). """ - from dace.codegen.targets.cpp import mangle_dace_state_struct_name # Avoid circular import - + from dace.codegen.targets.cpp import mangle_dace_state_struct_name # Avoid circular import + # Collect external arrays ext_arrays: Dict[dtypes.StorageType, List[Tuple[SDFG, str, data.Data]]] = collections.defaultdict(list) for subsdfg, aname, arr in sdfg.arrays_recursive(): @@ -392,22 +395,27 @@ def generate_external_memory_management(self, sdfg: SDFG, callsite_stream: CodeI f''' DACE_EXPORTED void __dace_set_external_memory_{storage.name}({mangle_dace_state_struct_name(sdfg)} *__state, char *ptr{initparams_comma}) {{''', sdfg) - + offset = 0 for subsdfg, aname, arr in arrays: allocname = f'__state->__{subsdfg.cfg_id}_{aname}' callsite_stream.write(f'{allocname} = decltype({allocname})(ptr + {sym2cpp(offset)});', subsdfg) offset += arr.total_size * arr.dtype.bytes - + # Footer callsite_stream.write('}', sdfg) - def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_state_footer=True): - - sid = sdfg.node_id(state) + def generate_state(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + global_stream: CodeIOStream, + callsite_stream: CodeIOStream, + generate_state_footer: bool = True): + sid = state.block_id # Emit internal transient array allocation - self.allocate_arrays_in_scope(sdfg, state, global_stream, callsite_stream) + self.allocate_arrays_in_scope(sdfg, cfg, state, global_stream, callsite_stream) callsite_stream.write('\n') @@ -426,14 +434,16 @@ def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_s components = dace.sdfg.concurrent_subgraphs(state) if len(components) <= 1: - self._dispatcher.dispatch_subgraph(sdfg, state, sid, global_stream, callsite_stream, skip_entry_node=False) + self._dispatcher.dispatch_subgraph(sdfg, cfg, state, sid, global_stream, callsite_stream, + skip_entry_node=False) else: if sdfg.openmp_sections: callsite_stream.write("#pragma omp parallel sections\n{") for c in components: if sdfg.openmp_sections: callsite_stream.write("#pragma omp section\n{") - self._dispatcher.dispatch_subgraph(sdfg, c, sid, global_stream, callsite_stream, skip_entry_node=False) + self._dispatcher.dispatch_subgraph(sdfg, cfg, c, sid, global_stream, callsite_stream, + skip_entry_node=False) if sdfg.openmp_sections: callsite_stream.write("} // End omp section") if sdfg.openmp_sections: @@ -444,28 +454,31 @@ def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_s if generate_state_footer: # Emit internal transient array deallocation - self.deallocate_arrays_in_scope(sdfg, state, global_stream, callsite_stream) + self.deallocate_arrays_in_scope(sdfg, state.parent_graph, state, global_stream, callsite_stream) # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_state_end(sdfg, state, callsite_stream, global_stream) - def generate_states(self, sdfg, global_stream, callsite_stream): + def generate_states(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stream: CodeIOStream) -> Set[SDFGState]: states_generated = set() - opbar = progress.OptionalProgressBar(sdfg.number_of_nodes(), title=f'Generating code (SDFG {sdfg.cfg_id})') + opbar = progress.OptionalProgressBar(len(sdfg.states()), title=f'Generating code (SDFG {sdfg.cfg_id})') # Create closure + function for state dispatcher def dispatch_state(state: SDFGState) -> str: stream = CodeIOStream() - self._dispatcher.dispatch_state(sdfg, state, global_stream, stream) + self._dispatcher.dispatch_state(state, global_stream, stream) opbar.next() states_generated.add(state) # For sanity check return stream.getvalue() - # Handle specialized control flow - if config.Config.get_bool('optimizer', 'detect_control_flow'): + if sdfg.root_sdfg.using_experimental_blocks: + # Use control flow blocks embedded in the SDFG to generate control flow. + cft = cflow.structured_control_flow_tree_with_regions(sdfg, dispatch_state) + elif config.Config.get_bool('optimizer', 'detect_control_flow'): + # Handle specialized control flow # Avoid import loop from dace.transformation import helpers as xfh @@ -479,8 +492,8 @@ def dispatch_state(state: SDFGState) -> str: states_topological = list(sdfg.bfs_nodes(sdfg.start_state)) last = states_topological[-1] cft = cflow.GeneralBlock(dispatch_state, None, - [cflow.SingleState(dispatch_state, s, s is last) for s in states_topological], [], - [], [], [], False) + [cflow.BasicCFBlock(dispatch_state, s, s is last) for s in states_topological], + [], [], [], [], False) callsite_stream.write(cft.as_cpp(self, sdfg.symbols), sdfg) @@ -533,8 +546,7 @@ def _can_allocate(self, sdfg: SDFG, state: SDFGState, desc: data.Data, scope: Un def determine_allocation_lifetime(self, top_sdfg: SDFG): """ - Determines where (at which scope/state/SDFG) each data descriptor - will be allocated/deallocated. + Determines where (at which scope/state/SDFG) each data descriptor will be allocated/deallocated. :param top_sdfg: The top-level SDFG to determine for. """ @@ -553,7 +565,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG): array_names = sdfg.arrays.keys( ) #set(k for k, v in sdfg.arrays.items() if v.lifetime == dtypes.AllocationLifetime.Scope) # Iterate topologically to get state-order - for state in sdfg.bfs_nodes(): + for state in cfg_analysis.blockorder_topological_sort(sdfg, ignore_nonstate_blocks=True): for node in state.data_nodes(): if node.data not in array_names: continue @@ -561,7 +573,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG): # Look in the surrounding edges for usage edge_fsyms: Set[str] = set() - for e in sdfg.all_edges(state): + for e in state.parent_graph.all_edges(state): edge_fsyms |= e.data.free_symbols for edge_array in edge_fsyms & array_names: instances[edge_array].append((state, nodes.AccessNode(edge_array))) @@ -651,7 +663,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG): # containing state or the SDFG (if used in more than one state) curstate: SDFGState = None multistate = False - for state in sdfg.nodes(): + for state in sdfg.states(): if any(n.data == name for n in state.data_nodes()): if curstate is not None: multistate = True @@ -671,11 +683,11 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG): multistate = False # Does the array appear in inter-state edges? - for isedge in sdfg.edges(): + for isedge in sdfg.all_interstate_edges(): if name in self.free_symbols(isedge.data): multistate = True - for state in sdfg.nodes(): + for state in sdfg.states(): if multistate: break sdict = state.scope_dict() @@ -759,7 +771,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG): if first_state_instance != last_state_instance: # If any state is not reachable from first state, find common denominators in the form of # dominator and postdominator. - instances = access_instances[sdfg.cfg_id][name] + instances: List[Tuple[SDFGState, nodes.AccessNode]] = access_instances[sdfg.cfg_id][name] # A view gets "allocated" everywhere it appears if isinstance(desc, data.View): @@ -797,34 +809,37 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG): else: self.where_allocated[(sdfg, name)] = cursdfg - def allocate_arrays_in_scope(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG], - function_stream: CodeIOStream, callsite_stream: CodeIOStream): + def allocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, + scope: Union[nodes.EntryNode, SDFGState, SDFG], function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: """ Dispatches allocation of all arrays in the given scope. """ for tsdfg, state, node, declare, allocate, _ in self.to_allocate[scope]: if state is not None: - state_id = tsdfg.node_id(state) + state_id = state.block_id else: state_id = -1 desc = node.desc(tsdfg) - self._dispatcher.dispatch_allocate(tsdfg, state, state_id, node, desc, function_stream, callsite_stream, - declare, allocate) + self._dispatcher.dispatch_allocate(tsdfg, cfg if state is None else state.parent_graph, state, state_id, + node, desc, function_stream, callsite_stream, declare, allocate) - def deallocate_arrays_in_scope(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG], - function_stream: CodeIOStream, callsite_stream: CodeIOStream): + def deallocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, + scope: Union[nodes.EntryNode, SDFGState, SDFG], function_stream: CodeIOStream, + callsite_stream: CodeIOStream): """ Dispatches deallocation of all arrays in the given scope. """ for tsdfg, state, node, _, _, deallocate in self.to_allocate[scope]: if not deallocate: continue if state is not None: - state_id = tsdfg.node_id(state) + state_id = state.block_id else: state_id = -1 desc = node.desc(tsdfg) - self._dispatcher.dispatch_deallocate(tsdfg, state, state_id, node, desc, function_stream, callsite_stream) + self._dispatcher.dispatch_deallocate(tsdfg, cfg, state, state_id, node, desc, function_stream, + callsite_stream) def generate_code(self, sdfg: SDFG, @@ -869,7 +884,7 @@ def generate_code(self, instr.on_sdfg_begin(sdfg, callsite_stream, global_stream, self) # Allocate outer-level transients - self.allocate_arrays_in_scope(sdfg, sdfg, global_stream, callsite_stream) + self.allocate_arrays_in_scope(sdfg, sdfg, sdfg, global_stream, callsite_stream) # Define constants as top-level-allocated for cname, (ctype, _) in sdfg.constants_prop.items(): @@ -882,15 +897,16 @@ def generate_code(self, global_symbols = copy.deepcopy(sdfg.symbols) global_symbols.update({aname: arr.dtype for aname, arr in sdfg.arrays.items()}) interstate_symbols = {} - for e in sdfg.dfs_edges(sdfg.start_state): - symbols = e.data.new_symbols(sdfg, global_symbols) - # Inferred symbols only take precedence if global symbol not defined or None - symbols = { - k: v if (k not in global_symbols or global_symbols[k] is None) else global_symbols[k] - for k, v in symbols.items() - } - interstate_symbols.update(symbols) - global_symbols.update(symbols) + for cfr in sdfg.all_control_flow_regions(): + for e in cfr.dfs_edges(cfr.start_block): + symbols = e.data.new_symbols(sdfg, global_symbols) + # Inferred symbols only take precedence if global symbol not defined or None + symbols = { + k: v if (k not in global_symbols or global_symbols[k] is None) else global_symbols[k] + for k, v in symbols.items() + } + interstate_symbols.update(symbols) + global_symbols.update(symbols) for isvarName, isvarType in interstate_symbols.items(): if isvarType is None: @@ -916,14 +932,14 @@ def generate_code(self, ####################################################################### # Sanity check - if len(states_generated) != len(sdfg.nodes()): + if len(states_generated) != len(sdfg.states()): raise RuntimeError( "Not all states were generated in SDFG {}!" "\n Generated: {}\n Missing: {}".format(sdfg.label, [s.label for s in states_generated], - [s.label for s in (set(sdfg.nodes()) - states_generated)])) + [s.label for s in (set(sdfg.states()) - states_generated)])) # Deallocate transients - self.deallocate_arrays_in_scope(sdfg, sdfg, global_stream, callsite_stream) + self.deallocate_arrays_in_scope(sdfg, sdfg, sdfg, global_stream, callsite_stream) # Now that we have all the information about dependencies, generate # header and footer @@ -982,22 +998,20 @@ def generate_code(self, return (generated_header, clean_code, self._dispatcher.used_targets, self._dispatcher.used_environments) -def _get_dominator_and_postdominator(sdfg: SDFG, accesses: List[Tuple[SDFGState, nodes.AccessNode]]): +def _get_dominator_and_postdominator(cfg: ControlFlowRegion, accesses: List[Tuple[SDFGState, nodes.AccessNode]]): """ Gets the closest common dominator and post-dominator for a list of states. Used for determining allocation of data used in branched states. """ - from dace.sdfg.analysis import cfg - # Get immediate dominators - idom = nx.immediate_dominators(sdfg.nx, sdfg.start_state) - alldoms = cfg.all_dominators(sdfg, idom) + idom = nx.immediate_dominators(cfg.nx, cfg.start_block) + alldoms = cfg_analysis.all_dominators(cfg, idom) states = [a for a, _ in accesses] data_name = accesses[0][1].data # Get immediate post-dominators - ipostdom, allpostdoms = utils.postdominators(sdfg, return_alldoms=True) + ipostdom, allpostdoms = utils.postdominators(cfg, return_alldoms=True) # All dominators and postdominators include the states themselves for state in states: diff --git a/dace/codegen/targets/intel_fpga.py b/dace/codegen/targets/intel_fpga.py index f44d84c76c..513dc0bbfc 100644 --- a/dace/codegen/targets/intel_fpga.py +++ b/dace/codegen/targets/intel_fpga.py @@ -7,14 +7,13 @@ import numpy as np import dace -from dace import registry, subsets, dtypes, symbolic +from dace import registry, dtypes, symbolic from dace.codegen import cppunparse from dace.config import Config from dace.codegen import exceptions as cgx from dace.codegen.codeobject import CodeObject from dace.codegen.dispatcher import DefinedType from dace.codegen.prettycode import CodeIOStream -from dace.codegen.targets.target import make_absolute from dace.codegen.targets import cpp, fpga from dace.codegen.common import codeblock_to_cpp from dace.codegen.tools.type_inference import infer_expr_type @@ -24,6 +23,8 @@ from dace.sdfg import nodes, utils as sdutils from dace.codegen.common import sym2cpp from dace.sdfg import SDFGState +from dace.sdfg.sdfg import SDFG +from dace.sdfg.state import ControlFlowRegion, StateSubgraphView import dace.sdfg.utils as utils from dace.symbolic import evaluate from collections import defaultdict @@ -302,31 +303,31 @@ def make_kernel_argument(self, data, var_name, is_output, with_vectorization): return data.as_arg(with_types=True, name=var_name) @staticmethod - def generate_unroll_loop_pre(kernel_stream, factor, sdfg, state_id, node): + def generate_unroll_loop_pre(kernel_stream, factor, sdfg, cfg, state_id, node): if factor is not None: factor_str = " " + factor else: factor_str = "" - kernel_stream.write("#pragma unroll{}".format(factor_str), sdfg, state_id, node) + kernel_stream.write("#pragma unroll{}".format(factor_str), cfg, state_id, node) @staticmethod - def generate_unroll_loop_post(kernel_stream, factor, sdfg, state_id, node): + def generate_unroll_loop_post(kernel_stream, factor, sdfg, cfg, state_id, node): pass @staticmethod - def generate_pipeline_loop_pre(kernel_stream, sdfg, state_id, node): + def generate_pipeline_loop_pre(kernel_stream, sdfg, cfg, state_id, node): pass @staticmethod - def generate_pipeline_loop_post(kernel_stream, sdfg, state_id, node): + def generate_pipeline_loop_post(kernel_stream, sdfg, cfg, state_id, node): pass @staticmethod - def generate_flatten_loop_pre(kernel_stream, sdfg, state_id, node): + def generate_flatten_loop_pre(kernel_stream, sdfg, cfg, state_id, node): kernel_stream.write("#pragma loop_coalesce") @staticmethod - def generate_flatten_loop_post(kernel_stream, sdfg, state_id, node): + def generate_flatten_loop_post(kernel_stream, sdfg, cfg, state_id, node): pass def make_read(self, defined_type, dtype, var_name, expr, index, is_pack, packing_factor): @@ -431,24 +432,25 @@ def make_shift_register_write(self, defined_type, dtype, var_name, write_expr, i return res @staticmethod - def generate_no_dependence_pre(kernel_stream, sdfg, state_id, node, var_name=None): + def generate_no_dependence_pre(kernel_stream, sdfg, cfg, state_id, node, var_name=None): """ Adds pre-loop pragma for ignoring loop carried dependencies on a given variable (if var_name is provided) or all variables """ if var_name is None: - kernel_stream.write("#pragma ivdep", sdfg, state_id, node) + kernel_stream.write("#pragma ivdep", cfg, state_id, node) else: - kernel_stream.write("#pragma ivdep array({})".format(var_name), sdfg, state_id, node) + kernel_stream.write("#pragma ivdep array({})".format(var_name), cfg, state_id, node) @staticmethod - def generate_no_dependence_post(kernel_stream, sdfg, state_id, node, var_name=None, accessed_subset=None): + def generate_no_dependence_post(kernel_stream, sdfg, cfg, state_id, node, var_name=None, accessed_subset=None): pass - def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState, kernel_name: str, predecessors: list, - subgraphs: list, kernel_stream: CodeIOStream, state_host_header_stream: CodeIOStream, - state_host_body_stream: CodeIOStream, instrumentation_stream: CodeIOStream, - function_stream: CodeIOStream, callsite_stream: CodeIOStream, state_parameters: list): + def generate_kernel_internal(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, state: dace.SDFGState, kernel_name: str, + predecessors: list, subgraphs: list, kernel_stream: CodeIOStream, + state_host_header_stream: CodeIOStream, state_host_body_stream: CodeIOStream, + instrumentation_stream: CodeIOStream, function_stream: CodeIOStream, + callsite_stream: CodeIOStream, state_parameters: list) -> None: """ Generates Kernel code, both device and host side. @@ -469,7 +471,7 @@ def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState, kerne """ # In xilinx one of them is not used because part of the code goes in another place (entry_stream) - state_id = sdfg.node_id(state) + state_id = state.block_id kernel_header_stream = CodeIOStream() kernel_body_stream = CodeIOStream() @@ -477,9 +479,9 @@ def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState, kerne #reset list of needed converters self.converters_to_generate = set() - kernel_header_stream.write("#include \n\n", sdfg) + kernel_header_stream.write("#include \n\n", cfg) self.generate_constants(sdfg, kernel_header_stream) - kernel_header_stream.write("\n", sdfg) + kernel_header_stream.write("\n", cfg) (global_data_parameters, top_level_local_data, subgraph_parameters, nested_global_transients, bank_assignments, external_streams) = self.make_parameters(sdfg, state, subgraphs) @@ -489,38 +491,38 @@ def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState, kerne # Emit allocations of inter-kernel memories for node in top_level_local_data: - self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, node.desc(sdfg), callsite_stream, + self._dispatcher.dispatch_allocate(sdfg, cfg, state, state_id, node, node.desc(sdfg), callsite_stream, kernel_body_stream) kernel_body_stream.write("\n") state_parameters.extend(global_data_parameters) # Generate host code (Global transients) - self.generate_host_function_boilerplate(sdfg, state, nested_global_transients, state_host_body_stream) + self.generate_host_function_boilerplate(sdfg, cfg, state, nested_global_transients, state_host_body_stream) - self.generate_host_function_prologue(sdfg, state, state_host_body_stream, kernel_name) + self.generate_host_function_prologue(sdfg, cfg, state, state_host_body_stream, kernel_name) # Generate PEs code - self.generate_modules(sdfg, state, kernel_name, subgraphs, subgraph_parameters, kernel_body_stream, + self.generate_modules(sdfg, cfg, state, kernel_name, subgraphs, subgraph_parameters, kernel_body_stream, state_host_header_stream, state_host_body_stream, instrumentation_stream) kernel_body_stream.write("\n") # Generate data width converters - self.generate_converters(sdfg, kernel_header_stream) + self.generate_converters(sdfg, cfg, kernel_header_stream) kernel_stream.write(kernel_header_stream.getvalue() + kernel_body_stream.getvalue()) # Generate host kernel invocation - self.generate_host_function_body(sdfg, state, state_host_body_stream, kernel_name, predecessors) + self.generate_host_function_body(sdfg, cfg, state, state_host_body_stream, kernel_name, predecessors) - def generate_host_function_prologue(self, sdfg, state, host_stream, kernel_name): + def generate_host_function_prologue(self, sdfg, cfg, state, host_stream, kernel_name): seperator = "/" * 59 host_stream.write(f"\n{seperator}\n// Kernel: {kernel_name}\n{seperator}\n\n") - host_stream.write(f"std::vector {kernel_name}_kernels;", sdfg, sdfg.node_id(state)) + host_stream.write(f"std::vector {kernel_name}_kernels;", cfg, state.block_id) - def generate_host_function_body(self, sdfg: dace.SDFG, state: dace.SDFGState, host_stream: CodeIOStream, - kernel_name: str, predecessors: list): + def generate_host_function_body(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, state: dace.SDFGState, + host_stream: CodeIOStream, kernel_name: str, predecessors: list) -> None: """ Generate the host-specific code for spawning and synchronizing the given kernel. @@ -530,7 +532,7 @@ def generate_host_function_body(self, sdfg: dace.SDFG, state: dace.SDFGState, ho :param kernel_name: :param predecessors: list containing all the name of kernels that must be finished before starting this one """ - state_id = sdfg.node_id(state) + state_id = state.block_id # Check if this kernel depends from other kernels needs_synch = len(predecessors) > 0 @@ -552,12 +554,12 @@ def generate_host_function_body(self, sdfg: dace.SDFG, state: dace.SDFGState, ho {kernel_name}_events.emplace_back(k.ExecuteTaskAsync({f'{kernel_deps_name}.begin(), {kernel_deps_name}.end()' if needs_synch else ''})); }} all_events.insert(all_events.end(), {kernel_name}_events.begin(), {kernel_name}_events.end()); -""", sdfg, state_id) +""", cfg, state_id) - def generate_module(self, sdfg, state, kernel_name, module_name, subgraph, parameters, module_stream, + def generate_module(self, sdfg, cfg, state, kernel_name, module_name, subgraph, parameters, module_stream, host_header_stream, host_body_stream, instrumentation_stream): - state_id = sdfg.node_id(state) - dfg = sdfg.nodes()[state_id] + state_id = state.block_id + dfg = cfg.state(state_id) kernel_args_opencl = [] kernel_args_host = [] @@ -580,7 +582,7 @@ def generate_module(self, sdfg, state, kernel_name, module_name, subgraph, param is_autorun = len(kernel_args_opencl) == 0 # create a unique module name to prevent name clashes - module_function_name = "mod_" + str(sdfg.cfg_id) + "_" + module_name + module_function_name = "mod_" + str(cfg.cfg_id) + "_" + module_name # The official limit suggested by Intel for module name is 61. However, the compiler # can also append text to the module. Longest seen so far is # "_cra_slave_inst", which is 15 characters, so we restrict to @@ -614,9 +616,9 @@ def generate_module(self, sdfg, state, kernel_name, module_name, subgraph, param host_body_stream.write( "{}_kernels.emplace_back(program.MakeKernel(\"{}\"{}));".format( kernel_name, module_function_name, - ", ".join([""] + kernel_args_call) if len(kernel_args_call) > 0 else ""), sdfg, state_id) + ", ".join([""] + kernel_args_call) if len(kernel_args_call) > 0 else ""), cfg, state_id) if state.instrument == dtypes.InstrumentationType.FPGA: - self.instrument_opencl_kernel(module_function_name, state_id, sdfg.cfg_id, instrumentation_stream) + self.instrument_opencl_kernel(module_function_name, state_id, cfg.cfg_id, instrumentation_stream) else: # We will generate a separate kernel for each PE. Adds host call start, stop, skip = unrolled_loop.range.ranges[0] @@ -636,10 +638,10 @@ def generate_module(self, sdfg, state, kernel_name, module_name, subgraph, param host_body_stream.write( "{}_kernels.emplace_back(program.MakeKernel(\"{}\"{}));".format( kernel_name, unrolled_module_name, - ", ".join([""] + kernel_args_call[:-1]) if len(kernel_args_call) > 1 else ""), sdfg, + ", ".join([""] + kernel_args_call[:-1]) if len(kernel_args_call) > 1 else ""), cfg, state_id) if state.instrument == dtypes.InstrumentationType.FPGA: - self.instrument_opencl_kernel(unrolled_module_name, state_id, sdfg.cfg_id, + self.instrument_opencl_kernel(unrolled_module_name, state_id, cfg.cfg_id, instrumentation_stream) # ---------------------------------------------------------------------- @@ -657,15 +659,15 @@ def generate_module(self, sdfg, state, kernel_name, module_name, subgraph, param if unrolled_loop is None: module_body_stream.write( "{}__kernel void {}({}) {{".format(AUTORUN_STR if is_autorun else "", module_function_name, - ", ".join(kernel_args_opencl)), sdfg, state_id) + ", ".join(kernel_args_opencl)), cfg, state_id) else: # Unrolled PEs: we have to generate a kernel for each PE. We will generate # a function that will be used create a kernel multiple times # generate a unique name for this function - pe_function_name = "pe_" + str(sdfg.cfg_id) + "_" + module_name + "_func" + pe_function_name = "pe_" + str(cfg.cfg_id) + "_" + module_name + "_func" module_body_stream.write("inline void {}({}) {{".format(pe_function_name, ", ".join(kernel_args_opencl)), - sdfg, state_id) + cfg, state_id) # Allocate local transients data_to_allocate = (set(subgraph.top_level_transients()) - set(sdfg.shared_transients()) - @@ -677,17 +679,18 @@ def generate_module(self, sdfg, state, kernel_name, module_name, subgraph, param if node.data not in data_to_allocate or node.data in allocated: continue allocated.add(node.data) - self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, node.desc(sdfg), module_stream, + self._dispatcher.dispatch_allocate(sdfg, cfg, state, state_id, node, node.desc(sdfg), module_stream, module_body_stream) self._dispatcher.dispatch_subgraph(sdfg, + cfg, subgraph, state_id, module_stream, module_body_stream, skip_entry_node=False) - module_stream.write(module_body_stream.getvalue(), sdfg, state_id) + module_stream.write(module_body_stream.getvalue(), cfg, state_id) module_stream.write("}\n\n") if unrolled_loop is not None: @@ -725,7 +728,7 @@ def generate_module(self, sdfg, state, kernel_name, module_name, subgraph, param self._dispatcher.defined_vars.exit_scope(subgraph) - def generate_nsdfg_header(self, sdfg, state, state_id, node, memlet_references, sdfg_label): + def generate_nsdfg_header(self, sdfg, cfg, state, state_id, node, memlet_references, sdfg_label): # Intel FPGA needs to deal with streams arguments = [f'{atype} {aname}' for atype, aname, _ in memlet_references] fsyms = node.sdfg.used_symbols(all_symbols=False, keep_defined_in_mapping=True) @@ -743,16 +746,18 @@ def generate_nsdfg_header(self, sdfg, state, state_id, node, memlet_references, desc = sdfg.arrays[edge.data.data] if isinstance(desc, dace.data.Stream): src_node = find_input_arraynode(state, edge) - self._dispatcher.dispatch_copy(src_node, node, edge, sdfg, state, state_id, None, nested_stream) + self._dispatcher.dispatch_copy(src_node, node, edge, sdfg, cfg, state, state_id, None, + nested_stream) for edge in state.out_edges(node): if edge.data.data is not None: # skip empty memlets desc = sdfg.arrays[edge.data.data] if isinstance(desc, dace.data.Stream): dst_node = find_output_arraynode(state, edge) - self._dispatcher.dispatch_copy(node, dst_node, edge, sdfg, state, state_id, None, nested_stream) + self._dispatcher.dispatch_copy(node, dst_node, edge, sdfg, cfg, state, state_id, None, + nested_stream) return function_header + "\n" + nested_stream.getvalue() - def generate_nsdfg_arguments(self, sdfg, dfg, state, node): + def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): # Connectors that are both input and output share the same name inout = set(node.in_connectors.keys() & node.out_connectors.keys()) memlet_references = [] @@ -875,8 +880,9 @@ def generate_nsdfg_arguments(self, sdfg, dfg, state, node): memlet_references.append((typedef, p, p)) return memlet_references - def allocate_view(self, sdfg: dace.SDFG, dfg: SDFGState, state_id: int, node: dace.nodes.AccessNode, - global_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + def allocate_view(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, + node: dace.nodes.AccessNode, global_stream: CodeIOStream, declaration_stream: CodeIOStream, + allocation_stream: CodeIOStream) -> None: """ Allocates (creates pointer and refers to original) a view of an existing array, scalar, or view. Specifically tailored for Intel FPGA @@ -893,8 +899,8 @@ def allocate_view(self, sdfg: dace.SDFG, dfg: SDFGState, state_id: int, node: da # Allocate the viewed data before the view, if necessary mpath = dfg.memlet_path(edge) viewed_dnode = mpath[0].src if edge.dst is node else mpath[-1].dst - self._dispatcher.dispatch_allocate(sdfg, dfg, state_id, viewed_dnode, viewed_dnode.desc(sdfg), global_stream, - allocation_stream) + self._dispatcher.dispatch_allocate(sdfg, cfg, dfg, state_id, viewed_dnode, viewed_dnode.desc(sdfg), + global_stream, allocation_stream) # Emit memlet as a reference and register defined variable if nodedesc.storage == dace.dtypes.StorageType.FPGA_Global: @@ -924,9 +930,9 @@ def allocate_view(self, sdfg: dace.SDFG, dfg: SDFGState, state_id: int, node: da name, dtypes.pointer(nodedesc.dtype), ancestor=0) - declaration_stream.write(f'{qualifier}{atype} {aname} = {value};', sdfg, state_id, node) + declaration_stream.write(f'{qualifier}{atype} {aname} = {value};', cfg, state_id, node) - def generate_memlet_definition(self, sdfg, dfg, state_id, src_node, dst_node, edge, callsite_stream): + def generate_memlet_definition(self, sdfg, cfg, dfg, state_id, src_node, dst_node, edge, callsite_stream): if isinstance(edge.dst, dace.sdfg.nodes.CodeNode): # Input memlet @@ -971,7 +977,7 @@ def generate_memlet_definition(self, sdfg, dfg, state_id, src_node, dst_node, ed if isinstance(data_desc, dace.data.Stream): # Derive the name of the original stream, by tracing the memlet path through nested SDFGs outer_stream_node_trace = utils.trace_nested_access(dst_node if is_output else src_node, - sdfg.nodes()[state_id], sdfg) + cfg.state(state_id), sdfg) data_name = outer_stream_node_trace[0][0][1 if is_output else 0].label is_global = True @@ -1050,7 +1056,7 @@ def generate_memlet_definition(self, sdfg, dfg, state_id, src_node, dst_node, ed result += "{} {};".format(memlet_type, connector) else: global_node = utils.trace_nested_access(dst_node if is_output else src_node, - sdfg.nodes()[state_id], sdfg) + cfg.state(state_id), sdfg) data_name = global_node[0][0][1 if is_output else 0].label if outer_memlet is not None: @@ -1072,9 +1078,9 @@ def generate_memlet_definition(self, sdfg, dfg, state_id, src_node, dst_node, ed else: raise TypeError("Unknown variable type: {}".format(def_type)) - callsite_stream.write(result, sdfg, state_id, tasklet) + callsite_stream.write(result, cfg, state_id, tasklet) - def generate_channel_writes(self, sdfg, dfg, node, callsite_stream, state_id): + def generate_channel_writes(self, sdfg, cfg, dfg, node, callsite_stream, state_id): for edge in dfg.out_edges(node): connector = edge.src_conn memlet = edge.data @@ -1089,9 +1095,9 @@ def generate_channel_writes(self, sdfg, dfg, node, callsite_stream, state_id): target = f"{chan_name}[{offset}]" else: target = chan_name - callsite_stream.write(f"write_channel_intel({target}, {connector});", sdfg) + callsite_stream.write(f"write_channel_intel({target}, {connector});", cfg) - def generate_undefines(self, sdfg, dfg, node, callsite_stream): + def generate_undefines(self, sdfg, cfg, dfg, node, callsite_stream): for edge in itertools.chain(dfg.in_edges(node), dfg.out_edges(node)): memlet = edge.data data_name = memlet.data @@ -1104,9 +1110,9 @@ def generate_undefines(self, sdfg, dfg, node, callsite_stream): if data_name is not None: data_desc = sdfg.arrays[data_name] if (isinstance(data_desc, dace.data.Stream) and (memlet.dynamic or memlet.num_accesses != 1)): - callsite_stream.write("#undef {}".format(memlet_name), sdfg) + callsite_stream.write("#undef {}".format(memlet_name), cfg) - def _generate_converter(self, is_unpack, ctype, veclen, sdfg, function_stream): + def _generate_converter(self, is_unpack, ctype, veclen, sdfg, cfg, function_stream): # Get the file stream if "converters" not in self._other_codes: self._other_codes["converters"] = CodeIOStream() @@ -1130,10 +1136,10 @@ def _generate_converter(self, is_unpack, ctype, veclen, sdfg, function_stream): for (int u = 0; u < {veclen}; ++u) {{ ptr[u] = value[u]; }} -}}\n\n""".format(signature=signature, dtype=ctype, veclen=veclen), sdfg) +}}\n\n""".format(signature=signature, dtype=ctype, veclen=veclen), cfg) # add forward declaration - function_stream.write("extern {};".format(signature), sdfg) + function_stream.write("extern {};".format(signature), cfg) else: converter_name = "pack_{dtype}{veclen}".format(dtype=ctype, veclen=veclen) @@ -1152,21 +1158,22 @@ def _generate_converter(self, is_unpack, ctype, veclen, sdfg, function_stream): vec[u] = ptr[u]; }} return vec; -}}\n\n""".format(signature=signature, dtype=ctype, veclen=veclen), sdfg) +}}\n\n""".format(signature=signature, dtype=ctype, veclen=veclen), cfg) # add forward declaration - function_stream.write("extern {};".format(signature), sdfg, self) + function_stream.write("extern {};".format(signature), cfg, self) - def generate_converters(self, sdfg, function_stream): + def generate_converters(self, sdfg, cfg, function_stream): for unpack, ctype, veclen in self.converters_to_generate: - self._generate_converter(unpack, ctype, veclen, sdfg, function_stream) + self._generate_converter(unpack, ctype, veclen, sdfg, cfg, function_stream) - def unparse_tasklet(self, sdfg, state_id, dfg, node, function_stream, callsite_stream, locals, ldepth, - toplevel_schedule): + def unparse_tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_id: int, dfg: StateSubgraphView, + node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream, + locals, ldepth, toplevel_schedule) -> str: if node.label is None or node.label == "": return '' - state_dfg: SDFGState = sdfg.nodes()[state_id] + state_dfg = cfg.state(state_id) # Not [], "" or None if not node.code: @@ -1175,23 +1182,23 @@ def unparse_tasklet(self, sdfg, state_id, dfg, node, function_stream, callsite_s if node.code_global and node.code_global.code: function_stream.write( codeblock_to_cpp(node.code_global), - sdfg, + cfg, state_id, node, ) - function_stream.write("\n", sdfg, state_id, node) + function_stream.write("\n", cfg, state_id, node) # If raw C++ or OpenCL code, return the code directly if node.language != dtypes.Language.Python: if node.language != dtypes.Language.CPP and node.language != dtypes.Language.OpenCL: raise ValueError("Only Python, C++ and OpenCL code are supported in Intel FPGA codegen, got: {}".format( node.language)) - callsite_stream.write(type(node).__properties__["code"].to_string(node.code), sdfg, state_id, node) + callsite_stream.write(type(node).__properties__["code"].to_string(node.code), cfg, state_id, node) return body = node.code.code - callsite_stream.write('// Tasklet code (%s)\n' % node.label, sdfg, state_id, node) + callsite_stream.write('// Tasklet code (%s)\n' % node.label, cfg, state_id, node) # Map local names to memlets (for WCR detection) memlets = {} @@ -1246,7 +1253,7 @@ def unparse_tasklet(self, sdfg, state_id, dfg, node, function_stream, callsite_s defined_symbols=defined_symbols, type_inference=True, language=dtypes.Language.OpenCL) - callsite_stream.write(result.getvalue(), sdfg, state_id, node) + callsite_stream.write(result.getvalue(), cfg, state_id, node) def generate_constants(self, sdfg, callsite_stream): # To avoid a constant being multiple defined, define it once and @@ -1276,11 +1283,11 @@ def generate_constants(self, sdfg, callsite_stream): # allocate fast memory. Therefore, we will use a #define callsite_stream.write(f"#define {cstname} {sym2cpp(cstval)}\n", sdfg) - def generate_tasklet_postamble(self, sdfg, dfg, state_id, node, function_stream, callsite_stream, + def generate_tasklet_postamble(self, sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream, after_memlets_stream): - super().generate_tasklet_postamble(sdfg, dfg, state_id, node, function_stream, callsite_stream, + super().generate_tasklet_postamble(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream, after_memlets_stream) - self.generate_channel_writes(sdfg, dfg, node, after_memlets_stream, state_id) + self.generate_channel_writes(sdfg, cfg, dfg, node, after_memlets_stream, state_id) def write_and_resolve_expr(self, sdfg, memlet, nc, outname, inname, indices=None, dtype=None): desc = sdfg.arrays[memlet.data] @@ -1313,10 +1320,11 @@ def make_ptr_vector_cast(self, dst_expr, dst_dtype, src_dtype, is_scalar, define expr = "&" + expr return expr - def process_out_memlets(self, sdfg, state_id, node, dfg, dispatcher, result, locals_defined, function_stream, + def process_out_memlets(self, sdfg, cfg, state_id, node, dfg, dispatcher, result, locals_defined, function_stream, **kwargs): # Call CPU implementation with this code generator as callback self._cpu_codegen.process_out_memlets(sdfg, + cfg, state_id, node, dfg, @@ -1327,7 +1335,7 @@ def process_out_memlets(self, sdfg, state_id, node, dfg, dispatcher, result, loc codegen=self, **kwargs) # Inject undefines - self.generate_undefines(sdfg, dfg, node, result) + self.generate_undefines(sdfg, cfg, dfg, node, result) class OpenCLDaceKeywordRemover(cpp.DaCeKeywordRemover): diff --git a/dace/codegen/targets/mlir/mlir.py b/dace/codegen/targets/mlir/mlir.py index 09cc69c72e..57a9924042 100644 --- a/dace/codegen/targets/mlir/mlir.py +++ b/dace/codegen/targets/mlir/mlir.py @@ -1,9 +1,14 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +from typing import TYPE_CHECKING from dace import registry, dtypes from dace.codegen.codeobject import CodeObject from dace.codegen.targets.target import TargetCodeGenerator from dace.codegen.targets.cpu import CPUCodeGen from dace.sdfg import nodes +from dace.sdfg.sdfg import SDFG + +if TYPE_CHECKING: + from dace.codegen.targets.framecode import DaCeCodeGenerator @registry.autoregister_params(name='mlir') @@ -11,9 +16,9 @@ class MLIRCodeGen(TargetCodeGenerator): target_name = 'mlir' title = 'MLIR' - def __init__(self, frame_codegen, sdfg): + def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._codeobjects = [] - self._cpu_codegen = frame_codegen.dispatcher.get_generic_node_dispatcher() + self._cpu_codegen: CPUCodeGen = frame_codegen.dispatcher.get_generic_node_dispatcher() frame_codegen.dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) def get_generated_codeobjects(self): @@ -22,14 +27,14 @@ def get_generated_codeobjects(self): def node_dispatch_predicate(self, sdfg, state, node): return isinstance(node, nodes.Tasklet) and node.language == dtypes.Language.MLIR - def generate_node(self, sdfg, dfg, state_id, node, function_stream, callsite_stream): + def generate_node(self, sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream): if self.node_dispatch_predicate(sdfg, dfg, node): - function_uid = str(sdfg.cfg_id) + "_" + str(state_id) + "_" + str(dfg.node_id(node)) + function_uid = str(cfg.cfg_id) + "_" + str(state_id) + "_" + str(dfg.node_id(node)) node.code.code = node.code.code.replace("mlir_entry", "mlir_entry_" + function_uid) node.label = node.name + "_" + function_uid self._codeobjects.append(CodeObject(node.name, node.code.code, "mlir", MLIRCodeGen, node.name + "_Source")) - self._cpu_codegen.generate_node(sdfg, dfg, state_id, node, function_stream, callsite_stream) + self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) @staticmethod def cmake_options(): diff --git a/dace/codegen/targets/mpi.py b/dace/codegen/targets/mpi.py index 0bb2b67a7e..d5278a32f0 100644 --- a/dace/codegen/targets/mpi.py +++ b/dace/codegen/targets/mpi.py @@ -1,4 +1,5 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +from typing import TYPE_CHECKING import dace from dace import registry, symbolic, dtypes from dace.codegen.prettycode import CodeIOStream @@ -9,6 +10,10 @@ from dace.config import Config from dace.codegen import cppunparse +from dace.sdfg.state import ControlFlowRegion, StateSubgraphView + +if TYPE_CHECKING: + from dace.codegen.targets.framecode import DaCeCodeGenerator @registry.autoregister_params(name='mpi') @@ -18,7 +23,7 @@ class MPICodeGen(TargetCodeGenerator): title = 'MPI' language = 'cpp' - def __init__(self, frame_codegen, sdfg: SDFG): + def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._frame = frame_codegen self._dispatcher = frame_codegen.dispatcher self._global_sdfg = sdfg @@ -96,35 +101,37 @@ def has_initializer(self): def has_finalizer(self): return True - def generate_scope(self, sdfg, dfg_scope, state_id, function_stream, callsite_stream): + def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: # Take care of map header assert len(dfg_scope.source_nodes()) == 1 - map_header = dfg_scope.source_nodes()[0] + map_header: nodes.MapEntry = dfg_scope.source_nodes()[0] - function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', sdfg, state_id, map_header) + function_stream.write('extern int __dace_comm_size, __dace_comm_rank;', cfg, state_id, map_header) # Add extra opening brace (dynamic map ranges, closed in MapExit # generator) - callsite_stream.write('{', sdfg, state_id, map_header) + callsite_stream.write('{', cfg, state_id, map_header) if len(map_header.map.params) > 1: raise NotImplementedError('Multi-dimensional MPI maps are not supported') - state = sdfg.node(state_id) + state = cfg.state(state_id) symtypes = map_header.new_symbols(sdfg, state, state.symbols_defined_at(map_header)) for var, r in zip(map_header.map.params, map_header.map.range): begin, end, skip = r - callsite_stream.write('{\n', sdfg, state_id, map_header) + callsite_stream.write('{\n', cfg, state_id, map_header) callsite_stream.write( '%s %s = %s + __dace_comm_rank * (%s);\n' % (symtypes[var], var, cppunparse.pyexpr2cpp(symbolic.symstr(begin, cpp_mode=True)), - cppunparse.pyexpr2cpp(symbolic.symstr(skip, cpp_mode=True))), sdfg, state_id, map_header) + cppunparse.pyexpr2cpp(symbolic.symstr(skip, cpp_mode=True))), cfg, state_id, map_header) - self._frame.allocate_arrays_in_scope(sdfg, map_header, function_stream, callsite_stream) + self._frame.allocate_arrays_in_scope(sdfg, cfg, map_header, function_stream, callsite_stream) self._dispatcher.dispatch_subgraph(sdfg, + cfg, dfg_scope, state_id, function_stream, diff --git a/dace/codegen/targets/rtl.py b/dace/codegen/targets/rtl.py index c9d13f0395..406f29e84e 100644 --- a/dace/codegen/targets/rtl.py +++ b/dace/codegen/targets/rtl.py @@ -1,14 +1,15 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import itertools -from typing import List, Tuple, Dict +from typing import List, Dict import warnings -from dace import dtypes, config, registry, symbolic, nodes, sdfg, data -from dace.sdfg import graph, state, find_input_arraynode, find_output_arraynode +from dace import dtypes, config, registry, symbolic, nodes, data, SDFG +from dace.sdfg import graph, find_input_arraynode, find_output_arraynode from dace.codegen import codeobject, dispatcher, prettycode from dace.codegen.targets import target, framecode from dace.codegen.common import sym2cpp +from dace.sdfg.state import ControlFlowRegion, SDFGState, StateSubgraphView @registry.autoregister_params(name='rtl') @@ -20,11 +21,11 @@ class RTLCodeGen(target.TargetCodeGenerator): languages = [dtypes.Language.SystemVerilog] n_unrolled: Dict[str, int] = {} - def __init__(self, frame_codegen: framecode.DaCeCodeGenerator, sdfg: sdfg.SDFG): + def __init__(self, frame_codegen: framecode.DaCeCodeGenerator, sdfg: SDFG): # store reference to sdfg - self.sdfg: sdfg.SDFG = sdfg + self.sdfg = sdfg # store reference to frame code generator - self.frame: framecode.DaCeCodeGenerator = frame_codegen + self.frame = frame_codegen self._frame = self.frame # get dispatcher to register callbacks for allocation/nodes/.. code generators self.dispatcher: dispatcher.TargetDispatcher = frame_codegen.dispatcher @@ -46,8 +47,9 @@ def __init__(self, frame_codegen: framecode.DaCeCodeGenerator, sdfg: sdfg.SDFG): self.hardware_target: bool = config.Config.get("compiler", "xilinx", "mode").startswith("hardware") self.frequencies: str = config.Config.get("compiler", "xilinx", "frequency") - def generate_node(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, node: nodes.Node, - function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream): + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Node, function_stream: prettycode.CodeIOStream, + callsite_stream: prettycode.CodeIOStream) -> None: # check instance type if isinstance(node, nodes.Tasklet): """ @@ -56,32 +58,32 @@ def generate_node(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: (2) generate tasklet->out (3) generate tasklet """ - callsite_stream.write('{', sdfg, state_id, dfg.node_id(node)) + callsite_stream.write('{', cfg, state_id, dfg.node_id(node)) # generate code to handle data input to the tasklet for edge in dfg.in_edges(node): # find input array src_node = find_input_arraynode(dfg, edge) # dispatch code gen (copy_memory) - self.dispatcher.dispatch_copy(src_node, node, edge, sdfg, dfg, state_id, function_stream, + self.dispatcher.dispatch_copy(src_node, node, edge, sdfg, cfg, dfg, state_id, function_stream, callsite_stream) # generate code to handle data output from the tasklet for edge in dfg.out_edges(node): # find output array dst_node = find_output_arraynode(dfg, edge) # dispatch code gen (define_out_memlet) - self.dispatcher.dispatch_output_definition(node, dst_node, edge, sdfg, dfg, state_id, function_stream, - callsite_stream) + self.dispatcher.dispatch_output_definition(node, dst_node, edge, sdfg, cfg, dfg, state_id, + function_stream, callsite_stream) # generate tasklet code - self.unparse_tasklet(sdfg, dfg, state_id, node, function_stream, callsite_stream) - callsite_stream.write('}', sdfg, state_id, dfg.node_id(node)) + self.unparse_tasklet(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + callsite_stream.write('}', cfg, state_id, dfg.node_id(node)) else: raise RuntimeError( "Only tasklets are handled here, not {}. This should have been filtered by the predicate".format( type(node))) - def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, src_node: nodes.Node, - dst_node: nodes.Node, edge: graph.MultiConnectorEdge, function_stream: prettycode.CodeIOStream, - callsite_stream: prettycode.CodeIOStream): + def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: graph.MultiConnectorEdge, + function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream) -> None: """ Generate input/output memory copies from the array references to local variables (i.e. for the tasklet code). """ @@ -118,7 +120,7 @@ def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: i line += "{} {} = {}.pop();".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data) elif isinstance(edge.src, nodes.MapEntry) and isinstance(edge.dst, nodes.Tasklet): - rtl_name = self.unique_name(edge.dst, sdfg.nodes()[state_id], sdfg) + rtl_name = self.unique_name(edge.dst, cfg.state(state_id)) self.n_unrolled[rtl_name] = symbolic.evaluate(edge.src.map.range[0][1] + 1, sdfg.constants) line: str = f'{dst_node.in_connectors[edge.dst_conn]} {edge.dst_conn} = &{edge.data.data}[{edge.src.map.params[0]}*{edge.data.volume}];' else: @@ -126,8 +128,8 @@ def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: i # write accessor to file callsite_stream.write(line) - def define_out_memlet(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, src_node: nodes.Node, - dst_node: nodes.Node, edge: graph.MultiConnectorEdge, + def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: graph.MultiConnectorEdge, function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream): """ Generate output copy code (handled within the rtl tasklet code). @@ -494,18 +496,19 @@ def generate_running_condition(self, tasklet): evals = ' && '.join([f'out_ptr_{name} < num_elements_{name}' for name in tasklet.out_connectors]) return evals - def unique_name(self, node: nodes.RTLTasklet, state, sdfg): - return "{}_{}_{}_{}".format(node.name, sdfg.cfg_id, sdfg.node_id(state), state.node_id(node)) + def unique_name(self, node: nodes.RTLTasklet, state: SDFGState): + return "{}_{}_{}_{}".format(node.name, state.parent_graph.cfg_id, state.block_id, state.node_id(node)) - def unparse_tasklet(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, node: nodes.Node, - function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream): + def unparse_tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Node, function_stream: prettycode.CodeIOStream, + callsite_stream: prettycode.CodeIOStream): # extract data - state = sdfg.nodes()[state_id] + state = cfg.state(state_id) tasklet = node # construct variables paths - unique_name: str = self.unique_name(tasklet, state, sdfg) + unique_name: str = self.unique_name(tasklet, state) # Collect all of the input and output connectors into buses and scalars buses = {} # {tasklet_name: (array_name, output_from_rtl, bytes, veclen)} @@ -705,7 +708,7 @@ def unparse_tasklet(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_i ''' if self.verilator_debug else '', debug_sim_end="\nstd::cout << \"SIM {name} END\" << std::endl;" if self.verilator_debug else "", ), - sdfg=sdfg, + cfg=cfg, state_id=state_id, node_id=node) diff --git a/dace/codegen/targets/snitch.py b/dace/codegen/targets/snitch.py index a5978a5582..5a62ca2995 100644 --- a/dace/codegen/targets/snitch.py +++ b/dace/codegen/targets/snitch.py @@ -1,14 +1,18 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -import copy +from typing import Union import dace import itertools import numpy as np import sympy as sp +from dace.memlet import Memlet +from dace.sdfg.graph import MultiConnectorEdge +from dace.sdfg.sdfg import SDFG +from dace.sdfg.state import ControlFlowRegion, SDFGState, StateSubgraphView from dace.transformation.dataflow.streaming_memory import _collect_map_ranges -from dace import registry, data, dtypes, config, sdfg as sd, symbolic +from dace import registry, data, dtypes, config, symbolic from dace.sdfg import nodes, utils as sdutils from dace.sdfg.scope import ScopeSubgraphView from dace.codegen.prettycode import CodeIOStream @@ -17,8 +21,7 @@ from dace.codegen.targets.target import TargetCodeGenerator from dace.codegen.targets.framecode import DaCeCodeGenerator from dace.codegen.targets.cpp import sym2cpp -from dace.codegen.dispatcher import DefinedType, TargetDispatcher -from sympy.core.symbol import Symbol +from dace.codegen.dispatcher import DefinedType MAX_SSR_STREAMERS = 2 # number of snitch cores executing parallel regions @@ -83,7 +86,8 @@ def state_dispatch_predicate(self, sdfg, state): return True return False - def emit_ssr_setup(self, sdfg, state, para, global_stream, callsite_stream): + def emit_ssr_setup(self, sdfg: SDFG, state: SDFGState, para: bool, global_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: if sum([x is not None for x in self.ssrs]) == 0: return @@ -141,9 +145,10 @@ def try_simplify(expr): # if para: # callsite_stream.write(f'}}') - def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_state_footer=True): - - sid = sdfg.node_id(state) + def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, global_stream: CodeIOStream, + callsite_stream: CodeIOStream, generate_state_footer: bool = True): + sdfg = state.sdfg + sid = state.block_id dbg(f'-- generate state "{state}"') # analyze memlets for SSR candidates @@ -165,7 +170,7 @@ def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_s if node.data not in data_to_allocate or node.data in allocated: continue allocated.add(node.data) - self.dispatcher.dispatch_allocate(sdfg, state, sid, node, global_stream, callsite_stream) + self.dispatcher.dispatch_allocate(sdfg, cfg, state, sid, node, global_stream, callsite_stream) callsite_stream.write('\n') @@ -208,14 +213,16 @@ def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_s components = dace.sdfg.concurrent_subgraphs(state) if len(components) == 1: - self.dispatcher.dispatch_subgraph(sdfg, state, sid, global_stream, callsite_stream, skip_entry_node=False) + self.dispatcher.dispatch_subgraph(sdfg, cfg, state, sid, global_stream, callsite_stream, + skip_entry_node=False) else: if config.Config.get_bool('compiler', 'cpu', 'openmp_sections'): callsite_stream.write("#pragma omp parallel sections\n{") for c in components: if config.Config.get_bool('compiler', 'cpu', 'openmp_sections'): callsite_stream.write("#pragma omp section\n{") - self.dispatcher.dispatch_subgraph(sdfg, c, sid, global_stream, callsite_stream, skip_entry_node=False) + self.dispatcher.dispatch_subgraph(sdfg, cfg, c, sid, global_stream, callsite_stream, + skip_entry_node=False) if config.Config.get_bool('compiler', 'cpu', 'openmp_sections'): callsite_stream.write("} // End omp section") if config.Config.get_bool('compiler', 'cpu', 'openmp_sections'): @@ -256,14 +263,16 @@ def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_s or (node.data in sdfg.arrays and sdfg.arrays[node.data].transient == False)): continue deallocated.add(node.data) - self.dispatcher.dispatch_deallocate(sdfg, state, sid, node, global_stream, callsite_stream) + self.dispatcher.dispatch_deallocate(sdfg, cfg, state, sid, node, global_stream, callsite_stream) # Invoke all instrumentation providers for instr in self.dispatcher.instrumentation.values(): if instr is not None: instr.on_state_end(sdfg, state, callsite_stream, global_stream) - def define_out_memlet(self, sdfg, state_dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream): + def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet], + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: cdtype = src_node.out_connectors[edge.src_conn] if isinstance(sdfg.arrays[edge.data.data], data.Stream): pass @@ -271,9 +280,9 @@ def define_out_memlet(self, sdfg, state_dfg, state_id, src_node, dst_node, edge, # If pointer, also point to output defined_type, _ = self.dispatcher.defined_vars.get(edge.data.data) base_ptr = cpp.cpp_ptr_expr(sdfg, edge.data, defined_type) - callsite_stream.write(f'{cdtype.ctype} {edge.src_conn} = {base_ptr};', sdfg, state_id, src_node) + callsite_stream.write(f'{cdtype.ctype} {edge.src_conn} = {base_ptr};', cfg, state_id, src_node) else: - callsite_stream.write(f'{cdtype.ctype} {edge.src_conn};', sdfg, state_id, src_node) + callsite_stream.write(f'{cdtype.ctype} {edge.src_conn};', cfg, state_id, src_node) def memlet_definition(self, sdfg, memlet, output, local_name, conntype=None, allow_shadowing=False, codegen=None): # TODO: Robust rule set @@ -357,8 +366,9 @@ def memlet_definition(self, sdfg, memlet, output, local_name, conntype=None, all dbg(f' memlet definition: "{result}"') return result - def allocate_array(self, sdfg, dfg, state_id, node, global_stream, function_stream, declaration_stream, - allocation_stream) -> None: + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, global_stream: CodeIOStream, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: dbg('-- allocate_array') name = node.data nodedesc = node.desc(sdfg) @@ -387,7 +397,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, global_stream, function_stre global_stream.write( "{ctype} *{name};\n#pragma omp threadprivate({name})".format(ctype=nodedesc.dtype.ctype, name=name), - sdfg, + cfg, state_id, node, ) @@ -401,7 +411,7 @@ def allocate_array(self, sdfg, dfg, state_id, node, global_stream, function_stre {name} = new {ctype} [{arrsize}];""".format(ctype=nodedesc.dtype.ctype, name=alloc_name, arrsize=cpp.sym2cpp(arrsize)), - sdfg, + cfg, state_id, node, ) @@ -414,30 +424,32 @@ def allocate_array(self, sdfg, dfg, state_id, node, global_stream, function_stre if node.desc(sdfg).lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): # Don't put a static if it is declared in the state struct for C compliance - declaration_stream.write(f'{nodedesc.dtype.ctype} {name}[{cpp.sym2cpp(arrsize)}];\n', sdfg, + declaration_stream.write(f'{nodedesc.dtype.ctype} {name}[{cpp.sym2cpp(arrsize)}];\n', cfg, state_id, node) else: - declaration_stream.write(f'static {nodedesc.dtype.ctype} {name}[{cpp.sym2cpp(arrsize)}];\n', sdfg, + declaration_stream.write(f'static {nodedesc.dtype.ctype} {name}[{cpp.sym2cpp(arrsize)}];\n', cfg, state_id, node) self.dispatcher.defined_vars.add(name, DefinedType.Pointer, ctypedef) else: # malloc array declaration_stream.write(f'// allocate storage "{nodedesc.storage}"') - declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', sdfg, state_id, node) + declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', cfg, state_id, node) allocation_stream.write( f'''{alloc_name} = ({nodedesc.dtype.ctype}*)malloc(sizeof({nodedesc.dtype.ctype})*({cpp.sym2cpp(arrsize)}));\n''', - sdfg, state_id, node) + cfg, state_id, node) self.dispatcher.defined_vars.add(name, DefinedType.Pointer, ctypedef) else: if (nodedesc.storage is dtypes.StorageType.CPU_Heap or nodedesc.storage is dtypes.StorageType.Snitch_TCDM): ctypedef = dtypes.pointer(nodedesc.dtype).ctype declaration_stream.write(f'// allocate scalar storage "{nodedesc.storage}"') - declaration_stream.write(f'{nodedesc.dtype.ctype} {name}[1];\n', sdfg, state_id, node) + declaration_stream.write(f'{nodedesc.dtype.ctype} {name}[1];\n', cfg, state_id, node) self.dispatcher.defined_vars.add(name, DefinedType.Pointer, ctypedef) else: raise NotImplementedError("Unimplemented storage type " + str(nodedesc.storage)) - def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, callsite_stream): + def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: data.Data, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: arrsize = nodedesc.total_size alloc_name = cpp.ptr(node.data, nodedesc) dbg(f'-- deallocate_array storate="{nodedesc.storage}" arrsize="{arrsize}" alloc_name="{alloc_name}"') @@ -455,12 +467,12 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, # free array if nodedesc.storage == dtypes.StorageType.Snitch_SSR: dbg(f'Check deallocation of SSR datatypes!!!') - callsite_stream.write(f"// free of an SSR type\n", sdfg, state_id, node) + callsite_stream.write(f"// free of an SSR type\n", cfg, state_id, node) if not symbolic.issymbolic(arrsize, sdfg.constants): # don't free static allocations return callsite_stream.write(f'// storage "{nodedesc.storage}"\n') - callsite_stream.write(f"free({alloc_name});\n", sdfg, state_id, node) + callsite_stream.write(f"free({alloc_name});\n", cfg, state_id, node) return elif nodedesc.storage is dtypes.StorageType.CPU_ThreadLocal: # Deallocate in each OpenMP thread @@ -469,7 +481,7 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, {{ delete[] {name}; }}""".format(name=alloc_name), - sdfg, + cfg, state_id, node, ) @@ -478,15 +490,16 @@ def deallocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, def copy_memory( self, - sdfg, - dfg, - state_id, - src_node, - dst_node, - edge, - function_stream, - callsite_stream, - ): + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: StateSubgraphView, + state_id: int, + src_node: Union[nodes.Tasklet, nodes.AccessNode], + dst_node: Union[nodes.Tasklet, nodes.AccessNode], + edge: MultiConnectorEdge[Memlet], + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + ) -> None: dbg(f'-- Copy dispatcher for {src_node}({type(src_node)})->{dst_node}({type(dst_node)})') # get source storage type @@ -513,10 +526,10 @@ def copy_memory( dst_parent = None dst_schedule = None if dst_parent is None else dst_parent.map.schedule - state_dfg = sdfg.node(state_id) + state_dfg = cfg.state(state_id) dbg(f' storage type {src_storage}->{dst_storage}') - callsite_stream.write(f'// storage type {src_storage}->{dst_storage}', sdfg, state_id, [src_node, dst_node]) + callsite_stream.write(f'// storage type {src_storage}->{dst_storage}', cfg, state_id, [src_node, dst_node]) u, uconn, v, vconn, memlet = edge @@ -548,12 +561,12 @@ def copy_memory( callsite_stream.write(f'// copy into tasklet SSR{streamer}') callsite_stream.write( "{} {} = __builtin_ssr_pop({});".format(dst_node.in_connectors[vconn].dtype.ctype, vconn, streamer), - sdfg, state_id, [src_node, dst_node]) + cfg, state_id, [src_node, dst_node]) else: callsite_stream.write('// copy into tasklet') callsite_stream.write( " " + self.memlet_definition(sdfg, memlet, False, vconn, dst_node.in_connectors[vconn]), - sdfg, + cfg, state_id, [src_node, dst_node], ) @@ -577,7 +590,7 @@ def copy_memory( if write: vconn = dst_node.data ctype = dst_nodedesc.dtype.ctype - state_dfg = sdfg.nodes()[state_id] + state_dfg = cfg.state(state_id) ############################################# # Corner cases ignored @@ -608,7 +621,7 @@ def copy_memory( if isinstance(copy_shape[0], int) and copy_shape[0] == 1: # if None: xfer = '''*({dst}) = *({src});'''.format(src=src_expr, dst=dst_expr) - callsite_stream.write(xfer, sdfg, state_id, [src_node, dst_node]) + callsite_stream.write(xfer, cfg, state_id, [src_node, dst_node]) return else: if src_strides[0] == 1 and dst_strides[0] == 1: @@ -635,14 +648,14 @@ def copy_memory( raise NotImplementedError('Unsupported dimnesions') # emit transfer - callsite_stream.write(xfer, sdfg, state_id, [src_node, dst_node]) + callsite_stream.write(xfer, cfg, state_id, [src_node, dst_node]) # emit wait for idle - callsite_stream.write('__builtin_sdma_wait_for_idle();', sdfg, state_id, [src_node, dst_node]) + callsite_stream.write('__builtin_sdma_wait_for_idle();', cfg, state_id, [src_node, dst_node]) # A scope dispatcher will trigger a method called generate_scope whenever # an SDFG has a scope with that schedule - def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, - callsite_stream: CodeIOStream): + def generate_scope(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: # The parameters here are: # sdfg: The SDFG we are currently generating. # scope: The subgraph of the state containing only the scope (map contents) @@ -659,7 +672,7 @@ def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: in dbg(f'-- generate scope entry_node="{entry_node}" type="{type(entry_node)}"') # Encapsulate map with a C scope - callsite_stream.write('{', sdfg, state_id, entry_node) + callsite_stream.write('{', cfg, state_id, entry_node) ssr_region = sum([x is not None and x["map"] == entry_node for x in self.ssrs]) != 0 para = entry_node.map.schedule == dace.ScheduleType.Snitch_Multicore @@ -673,7 +686,7 @@ def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: in if ssr_region: non_null_ssrs = [x for x in self.ssrs if x] callsite_stream.write(f'// ssr allocated: {len(non_null_ssrs)}: {[x["data"] for x in non_null_ssrs]}') - self.emit_ssr_setup(sdfg, sdfg.states()[state_id], para, function_stream, callsite_stream) + self.emit_ssr_setup(sdfg, cfg.state(state_id), para, function_stream, callsite_stream) # loop over out edges which are the in edges to the tasklet # for e in scope.out_edges(entry_node): @@ -737,12 +750,13 @@ def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: in continue allocated.add(child.data) dbg(f' calling allocate for {child.data}') - self.dispatcher.dispatch_allocate(sdfg, scope, state_id, child, function_stream, callsite_stream) + self.dispatcher.dispatch_allocate(sdfg, cfg, scope, state_id, child, function_stream, callsite_stream) # Now that the loops have been defined, use the dispatcher to invoke any # code generator (including this one) that is registered to deal with # the internal nodes in the subgraph. We skip the MapEntry node. self.dispatcher.dispatch_subgraph(sdfg, + cfg, scope, state_id, function_stream, @@ -759,7 +773,7 @@ def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: in if child.data not in to_allocate or child.data in deallocated: continue deallocated.add(child.data) - self.dispatcher.dispatch_deallocate(sdfg, scope, state_id, child, None, callsite_stream) + self.dispatcher.dispatch_deallocate(sdfg, cfg, scope, state_id, child, None, callsite_stream) dbg(f' after dispatch_subgraph') @@ -773,7 +787,7 @@ def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: in for param, rng in zip(entry_node.map.params, entry_node.map.range): dbg(f' closing for parameter {param}') callsite_stream.write(f'''// end loopy-loop - }}''', sdfg, state_id, entry_node) + }}''', cfg, state_id, entry_node) if ssr_region: # callsite_stream.write(f'// end ssr allocated: {len(self.ssr_configs)}') @@ -788,7 +802,7 @@ def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: in self.ssrs[i] = None # End-encapsulate map with a C scope - callsite_stream.write('}', sdfg, state_id, entry_node) + callsite_stream.write('}', cfg, state_id, entry_node) # postamble code for disabling SSR comes here # for param, rng in zip(entry_node.map.params, entry_node.map.range): diff --git a/dace/codegen/targets/sve/codegen.py b/dace/codegen/targets/sve/codegen.py index 71b0a68c1f..d8a972ac97 100644 --- a/dace/codegen/targets/sve/codegen.py +++ b/dace/codegen/targets/sve/codegen.py @@ -76,8 +76,8 @@ def get_generated_codeobjects(self): res = super().get_generated_codeobjects() return res - def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, - edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, + def copy_memory(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, src_node: nodes.Node, + dst_node: nodes.Node, edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: # Check whether it is a known reduction that is possible in SVE @@ -103,7 +103,7 @@ def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes return super().copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream) - def generate_node(self, sdfg: SDFG, state: SDFGState, state_id: int, node: nodes.Node, + def generate_node(self, sdfg: SDFG, cfg: state.ControlFlowRegion, state: SDFGState, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream): self.add_header(function_stream) @@ -134,7 +134,7 @@ def generate_node(self, sdfg: SDFG, state: SDFGState, state_id: int, node: nodes requires_wb.append(edge) # Tasklet code - self.unparse_tasklet(sdfg, state, state_id, node, function_stream, callsite_stream) + self.unparse_tasklet(sdfg, cfg, state, state_id, node, function_stream, callsite_stream) # Writeback from temporary registers to memory for edge in requires_wb: @@ -365,12 +365,12 @@ def generate_writeback(self, sdfg: SDFG, state: SDFGState, map: nodes.Map, else: raise util.NotSupportedError('Only writeback to Tasklets and AccessNodes is supported') - def declare_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.Node, nodedesc: data.Data, - global_stream: CodeIOStream, declaration_stream: CodeIOStream) -> None: - self.cpu_codegen.declare_array(sdfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream) + def declare_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, + nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream) -> None: + self.cpu_codegen.declare_array(sdfg, cfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream) - def allocate_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.Node, nodedesc: data.Data, - global_stream: CodeIOStream, declaration_stream: CodeIOStream, + def allocate_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, + nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: if nodedesc.storage == dtypes.StorageType.SVE_Register: sve_type = util.TYPE_TO_SVE[nodedesc.dtype] @@ -388,15 +388,17 @@ def allocate_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes. declaration_stream.write(f'{sve_type} {ptrname};') return - self.cpu_codegen.allocate_array(sdfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream, + self.cpu_codegen.allocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream, allocation_stream) - def deallocate_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.Node, nodedesc: data.Data, - function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - return self.cpu_codegen.deallocate_array(sdfg, dfg, state_id, node, nodedesc, function_stream, callsite_stream) + def deallocate_array(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: SDFGState, state_id: int, + node: nodes.Node, nodedesc: data.Data, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: + return self.cpu_codegen.deallocate_array(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, + callsite_stream) - def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, - callsite_stream: CodeIOStream): + def generate_scope(self, sdfg: dace.SDFG, cfg: state.ControlFlowRegion, scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream): entry_node = scope.source_nodes()[0] current_map = entry_node.map self.current_map = current_map @@ -420,12 +422,12 @@ def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: in self.dispatcher.defined_vars.enter_scope(scope) # Define all dynamic input connectors of the map entry - state_dfg = sdfg.node(state_id) + state_dfg = cfg.state(state_id) for e in dace.sdfg.dynamic_map_inputs(state_dfg, entry_node): if e.data.data != e.dst_conn: callsite_stream.write( self.cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, - e.dst.in_connectors[e.dst_conn]), sdfg, state_id, entry_node) + e.dst.in_connectors[e.dst_conn]), cfg, state_id, entry_node) param = current_map.params[0] rng = current_map.range[0] @@ -456,6 +458,7 @@ def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: in # Dispatch the subgraph generation self.dispatcher.dispatch_subgraph(sdfg, + cfg, scope, state_id, function_stream, @@ -475,9 +478,9 @@ def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: in self.dispatcher.defined_vars.exit_scope(scope) callsite_stream.write('}') - def unparse_tasklet(self, sdfg: SDFG, dfg: state.StateSubgraphView, state_id: int, node: nodes.Node, - function_stream: CodeIOStream, callsite_stream: CodeIOStream): - state_dfg: SDFGState = sdfg.nodes()[state_id] + def unparse_tasklet(self, sdfg: SDFG, cfg: state.ControlFlowRegion, dfg: state.StateSubgraphView, state_id: int, + node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + state_dfg = cfg.state(state_id) callsite_stream.write('\n///////////////////') callsite_stream.write(f'// Tasklet code ({node.label})') @@ -508,6 +511,6 @@ def unparse_tasklet(self, sdfg: SDFG, dfg: state.StateSubgraphView, state_id: in util.get_loop_predicate(sdfg, dfg, node), self.counter_type, defined_symbols, self.stream_associations, self.wcr_associations) - callsite_stream.write(result.getvalue(), sdfg, state_id, node) + callsite_stream.write(result.getvalue(), cfg, state_id, node) callsite_stream.write('///////////////////\n\n') diff --git a/dace/codegen/targets/target.py b/dace/codegen/targets/target.py index 4af6cb9149..28c2c25378 100644 --- a/dace/codegen/targets/target.py +++ b/dace/codegen/targets/target.py @@ -9,6 +9,7 @@ from dace.registry import make_registry from dace.codegen.prettycode import CodeIOStream from dace.codegen.codeobject import CodeObject +from dace.sdfg.state import ControlFlowRegion @make_registry @@ -61,8 +62,8 @@ def has_finalizer(self) -> bool: function that should be called on finalization. """ return False - def generate_state(self, sdfg: SDFG, state: SDFGState, function_stream: CodeIOStream, - callsite_stream: CodeIOStream) -> None: + def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, function_stream: CodeIOStream, + callsite_stream: CodeIOStream, generate_state_footer: bool) -> None: """ Generates code for an SDFG state, outputting it to the given code streams. @@ -77,8 +78,8 @@ def generate_state(self, sdfg: SDFG, state: SDFGState, function_stream: CodeIOSt """ pass - def generate_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, - callsite_stream: CodeIOStream) -> None: + def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: """ Generates code for an SDFG state scope (from a scope-entry node to its corresponding scope-exit node), outputting it to the given code streams. @@ -95,8 +96,8 @@ def generate_scope(self, sdfg: SDFG, dfg_scope: ScopeSubgraphView, state_id: int """ raise NotImplementedError('Abstract class') - def generate_node(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.Node, function_stream: CodeIOStream, - callsite_stream: CodeIOStream) -> None: + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: """ Generates code for a single node, outputting it to the given code streams. @@ -113,8 +114,8 @@ def generate_node(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.N """ raise NotImplementedError('Abstract class') - def declare_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.Node, nodedesc: dt.Data, - global_stream: CodeIOStream, declaration_stream: CodeIOStream) -> None: + def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, + nodedesc: dt.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream) -> None: """ Generates code for declaring an array without allocating it, outputting to the given code streams. @@ -131,8 +132,8 @@ def declare_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.N """ raise NotImplementedError('Abstract class') - def allocate_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.Node, nodedesc: dt.Data, - global_stream: CodeIOStream, declaration_stream: CodeIOStream, + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, + nodedesc: dt.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: """ Generates code for allocating an array, outputting to the given code streams. @@ -152,8 +153,8 @@ def allocate_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes. """ raise NotImplementedError('Abstract class') - def deallocate_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.Node, nodedesc: dt.Data, - function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, node: nodes.Node, + nodedesc: dt.Data, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: """ Generates code for deallocating an array, outputting to the given code streams. @@ -171,8 +172,8 @@ def deallocate_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: node """ raise NotImplementedError('Abstract class') - def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, - edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, + def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: SDFGState, state_id: int, src_node: nodes.Node, + dst_node: nodes.Node, edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: """ Generates code for copying memory, either from a data access node (array/stream) to another, a code node (tasklet/nested diff --git a/dace/codegen/targets/unroller.py b/dace/codegen/targets/unroller.py index 65eb58dc7c..f4c2bdd2c0 100644 --- a/dace/codegen/targets/unroller.py +++ b/dace/codegen/targets/unroller.py @@ -1,6 +1,5 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import copy -from typing import Any, Dict, Tuple import dace from dace import registry @@ -14,7 +13,6 @@ import dace.sdfg from dace.sdfg import nodes as nd import dace.codegen.common -from dace import dtypes, data as dt @registry.autoregister_params(name='unroll') @@ -62,9 +60,8 @@ def nsdfg_after_unroll(self, backup: "list[tuple[str, str, dict, dict]]"): node.sdfg.constants_prop = constants #TODO: Expand the unroller so it can also generate openCL code - def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, - callsite_stream: CodeIOStream): - + def generate_scope(self, sdfg: dace.SDFG, cfg: state.ControlFlowRegion, scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: entry_node: nd.MapEntry = scope.source_nodes()[0] index_list = [] @@ -89,12 +86,13 @@ def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: in self.nsdfg_prepare_unroll(scope, str(param), str(index)) callsite_stream.write( f"constexpr {mapsymboltypes[param]} {param} = " - f"{dace.codegen.common.sym2cpp(index)};\n", sdfg) + f"{dace.codegen.common.sym2cpp(index)};\n", cfg) sdfg.add_constant(param, int(index)) callsite_stream.write('{') self._dispatcher.dispatch_subgraph( sdfg, + cfg, scope, state_id, function_stream, diff --git a/dace/codegen/targets/xilinx.py b/dace/codegen/targets/xilinx.py index 2c2802b615..8dac2720b6 100644 --- a/dace/codegen/targets/xilinx.py +++ b/dace/codegen/targets/xilinx.py @@ -1,31 +1,26 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -import collections import copy from dace.sdfg.sdfg import SDFG -import itertools -import os import re -import numpy as np import ast import dace from dace import data as dt, registry, dtypes, subsets from dace.config import Config from dace.frontend import operations -from dace.sdfg import nodes, utils -from dace.sdfg import find_input_arraynode, find_output_arraynode +from dace.sdfg import nodes from dace.codegen import exceptions as cgx from dace.codegen.codeobject import CodeObject from dace.codegen.dispatcher import DefinedType from dace.codegen.prettycode import CodeIOStream -from dace.codegen.targets.target import make_absolute from dace.codegen.targets import cpp, fpga from typing import List, Union, Tuple from dace.external.rtllib.templates.control import generate_from_config as rtllib_control from dace.external.rtllib.templates.package import generate_from_config as rtllib_package -from dace.external.rtllib.templates.top import data_packer, generate_from_config as rtllib_top +from dace.external.rtllib.templates.top import generate_from_config as rtllib_top from dace.external.rtllib.templates.synth import generate_from_config as rtllib_synth +from dace.sdfg.state import ControlFlowRegion REDUCTION_TYPE_TO_HLSLIB = { dace.dtypes.ReductionType.Min: "hlslib::op::Min", @@ -231,40 +226,38 @@ def _internal_preprocess(self, sdfg: dace.SDFG): # in the assignment string for graph in sdfg.all_sdfgs_recursive(): - for state in graph.states(): - out_edges = graph.out_edges(state) - for e in out_edges: - if len(e.data.assignments) > 0: - replace_dict = dict() - - for variable, value in e.data.assignments.items(): - expr = ast.parse(value) - # walk in the expression, get all array names and check whether we need to qualify them - for node in ast.walk(expr): - if isinstance(node, ast.Subscript) and isinstance(node.value, ast.Name): - arr_name = node.value.id - - if arr_name not in replace_dict and arr_name in graph.arrays and graph.arrays[ - arr_name].storage == dace.dtypes.StorageType.FPGA_Global: - repl = fpga.fpga_ptr( - arr_name, - graph.arrays[node.value.id], - sdfg, - None, - False, - None, - None, - True, - decouple_array_interfaces=self._decouple_array_interfaces) - replace_dict[arr_name] = repl - - # Perform replacement and update graph.arrays to allow type inference - # on interstate edges - for k, v in replace_dict.items(): - e.data.replace(k, v) - if v not in graph.arrays: - # Note: this redundancy occurs only during codegen - graph.arrays[v] = graph.arrays[k] + for e in graph.all_interstate_edges(): + if len(e.data.assignments) > 0: + replace_dict = dict() + + for variable, value in e.data.assignments.items(): + expr = ast.parse(value) + # walk in the expression, get all array names and check whether we need to qualify them + for node in ast.walk(expr): + if isinstance(node, ast.Subscript) and isinstance(node.value, ast.Name): + arr_name = node.value.id + + if arr_name not in replace_dict and arr_name in graph.arrays and graph.arrays[ + arr_name].storage == dace.dtypes.StorageType.FPGA_Global: + repl = fpga.fpga_ptr( + arr_name, + graph.arrays[node.value.id], + sdfg, + None, + False, + None, + None, + True, + decouple_array_interfaces=self._decouple_array_interfaces) + replace_dict[arr_name] = repl + + # Perform replacement and update graph.arrays to allow type inference + # on interstate edges + for k, v in replace_dict.items(): + e.data.replace(k, v) + if v not in graph.arrays: + # Note: this redundancy occurs only during codegen + graph.arrays[v] = graph.arrays[k] def define_stream(self, dtype, buffer_size, var_name, array_size, function_stream, kernel_stream, sdfg): """ @@ -341,33 +334,33 @@ def make_kernel_argument(data: dt.Data, else: return data.as_arg(with_types=True, name=var_name) - def generate_unroll_loop_pre(self, kernel_stream, factor, sdfg, state_id, node): + def generate_unroll_loop_pre(self, kernel_stream, factor, sdfg, cfg, state_id, node): pass @staticmethod - def generate_unroll_loop_post(kernel_stream, factor, sdfg, state_id, node): + def generate_unroll_loop_post(kernel_stream, factor, sdfg, cfg, state_id, node): if factor is None: - kernel_stream.write("#pragma HLS UNROLL", sdfg, state_id, node) + kernel_stream.write("#pragma HLS UNROLL", cfg, state_id, node) else: - kernel_stream.write("#pragma HLS UNROLL factor={}".format(factor), sdfg, state_id, node) + kernel_stream.write("#pragma HLS UNROLL factor={}".format(factor), cfg, state_id, node) @staticmethod - def generate_pipeline_loop_pre(kernel_stream, sdfg, state_id, node): + def generate_pipeline_loop_pre(kernel_stream, sdfg, cfg, state_id, node): pass @staticmethod - def generate_pipeline_loop_post(kernel_stream, sdfg, state_id, node): - kernel_stream.write("#pragma HLS PIPELINE II=1", sdfg, state_id, node) + def generate_pipeline_loop_post(kernel_stream, sdfg, cfg, state_id, node): + kernel_stream.write("#pragma HLS PIPELINE II=1", cfg, state_id, node) @staticmethod - def generate_flatten_loop_pre(kernel_stream, sdfg, state_id, node): + def generate_flatten_loop_pre(kernel_stream, sdfg, cfg, state_id, node): pass @staticmethod - def generate_flatten_loop_post(kernel_stream, sdfg, state_id, node): + def generate_flatten_loop_post(kernel_stream, sdfg, cfg, state_id, node): kernel_stream.write("#pragma HLS LOOP_FLATTEN") - def generate_nsdfg_header(self, sdfg, state, state_id, node, memlet_references, sdfg_label): + def generate_nsdfg_header(self, sdfg, cfg, state, state_id, node, memlet_references, sdfg_label): # TODO: Use a single method for GPU kernels, FPGA modules, and NSDFGs arguments = [f'{atype} {aname}' for atype, aname, _ in memlet_references] fsyms = node.sdfg.used_symbols(all_symbols=False, keep_defined_in_mapping=True) @@ -473,12 +466,13 @@ def make_shift_register_write(self, defined_type, dtype, var_name, write_expr, i raise NotImplementedError("Xilinx shift registers NYI") @staticmethod - def generate_no_dependence_pre(kernel_stream, sdfg, state_id, node, var_name=None): + def generate_no_dependence_pre(kernel_stream, sdfg, cfg, state_id, node, var_name=None): pass def generate_no_dependence_post(self, kernel_stream, sdfg: SDFG, + cfg: ControlFlowRegion, state_id: int, node: nodes.Node, var_name: str, @@ -501,17 +495,17 @@ def generate_no_dependence_post(self, self._dispatcher, is_array_interface=(defined_type == DefinedType.ArrayInterface), decouple_array_interfaces=self._decouple_array_interfaces) - kernel_stream.write("#pragma HLS DEPENDENCE variable={} false".format(var_name), sdfg, state_id, node) + kernel_stream.write("#pragma HLS DEPENDENCE variable={} false".format(var_name), cfg, state_id, node) - def generate_kernel_boilerplate_pre(self, sdfg, state_id, kernel_name, parameters, bank_assignments, module_stream, - kernel_stream, external_streams, multi_pumped): + def generate_kernel_boilerplate_pre(self, sdfg, cfg, state_id, kernel_name, parameters, bank_assignments, + module_stream, kernel_stream, external_streams, multi_pumped): # Write header module_stream.write("""#include #include -#include """, sdfg) +#include """, cfg) self._frame.generate_fileheader(sdfg, module_stream, 'xilinx_device') - module_stream.write("\n", sdfg) + module_stream.write("\n", cfg) argname_to_bank_assignment = {} # Build kernel signature @@ -572,7 +566,7 @@ def generate_kernel_boilerplate_pre(self, sdfg, state_id, kernel_name, parameter # Write kernel signature kernel_stream.write("DACE_EXPORTED void {}({}) {{\n".format(kernel_name, ', '.join(kernel_args + stream_args)), - sdfg, state_id) + cfg, state_id) # Insert interface pragmas num_mapped_args = 0 @@ -585,7 +579,7 @@ def generate_kernel_boilerplate_pre(self, sdfg, state_id, kernel_name, parameter interface_name = "gmem{}".format(num_mapped_args) kernel_stream.write( "#pragma HLS INTERFACE m_axi port={} " - "offset=slave bundle={}".format(var_name, interface_name), sdfg, state_id) + "offset=slave bundle={}".format(var_name, interface_name), cfg, state_id) # Map this interface to the corresponding location # specification to be passed to the Xilinx compiler memory_bank = argname_to_bank_assignment[arg] @@ -617,12 +611,13 @@ def generate_kernel_boilerplate_pre(self, sdfg, state_id, kernel_name, parameter kernel_stream.write("\nHLSLIB_DATAFLOW_INIT();") @staticmethod - def generate_kernel_boilerplate_post(kernel_stream, sdfg, state_id): - kernel_stream.write("HLSLIB_DATAFLOW_FINALIZE();\n}\n", sdfg, state_id) + def generate_kernel_boilerplate_post(kernel_stream, sdfg, cfg, state_id): + kernel_stream.write("HLSLIB_DATAFLOW_FINALIZE();\n}\n", cfg, state_id) - def generate_host_function_body(self, sdfg: dace.SDFG, state: dace.SDFGState, kernel_name: str, predecessors: list, - parameters: list, rtl_tasklet_names: list, kernel_stream: CodeIOStream, - instrumentation_stream: CodeIOStream, multi_pumped: bool): + def generate_host_function_body(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, state: dace.SDFGState, + kernel_name: str, predecessors: list, parameters: list, rtl_tasklet_names: list, + kernel_stream: CodeIOStream, instrumentation_stream: CodeIOStream, + multi_pumped: bool) -> None: """ Generate the host-specific code for spawning and synchronizing the given kernel. @@ -690,17 +685,17 @@ def generate_host_function_body(self, sdfg: dace.SDFG, state: dace.SDFGState, ke kernel_stream.write( f"""\ hlslib::ocl::Event {kernel_name}_event = {kernel_name}_kernel.ExecuteTaskAsync({f'{kernel_deps_name}.begin(), {kernel_deps_name}.end()' if needs_synch else ''}); - all_events.push_back({kernel_name}_event);""", sdfg, sdfg.node_id(state)) + all_events.push_back({kernel_name}_event);""", cfg, state.block_id) if state.instrument == dtypes.InstrumentationType.FPGA: - self.instrument_opencl_kernel(kernel_name, sdfg.node_id(state), sdfg.cfg_id, instrumentation_stream) + self.instrument_opencl_kernel(kernel_name, state.block_id, cfg.cfg_id, instrumentation_stream) - def generate_module(self, sdfg, state, kernel_name, name, subgraph, parameters, module_stream, entry_stream, + def generate_module(self, sdfg, cfg, state, kernel_name, name, subgraph, parameters, module_stream, entry_stream, host_stream, instrumentation_stream): """Generates a module that will run as a dataflow function in the FPGA kernel.""" - state_id = sdfg.node_id(state) - dfg = sdfg.nodes()[state_id] + state_id = state.block_id + dfg = cfg.state(state_id) kernel_args_call = [] kernel_args_module = [] @@ -759,7 +754,7 @@ def generate_module(self, sdfg, state, kernel_name, name, subgraph, parameters, entry_stream.write(f'// [RTL] HLSLIB_DATAFLOW_FUNCTION({name}, {", ".join(kernel_args_call)});') module_stream.write(f'// [RTL] void {name}({", ".join(kernel_args_module)});\n\n') - rtl_name = self.rtl_tasklet_name(rtl_tasklet, state, sdfg) + rtl_name = self.rtl_tasklet_name(rtl_tasklet, state, cfg) # _i in names are due to vitis source_accessors = [] @@ -825,6 +820,7 @@ def generate_module(self, sdfg, state, kernel_name, name, subgraph, parameters, # appropriate files. ignore_stream = CodeIOStream() self._dispatcher.dispatch_subgraph(sdfg, + cfg, subgraph, state_id, ignore_stream, @@ -835,14 +831,14 @@ def generate_module(self, sdfg, state, kernel_name, name, subgraph, parameters, # kernel arguments host_stream.write( f"all_events.push_back(program.MakeKernel(\"{rtl_name}_top\"{', '.join([''] + [name for _, name, p, _ in parameters if not isinstance(p, dt.Stream)])}).ExecuteTaskAsync());", - sdfg, state_id, rtl_tasklet) + cfg, state_id, rtl_tasklet) if state.instrument == dtypes.InstrumentationType.FPGA: - self.instrument_opencl_kernel(rtl_name, state_id, sdfg.cfg_id, instrumentation_stream) + self.instrument_opencl_kernel(rtl_name, state_id, cfg.cfg_id, instrumentation_stream) return # create a unique module name to prevent name clashes - module_function_name = f"module_{name}_{sdfg.cfg_id}" + module_function_name = f"module_{name}_{cfg.cfg_id}" # Unrolling processing elements: if there first scope of the subgraph # is an unrolled map, generate a processing element for each iteration @@ -869,7 +865,7 @@ def generate_module(self, sdfg, state, kernel_name, name, subgraph, parameters, if not self._decouple_array_interfaces: kernel_args_call = dtypes.deduplicate(kernel_args_call) entry_stream.write( - "HLSLIB_DATAFLOW_FUNCTION({}, {});".format(module_function_name, ", ".join(kernel_args_call)), sdfg, + "HLSLIB_DATAFLOW_FUNCTION({}, {});".format(module_function_name, ", ".join(kernel_args_call)), cfg, state_id) for _ in range(unrolled_loops): @@ -886,7 +882,7 @@ def generate_module(self, sdfg, state, kernel_name, name, subgraph, parameters, if not self._decouple_array_interfaces: kernel_args_module = dtypes.deduplicate(kernel_args_module) - module_body_stream.write("void {}({}) {{".format(module_function_name, ", ".join(kernel_args_module)), sdfg, + module_body_stream.write("void {}({}) {{".format(module_function_name, ", ".join(kernel_args_module)), cfg, state_id) # Register the array interface as a naked pointer for use inside the @@ -934,28 +930,30 @@ def generate_module(self, sdfg, state, kernel_name, name, subgraph, parameters, if node.data not in data_to_allocate or node.data in allocated: continue allocated.add(node.data) - self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, node.desc(sdfg), module_stream, + self._dispatcher.dispatch_allocate(sdfg, cfg, state, state_id, node, node.desc(sdfg), module_stream, module_body_stream) self._dispatcher.dispatch_subgraph(sdfg, + cfg, subgraph, state_id, module_stream, module_body_stream, skip_entry_node=False) - module_stream.write(module_body_stream.getvalue(), sdfg, state_id) + module_stream.write(module_body_stream.getvalue(), cfg, state_id) module_stream.write("}\n\n") self._dispatcher.defined_vars.exit_scope(subgraph) - def rtl_tasklet_name(self, node: nodes.RTLTasklet, state, sdfg): - return "{}_{}_{}_{}".format(node.name, sdfg.cfg_id, sdfg.node_id(state), state.node_id(node)) + def rtl_tasklet_name(self, node: nodes.RTLTasklet, state, cfg): + return "{}_{}_{}_{}".format(node.name, cfg.cfg_id, state.block_id, state.node_id(node)) - def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState, kernel_name: str, predecessors: list, - subgraphs: list, kernel_stream: CodeIOStream, state_host_header_stream: CodeIOStream, - state_host_body_stream: CodeIOStream, instrumentation_stream: CodeIOStream, - function_stream: CodeIOStream, callsite_stream: CodeIOStream, state_parameters: list): + def generate_kernel_internal(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, state: dace.SDFGState, kernel_name: str, + predecessors: list, subgraphs: list, kernel_stream: CodeIOStream, + state_host_header_stream: CodeIOStream, state_host_body_stream: CodeIOStream, + instrumentation_stream: CodeIOStream, function_stream: CodeIOStream, + callsite_stream: CodeIOStream, state_parameters: list) -> None: """ Generates Kernel code, both device and host side. @@ -1040,28 +1038,28 @@ def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState, kerne # Detect RTL tasklets, which will be launched as individual kernels rtl_tasklet_names = [ - self.rtl_tasklet_name(nd, state, sdfg) for nd in state.nodes() if isinstance(nd, nodes.RTLTasklet) + self.rtl_tasklet_name(nd, state, cfg) for nd in state.nodes() if isinstance(nd, nodes.RTLTasklet) ] multi_pumped = all([self.is_multi_pumped_subgraph(sg) for sg in subgraphs]) # Generate host code - self.generate_host_header(sdfg, kernel_name, global_data_parameters + external_streams, + self.generate_host_header(sdfg, cfg, kernel_name, global_data_parameters + external_streams, state_host_header_stream, multi_pumped) - self.generate_host_function_boilerplate(sdfg, state, nested_global_transients, state_host_body_stream) + self.generate_host_function_boilerplate(sdfg, cfg, state, nested_global_transients, state_host_body_stream) # Now we write the device code module_stream = CodeIOStream() entry_stream = CodeIOStream() - state_id = sdfg.node_id(state) + state_id = cfg.node_id(state) - self.generate_kernel_boilerplate_pre(sdfg, state_id, kernel_name, global_data_parameters, bank_assignments, + self.generate_kernel_boilerplate_pre(sdfg, cfg, state_id, kernel_name, global_data_parameters, bank_assignments, module_stream, entry_stream, external_streams, multi_pumped) # Emit allocations for node in top_level_local_data: - self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, node.desc(sdfg), module_stream, + self._dispatcher.dispatch_allocate(sdfg, cfg, state, state_id, node, node.desc(sdfg), module_stream, entry_stream) for is_output, name, node, _ in external_streams: @@ -1095,8 +1093,8 @@ def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState, kerne val = '{}{}.{}{}'.format(kernel_name, kernel_postfix, stream_prefix, stream) self._stream_connections[stream][key] = val - self.generate_modules(sdfg, state, kernel_name, subgraphs, subgraph_parameters, module_stream, entry_stream, - state_host_body_stream, instrumentation_stream) + self.generate_modules(sdfg, cfg, state, kernel_name, subgraphs, subgraph_parameters, module_stream, + entry_stream, state_host_body_stream, instrumentation_stream) if multi_pumped: # We have to generate the rest of the RTL files for multi-pumping. In particular: @@ -1153,7 +1151,7 @@ def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState, kerne self._ip_codes.append((f'{kernel_name}_package', 'tcl', rtllib_package(rtllib_config))) self._ip_codes.append((f'{kernel_name}_synth', 'tcl', rtllib_synth(rtllib_config))) - self.generate_host_function_body(sdfg, state, kernel_name, predecessors, + self.generate_host_function_body(sdfg, cfg, state, kernel_name, predecessors, global_data_parameters + external_streams, rtl_tasklet_names, state_host_body_stream, instrumentation_stream, multi_pumped) @@ -1162,9 +1160,9 @@ def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState, kerne kernel_stream.write(module_stream.getvalue()) kernel_stream.write(entry_stream.getvalue()) - self.generate_kernel_boilerplate_post(kernel_stream, sdfg, state_id) + self.generate_kernel_boilerplate_post(kernel_stream, sdfg, cfg, state_id) - def generate_host_header(self, sdfg, kernel_function_name, parameters, host_code_stream, multi_pumped): + def generate_host_header(self, sdfg, cfg, kernel_function_name, parameters, host_code_stream, multi_pumped): kernel_args = [] for is_output, name, arg, interface_ids in parameters: @@ -1203,23 +1201,24 @@ def generate_host_header(self, sdfg, kernel_function_name, parameters, host_code {ignore_signature}DACE_EXPORTED void {kernel_function_name}({kernel_args});\n\n""".format( kernel_function_name=kernel_function_name, ignore_signature=ignore_signature, - kernel_args=", ".join(kernel_args)), sdfg) + kernel_args=", ".join(kernel_args)), cfg) - def generate_memlet_definition(self, sdfg, dfg, state_id, src_node, dst_node, edge, callsite_stream): + def generate_memlet_definition(self, sdfg, cfg, dfg, state_id, src_node, dst_node, edge, callsite_stream): memlet = edge.data ptrname = cpp.ptr(memlet.data, sdfg.arrays[memlet.data], sdfg, self._frame) if (self._dispatcher.defined_vars.get(ptrname)[0] == DefinedType.FPGA_ShiftRegister): raise NotImplementedError("Shift register for Xilinx NYI") else: - self._cpu_codegen.copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) + self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) - def allocate_view(self, sdfg: dace.SDFG, dfg: dace.SDFGState, state_id: int, node: dace.nodes.AccessNode, - global_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): - return self._cpu_codegen.allocate_view(sdfg, dfg, state_id, node, global_stream, declaration_stream, + def allocate_view(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, dfg: dace.SDFGState, state_id: int, + node: dace.nodes.AccessNode, global_stream: CodeIOStream, declaration_stream: CodeIOStream, + allocation_stream: CodeIOStream) -> None: + return self._cpu_codegen.allocate_view(sdfg, cfg, dfg, state_id, node, global_stream, declaration_stream, allocation_stream) - def generate_nsdfg_arguments(self, sdfg, dfg, state, node): + def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): # Connectors that are both input and output share the same name, unless # they are pointers to global memory in device code, in which case they # are split into explicit input and output interfaces diff --git a/dace/sdfg/analysis/cfg.py b/dace/sdfg/analysis/cfg.py index b8d8739a7e..9ed28cc28f 100644 --- a/dace/sdfg/analysis/cfg.py +++ b/dace/sdfg/analysis/cfg.py @@ -1,31 +1,30 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -""" Various analyses related to control flow in SDFG states. """ +""" Various analyses related to control flow in SDFGs. """ from collections import defaultdict -from dace.sdfg import SDFG, SDFGState, InterstateEdge, graph as gr, utils as sdutil -from dace.symbolic import pystr_to_symbolic +from dace.sdfg import SDFGState, InterstateEdge, graph as gr, utils as sdutil import networkx as nx import sympy as sp from typing import Dict, Iterator, List, Optional, Set +from dace.sdfg.state import ControlFlowBlock, ControlFlowRegion -def acyclic_dominance_frontier(sdfg: SDFG, idom=None) -> Dict[SDFGState, Set[SDFGState]]: + +def acyclic_dominance_frontier(cfg: ControlFlowRegion, idom=None) -> Dict[ControlFlowBlock, Set[ControlFlowBlock]]: """ - Finds the dominance frontier for an SDFG while ignoring any back edges. + Finds the dominance frontier for a CFG while ignoring any back edges. - This is a modified version of the dominance frontiers algorithm as - implemented by networkx. + This is a modified version of the dominance frontiers algorithm as implemented by networkx. - :param sdfg: The SDFG for which to compute the acyclic dominance frontier. + :param cfg: The CFG for which to compute the acyclic dominance frontier. :param idom: Optional precomputed immediate dominators. - :return: A dictionary keyed by states, containing the dominance frontier - for each SDFG state. + :return: A dictionary keyed by control flow blocks, containing the dominance frontier for each control flow block. """ - idom = idom or nx.immediate_dominators(sdfg.nx, sdfg.start_state) + idom = idom or nx.immediate_dominators(cfg.nx, cfg.start_block) - dom_frontiers = {state: set() for state in sdfg.nodes()} + dom_frontiers = {block: set() for block in cfg.nodes()} for u in idom: - if len(sdfg.nx.pred[u]) >= 2: - for v in sdfg.nx.pred[u]: + if len(cfg.nx.pred[u]) >= 2: + for v in cfg.nx.pred[u]: if v in idom: df_candidates = set() while v != idom[u]: @@ -41,185 +40,193 @@ def acyclic_dominance_frontier(sdfg: SDFG, idom=None) -> Dict[SDFGState, Set[SDF return dom_frontiers -def all_dominators(sdfg: SDFG, idom: Dict[SDFGState, SDFGState] = None) -> Dict[SDFGState, Set[SDFGState]]: - """ Returns a mapping between each state and all its dominators. """ - idom = idom or nx.immediate_dominators(sdfg.nx, sdfg.start_state) - # Create a dictionary of all dominators of each node by using the - # transitive closure of the DAG induced by the idoms +def all_dominators( + cfg: ControlFlowRegion, + idom: Dict[ControlFlowBlock, ControlFlowBlock] = None) -> Dict[ControlFlowBlock, Set[ControlFlowBlock]]: + """ Returns a mapping between each control flow block and all its dominators. """ + idom = idom or nx.immediate_dominators(cfg.nx, cfg.start_block) + # Create a dictionary of all dominators of each node by using the transitive closure of the DAG induced by the idoms g = nx.DiGraph() for node, dom in idom.items(): if node is dom: # Skip root continue g.add_edge(node, dom) tc = nx.transitive_closure_dag(g) - alldoms: Dict[SDFGState, Set[SDFGState]] = {sdfg.start_state: set()} + alldoms: Dict[ControlFlowBlock, Set[ControlFlowBlock]] = {cfg.start_block: set()} for node in tc: alldoms[node] = set(dst for _, dst in tc.out_edges(node)) return alldoms -def back_edges(sdfg: SDFG, - idom: Dict[SDFGState, SDFGState] = None, - alldoms: Dict[SDFGState, SDFGState] = None) -> List[gr.Edge[InterstateEdge]]: - """ Returns a list of back-edges in an SDFG. """ - alldoms = alldoms or all_dominators(sdfg, idom) - return [e for e in sdfg.edges() if e.dst in alldoms[e.src]] +def back_edges(cfg: ControlFlowRegion, + idom: Dict[ControlFlowBlock, ControlFlowBlock] = None, + alldoms: Dict[ControlFlowBlock, ControlFlowBlock] = None) -> List[gr.Edge[InterstateEdge]]: + """ Returns a list of back-edges in a control flow graph. """ + alldoms = alldoms or all_dominators(cfg, idom) + return [e for e in cfg.edges() if e.dst in alldoms[e.src]] -def state_parent_tree(sdfg: SDFG, loopexits: Optional[Dict[SDFGState, SDFGState]] = None) -> Dict[SDFGState, SDFGState]: +def block_parent_tree(cfg: ControlFlowRegion, + loopexits: Optional[Dict[ControlFlowBlock, ControlFlowBlock]] = None, + idom: Dict[ControlFlowBlock, ControlFlowBlock] = None, + with_loops: bool = True) -> Dict[ControlFlowBlock, ControlFlowBlock]: """ - Computes an upward-pointing tree of each state, pointing to the "parent - state" it belongs to (in terms of structured control flow). More formally, - each state is either mapped to its immediate dominator with out degree > 2, - one state upwards if state occurs after a loop, or the start state if - no such states exist. + Computes an upward-pointing tree of each control flow block, pointing to the "parent block" it belongs to (in terms + of structured control flow). More formally, each block is either mapped to its immediate dominator with out + degree >= 2, one block upwards if the block occurs after a loop and `with_loops` is True, or the start block if + no such block exist. :param sdfg: The SDFG to analyze. - :return: A dictionary that maps each state to a parent state, or None - if the root (start) state. + :param idom: An optional, pre-computed immediate dominator dictionary. + :param with_loops: Respect loops in the parent computation, mapping blocks to a parent one block upwards of a loop + if the block occurs after a loop. Defaults to true. + :return: A dictionary that maps each block to a parent block, or None if the root (start) block. """ - idom = nx.immediate_dominators(sdfg.nx, sdfg.start_state) - alldoms = all_dominators(sdfg, idom) - loopexits = loopexits if loopexits is not None else defaultdict(lambda: None) - - # First, annotate loops - for be in back_edges(sdfg, idom, alldoms): - guard = be.dst - laststate = be.src - if loopexits[guard] is not None: - continue + idom = idom or nx.immediate_dominators(cfg.nx, cfg.start_block) + if with_loops: + alldoms = all_dominators(cfg, idom) + loopexits = loopexits if loopexits is not None else defaultdict(lambda: None) + + # First, annotate loops + for be in back_edges(cfg, idom, alldoms): + guard = be.dst + laststate = be.src + if loopexits[guard] is not None: + continue - # Natural loops = one edge leads back to loop, another leads out - in_edges = sdfg.in_edges(guard) - out_edges = sdfg.out_edges(guard) + # Natural loops = one edge leads back to loop, another leads out + in_edges = cfg.in_edges(guard) + out_edges = cfg.out_edges(guard) - # A loop guard has at least one incoming edges (the backedge, performing the increment), and exactly two - # outgoing edges (loop and exit loop). - if len(in_edges) < 1 or len(out_edges) != 2: - continue + # A loop guard has at least one incoming edges (the backedge, performing the increment), and exactly two + # outgoing edges (loop and exit loop). + if len(in_edges) < 1 or len(out_edges) != 2: + continue - # The outgoing edges must be negations of one another. - if out_edges[0].data.condition_sympy() != (sp.Not(out_edges[1].data.condition_sympy())): - continue + # The outgoing edges must be negations of one another. + if out_edges[0].data.condition_sympy() != (sp.Not(out_edges[1].data.condition_sympy())): + continue - # Find all nodes that are between each branch and the guard. - # Condition makes sure the entire cycle is dominated by this node. - # If not, we're looking at a guard for a nested cycle, which we ignore for - # this cycle. - oa, ob = out_edges[0].dst, out_edges[1].dst - - reachable_a = False - a_reached_guard = False - - def cond_a(parent, child): - nonlocal reachable_a - nonlocal a_reached_guard - if reachable_a: # If last state has been reached, stop traversal - return False - if parent is laststate or child is laststate: # Reached back edge - reachable_a = True - a_reached_guard = True - return False - if oa not in alldoms[child]: # Traversed outside of the loop - return False - if child is guard: # Traversed back to guard - a_reached_guard = True - return False - return True # Keep traversing - - reachable_b = False - b_reached_guard = False - - def cond_b(parent, child): - nonlocal reachable_b - nonlocal b_reached_guard - if reachable_b: # If last state has been reached, stop traversal - return False - if parent is laststate or child is laststate: # Reached back edge - reachable_b = True - b_reached_guard = True - return False - if ob not in alldoms[child]: # Traversed outside of the loop - return False - if child is guard: # Traversed back to guard - b_reached_guard = True - return False - return True # Keep traversing - - list(sdutil.dfs_conditional(sdfg, (oa, ), cond_a)) - list(sdutil.dfs_conditional(sdfg, (ob, ), cond_b)) - - # Check which candidate states led back to guard - is_a_begin = a_reached_guard and reachable_a - is_b_begin = b_reached_guard and reachable_b - - loop_state = None - exit_state = None - if is_a_begin and not is_b_begin: - loop_state = oa - exit_state = ob - elif is_b_begin and not is_a_begin: - loop_state = ob - exit_state = oa - if loop_state is None or exit_state is None: - continue - loopexits[guard] = exit_state + # Find all nodes that are between each branch and the guard. + # Condition makes sure the entire cycle is dominated by this node. + # If not, we're looking at a guard for a nested cycle, which we ignore for + # this cycle. + oa, ob = out_edges[0].dst, out_edges[1].dst + + reachable_a = False + a_reached_guard = False + + def cond_a(parent, child): + nonlocal reachable_a + nonlocal a_reached_guard + if reachable_a: # If last state has been reached, stop traversal + return False + if parent is laststate or child is laststate: # Reached back edge + reachable_a = True + a_reached_guard = True + return False + if oa not in alldoms[child]: # Traversed outside of the loop + return False + if child is guard: # Traversed back to guard + a_reached_guard = True + return False + return True # Keep traversing + + reachable_b = False + b_reached_guard = False + + def cond_b(parent, child): + nonlocal reachable_b + nonlocal b_reached_guard + if reachable_b: # If last state has been reached, stop traversal + return False + if parent is laststate or child is laststate: # Reached back edge + reachable_b = True + b_reached_guard = True + return False + if ob not in alldoms[child]: # Traversed outside of the loop + return False + if child is guard: # Traversed back to guard + b_reached_guard = True + return False + return True # Keep traversing + + list(sdutil.dfs_conditional(cfg, (oa, ), cond_a)) + list(sdutil.dfs_conditional(cfg, (ob, ), cond_b)) + + # Check which candidate states led back to guard + is_a_begin = a_reached_guard and reachable_a + is_b_begin = b_reached_guard and reachable_b + + loop_state = None + exit_state = None + if is_a_begin and not is_b_begin: + loop_state = oa + exit_state = ob + elif is_b_begin and not is_a_begin: + loop_state = ob + exit_state = oa + if loop_state is None or exit_state is None: + continue + loopexits[guard] = exit_state # Get dominators - parents: Dict[SDFGState, SDFGState] = {} - step_up: Set[SDFGState] = set() - for state in sdfg.nodes(): - curdom = idom[state] - if curdom == state: - parents[state] = None + parents: Dict[ControlFlowBlock, ControlFlowBlock] = {} + step_up: Set[ControlFlowBlock] = set() + for block in cfg.nodes(): + curdom = idom[block] + if curdom == block: + parents[block] = None continue while curdom != idom[curdom]: - if sdfg.out_degree(curdom) > 1: + if cfg.out_degree(curdom) > 1: break curdom = idom[curdom] - if sdfg.out_degree(curdom) == 2 and loopexits[curdom] is not None: - p = state + if with_loops and cfg.out_degree(curdom) == 2 and loopexits[curdom] is not None: + p = block while p != curdom and p != loopexits[curdom]: p = idom[p] if p == loopexits[curdom]: # Dominated by loop exit: do one more step up - step_up.add(state) + step_up.add(block) - parents[state] = curdom + parents[block] = curdom - # Step up - for state in step_up: - if parents[state] is not None and parents[parents[state]] is not None: - parents[state] = parents[parents[state]] + if with_loops: + # Step up for post-loop blocks. + for block in step_up: + if parents[block] is not None and parents[parents[block]] is not None: + parents[block] = parents[parents[block]] return parents -def _stateorder_topological_sort(sdfg: SDFG, - start: SDFGState, - ptree: Dict[SDFGState, SDFGState], - branch_merges: Dict[SDFGState, SDFGState], - stop: SDFGState = None, - visited: Set[SDFGState] = None, - loopexits: Optional[Dict[SDFGState, SDFGState]] = None) -> Iterator[SDFGState]: +def _blockorder_topological_sort( + cfg: ControlFlowRegion, + start: ControlFlowBlock, + ptree: Dict[ControlFlowBlock, ControlFlowBlock], + branch_merges: Dict[ControlFlowBlock, ControlFlowBlock], + stop: ControlFlowBlock = None, + visited: Set[ControlFlowBlock] = None, + loopexits: Optional[Dict[ControlFlowBlock, ControlFlowBlock]] = None) -> Iterator[ControlFlowBlock]: """ - Helper function for ``stateorder_topological_sort``. - - :param sdfg: SDFG. - :param start: Starting state for traversal. - :param ptree: State parent tree (computed from ``state_parent_tree``). - :param branch_merges: Dictionary mapping from branch state to its merge state. - :param stop: Stopping state to not traverse through (merge state of a - branch or guard state of a loop). - :return: Generator that yields states in state-order from ``start`` to - ``stop``. + Helper function for ``blockorder_topological_sort``. + + :param cfg: CFG. + :param start: Starting block for traversal. + :param ptree: Block parent tree (computed from ``block_parent_tree``). + :param branch_merges: Dictionary mapping from branch blocks to its merge block. + :param stop: Stopping blocks to not traverse through (e.g., merge blocks of a branch or guard block of a loop). + :param visited: Optionally, a set of already visited blocks. + :param loopexits: An optional dictionary of already identified loop guard to exit block mappings. + :return: Generator that yields control flow blocks in execution order from ``start`` to ``stop``. """ loopexits = loopexits if loopexits is not None else defaultdict(lambda: None) - # Traverse states in custom order + # Traverse blocks in custom order visited = visited or set() stack = [start] while stack: @@ -229,8 +236,8 @@ def _stateorder_topological_sort(sdfg: SDFG, yield node visited.add(node) - oe = sdfg.out_edges(node) - if len(oe) == 0: # End state + oe = cfg.out_edges(node) + if len(oe) == 0: # End block continue elif len(oe) == 1: # No traversal change stack.append(oe[0].dst) @@ -239,15 +246,25 @@ def _stateorder_topological_sort(sdfg: SDFG, # If loop, traverse body, then exit if node in loopexits: if oe[0].dst == loopexits[node]: - for s in _stateorder_topological_sort(sdfg, oe[1].dst, ptree, branch_merges, stop=node, - visited=visited, loopexits=loopexits): + for s in _blockorder_topological_sort(cfg, + oe[1].dst, + ptree, + branch_merges, + stop=node, + visited=visited, + loopexits=loopexits): yield s visited.add(s) stack.append(oe[0].dst) continue elif oe[1].dst == loopexits[node]: - for s in _stateorder_topological_sort(sdfg, oe[0].dst, ptree, branch_merges, stop=node, - visited=visited, loopexits=loopexits): + for s in _blockorder_topological_sort(cfg, + oe[0].dst, + ptree, + branch_merges, + stop=node, + visited=visited, + loopexits=loopexits): yield s visited.add(s) stack.append(oe[1].dst) @@ -255,52 +272,55 @@ def _stateorder_topological_sort(sdfg: SDFG, # Otherwise, passthrough to branch # Branch if node in branch_merges: - # Try to find merge state and traverse until reaching that - mergestate = branch_merges[node] + # Try to find merge block and traverse until reaching that + mergeblock = branch_merges[node] else: try: # Otherwise (e.g., with return/break statements), traverse through each branch, # stopping at the end of the current tree level. - mergestate = next(e.dst for e in sdfg.out_edges(stop) if ptree[e.dst] != stop) + mergeblock = next(e.dst for e in cfg.out_edges(stop) if ptree[e.dst] != stop) except (StopIteration, KeyError): # If that fails, simply traverse branches in arbitrary order - mergestate = stop + mergeblock = stop for branch in oe: - if branch.dst is mergestate: - # If we hit the merge state (if without else), defer to end of branch traversal + if branch.dst is mergeblock: + # If we hit the merge block (if without else), defer to end of branch traversal continue - for s in _stateorder_topological_sort(sdfg, + for s in _blockorder_topological_sort(cfg, branch.dst, ptree, branch_merges, - stop=mergestate, + stop=mergeblock, visited=visited, loopexits=loopexits): yield s visited.add(s) - stack.append(mergestate) + stack.append(mergeblock) -def stateorder_topological_sort(sdfg: SDFG) -> Iterator[SDFGState]: +def blockorder_topological_sort(cfg: ControlFlowRegion, + recursive: bool = True, + ignore_nonstate_blocks: bool = False) -> Iterator[ControlFlowBlock]: """ - Returns a generator that produces states in the order that they will be - executed, disregarding multiple loop iterations and employing topological - sort for branches. + Returns a generator that produces control flow blocks in the order that they will be executed, disregarding multiple + loop iterations and employing topological sort for branches. - :param sdfg: The SDFG to iterate over. - :return: Generator that yields states in state-order. + :param cfg: The CFG to iterate over. + :param recursive: Whether or not to recurse down hierarchies of control flow regions (not across Nested SDFGs). + :param ignore_nonstate_blocks: If true, only produce basic blocks / SDFGStates. Defaults to False. + :return: Generator that yields control flow blocks in execution-order. """ # Get parent states - loopexits: Dict[SDFGState, SDFGState] = defaultdict(lambda: None) - ptree = state_parent_tree(sdfg, loopexits) + loopexits: Dict[ControlFlowBlock, ControlFlowBlock] = defaultdict(lambda: None) + ptree = block_parent_tree(cfg, loopexits) # Annotate branches - branch_merges: Dict[SDFGState, SDFGState] = {} - adf = acyclic_dominance_frontier(sdfg) - ipostdom = sdutil.postdominators(sdfg) - for state in sdfg.nodes(): - oedges = sdfg.out_edges(state) + branch_merges: Dict[ControlFlowBlock, ControlFlowBlock] = {} + adf = acyclic_dominance_frontier(cfg) + ipostdom = sdutil.postdominators(cfg) + for state in cfg.nodes(): + oedges = cfg.out_edges(state) # Skip if not branch if len(oedges) <= 1: continue @@ -317,7 +337,18 @@ def stateorder_topological_sort(sdfg: SDFG) -> Iterator[SDFGState]: common_frontier |= frontier if len(common_frontier) == 1: branch_merges[state] = next(iter(common_frontier)) - elif len(common_frontier) > 1 and ipostdom[state] in common_frontier: + elif len(common_frontier) > 1 and ipostdom and ipostdom[state] in common_frontier: branch_merges[state] = ipostdom[state] - yield from _stateorder_topological_sort(sdfg, sdfg.start_state, ptree, branch_merges, loopexits=loopexits) + for block in _blockorder_topological_sort(cfg, cfg.start_block, ptree, branch_merges, loopexits=loopexits): + if isinstance(block, ControlFlowRegion): + if not ignore_nonstate_blocks: + yield block + if recursive: + yield from blockorder_topological_sort(block, recursive, ignore_nonstate_blocks) + elif isinstance(block, SDFGState): + yield block + else: + # Other control flow block. + if not ignore_nonstate_blocks: + yield block diff --git a/dace/sdfg/analysis/cutout.py b/dace/sdfg/analysis/cutout.py index 9d5437dbee..50272167bb 100644 --- a/dace/sdfg/analysis/cutout.py +++ b/dace/sdfg/analysis/cutout.py @@ -111,8 +111,8 @@ def to_json(self, hash=False): return cutout_json @classmethod - def from_json(cls, json_obj, context_info=None): - return super(SDFGCutout, cls).from_json(json_obj, context_info) + def from_json(cls, json_obj, context=None): + return super(SDFGCutout, cls).from_json(json_obj, context) @classmethod def from_transformation( diff --git a/dace/sdfg/analysis/schedule_tree/sdfg_to_tree.py b/dace/sdfg/analysis/schedule_tree/sdfg_to_tree.py index a5a2227d93..cf29466663 100644 --- a/dace/sdfg/analysis/schedule_tree/sdfg_to_tree.py +++ b/dace/sdfg/analysis/schedule_tree/sdfg_to_tree.py @@ -116,7 +116,7 @@ def dealias_sdfg(sdfg: SDFG): elif e.data.data == dst_data: e.data.data = new_dst_memlet.data - for e in nsdfg.edges(): + for e in nsdfg.all_interstate_edges(): repl_dict = dict() syms = e.data.read_symbols() for memlet in e.data.get_read_memlets(nsdfg.arrays): @@ -276,7 +276,7 @@ def remove_name_collisions(sdfg: SDFG): for nsdfg in sdfg.all_sdfgs_recursive(): # Rename duplicate states - for state in nsdfg.nodes(): + for state in nsdfg.states(): if state.label in state_names_seen: state.label = data.find_new_name(state.label, state_names_seen) state_names_seen.add(state.label) @@ -668,12 +668,12 @@ def totree(node: cf.ControlFlow, parent: cf.GeneralBlock = None) -> List[tn.Sche # Use the sub-nodes directly result = subnodes - elif isinstance(node, cf.SingleState): + elif isinstance(node, cf.BasicCFBlock): result = state_schedule_tree(node.state) # Add interstate assignments unrelated to structured control flow if parent is not None: - for e in sdfg.out_edges(node.state): + for e in node.state.parent_graph.out_edges(node.state): edge_body = [] if e not in parent.assignments_to_ignore: @@ -722,12 +722,14 @@ def totree(node: cf.ControlFlow, parent: cf.GeneralBlock = None) -> List[tn.Sche result.append(tn.WhileScope(header=node, children=totree(node.body))) elif isinstance(node, cf.DoWhileScope): result.append(tn.DoWhileScope(header=node, children=totree(node.body))) + elif isinstance(node, cf.GeneralLoopScope): + result.append(tn.GeneralLoopScope(header=node, children=totree(node.body))) else: # e.g., "SwitchCaseScope" raise tn.UnsupportedScopeException(type(node).__name__) - if node.first_state is not None: - result = [tn.StateLabel(state=node.first_state)] + result + if node.first_block is not None: + result = [tn.StateLabel(state=node.first_block)] + result return result diff --git a/dace/sdfg/analysis/schedule_tree/treenodes.py b/dace/sdfg/analysis/schedule_tree/treenodes.py index 5d3d2a6fa8..619b71b770 100644 --- a/dace/sdfg/analysis/schedule_tree/treenodes.py +++ b/dace/sdfg/analysis/schedule_tree/treenodes.py @@ -151,6 +151,38 @@ def as_string(self, indent: int = 0): return header + super().as_string(indent) + footer +@dataclass +class GeneralLoopScope(ControlFlowScope): + """ + General loop scope (representing a loop region). + """ + header: cf.GeneralLoopScope + + def as_string(self, indent: int = 0): + loop = self.header.loop + if loop.update_statement and loop.init_statement and loop.loop_variable: + if loop.inverted: + pre_header = indent * INDENTATION + f'{loop.init_statement.as_string}\n' + header = indent * INDENTATION + 'do:\n' + pre_footer = (indent + 1) * INDENTATION + f'{loop.update_statement.as_string}\n' + footer = indent * INDENTATION + f'while {loop.loop_condition.as_string}' + return pre_header + header + super().as_string(indent) + '\n' + pre_footer + footer + else: + result = (indent * INDENTATION + + f'for {loop.init_statement.as_string}; ' + + f'{loop.loop_condition.as_string}; ' + + f'{loop.update_statement.as_string}:\n') + return result + super().as_string(indent) + else: + if loop.inverted: + header = indent * INDENTATION + 'do:\n' + footer = indent * INDENTATION + f'while {loop.loop_condition.as_string}' + return header + super().as_string(indent) + '\n' + footer + else: + result = indent * INDENTATION + f'while {loop.loop_condition.as_string}:\n' + return result + super().as_string(indent) + + @dataclass class IfScope(ControlFlowScope): """ diff --git a/dace/sdfg/replace.py b/dace/sdfg/replace.py index 0220fd990d..639f392535 100644 --- a/dace/sdfg/replace.py +++ b/dace/sdfg/replace.py @@ -3,7 +3,7 @@ import re import warnings -from typing import Any, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Optional, Union import sympy as sp @@ -12,6 +12,9 @@ from dace.codegen import cppunparse from dace.frontend.python.astutils import ASTFindReplace +if TYPE_CHECKING: + from dace.sdfg.state import StateSubgraphView + tokenize_cpp = re.compile(r'\b\w+\b') @@ -48,7 +51,7 @@ def _replsym(symlist, symrepl): return symlist -def replace_dict(subgraph: 'dace.sdfg.state.StateGraphView', +def replace_dict(subgraph: 'StateSubgraphView', repl: Dict[str, str], symrepl: Optional[Dict[symbolic.SymbolicType, symbolic.SymbolicType]] = None): """ @@ -80,7 +83,7 @@ def replace_dict(subgraph: 'dace.sdfg.state.StateGraphView', edge.data.volume = _replsym(edge.data.volume, symrepl) -def replace(subgraph: 'dace.sdfg.state.StateGraphView', name: str, new_name: str): +def replace(subgraph: 'StateSubgraphView', name: str, new_name: str): """ Finds and replaces all occurrences of a symbol or array in the given subgraph. @@ -95,7 +98,7 @@ def replace(subgraph: 'dace.sdfg.state.StateGraphView', name: str, new_name: str def replace_properties_dict(node: Any, repl: Dict[str, str], - symrepl: Dict[symbolic.SymbolicType, symbolic.SymbolicType] = None): + symrepl: Optional[Dict[symbolic.SymbolicType, symbolic.SymbolicType]] = None): symrepl = symrepl or { symbolic.pystr_to_symbolic(symname): symbolic.pystr_to_symbolic(new_name) if isinstance(new_name, str) else new_name @@ -163,15 +166,13 @@ def replace_properties_dict(node: Any, pass -def replace_properties(node: Any, symrepl: Dict[symbolic.symbol, symbolic.SymbolicType], name: str, new_name: str): +def replace_properties(node: Any, symrepl: Dict[symbolic.SymbolicType, symbolic.SymbolicType], + name: str, new_name: str): replace_properties_dict(node, {name: new_name}, symrepl) -def replace_datadesc_names(sdfg, repl: Dict[str, str]): +def replace_datadesc_names(sdfg: 'dace.SDFG', repl: Dict[str, str]): """ Reduced form of replace which only replaces data descriptor names. """ - from dace.sdfg import SDFG # Avoid import loop - sdfg: SDFG = sdfg - # Replace in descriptor repository for aname, aval in list(sdfg.arrays.items()): if aname in repl: diff --git a/dace/sdfg/scope.py b/dace/sdfg/scope.py index 95f278b06a..97fe12d7c2 100644 --- a/dace/sdfg/scope.py +++ b/dace/sdfg/scope.py @@ -1,6 +1,6 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import collections -from typing import Any, Dict, List, Tuple +from typing import Dict, List, Tuple import dace from dace import dtypes, symbolic @@ -8,21 +8,18 @@ from dace.sdfg import nodes as nd from dace.sdfg.state import StateSubgraphView -NodeType = 'dace.sdfg.nodes.Node' -EntryNodeType = 'dace.sdfg.nodes.EntryNode' -ExitNodeType = 'dace.sdfg.nodes.ExitNode' -ScopeDictType = Dict[NodeType, List[NodeType]] +ScopeDictType = Dict[nd.Node, List[nd.Node]] class ScopeTree(object): """ A class defining a scope, its parent and children scopes, and scope entry/exit nodes. """ - def __init__(self, entrynode: EntryNodeType, exitnode: ExitNodeType): + def __init__(self, entrynode: nd.EntryNode, exitnode: nd.ExitNode): self.parent: 'ScopeTree' = None self.children: List['ScopeTree'] = [] - self.entry: EntryNodeType = entrynode - self.exit: ExitNodeType = exitnode + self.entry: nd.EntryNode = entrynode + self.exit: nd.ExitNode = exitnode class ScopeSubgraphView(StateSubgraphView): @@ -127,7 +124,7 @@ def node_id_or_none(node): return {node_id_or_none(k): [node_id_or_none(vi) for vi in v] for k, v in scope_dict.items()} -def scope_contains_scope(sdict: ScopeDictType, node: NodeType, other_node: NodeType) -> bool: +def scope_contains_scope(sdict: ScopeDictType, node: nd.Node, other_node: nd.Node) -> bool: """ Returns true iff scope of `node` contains the scope of `other_node`. """ @@ -140,7 +137,7 @@ def scope_contains_scope(sdict: ScopeDictType, node: NodeType, other_node: NodeT return False -def _scope_path(sdict: ScopeDictType, scope: NodeType) -> List[NodeType]: +def _scope_path(sdict: ScopeDictType, scope: nd.Node) -> List[nd.Node]: result = [] curnode = scope while curnode is not None: @@ -149,7 +146,7 @@ def _scope_path(sdict: ScopeDictType, scope: NodeType) -> List[NodeType]: return result -def common_parent_scope(sdict: ScopeDictType, scope_a: NodeType, scope_b: NodeType) -> NodeType: +def common_parent_scope(sdict: ScopeDictType, scope_a: nd.Node, scope_b: nd.Node) -> nd.Node: """ Finds a common parent scope for both input scopes, or None if the scopes are in different connected components. @@ -181,7 +178,7 @@ def common_parent_scope(sdict: ScopeDictType, scope_a: NodeType, scope_b: NodeTy return common -def is_in_scope(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', node: NodeType, +def is_in_scope(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', node: nd.Node, schedules: List[dtypes.ScheduleType]) -> bool: """ Tests whether a node in an SDFG is contained within a certain set of scope schedules. @@ -214,7 +211,7 @@ def is_in_scope(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', node: Node def is_devicelevel_gpu(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', - node: NodeType, + node: nd.Node, with_gpu_default: bool = False) -> bool: """ Tests whether a node in an SDFG is contained within GPU device-level code. @@ -235,7 +232,7 @@ def is_devicelevel_gpu(sdfg: 'dace.sdfg.SDFG', ) -def is_devicelevel_gpu_kernel(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', node: NodeType) -> bool: +def is_devicelevel_gpu_kernel(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', node: nd.Node) -> bool: """ Tests whether a node in an SDFG is contained within an actual GPU kernel. The main difference from :func:`is_devicelevel_gpu` is that it returns False for NestedSDFGs that have a GPU device-level schedule, but are not within an actual GPU kernel. @@ -251,7 +248,7 @@ def is_devicelevel_gpu_kernel(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGStat return is_devicelevel_gpu(state.parent, state, node, with_gpu_default=True) -def is_devicelevel_fpga(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', node: NodeType) -> bool: +def is_devicelevel_fpga(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', node: nd.Node) -> bool: """ Tests whether a node in an SDFG is contained within FPGA device-level code. @@ -266,7 +263,7 @@ def is_devicelevel_fpga(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', no def devicelevel_block_size(sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', - node: NodeType) -> Tuple[symbolic.SymExpr]: + node: nd.Node) -> Tuple[symbolic.SymExpr]: """ Returns the current thread-block size if the given node is enclosed in a GPU kernel, or None otherwise. diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 82d98c1e18..3e5f58a413 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -3,35 +3,26 @@ import collections import copy import ctypes -import itertools import gzip from numbers import Integral import os -import pickle, json +import json from hashlib import md5, sha256 -from pydoc import locate import random -import re import shutil import sys -import time -from typing import Any, AnyStr, Dict, Iterator, List, Optional, Sequence, Set, Tuple, Type, TYPE_CHECKING, Union +from typing import Any, AnyStr, Dict, List, Optional, Sequence, Set, Tuple, Type, TYPE_CHECKING, Union import warnings -import numpy as np -import sympy as sp import dace import dace.serialize -from dace import (data as dt, hooks, memlet as mm, subsets as sbs, dtypes, properties, symbolic) -from dace.sdfg.scope import ScopeTree -from dace.sdfg.replace import replace, replace_properties, replace_properties_dict +from dace import (data as dt, hooks, memlet as mm, subsets as sbs, dtypes, symbolic) +from dace.sdfg.replace import replace_properties_dict from dace.sdfg.validation import (InvalidSDFGError, validate_sdfg) from dace.config import Config -from dace.frontend.python import astutils, wrappers +from dace.frontend.python import astutils from dace.sdfg import nodes as nd -from dace.sdfg.graph import OrderedDiGraph, Edge, SubgraphView from dace.sdfg.state import ControlFlowBlock, SDFGState, ControlFlowRegion -from dace.sdfg.propagation import propagate_memlets_sdfg from dace.distr_types import ProcessGrid, SubArray, RedistrArray from dace.dtypes import validate_name from dace.properties import (DebugInfoProperty, EnumProperty, ListProperty, make_properties, Property, CodeProperty, @@ -183,7 +174,9 @@ class InterstateEdge(object): desc="Assignments to perform upon transition (e.g., 'x=x+1; y = 0')") condition = CodeProperty(desc="Transition condition", default=CodeBlock("1")) - def __init__(self, condition: Optional[Union[CodeBlock, str, ast.AST, list]] = None, assignments=None): + def __init__(self, + condition: Optional[Union[CodeBlock, str, ast.AST, list]] = None, + assignments: Optional[Dict] = None): if condition is None: condition = CodeBlock("1") @@ -584,8 +577,8 @@ def to_json(self, hash=False): return tmp @classmethod - def from_json(cls, json_obj, context_info=None): - context_info = context_info or {'sdfg': None} + def from_json(cls, json_obj, context=None): + context = context or {'sdfg': None} _type = json_obj['type'] if _type != cls.__name__: raise TypeError("Class type mismatch") @@ -599,7 +592,7 @@ def from_json(cls, json_obj, context_info=None): else: constants_prop = None - ret = SDFG(name=attrs['name'], constants=constants_prop, parent=context_info['sdfg']) + ret = SDFG(name=attrs['name'], constants=constants_prop, parent=context['sdfg']) dace.serialize.set_properties_from_json(ret, json_obj, @@ -607,12 +600,12 @@ def from_json(cls, json_obj, context_info=None): nodelist = [] for n in nodes: - nci = copy.copy(context_info) + nci = copy.copy(context) nci['sdfg'] = ret - state = SDFGState.from_json(n, context=nci) - ret.add_node(state) - nodelist.append(state) + block = dace.serialize.from_json(n, context=nci) + ret.add_node(block) + nodelist.append(block) for e in edges: e = dace.serialize.from_json(e) @@ -1234,7 +1227,7 @@ def _yield_nested_data(name, arr): if isinstance(arr, dt.Structure) and include_nested_data: yield from _yield_nested_data(aname, arr) yield self, aname, arr - for state in self.nodes(): + for state in self.states(): for node in state.nodes(): if isinstance(node, nd.NestedSDFG): yield from node.sdfg.arrays_recursive(include_nested_data=include_nested_data) @@ -2201,7 +2194,6 @@ def compile(self, output_file=None, validate=True) -> 'CompiledSDFG': # Importing these outside creates an import loop from dace.codegen import codegen, compiler - from dace.sdfg import utils as sdutils # Compute build folder path before running codegen build_folder = self.build_folder @@ -2222,11 +2214,6 @@ def compile(self, output_file=None, validate=True) -> 'CompiledSDFG': # if the codegen modifies the SDFG (thereby changing its hash) sdfg.build_folder = build_folder - # Convert any loop constructs with hierarchical loop regions into simple 1-level state machine loops. - # TODO (later): Adapt codegen to deal with hierarchical CFGs instead. - sdutils.inline_loop_blocks(sdfg) - sdutils.inline_control_flow_regions(sdfg) - # Rename SDFG to avoid runtime issues with clashing names index = 0 while sdfg.is_loaded(): diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 45a7913f6a..1428564f4e 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -1107,6 +1107,10 @@ class ControlFlowBlock(BlockGraphView, abc.ABC): _label: str + _default_lineinfo: Optional[dace.dtypes.DebugInfo] = None + _sdfg: Optional['SDFG'] = None + _parent_graph: Optional['ControlFlowRegion'] = None + def __init__(self, label: str = '', sdfg: Optional['SDFG'] = None, parent: Optional['ControlFlowRegion'] = None): super(ControlFlowBlock, self).__init__() self._label = label @@ -1137,9 +1141,23 @@ def to_json(self, parent=None): 'collapsed': self.is_collapsed, 'label': self._label, 'id': parent.node_id(self) if parent is not None else None, + 'attributes': serialize.all_properties_to_json(self), } return tmp + @classmethod + def from_json(cls, json_obj, context=None): + context = context or {'sdfg': None, 'parent_graph': None} + _type = json_obj['type'] + if _type != cls.__name__: + raise TypeError("Class type mismatch") + + ret = cls(label=json_obj['label'], sdfg=context['sdfg']) + + dace.serialize.set_properties_from_json(ret, json_obj) + + return ret + def __str__(self): return self._label @@ -1353,7 +1371,7 @@ def to_json(self, parent=None): return ret @classmethod - def from_json(cls, json_obj, context={'sdfg': None}): + def from_json(cls, json_obj, context={'sdfg': None}, pre_ret=None): """ Loads the node properties, label and type into a dict. :param json_obj: The object containing information about this node. @@ -1369,7 +1387,9 @@ def from_json(cls, json_obj, context={'sdfg': None}): nodes = json_obj['nodes'] edges = json_obj['edges'] - ret = SDFGState(label=json_obj['label'], sdfg=context['sdfg'], debuginfo=None) + ret = pre_ret if pre_ret is not None else SDFGState(label=json_obj['label'], + sdfg=context['sdfg'], + debuginfo=None) rec_ci = { 'sdfg': context['sdfg'], @@ -2394,26 +2414,47 @@ def fill_scope_connectors(self): node.add_in_connector(edge.dst_conn) +@make_properties class ContinueBlock(ControlFlowBlock): """ Special control flow block to represent a continue inside of loops. """ def __repr__(self): return f'ContinueBlock ({self.label})' + def to_json(self, parent=None): + tmp = super().to_json(parent) + tmp['nodes'] = [] + tmp['edges'] = [] + return tmp + +@make_properties class BreakBlock(ControlFlowBlock): """ Special control flow block to represent a continue inside of loops or switch / select blocks. """ def __repr__(self): return f'BreakBlock ({self.label})' + def to_json(self, parent=None): + tmp = super().to_json(parent) + tmp['nodes'] = [] + tmp['edges'] = [] + return tmp + +@make_properties class ReturnBlock(ControlFlowBlock): """ Special control flow block to represent an early return out of the SDFG or a nested procedure / SDFG. """ def __repr__(self): return f'ReturnBlock ({self.label})' + def to_json(self, parent=None): + tmp = super().to_json(parent) + tmp['nodes'] = [] + tmp['edges'] = [] + return tmp + class StateSubgraphView(SubgraphView, DataflowGraphView): """ A read-only subgraph view of an SDFG state. """ @@ -2492,6 +2533,12 @@ def update_cfg_list(self, cfg_list): else: self._cfg_list = sub_cfg_list + def state(self, state_id: int) -> SDFGState: + node = self.node(state_id) + if not isinstance(node, SDFGState): + raise TypeError(f'The node with id {state_id} is not an SDFGState') + return node + def inline(self) -> Tuple[bool, Any]: """ Inlines the control flow region into its parent control flow region (if it exists). @@ -2544,6 +2591,9 @@ def inline(self) -> Tuple[bool, Any]: return False, None + ################################################################### + # CFG API methods + def add_return(self, label=None) -> ReturnBlock: label = self._ensure_unique_block_name(label) block = ReturnBlock(label) @@ -2601,7 +2651,7 @@ def add_node(self, self.start_block = len(self.nodes()) - 1 self._cached_start_block = node - def add_state(self, label=None, is_start_block=False, *, is_start_state: bool = None) -> SDFGState: + def add_state(self, label=None, is_start_block=False, *, is_start_state: Optional[bool] = None) -> SDFGState: label = self._ensure_unique_block_name(label) state = SDFGState(label) self._labels.add(label) @@ -2616,10 +2666,10 @@ def add_state_before(self, state: SDFGState, label=None, is_start_block=False, - condition: CodeBlock = None, - assignments=None, + condition: Optional[CodeBlock] = None, + assignments: Optional[Dict] = None, *, - is_start_state: bool = None) -> SDFGState: + is_start_state: Optional[bool] = None) -> SDFGState: """ Adds a new SDFG state before an existing state, reconnecting predecessors to it instead. :param state: The state to prepend the new state before. @@ -2642,10 +2692,10 @@ def add_state_after(self, state: SDFGState, label=None, is_start_block=False, - condition: CodeBlock = None, - assignments=None, + condition: Optional[CodeBlock] = None, + assignments: Optional[Dict] = None, *, - is_start_state: bool = None) -> SDFGState: + is_start_state: Optional[bool] = None) -> SDFGState: """ Adds a new SDFG state after an existing state, reconnecting it to the successors instead. :param state: The state to append the new state after. @@ -2664,6 +2714,49 @@ def add_state_after(self, self.add_edge(state, new_state, dace.sdfg.InterstateEdge(condition=condition, assignments=assignments)) return new_state + ################################################################### + # Traversal methods + + def all_control_flow_regions(self, recursive=False) -> Iterator['ControlFlowRegion']: + """ Iterate over this and all nested control flow regions. """ + yield self + for block in self.nodes(): + if isinstance(block, SDFGState) and recursive: + for node in block.nodes(): + if isinstance(node, nd.NestedSDFG): + yield from node.sdfg.all_control_flow_regions(recursive=recursive) + elif isinstance(block, ControlFlowRegion): + yield from block.all_control_flow_regions(recursive=recursive) + + def all_sdfgs_recursive(self) -> Iterator['SDFG']: + """ Iterate over this and all nested SDFGs. """ + for cfg in self.all_control_flow_regions(recursive=True): + if isinstance(cfg, dace.SDFG): + yield cfg + + def all_states(self) -> Iterator[SDFGState]: + """ Iterate over all states in this control flow graph. """ + for block in self.nodes(): + if isinstance(block, SDFGState): + yield block + elif isinstance(block, ControlFlowRegion): + yield from block.all_states() + + def all_control_flow_blocks(self, recursive=False) -> Iterator[ControlFlowBlock]: + """ Iterate over all control flow blocks in this control flow graph. """ + for cfg in self.all_control_flow_regions(recursive=recursive): + for block in cfg.nodes(): + yield block + + def all_interstate_edges(self, recursive=False) -> Iterator[Edge['dace.sdfg.InterstateEdge']]: + """ Iterate over all interstate edges in this control flow graph. """ + for cfg in self.all_control_flow_regions(recursive=recursive): + for edge in cfg.edges(): + yield edge + + ################################################################### + # Inherited / Overrides + def _used_symbols_internal(self, all_symbols: bool, defined_syms: Optional[Set] = None, @@ -2682,13 +2775,17 @@ def _used_symbols_internal(self, for block in ordered_blocks: state_symbols = set() if isinstance(block, ControlFlowRegion): - b_free_syms, b_defined_syms, b_used_before_syms = block._used_symbols_internal(all_symbols) + b_free_syms, b_defined_syms, b_used_before_syms = block._used_symbols_internal(all_symbols, + defined_syms, + free_syms, + used_before_assignment, + keep_defined_in_mapping) free_syms |= b_free_syms defined_syms |= b_defined_syms used_before_assignment |= b_used_before_syms state_symbols = b_free_syms else: - state_symbols = block.used_symbols(all_symbols) + state_symbols = block.used_symbols(all_symbols, keep_defined_in_mapping) free_syms |= state_symbols # Add free inter-state symbols @@ -2734,28 +2831,27 @@ def to_json(self, parent=None): return graph_json @classmethod - def from_json(cls, json_obj, context_info=None): - context_info = context_info or {'sdfg': None, 'parent_graph': None} + def from_json(cls, json_obj, context=None): + context = context or {'sdfg': None, 'parent_graph': None} _type = json_obj['type'] if _type != cls.__name__: raise TypeError("Class type mismatch") - attrs = json_obj['attributes'] nodes = json_obj['nodes'] edges = json_obj['edges'] - ret = ControlFlowRegion(label=attrs['label'], sdfg=context_info['sdfg']) + ret = cls(label=json_obj['label'], sdfg=context['sdfg']) dace.serialize.set_properties_from_json(ret, json_obj) nodelist = [] for n in nodes: - nci = copy.copy(context_info) + nci = copy.copy(context) nci['parent_graph'] = ret - state = SDFGState.from_json(n, context=nci) - ret.add_node(state) - nodelist.append(state) + block = dace.serialize.from_json(n, context=nci) + ret.add_node(block) + nodelist.append(block) for e in edges: e = dace.serialize.from_json(e) @@ -2767,47 +2863,7 @@ def from_json(cls, json_obj, context_info=None): return ret ################################################################### - # Traversal methods - - def all_control_flow_regions(self, recursive=False) -> Iterator['ControlFlowRegion']: - """ Iterate over this and all nested control flow regions. """ - yield self - for block in self.nodes(): - if isinstance(block, SDFGState) and recursive: - for node in block.nodes(): - if isinstance(node, nd.NestedSDFG): - yield from node.sdfg.all_control_flow_regions(recursive=recursive) - elif isinstance(block, ControlFlowRegion): - yield from block.all_control_flow_regions(recursive=recursive) - - def all_sdfgs_recursive(self) -> Iterator['SDFG']: - """ Iterate over this and all nested SDFGs. """ - for cfg in self.all_control_flow_regions(recursive=True): - if isinstance(cfg, dace.SDFG): - yield cfg - - def all_states(self) -> Iterator[SDFGState]: - """ Iterate over all states in this control flow graph. """ - for block in self.nodes(): - if isinstance(block, SDFGState): - yield block - elif isinstance(block, ControlFlowRegion): - yield from block.all_states() - - def all_control_flow_blocks(self, recursive=False) -> Iterator[ControlFlowBlock]: - """ Iterate over all control flow blocks in this control flow graph. """ - for cfg in self.all_control_flow_regions(recursive=recursive): - for block in cfg.nodes(): - yield block - - def all_interstate_edges(self, recursive=False) -> Iterator[Edge['dace.sdfg.InterstateEdge']]: - """ Iterate over all interstate edges in this control flow graph. """ - for cfg in self.all_control_flow_regions(recursive=recursive): - for edge in cfg.edges(): - yield edge - - ################################################################### - # Getters & setters, overrides + # Getters, setters, and builtins def __str__(self): return ControlFlowBlock.__str__(self) @@ -2895,12 +2951,13 @@ class LoopRegion(ControlFlowRegion): def __init__(self, label: str, - condition_expr: str, + condition_expr: Optional[str] = None, loop_var: Optional[str] = None, initialize_expr: Optional[str] = None, update_expr: Optional[str] = None, - inverted: bool = False): - super(LoopRegion, self).__init__(label) + inverted: bool = False, + sdfg: Optional['SDFG'] = None): + super(LoopRegion, self).__init__(label, sdfg) if initialize_expr is not None: self.init_statement = CodeBlock(initialize_expr) @@ -3059,9 +3116,10 @@ def _used_symbols_internal(self, b_free_symbols, b_defined_symbols, b_used_before_assignment = super()._used_symbols_internal( all_symbols, keep_defined_in_mapping=keep_defined_in_mapping) + outside_defined = defined_syms - used_before_assignment + used_before_assignment |= ((b_used_before_assignment - {self.loop_variable}) - outside_defined) free_syms |= b_free_symbols defined_syms |= b_defined_symbols - used_before_assignment |= (b_used_before_assignment - {self.loop_variable}) defined_syms -= used_before_assignment free_syms -= defined_syms @@ -3082,9 +3140,6 @@ def replace_dict(self, super().replace_dict(repl, symrepl, replace_in_graph) - def to_json(self, parent=None): - return super().to_json(parent) - def add_break(self, label=None) -> BreakBlock: label = self._ensure_unique_block_name(label) block = BreakBlock(label) diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index 12f66db85f..a90a232aeb 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -1577,7 +1577,7 @@ def is_fpga_kernel(sdfg, state): def postdominators( sdfg: SDFG, return_alldoms: bool = False -) -> Union[Dict[SDFGState, SDFGState], Tuple[Dict[SDFGState, SDFGState], Dict[SDFGState, Set[SDFGState]]]]: +) -> Optional[Union[Dict[SDFGState, SDFGState], Tuple[Dict[SDFGState, SDFGState], Dict[SDFGState, Set[SDFGState]]]]]: """ Return the immediate postdominators of an SDFG. This may require creating new nodes and removing them, which happens in-place on the SDFG. @@ -1594,6 +1594,8 @@ def postdominators( sink = sdfg.add_state() for snode in sink_nodes: sdfg.add_edge(snode, sink, dace.InterstateEdge()) + elif len(sink_nodes) == 0: + return None else: sink = sink_nodes[0] ipostdom: Dict[SDFGState, SDFGState] = nx.immediate_dominators(sdfg._nx.reverse(), sink) diff --git a/dace/transformation/dataflow/map_fission.py b/dace/transformation/dataflow/map_fission.py index a50605812b..89e3d2d90f 100644 --- a/dace/transformation/dataflow/map_fission.py +++ b/dace/transformation/dataflow/map_fission.py @@ -122,6 +122,7 @@ def can_be_applied(self, graph, expr_index, sdfg, permissive=False): return False # Get NestedSDFG control flow components + nsdfg_node.sdfg.reset_cfg_list() cf_comp = helpers.find_sdfg_control_flow(nsdfg_node.sdfg) if len(cf_comp) == 1: child = list(cf_comp.values())[0][1] diff --git a/dace/transformation/dataflow/map_for_loop.py b/dace/transformation/dataflow/map_for_loop.py index 4295e8a0eb..d7148fc651 100644 --- a/dace/transformation/dataflow/map_for_loop.py +++ b/dace/transformation/dataflow/map_for_loop.py @@ -111,6 +111,7 @@ def replace_param(param): self.nsdfg = nsdfg sdfg.reset_cfg_list() + # Ensure the SDFG is marked as containing CFG regions sdfg.root_sdfg.using_experimental_blocks = True return node, nstate diff --git a/dace/transformation/dataflow/prune_connectors.py b/dace/transformation/dataflow/prune_connectors.py index a2b48ec595..499f488448 100644 --- a/dace/transformation/dataflow/prune_connectors.py +++ b/dace/transformation/dataflow/prune_connectors.py @@ -124,7 +124,7 @@ def _candidates(nsdfg: nodes.NestedSDFG) -> Set[str]: candidates -= set(map(str, desc.free_symbols)) ignore = set() - for nstate in cfg.stateorder_topological_sort(nsdfg.sdfg): + for nstate in cfg.blockorder_topological_sort(nsdfg.sdfg): state_syms = nstate.free_symbols # Try to be conservative with C++ tasklets diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py index cef0ca0fc6..f2b4ed622f 100644 --- a/dace/transformation/helpers.py +++ b/dace/transformation/helpers.py @@ -4,6 +4,7 @@ import itertools from networkx import MultiDiGraph +from dace.sdfg.state import ControlFlowRegion from dace.subsets import Range, Subset, union import dace.subsets as subsets from typing import Dict, List, Optional, Tuple, Set, Union @@ -270,7 +271,7 @@ def find_sdfg_control_flow(sdfg: SDFG) -> Dict[SDFGState, Set[SDFGState]]: components = {} visited = {} # Dict[SDFGState, bool]: True if SDFGState in Scope (non-SingleState) for i, child in enumerate(cft.children): - if isinstance(child, cf.SingleState): + if isinstance(child, cf.BasicCFBlock): if child.state in visited: continue components[child.state] = (set([child.state]), child) @@ -299,7 +300,7 @@ def find_sdfg_control_flow(sdfg: SDFG) -> Dict[SDFGState, Set[SDFGState]]: del components[guard] del visited[guard] - if not (i == len(cft.children) - 2 and isinstance(cft.children[i + 1], cf.SingleState) + if not (i == len(cft.children) - 2 and isinstance(cft.children[i + 1], cf.BasicCFBlock) and cft.children[i + 1].state is fexit): fexit_copy = _copy_state(sdfg, fexit, True, states) fexit.remove_nodes_from(fexit.nodes()) @@ -309,7 +310,7 @@ def find_sdfg_control_flow(sdfg: SDFG) -> Dict[SDFGState, Set[SDFGState]]: components[guard] = (states, child) visited.update({s: True for s in states}) elif isinstance(child, (cf.IfScope, cf.IfElseChain)): - guard = child.branch_state + guard = child.branch_block ifexit = ipostdom[guard] states = set(utils.dfs_conditional(sdfg, [guard], lambda p, _: p is not ifexit)) @@ -325,7 +326,7 @@ def find_sdfg_control_flow(sdfg: SDFG) -> Dict[SDFGState, Set[SDFGState]]: del components[guard] del visited[guard] - if not (i == len(cft.children) - 2 and isinstance(cft.children[i + 1], cf.SingleState) + if not (i == len(cft.children) - 2 and isinstance(cft.children[i + 1], cf.BasicCFBlock) and cft.children[i + 1].state is ifexit): ifexit_copy = _copy_state(sdfg, ifexit, True, states) ifexit.remove_nodes_from(ifexit.nodes()) @@ -644,6 +645,8 @@ def nest_state_subgraph(sdfg: SDFG, if state.in_degree(edge.dst) + state.out_degree(edge.dst) == 0: state.remove_node(edge.dst) + sdfg.reset_cfg_list() + return nested_sdfg @@ -954,20 +957,21 @@ def offset_map(state: SDFGState, subgraph.replace(param, f'({param} - {offset})') -def split_interstate_edges(sdfg: SDFG) -> None: +def split_interstate_edges(cfg: ControlFlowRegion) -> None: """ - Splits all inter-state edges into edges with conditions and edges with - assignments. This procedure helps in nested loop detection. + Splits all inter-state edges into edges with conditions and edges with assignments. + This procedure helps in nested loop detection. - :param sdfg: The SDFG to split - :note: Operates in-place on the SDFG. + :param cfg: The control flow graph to split + :note: Operates in-place on the graph. """ - for e in sdfg.edges(): - if e.data.assignments and not e.data.is_unconditional(): - tmpstate = sdfg.add_state() - sdfg.add_edge(e.src, tmpstate, InterstateEdge(condition=e.data.condition)) - sdfg.add_edge(tmpstate, e.dst, InterstateEdge(assignments=e.data.assignments)) - sdfg.remove_edge(e) + for cfg in cfg.all_control_flow_regions(): + for e in cfg.edges(): + if e.data.assignments and not e.data.is_unconditional(): + tmpstate = cfg.add_state() + cfg.add_edge(e.src, tmpstate, InterstateEdge(condition=e.data.condition)) + cfg.add_edge(tmpstate, e.dst, InterstateEdge(assignments=e.data.assignments)) + cfg.remove_edge(e) def is_symbol_unused(sdfg: SDFG, sym: str) -> bool: @@ -1392,7 +1396,7 @@ def replace_code_to_code_edges(sdfg: SDFG): :param sdfg: The SDFG to process. """ - for state in sdfg.nodes(): + for state in sdfg.states(): for edge in state.edges(): if not isinstance(edge.src, nodes.CodeNode) or not isinstance(edge.dst, nodes.CodeNode): continue diff --git a/dace/transformation/interstate/loop_to_map.py b/dace/transformation/interstate/loop_to_map.py index 7df057f1aa..7344b54161 100644 --- a/dace/transformation/interstate/loop_to_map.py +++ b/dace/transformation/interstate/loop_to_map.py @@ -115,7 +115,7 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi if symbolic.contains_sympy_functions(expr): return False - in_order_states = list(cfg.stateorder_topological_sort(sdfg)) + in_order_states = list(cfg.blockorder_topological_sort(sdfg)) loop_begin_idx = in_order_states.index(begin) loop_end_idx = in_order_states.index(body_end) @@ -138,7 +138,7 @@ def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissi for state in states: for e in sdfg.out_edges(state): # Collect read-before-assigned symbols (this works because the states are always in order, - # see above call to `stateorder_topological_sort`) + # see above call to `blockorder_topological_sort`) read_symbols = e.data.read_symbols() read_symbols -= symbols_that_may_be_used used_before_assignment |= read_symbols diff --git a/dace/transformation/passes/array_elimination.py b/dace/transformation/passes/array_elimination.py index a25858b0d6..46411478d5 100644 --- a/dace/transformation/passes/array_elimination.py +++ b/dace/transformation/passes/array_elimination.py @@ -48,7 +48,7 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Optional[S # Traverse SDFG backwards try: - state_order = list(cfg.stateorder_topological_sort(sdfg)) + state_order = list(cfg.blockorder_topological_sort(sdfg)) except KeyError: return None for state in reversed(state_order): diff --git a/dace/transformation/passes/constant_propagation.py b/dace/transformation/passes/constant_propagation.py index b0a20f70d6..7c05b3ea38 100644 --- a/dace/transformation/passes/constant_propagation.py +++ b/dace/transformation/passes/constant_propagation.py @@ -194,7 +194,7 @@ def _add_nested_datanames(name: str, desc: data.Structure): result[start_state].update(initial_symbols) # Traverse SDFG topologically - for state in optional_progressbar(cfg.stateorder_topological_sort(sdfg), 'Collecting constants', + for state in optional_progressbar(cfg.blockorder_topological_sort(sdfg), 'Collecting constants', sdfg.number_of_nodes(), self.progress): # NOTE: We must always check the start-state regardless if there are initial symbols. This is necessary # when the start-state is a scope's guard instead of a special initialization state, i.e., when the start- diff --git a/dace/transformation/passes/dead_dataflow_elimination.py b/dace/transformation/passes/dead_dataflow_elimination.py index fe181d01b4..856924abd2 100644 --- a/dace/transformation/passes/dead_dataflow_elimination.py +++ b/dace/transformation/passes/dead_dataflow_elimination.py @@ -65,7 +65,7 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Optional[D # Traverse SDFG backwards try: - state_order = list(cfg.stateorder_topological_sort(sdfg)) + state_order = list(cfg.blockorder_topological_sort(sdfg)) except KeyError: return None for state in reversed(state_order): diff --git a/dace/transformation/transformation.py b/dace/transformation/transformation.py index bb4a730e24..25d61d1ce8 100644 --- a/dace/transformation/transformation.py +++ b/dace/transformation/transformation.py @@ -330,11 +330,13 @@ def apply_to(cls, sample_node = next(iter(where.values())) if isinstance(sample_node, SDFGState): - graph = sdfg + graph = sample_node.parent_graph state_id = -1 + cfg_id = graph.cfg_id elif isinstance(sample_node, nd.Node): - graph = next(s for s in sdfg.nodes() if sample_node in s.nodes()) - state_id = sdfg.node_id(graph) + graph = next(s for s in sdfg.states() if sample_node in s.nodes()) + state_id = graph.block_id + cfg_id = graph.parent_graph.cfg_id else: raise TypeError('Invalid node type "%s"' % type(sample_node).__name__) @@ -352,7 +354,7 @@ def apply_to(cls, # Construct subgraph and instantiate transformation subgraph = {required_node_names[k]: graph.node_id(where[k]) for k in required} instance = cls() - instance.setup_match(sdfg, sdfg.cfg_id, state_id, subgraph, expr_index) + instance.setup_match(sdfg, cfg_id, state_id, subgraph, expr_index) # Construct transformation parameters for optname, optval in options.items(): diff --git a/doc/general/errors.rst b/doc/general/errors.rst index f200cae5f8..d97420c590 100644 --- a/doc/general/errors.rst +++ b/doc/general/errors.rst @@ -11,7 +11,7 @@ The default traversal order of DaCe is not guaranteed to be deterministic. This write a transformation that depends on the order of nodes in the SDFG, it may not work as expected. To fix this, you can use the :func:`~dace.sdfg.utils.dfs_topological_sort` function to sort the nodes in a state. -For SDFG state machines, you can also use :func:`~dace.sdfg.analysis.cfg.stateorder_topological_sort`, which will +For SDFG state machines, you can also use :func:`~dace.sdfg.analysis.cfg.blockorder_topological_sort`, which will traverse the states in the approximate order of execution (i.e., preserving order and entering if/for scopes before continuing). diff --git a/doc/sdfg/ir.rst b/doc/sdfg/ir.rst index 9eb37153d5..61dc8d4858 100644 --- a/doc/sdfg/ir.rst +++ b/doc/sdfg/ir.rst @@ -744,7 +744,7 @@ can be added to the SDFG using the :meth:`~dace.sdfg.sdfg.SDFG.add_datadesc` met **Traversal**: Since nodes and edges are stored in arbitrary order, the API provides methods for traversing the graph by topological order. The method :func:`~dace.sdfg.utils.dfs_topological_sort` returns a list of nodes in a state, and -:func:`~dace.sdfg.analysis.cfg.stateorder_topological_sort` traverses the state machine in approximate order of execution +:func:`~dace.sdfg.analysis.cfg.blockorder_topological_sort` traverses the state machine in approximate order of execution (i.e., preserving order and entering if/for scopes before continuing). diff --git a/samples/codegen/tensor_cores.py b/samples/codegen/tensor_cores.py index eaad543e6c..2090002d03 100644 --- a/samples/codegen/tensor_cores.py +++ b/samples/codegen/tensor_cores.py @@ -25,7 +25,7 @@ # Type hints from dace.sdfg.graph import MultiConnectorEdge -from dace.sdfg.state import StateSubgraphView +from dace.sdfg.state import ControlFlowRegion, StateSubgraphView from dace.codegen.prettycode import CodeIOStream from dace.codegen.dispatcher import DefinedType from typing import Any, List @@ -74,9 +74,9 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: dace.SDFG): self._dispatcher.register_copy_dispatcher(src_storage, dst_storage, None, self) self._dispatcher.register_copy_dispatcher(dst_storage, src_storage, None, self) - def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, - nodedesc: dt.Array, function_stream: CodeIOStream, declaration_stream: CodeIOStream, - allocation_stream: CodeIOStream): + def allocate_array(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Array, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): # Make sure the codegen includes the appropriate header files _include_mma(sdfg) @@ -90,23 +90,24 @@ def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, # Write a fragment based on the storage type if nodedesc.storage == dace.StorageType.TensorCore_Accumulator: ctype = 'wmma::fragment' - declaration_stream.write(f'{ctype} {name};', sdfg, state_id, node) + declaration_stream.write(f'{ctype} {name};', cfg, state_id, node) else: ctype = 'wmma::fragment'.format( mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj) - declaration_stream.write(f'{ctype} {name};', sdfg, state_id, node) + declaration_stream.write(f'{ctype} {name};', cfg, state_id, node) # Add the ctype to defined_vars so that the codegen can properly pass # fragments to functions as an object reference. self._dispatcher.defined_vars.add(name, DefinedType.Object, ctype) - def deallocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, - nodedesc: dt.Array, function_stream: CodeIOStream, callsite_stream: CodeIOStream): + def deallocate_array(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Array, function_stream: CodeIOStream, + callsite_stream: CodeIOStream): pass # Nothing to deallocate (wmma::fragment is a C++ object) - def copy_memory(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, src_node: nodes.Node, - dst_node: nodes.Node, edge: MultiConnectorEdge, function_stream: CodeIOStream, - callsite_stream: CodeIOStream): + def copy_memory(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: # Obtain source and destination information, handle access<->tasklet # If copying from tensor core fragments to/from tasklets, we only need # to emit a reference, as the fragment contains the memory. @@ -114,14 +115,14 @@ def copy_memory(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, sr # Tasklet -> Array if not src_desc: local_name = dfg.memlet_path(edge)[0].src_conn - callsite_stream.write('auto& %s = %s;' % (local_name, dst_node.data), sdfg, state_id, [src_node, dst_node]) + callsite_stream.write('auto& %s = %s;' % (local_name, dst_node.data), cfg, state_id, [src_node, dst_node]) return dst_desc = (dst_node.desc(sdfg) if isinstance(dst_node, nodes.AccessNode) else None) # Array -> Tasklet if not dst_desc: local_name = dfg.memlet_path(edge)[-1].dst_conn - callsite_stream.write('auto& %s = %s;' % (local_name, src_node.data), sdfg, state_id, [src_node, dst_node]) + callsite_stream.write('auto& %s = %s;' % (local_name, src_node.data), cfg, state_id, [src_node, dst_node]) return nontc_desc = (dst_desc if 'TensorCore' in src_desc.storage.name else src_desc) @@ -147,7 +148,7 @@ def copy_memory(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, sr callsite_stream.write( 'wmma::load_matrix_sync({tc}, &{other}, ' '{stride});'.format(tc=dst_node.data, other=other_expr, stride=src_desc.strides[0 if row_major else 1]), - sdfg, state_id, [src_node, dst_node]) + cfg, state_id, [src_node, dst_node]) else: # Tensor Cores to GPU memory callsite_stream.write( @@ -155,12 +156,12 @@ def copy_memory(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, sr '{stride}, wmma::mem_{maj}_major);'.format(tc=src_node.data, other=other_expr, maj='row' if row_major else 'col', - stride=dst_desc.strides[0 if row_major else 1]), sdfg, + stride=dst_desc.strides[0 if row_major else 1]), cfg, state_id, [src_node, dst_node]) - def define_out_memlet(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, src_node: nodes.Node, - dst_node: nodes.Node, edge: MultiConnectorEdge, function_stream: CodeIOStream, - callsite_stream: CodeIOStream): + def define_out_memlet(self, sdfg: dace.SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge, + function_stream: CodeIOStream, callsite_stream: CodeIOStream): # Output memlets that are directed at WMMA fragments can use the "auto" # keyword for simplicity. callsite_stream.write(f'auto& {edge.src_conn} = {edge.data.data};') diff --git a/tests/python_frontend/loop_regions_test.py b/tests/python_frontend/loop_regions_test.py index b6509bb0c3..cb7fa30fd4 100644 --- a/tests/python_frontend/loop_regions_test.py +++ b/tests/python_frontend/loop_regions_test.py @@ -1,21 +1,10 @@ # Copyright 2019-2024 ETH Zurich and the DaCe authors. All rights reserved. -import pytest import dace import numpy as np from dace.frontend.python.common import DaceSyntaxError from dace.sdfg.state import LoopRegion -# NOTE: Some tests have been disabled due to issues with our control flow detection during codegen. -# The issue is documented in #1586, and in parts in #635. The problem causes the listed tests to fail when -# automatic simplification is turned off ONLY. There are several active efforts to address this issue. -# For one, there are fixes being made to the control flow detection itself (commits da7af41 and c830f92 -# are the start of that). Additionally, codegen is being adapted (in a separate, following PR) to make use -# of the control flow region constructs directly, circumventing this issue entirely. -# As such, disabling these tests is a very temporary solution that should not be longer lived than -# a few weeks at most. -# TODO: Re-enable after issues are addressed. - @dace.program def for_loop(): A = dace.ndarray([10], dtype=dace.int32) @@ -49,7 +38,6 @@ def for_loop_with_break_continue(): return A -@pytest.mark.skip(reason='Control flow detection issues through extraneous states, needs control flow detection fix') def test_for_loop_with_break_continue(): for_loop_with_break_continue.use_experimental_cfg_blocks = True @@ -79,7 +67,6 @@ def nested_for_loop(): return A -@pytest.mark.skip(reason='Control flow detection issues through extraneous states, needs control flow detection fix') def test_nested_for_loop(): nested_for_loop.use_experimental_cfg_blocks = True @@ -196,7 +183,6 @@ def nested_for_while_loop(): return A -@pytest.mark.skip(reason='Control flow detection issues through extraneous states, needs control flow detection fix') def test_nested_for_while_loop(): nested_for_while_loop.use_experimental_cfg_blocks = True @@ -230,7 +216,6 @@ def nested_while_for_loop(): return A -@pytest.mark.skip(reason='Control flow detection issues through extraneous states, needs control flow detection fix') def test_nested_while_for_loop(): nested_while_for_loop.use_experimental_cfg_blocks = True @@ -469,7 +454,6 @@ def test_nested_map_with_symbol(): assert (np.array_equal(val, ref)) -@pytest.mark.skip(reason='Control flow detection issues through extraneous states, needs control flow detection fix') def test_for_else(): @dace.program diff --git a/tests/python_frontend/loops_test.py b/tests/python_frontend/loops_test.py index 952d69b8fb..e0c869f20c 100644 --- a/tests/python_frontend/loops_test.py +++ b/tests/python_frontend/loops_test.py @@ -5,16 +5,6 @@ from dace.frontend.python.common import DaceSyntaxError -# NOTE: Some tests have been disabled due to issues with our control flow detection during codegen. -# The issue is documented in #1586, and in parts in #635. The problem causes the listed tests to fail when -# automatic simplification is turned off ONLY. There are several active efforts to address this issue. -# For one, there are fixes being made to the control flow detection itself (commits da7af41 and c830f92 -# are the start of that). Additionally, codegen is being adapted (in a separate, following PR) to make use -# of the control flow region constructs directly, circumventing this issue entirely. -# As such, disabling these tests is a very temporary solution that should not be longer lived than -# a few weeks at most. -# TODO: Re-enable after issues are addressed. - @dace.program def for_loop(): A = dace.ndarray([10], dtype=dace.int32) diff --git a/tests/sdfg/loop_region_test.py b/tests/sdfg/loop_region_test.py index 5742fc12ac..6aca54f40c 100644 --- a/tests/sdfg/loop_region_test.py +++ b/tests/sdfg/loop_region_test.py @@ -1,11 +1,14 @@ # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. import dace import numpy as np +from dace.sdfg.sdfg import SDFG from dace.sdfg.state import LoopRegion +from dace.sdfg.analysis.schedule_tree import sdfg_to_tree as s2t, treenodes as tn -def test_loop_regular_for(): +def _make_regular_for_loop() -> SDFG: sdfg = dace.SDFG('regular_for') + sdfg.using_experimental_blocks = True state0 = sdfg.add_state('state0', is_start_block=True) loop1 = LoopRegion(label='loop1', condition_expr='i < 10', loop_var='i', initialize_expr='i = 0', update_expr='i = i + 1', inverted=False) @@ -19,19 +22,12 @@ def test_loop_regular_for(): state3 = sdfg.add_state('state3') sdfg.add_edge(state0, loop1, dace.InterstateEdge()) sdfg.add_edge(loop1, state3, dace.InterstateEdge()) + return sdfg - assert sdfg.is_valid() - a_validation = np.zeros([10], dtype=np.float32) - a_test = np.zeros([10], dtype=np.float32) - sdfg(A=a_test) - for i in range(10): - a_validation[i] = i - assert np.allclose(a_validation, a_test) - - -def test_loop_regular_while(): +def _make_regular_while_loop() -> SDFG: sdfg = dace.SDFG('regular_while') + sdfg.using_experimental_blocks = True state0 = sdfg.add_state('state0', is_start_block=True) loop1 = LoopRegion(label='loop1', condition_expr='i < 10') sdfg.add_array('A', [10], dace.float32) @@ -46,19 +42,12 @@ def test_loop_regular_while(): state3 = sdfg.add_state('state3') sdfg.add_edge(state0, loop1, dace.InterstateEdge(assignments={'i': '0'})) sdfg.add_edge(loop1, state3, dace.InterstateEdge()) - - assert sdfg.is_valid() - - a_validation = np.zeros([10], dtype=np.float32) - a_test = np.zeros([10], dtype=np.float32) - sdfg(A=a_test) - for i in range(10): - a_validation[i] = i - assert np.allclose(a_validation, a_test) + return sdfg -def test_loop_do_while(): +def _make_do_while_loop() -> SDFG: sdfg = dace.SDFG('do_while') + sdfg.using_experimental_blocks = True sdfg.add_symbol('i', dace.int32) state0 = sdfg.add_state('state0', is_start_block=True) loop1 = LoopRegion(label='loop1', condition_expr='i < 10', inverted=True) @@ -73,18 +62,12 @@ def test_loop_do_while(): state3 = sdfg.add_state('state3') sdfg.add_edge(state0, loop1, dace.InterstateEdge(assignments={'i': '10'})) sdfg.add_edge(loop1, state3, dace.InterstateEdge()) + return sdfg - assert sdfg.is_valid() - a_validation = np.zeros([11], dtype=np.float32) - a_test = np.zeros([11], dtype=np.float32) - a_validation[10] = 10 - sdfg(A=a_test) - assert np.allclose(a_validation, a_test) - - -def test_loop_do_for(): +def _make_do_for_loop() -> SDFG: sdfg = dace.SDFG('do_for') + sdfg.using_experimental_blocks = True sdfg.add_symbol('i', dace.int32) sdfg.add_array('A', [10], dace.float32) state0 = sdfg.add_state('state0', is_start_block=True) @@ -100,19 +83,12 @@ def test_loop_do_for(): state3 = sdfg.add_state('state3') sdfg.add_edge(state0, loop1, dace.InterstateEdge()) sdfg.add_edge(loop1, state3, dace.InterstateEdge()) + return sdfg - assert sdfg.is_valid() - - a_validation = np.zeros([10], dtype=np.float32) - a_test = np.zeros([10], dtype=np.float32) - sdfg(A=a_test) - for i in range(10): - a_validation[i] = i - assert np.allclose(a_validation, a_test) - -def test_triple_nested_for(): +def _make_triple_nested_for_loop() -> SDFG: sdfg = dace.SDFG('gemm') + sdfg.using_experimental_blocks = True sdfg.add_symbol('i', dace.int32) sdfg.add_symbol('j', dace.int32) sdfg.add_symbol('k', dace.int32) @@ -146,6 +122,63 @@ def test_triple_nested_for(): red = reduce_state.add_reduce('lambda a, b: a + b', (2,), 0) reduce_state.add_edge(tmpnode2, None, red, None, dace.Memlet.simple('tmp', '0:N, 0:M, 0:K')) reduce_state.add_edge(red, None, cnode, None, dace.Memlet.simple('C', '0:N, 0:M')) + return sdfg + + +def test_loop_regular_for(): + sdfg = _make_regular_for_loop() + + assert sdfg.is_valid() + + a_validation = np.zeros([10], dtype=np.float32) + a_test = np.zeros([10], dtype=np.float32) + sdfg(A=a_test) + for i in range(10): + a_validation[i] = i + assert np.allclose(a_validation, a_test) + + +def test_loop_regular_while(): + sdfg = _make_regular_while_loop() + + assert sdfg.is_valid() + + a_validation = np.zeros([10], dtype=np.float32) + a_test = np.zeros([10], dtype=np.float32) + sdfg(A=a_test) + for i in range(10): + a_validation[i] = i + assert np.allclose(a_validation, a_test) + + +def test_loop_do_while(): + sdfg = _make_do_while_loop() + + assert sdfg.is_valid() + + a_validation = np.zeros([11], dtype=np.float32) + a_test = np.zeros([11], dtype=np.float32) + a_validation[10] = 10 + sdfg(A=a_test) + assert np.allclose(a_validation, a_test) + assert 'do {' in sdfg.generate_code()[0].code + + +def test_loop_do_for(): + sdfg = _make_do_for_loop() + + assert sdfg.is_valid() + + a_validation = np.zeros([10], dtype=np.float32) + a_test = np.zeros([10], dtype=np.float32) + sdfg(A=a_test) + for i in range(10): + a_validation[i] = i + assert np.allclose(a_validation, a_test) + + +def test_loop_triple_nested_for(): + sdfg = _make_triple_nested_for_loop() assert sdfg.is_valid() @@ -164,9 +197,79 @@ def test_triple_nested_for(): assert np.allclose(C_validation, C_test) +def test_loop_to_stree_regular_for(): + sdfg = _make_regular_for_loop() + + assert sdfg.is_valid() + + stree = s2t.as_schedule_tree(sdfg) + + assert stree.as_string() == (f'{tn.INDENTATION}for i = 0; (i < 10); i = (i + 1):\n' + + f'{2 * tn.INDENTATION}A[i] = tasklet()') + + +def test_loop_to_stree_regular_while(): + sdfg = _make_regular_while_loop() + + assert sdfg.is_valid() + + stree = s2t.as_schedule_tree(sdfg) + + assert stree.as_string() == (f'{tn.INDENTATION}assign i = 0\n' + + f'{tn.INDENTATION}while (i < 10):\n' + + f'{2 * tn.INDENTATION}A[i] = tasklet()\n' + + f'{2 * tn.INDENTATION}assign i = (i + 1)') + + +def test_loop_to_stree_do_while(): + sdfg = _make_do_while_loop() + + assert sdfg.is_valid() + + stree = s2t.as_schedule_tree(sdfg) + + assert stree.as_string() == (f'{tn.INDENTATION}assign i = 10\n' + + f'{tn.INDENTATION}do:\n' + + f'{2 * tn.INDENTATION}A[i] = tasklet()\n' + + f'{2 * tn.INDENTATION}assign i = (i + 1)\n' + + f'{tn.INDENTATION}while (i < 10)') + + +def test_loop_to_stree_do_for(): + sdfg = _make_do_for_loop() + + assert sdfg.is_valid() + + stree = s2t.as_schedule_tree(sdfg) + + assert stree.as_string() == (f'{tn.INDENTATION}i = 0\n' + + f'{tn.INDENTATION}do:\n' + + f'{2 * tn.INDENTATION}A[i] = tasklet()\n' + + f'{2 * tn.INDENTATION}i = (i + 1)\n' + + f'{tn.INDENTATION}while (i < 10)') + + +def test_loop_to_stree_triple_nested_for(): + sdfg = _make_triple_nested_for_loop() + + assert sdfg.is_valid() + + stree = s2t.as_schedule_tree(sdfg) + + po_nodes = list(stree.preorder_traversal())[1:] + assert [type(n) for n in po_nodes] == [tn.GeneralLoopScope, tn.GeneralLoopScope, tn.GeneralLoopScope, + tn.TaskletNode, tn.LibraryCall] + + + if __name__ == '__main__': test_loop_regular_for() test_loop_regular_while() test_loop_do_while() test_loop_do_for() - test_triple_nested_for() + test_loop_triple_nested_for() + test_loop_to_stree_regular_for() + test_loop_to_stree_regular_while() + test_loop_to_stree_do_while() + test_loop_to_stree_do_for() + test_loop_to_stree_triple_nested_for() diff --git a/tests/transformations/nest_subgraph_test.py b/tests/transformations/nest_subgraph_test.py index 763bb3327d..623b029c3a 100644 --- a/tests/transformations/nest_subgraph_test.py +++ b/tests/transformations/nest_subgraph_test.py @@ -78,10 +78,10 @@ def symbolic_return(): assert i < len(cft.children) - 1 exit_scope = cft.children[i+1] - assert isinstance(exit_scope, cf.SingleState) + assert isinstance(exit_scope, cf.BasicCFBlock) guard = for_scope.guard - fexit = exit_scope.first_state + fexit = exit_scope.first_block states = list(utils.dfs_conditional(sdfg, [guard], lambda p, _: p is not fexit)) nest_sdfg_subgraph(sdfg, SubgraphView(sdfg, states), start=guard) diff --git a/tests/transformations/subgraph_fusion/block_allreduce_cudatest.py b/tests/transformations/subgraph_fusion/block_allreduce_cudatest.py index 7bb9055ade..f948d2032b 100644 --- a/tests/transformations/subgraph_fusion/block_allreduce_cudatest.py +++ b/tests/transformations/subgraph_fusion/block_allreduce_cudatest.py @@ -46,8 +46,6 @@ def test_blockallreduce(): print(np.linalg.norm(result2)) assert np.allclose(result1, result2) - print("PASS") - if __name__ == '__main__': test_blockallreduce()