Structure Support to NestedSDFGs and Python Frontend (#1366)

Adds basic support for nested data (Structures) to the Python frontend. It also resolves issues with the use of Structures in nested SDFG scopes (mostly code generation). NOTE: This PR handles only CPU code generation and fixes issues with libraries and transformations discovered during testing. It doesn't handle GPU/FPGA code generation, which will be studied in subsequent PRs. --------- Co-authored-by: Tal Ben-Nun <[email protected]>
spcl · Feb 22, 2024 · ab6647b · ab6647b
1 parent 52d9f9f
commit ab6647b
Show file tree

Hide file tree

Showing 25 changed files with 729 additions and 262 deletions.
diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
@@ -61,6 +61,13 @@ def copy_expr(
     packed_types=False,
 ):
     data_desc = sdfg.arrays[data_name]
+    # NOTE: Are there any cases where a mix of '.' and '->' is needed when traversing nested structs?
+    # TODO: Study this when changing Structures to be (optionally?) non-pointers.
+    tokens = data_name.split('.')
+    if len(tokens) > 1 and tokens[0] in sdfg.arrays and isinstance(sdfg.arrays[tokens[0]], data.Structure):
+        name = data_name.replace('.', '->')
+    else:
+        name = data_name
     ptrname = ptr(data_name, data_desc, sdfg, dispatcher.frame)
     if relative_offset:
         s = memlet.subset
@@ -99,6 +106,7 @@ def copy_expr(
         # get conf flag
         decouple_array_interfaces = Config.get_bool("compiler", "xilinx", "decouple_array_interfaces")
 
+        # TODO: Study structures on FPGAs. Should probably use 'name' instead of 'data_name' here.
         expr = fpga.fpga_ptr(
             data_name,
             data_desc,
@@ -112,7 +120,7 @@ def copy_expr(
             and not isinstance(data_desc, data.View),
             decouple_array_interfaces=decouple_array_interfaces)
     else:
-        expr = ptr(data_name, data_desc, sdfg, dispatcher.frame)
+        expr = ptr(name, data_desc, sdfg, dispatcher.frame)
 
     add_offset = offset_cppstr != "0"
 
@@ -344,7 +352,7 @@ def make_const(expr: str) -> str:
         is_scalar = False
     elif defined_type == DefinedType.Scalar:
         typedef = defined_ctype if is_scalar else (defined_ctype + '*')
-        if is_write is False:
+        if is_write is False and not isinstance(desc, data.Structure):
             typedef = make_const(typedef)
         ref = '&' if is_scalar else ''
         defined_type = DefinedType.Scalar if is_scalar else DefinedType.Pointer
@@ -578,17 +586,26 @@ def cpp_array_expr(sdfg,
     desc = (sdfg.arrays[memlet.data] if referenced_array is None else referenced_array)
     offset_cppstr = cpp_offset_expr(desc, s, o, packed_veclen, indices=indices)
 
+    # NOTE: Are there any cases where a mix of '.' and '->' is needed when traversing nested structs?
+    # TODO: Study this when changing Structures to be (optionally?) non-pointers.
+    tokens = memlet.data.split('.')
+    if len(tokens) > 1 and tokens[0] in sdfg.arrays and isinstance(sdfg.arrays[tokens[0]], data.Structure):
+        name = memlet.data.replace('.', '->')
+    else:
+        name = memlet.data
+
     if with_brackets:
         if fpga.is_fpga_array(desc):
             # get conf flag
             decouple_array_interfaces = Config.get_bool("compiler", "xilinx", "decouple_array_interfaces")
+            # TODO: Study structures on FPGAs. Should probably use 'name' instead of 'memlet.data' here.
             ptrname = fpga.fpga_ptr(memlet.data,
                                     desc,
                                     sdfg,
                                     subset,
                                     decouple_array_interfaces=decouple_array_interfaces)
         else:
-            ptrname = ptr(memlet.data, desc, sdfg, codegen)
+            ptrname = ptr(name, desc, sdfg, codegen)
         return "%s[%s]" % (ptrname, offset_cppstr)
     else:
         return offset_cppstr

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
@@ -31,29 +31,7 @@ class CPUCodeGen(TargetCodeGenerator):
     target_name = "cpu"
     language = "cpp"
 
-    def __init__(self, frame_codegen, sdfg):
-        self._frame = frame_codegen
-        self._dispatcher: TargetDispatcher = frame_codegen.dispatcher
-        self.calling_codegen = self
-        dispatcher = self._dispatcher
-
-        self._locals = cppunparse.CPPLocals()
-        # Scope depth (for defining locals)
-        self._ldepth = 0
-
-        # Keep nested SDFG schedule when descending into it
-        self._toplevel_schedule = None
-
-        # FIXME: this allows other code generators to change the CPU
-        # behavior to assume that arrays point to packed types, thus dividing
-        # all addresess by the vector length.
-        self._packed_types = False
-
-        # Keep track of traversed nodes
-        self._generated_nodes = set()
-
-        # Keep track of generated NestedSDG, and the name of the assigned function
-        self._generated_nested_sdfg = dict()
+    def _define_sdfg_arguments(self, sdfg, arglist):
 
         # NOTE: Multi-nesting with StructArrays must be further investigated.
         def _visit_structure(struct: data.Structure, args: dict, prefix: str = ''):
@@ -66,18 +44,18 @@ def _visit_structure(struct: data.Structure, args: dict, prefix: str = ''):
                     args[f'{prefix}->{k}'] = v
 
         # Keeps track of generated connectors, so we know how to access them in nested scopes
-        arglist = dict(self._frame.arglist)
-        for name, arg_type in self._frame.arglist.items():
+        args = dict(arglist)
+        for name, arg_type in arglist.items():
             if isinstance(arg_type, data.Structure):
                 desc = sdfg.arrays[name]
-                _visit_structure(arg_type, arglist, name)
+                _visit_structure(arg_type, args, name)
             elif isinstance(arg_type, data.StructArray):
                 desc = sdfg.arrays[name]
                 desc = desc.stype
-                _visit_structure(desc, arglist, name)
+                _visit_structure(desc, args, name)
 
-        for name, arg_type in arglist.items():
-            if isinstance(arg_type, (data.Scalar, data.Structure)):
+        for name, arg_type in args.items():
+            if isinstance(arg_type, data.Scalar):
                 # GPU global memory is only accessed via pointers
                 # TODO(later): Fix workaround somehow
                 if arg_type.storage is dtypes.StorageType.GPU_Global:
@@ -92,10 +70,40 @@ def _visit_structure(struct: data.Structure, args: dict, prefix: str = ''):
                     self._dispatcher.defined_vars.add(name, DefinedType.StreamArray, arg_type.as_arg(name=''))
                 else:
                     self._dispatcher.defined_vars.add(name, DefinedType.Stream, arg_type.as_arg(name=''))
+            elif isinstance(arg_type, data.Structure):
+                self._dispatcher.defined_vars.add(name, DefinedType.Pointer, arg_type.dtype.ctype)
             else:
                 raise TypeError("Unrecognized argument type: {t} (value {v})".format(t=type(arg_type).__name__,
                                                                                      v=str(arg_type)))
 
+    def __init__(self, frame_codegen, sdfg):
+        self._frame = frame_codegen
+        self._dispatcher: TargetDispatcher = frame_codegen.dispatcher
+        self.calling_codegen = self
+        dispatcher = self._dispatcher
+
+        self._locals = cppunparse.CPPLocals()
+        # Scope depth (for defining locals)
+        self._ldepth = 0
+
+        # Keep nested SDFG schedule when descending into it
+        self._toplevel_schedule = None
+
+        # FIXME: this allows other code generators to change the CPU
+        # behavior to assume that arrays point to packed types, thus dividing
+        # all addresess by the vector length.
+        self._packed_types = False
+
+        # Keep track of traversed nodes
+        self._generated_nodes = set()
+
+        # Keep track of generated NestedSDG, and the name of the assigned function
+        self._generated_nested_sdfg = dict()
+
+        # Keeps track of generated connectors, so we know how to access them in nested scopes
+        arglist = dict(self._frame.arglist)
+        self._define_sdfg_arguments(sdfg, arglist)
+
         # Register dispatchers
         dispatcher.register_node_dispatcher(self)
         dispatcher.register_map_dispatcher(
@@ -258,7 +266,7 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de
             raise NotImplementedError("The declare_array method should only be used for variables "
                                       "that must have their declaration and allocation separate.")
 
-        name = node.data
+        name = node.root_data
         ptrname = cpp.ptr(name, nodedesc, sdfg, self._frame)
 
         if nodedesc.transient is False:
@@ -295,23 +303,40 @@ def declare_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, de
             raise NotImplementedError("Unimplemented storage type " + str(nodedesc.storage))
 
     def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream,
-                       allocation_stream):
-        name = node.data
-        alloc_name = cpp.ptr(name, nodedesc, sdfg, self._frame)
+                       allocation_stream, allocate_nested_data: bool = True):
+        alloc_name = cpp.ptr(node.data, nodedesc, sdfg, self._frame)
         name = alloc_name
 
-        if nodedesc.transient is False:
+        tokens = node.data.split('.')
+        top_desc = sdfg.arrays[tokens[0]]
+        # NOTE: Assuming here that all Structure members share transient/storage/lifetime properties.
+        # TODO: Study what is needed in the DaCe stack to ensure this assumption is correct.
+        top_transient = top_desc.transient
+        top_storage = top_desc.storage
+        top_lifetime = top_desc.lifetime
+
+        if top_transient is False:
             return
 
         # Check if array is already allocated
         if self._dispatcher.defined_vars.has(name):
             return
-
-        # Check if array is already declared
-        declared = self._dispatcher.declared_arrays.has(name)
+
+        if len(tokens) > 1:
+            for i in range(len(tokens) - 1):
+                tmp_name = '.'.join(tokens[:i + 1])
+                tmp_alloc_name = cpp.ptr(tmp_name, sdfg.arrays[tmp_name], sdfg, self._frame)
+                if not self._dispatcher.defined_vars.has(tmp_alloc_name):
+                    self.allocate_array(sdfg, dfg, state_id, nodes.AccessNode(tmp_name), sdfg.arrays[tmp_name],
+                                        function_stream, declaration_stream, allocation_stream,
+                                        allocate_nested_data=False)
+            declared = True
+        else:
+            # Check if array is already declared
+            declared = self._dispatcher.declared_arrays.has(name)
 
         define_var = self._dispatcher.defined_vars.add
-        if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
+        if top_lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
             define_var = self._dispatcher.defined_vars.add_global
             nodedesc = update_persistent_desc(nodedesc, sdfg)
 
@@ -324,13 +349,14 @@ def allocate_array(self, sdfg, dfg, state_id, node, nodedesc, function_stream, d
         if isinstance(nodedesc, data.Structure) and not isinstance(nodedesc, data.StructureView):
             declaration_stream.write(f"{nodedesc.ctype} {name} = new {nodedesc.dtype.base_type};\n")
             define_var(name, DefinedType.Pointer, nodedesc.ctype)
-            for k, v in nodedesc.members.items():
-                if isinstance(v, data.Data):
-                    ctypedef = dtypes.pointer(v.dtype).ctype if isinstance(v, data.Array) else v.dtype.ctype
-                    defined_type = DefinedType.Scalar if isinstance(v, data.Scalar) else DefinedType.Pointer
-                    self._dispatcher.declared_arrays.add(f"{name}->{k}", defined_type, ctypedef)
-                    self.allocate_array(sdfg, dfg, state_id, nodes.AccessNode(f"{name}.{k}"), v, function_stream,
-                                        declaration_stream, allocation_stream)
+            if allocate_nested_data:
+                for k, v in nodedesc.members.items():
+                    if isinstance(v, data.Data):
+                        ctypedef = dtypes.pointer(v.dtype).ctype if isinstance(v, data.Array) else v.dtype.ctype
+                        defined_type = DefinedType.Scalar if isinstance(v, data.Scalar) else DefinedType.Pointer
+                        self._dispatcher.declared_arrays.add(f"{name}->{k}", defined_type, ctypedef)
+                        self.allocate_array(sdfg, dfg, state_id, nodes.AccessNode(f"{name}.{k}"), v, function_stream,
+                                            declaration_stream, allocation_stream)
             return
         if isinstance(nodedesc, (data.StructureView, data.View)):
             return self.allocate_view(sdfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream)
@@ -620,17 +646,6 @@ def _emit_copy(
             #############################################
             # Corner cases
 
-            # Writing one index
-            if (isinstance(memlet.subset, subsets.Indices) and memlet.wcr is None
-                    and self._dispatcher.defined_vars.get(vconn)[0] == DefinedType.Scalar):
-                stream.write(
-                    "%s = %s;" % (vconn, self.memlet_ctor(sdfg, memlet, dst_nodedesc.dtype, False)),
-                    sdfg,
-                    state_id,
-                    [src_node, dst_node],
-                )
-                return
-
             # Setting a reference
             if isinstance(dst_nodedesc, data.Reference) and orig_vconn == 'set':
                 srcptr = cpp.ptr(src_node.data, src_nodedesc, sdfg, self._frame)
@@ -1586,6 +1601,10 @@ def _generate_NestedSDFG(
         self._dispatcher.defined_vars.enter_scope(sdfg, can_access_parent=inline)
         state_dfg = sdfg.nodes()[state_id]
 
+        fsyms = self._frame.free_symbols(node.sdfg)
+        arglist = node.sdfg.arglist(scalars_only=False, free_symbols=fsyms)
+        self._define_sdfg_arguments(node.sdfg, arglist)
+
         # Quick sanity check.
         # TODO(later): Is this necessary or "can_access_parent" should always be False?
         if inline:

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
@@ -1023,10 +1023,12 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst
                 if issubclass(node_dtype.type, ctypes.Structure):
                     callsite_stream.write('for (size_t __idx = 0; __idx < {arrlen}; ++__idx) '
                                           '{{'.format(arrlen=array_length))
-                    for field_name, field_type in node_dtype._data.items():
+                    # TODO: Study further when tackling Structures on GPU.
+                    for field_name, field_type in node_dtype._typeclass.fields.items():
                         if isinstance(field_type, dtypes.pointer):
                             tclass = field_type.type
-                            length = node_dtype._length[field_name]
+
+                            length = node_dtype._typeclass._length[field_name]
                             size = 'sizeof({})*{}[__idx].{}'.format(dtypes._CTYPES[tclass], str(src_node), length)
                             callsite_stream.write('DACE_GPU_CHECK({backend}Malloc(&{dst}[__idx].{fname}, '
                                                   '{sz}));'.format(dst=str(dst_node),

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
@@ -539,7 +539,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG):
         reachability = StateReachability().apply_pass(top_sdfg, {})
         access_instances: Dict[int, Dict[str, List[Tuple[SDFGState, nodes.AccessNode]]]] = {}
         for sdfg in top_sdfg.all_sdfgs_recursive():
-            shared_transients[sdfg.sdfg_id] = sdfg.shared_transients(check_toplevel=False)
+            shared_transients[sdfg.sdfg_id] = sdfg.shared_transients(check_toplevel=False, include_nested_data=True)
             fsyms[sdfg.sdfg_id] = self.symbols_and_constants(sdfg)
 
             #############################################
@@ -564,8 +564,14 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG):
 
             access_instances[sdfg.sdfg_id] = instances
 
-        for sdfg, name, desc in top_sdfg.arrays_recursive():
-            if not desc.transient:
+        for sdfg, name, desc in top_sdfg.arrays_recursive(include_nested_data=True):
+            # NOTE: Assuming here that all Structure members share transient/storage/lifetime properties.
+            # TODO: Study what is needed in the DaCe stack to ensure this assumption is correct.
+            top_desc = sdfg.arrays[name.split('.')[0]]
+            top_transient = top_desc.transient
+            top_storage = top_desc.storage
+            top_lifetime = top_desc.lifetime
+            if not top_transient:
                 continue
             if name in sdfg.constants_prop:
                 # Constants do not need to be allocated
@@ -589,7 +595,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG):
                 access_instances[sdfg.sdfg_id].get(name, [(None, None)])[-1]
 
             # Cases
-            if desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
+            if top_lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
                 # Persistent memory is allocated in initialization code and
                 # exists in the library state structure
 
@@ -599,13 +605,13 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG):
 
                 definition = desc.as_arg(name=f'__{sdfg.sdfg_id}_{name}') + ';'
 
-                if desc.storage != dtypes.StorageType.CPU_ThreadLocal:  # If thread-local, skip struct entry
+                if top_storage != dtypes.StorageType.CPU_ThreadLocal:  # If thread-local, skip struct entry
                     self.statestruct.append(definition)
 
                 self.to_allocate[top_sdfg].append((sdfg, first_state_instance, first_node_instance, True, True, True))
                 self.where_allocated[(sdfg, name)] = top_sdfg
                 continue
-            elif desc.lifetime is dtypes.AllocationLifetime.Global:
+            elif top_lifetime is dtypes.AllocationLifetime.Global:
                 # Global memory is allocated in the beginning of the program
                 # exists in the library state structure (to be passed along
                 # to the right SDFG)
@@ -627,15 +633,15 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG):
             # a kernel).
             alloc_scope: Union[nodes.EntryNode, SDFGState, SDFG] = None
             alloc_state: SDFGState = None
-            if (name in shared_transients[sdfg.sdfg_id] or desc.lifetime is dtypes.AllocationLifetime.SDFG):
+            if (name in shared_transients[sdfg.sdfg_id] or top_lifetime is dtypes.AllocationLifetime.SDFG):
                 # SDFG descriptors are allocated in the beginning of their SDFG
                 alloc_scope = sdfg
                 if first_state_instance is not None:
                     alloc_state = first_state_instance
                 # If unused, skip
                 if first_node_instance is None:
                     continue
-            elif desc.lifetime == dtypes.AllocationLifetime.State:
+            elif top_lifetime == dtypes.AllocationLifetime.State:
                 # State memory is either allocated in the beginning of the
                 # containing state or the SDFG (if used in more than one state)
                 curstate: SDFGState = None
@@ -651,7 +657,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG):
                 else:
                     alloc_scope = curstate
                     alloc_state = curstate
-            elif desc.lifetime == dtypes.AllocationLifetime.Scope:
+            elif top_lifetime == dtypes.AllocationLifetime.Scope:
                 # Scope memory (default) is either allocated in the innermost
                 # scope (e.g., Map, Consume) it is used in (i.e., greatest
                 # common denominator), or in the SDFG if used in multiple states
@@ -671,7 +677,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG):
                     for node in state.nodes():
                         if not isinstance(node, nodes.AccessNode):
                             continue
-                        if node.data != name:
+                        if node.root_data != name:
                             continue
 
                         # If already found in another state, set scope to SDFG