From 77c5c7205f5daabae4312cd975191fb48c67ac6f Mon Sep 17 00:00:00 2001 From: Yakup Koray Budanaz Date: Fri, 29 Nov 2024 08:47:36 +0100 Subject: [PATCH] Minor Additions to Enable Tiling and Explicit Memory Movement Transformations (#1636) I made some minor additions to make implementing some transformations easier for me. I will explain all three changes and why I needed them. 1. Add gpu_force_syncthreads to force a call to __syncthreads in a map in dace/codegen/targets/cuda.py and dace/sdfg/nodes.py. - I preferred to tile work maps (e.g., K reduction for sum-of-inner-products matrix multiplication) of kernels in such a way that all new tiled maps are in the scope of the thread block map, yet when it is combined with shared memory, a `__syncthreads` call is necessary within the thread block map which is not performed for sequential maps inside a thread block scheduled map, I would like to be able to force this behavior 2. Adding the skew option to the map tiling transformation. - Having every map start from 0 makes writing my transformations simpler. Therefore, I wanted the map tiling transformation to start the inner map at 0; I could only achieve this behavior by copying over the skew parameter from the strip mine transformation. I would still prefer to use the map tiling transformation instead of strip mine while having the skew parameter. --- dace/codegen/targets/cuda.py | 3 +++ dace/sdfg/nodes.py | 2 ++ dace/transformation/dataflow/tiling.py | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index 1cf8919d74..6425f01688 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -2620,6 +2620,9 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + if isinstance(node, nodes.MapExit) and node.map.gpu_force_syncthreads: + callsite_stream.write('__syncthreads();', cfg, state_id) + if node.map.schedule == dtypes.ScheduleType.GPU_Device: # Remove grid invocation conditions for i in range(len(node.map.params)): diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index d29b1a22e4..fe2d523e82 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -930,6 +930,8 @@ class Map(object): "(including tuples) sets it explicitly.", serialize_if=lambda m: m.schedule in dtypes.GPU_SCHEDULES) + gpu_force_syncthreads = Property(dtype=bool, desc="Force a call to the __syncthreads for the map", default=False) + def __init__(self, label, params, diff --git a/dace/transformation/dataflow/tiling.py b/dace/transformation/dataflow/tiling.py index bfa899e71a..8a6d75f4db 100644 --- a/dace/transformation/dataflow/tiling.py +++ b/dace/transformation/dataflow/tiling.py @@ -33,6 +33,8 @@ class MapTiling(transformation.SingleStateTransformation): divides_evenly = Property(dtype=bool, default=False, desc="Tile size divides dimension length evenly") tile_trivial = Property(dtype=bool, default=False, desc="Tiles even if tile_size is 1") + skew = Property(dtype=bool, default=False, desc="If True, offsets inner tile back such that it starts with zero") + @staticmethod def annotates_memlets(): return True @@ -92,6 +94,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG): stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = True stripmine.tile_offset = str(offset) + stripmine.skew = self.skew stripmine.apply(graph, sdfg) removed_maps += 1 else: @@ -101,6 +104,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG): stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = self.divides_evenly stripmine.tile_offset = str(offset) + stripmine.skew = self.skew stripmine.apply(graph, sdfg) # apply to the new map the schedule of the original one