diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index 1cf8919d74..6425f01688 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -2620,6 +2620,9 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + if isinstance(node, nodes.MapExit) and node.map.gpu_force_syncthreads: + callsite_stream.write('__syncthreads();', cfg, state_id) + if node.map.schedule == dtypes.ScheduleType.GPU_Device: # Remove grid invocation conditions for i in range(len(node.map.params)): diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index d29b1a22e4..fe2d523e82 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -930,6 +930,8 @@ class Map(object): "(including tuples) sets it explicitly.", serialize_if=lambda m: m.schedule in dtypes.GPU_SCHEDULES) + gpu_force_syncthreads = Property(dtype=bool, desc="Force a call to the __syncthreads for the map", default=False) + def __init__(self, label, params, diff --git a/dace/transformation/dataflow/tiling.py b/dace/transformation/dataflow/tiling.py index bfa899e71a..8a6d75f4db 100644 --- a/dace/transformation/dataflow/tiling.py +++ b/dace/transformation/dataflow/tiling.py @@ -33,6 +33,8 @@ class MapTiling(transformation.SingleStateTransformation): divides_evenly = Property(dtype=bool, default=False, desc="Tile size divides dimension length evenly") tile_trivial = Property(dtype=bool, default=False, desc="Tiles even if tile_size is 1") + skew = Property(dtype=bool, default=False, desc="If True, offsets inner tile back such that it starts with zero") + @staticmethod def annotates_memlets(): return True @@ -92,6 +94,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG): stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = True stripmine.tile_offset = str(offset) + stripmine.skew = self.skew stripmine.apply(graph, sdfg) removed_maps += 1 else: @@ -101,6 +104,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG): stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = self.divides_evenly stripmine.tile_offset = str(offset) + stripmine.skew = self.skew stripmine.apply(graph, sdfg) # apply to the new map the schedule of the original one