From 77c5c7205f5daabae4312cd975191fb48c67ac6f Mon Sep 17 00:00:00 2001
From: Yakup Koray Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 29 Nov 2024 08:47:36 +0100
Subject: [PATCH] Minor Additions to Enable Tiling and Explicit Memory Movement
 Transformations (#1636)

I made some minor additions to make implementing some transformations
easier for me. I will explain all three changes and why I needed them.

1. Add gpu_force_syncthreads to force a call to __syncthreads in a map
in dace/codegen/targets/cuda.py and dace/sdfg/nodes.py.
- I preferred to tile work maps (e.g., K reduction for
sum-of-inner-products matrix multiplication) of kernels in such a way
that all new tiled maps are in the scope of the thread block map, yet
when it is combined with shared memory, a `__syncthreads` call is
necessary within the thread block map which is not performed for
sequential maps inside a thread block scheduled map, I would like to be
able to force this behavior

2. Adding the skew option to the map tiling transformation.
- Having every map start from 0 makes writing my transformations
simpler. Therefore, I wanted the map tiling transformation to start the
inner map at 0; I could only achieve this behavior by copying over the
skew parameter from the strip mine transformation. I would still prefer
to use the map tiling transformation instead of strip mine while having
the skew parameter.
---
 dace/codegen/targets/cuda.py           | 3 +++
 dace/sdfg/nodes.py                     | 2 ++
 dace/transformation/dataflow/tiling.py | 4 ++++
 3 files changed, 9 insertions(+)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 1cf8919d74..6425f01688 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -2620,6 +2620,9 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub
 
     def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
                           node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+        if isinstance(node, nodes.MapExit) and node.map.gpu_force_syncthreads:
+            callsite_stream.write('__syncthreads();', cfg, state_id)
+
         if node.map.schedule == dtypes.ScheduleType.GPU_Device:
             # Remove grid invocation conditions
             for i in range(len(node.map.params)):
diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index d29b1a22e4..fe2d523e82 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -930,6 +930,8 @@ class Map(object):
                                  "(including tuples) sets it explicitly.",
                                  serialize_if=lambda m: m.schedule in dtypes.GPU_SCHEDULES)
 
+    gpu_force_syncthreads = Property(dtype=bool, desc="Force a call to the __syncthreads for the map", default=False)
+
     def __init__(self,
                  label,
                  params,
diff --git a/dace/transformation/dataflow/tiling.py b/dace/transformation/dataflow/tiling.py
index bfa899e71a..8a6d75f4db 100644
--- a/dace/transformation/dataflow/tiling.py
+++ b/dace/transformation/dataflow/tiling.py
@@ -33,6 +33,8 @@ class MapTiling(transformation.SingleStateTransformation):
     divides_evenly = Property(dtype=bool, default=False, desc="Tile size divides dimension length evenly")
     tile_trivial = Property(dtype=bool, default=False, desc="Tiles even if tile_size is 1")
 
+    skew = Property(dtype=bool, default=False, desc="If True, offsets inner tile back such that it starts with zero")
+
     @staticmethod
     def annotates_memlets():
         return True
@@ -92,6 +94,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
                 stripmine.tile_stride = str(tile_stride)
                 stripmine.divides_evenly = True
                 stripmine.tile_offset = str(offset)
+                stripmine.skew = self.skew
                 stripmine.apply(graph, sdfg)
                 removed_maps += 1
             else:
@@ -101,6 +104,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
                 stripmine.tile_stride = str(tile_stride)
                 stripmine.divides_evenly = self.divides_evenly
                 stripmine.tile_offset = str(offset)
+                stripmine.skew = self.skew
                 stripmine.apply(graph, sdfg)
 
             # apply to the new map the schedule of the original one