From 7c06755142d29981e4d8e66edd7315df392400e9 Mon Sep 17 00:00:00 2001 From: edopao Date: Mon, 18 Dec 2023 19:57:45 +0100 Subject: [PATCH] Fix for CUDA codegen (#1442) This PR addresses #1388: fix python codegen and `SharedToGlobal1D` template to generate correct code for write without reduction. --- dace/codegen/targets/cuda.py | 16 ++++- dace/runtime/include/dace/cuda/copy.cuh | 53 ++++++++++------ tests/codegen/cuda_memcopy_test.py | 84 +++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 20 deletions(-) create mode 100644 tests/codegen/cuda_memcopy_test.py diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index b729b34088..5060339e18 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -1132,10 +1132,22 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst func=funcname, type=dst_node.desc(sdfg).dtype.ctype, bdims=', '.join(_topy(self._block_dims)), - is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'true', + is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', accum=accum, args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + custom_reduction + _topy(dst_strides) + _topy(copy_shape))), sdfg, state_id, [src_node, dst_node]) + elif funcname == 'dace::SharedToGlobal1D': + # special case: use a new template struct that provides functions for copy and reduction + callsite_stream.write( + (' {func}<{type}, {bdims}, {copysize}, {is_async}>{accum}({args});').format( + func=funcname, + type=dst_node.desc(sdfg).dtype.ctype, + bdims=', '.join(_topy(self._block_dims)), + copysize=', '.join(_topy(copy_shape)), + is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', + accum=accum or '::Copy', + args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + _topy(dst_strides) + custom_reduction)), sdfg, + state_id, [src_node, dst_node]) else: callsite_stream.write( (' {func}<{type}, {bdims}, {copysize}, ' + @@ -1145,7 +1157,7 @@ def _emit_copy(self, state_id, src_node, src_storage, dst_node, dst_storage, dst bdims=', '.join(_topy(self._block_dims)), copysize=', '.join(_topy(copy_shape)), dststrides=', '.join(_topy(dst_strides)), - is_async='true' if state_dfg.out_degree(dst_node) > 0 else 'true', + is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', accum=accum, args=', '.join([src_expr] + _topy(src_strides) + [dst_expr] + custom_reduction)), sdfg, state_id, [src_node, dst_node]) diff --git a/dace/runtime/include/dace/cuda/copy.cuh b/dace/runtime/include/dace/cuda/copy.cuh index 52462a906d..db3d715301 100644 --- a/dace/runtime/include/dace/cuda/copy.cuh +++ b/dace/runtime/include/dace/cuda/copy.cuh @@ -736,60 +736,77 @@ namespace dace int COPY_XLEN, bool ASYNC> struct SharedToGlobal1D { - template - static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int DST_XSTRIDE, WCR wcr) + static constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH; + static constexpr int TOTAL = COPY_XLEN; + static constexpr int WRITES = TOTAL / BLOCK_SIZE; + static constexpr int REM_WRITES = TOTAL % BLOCK_SIZE; + + static DACE_DFI void Copy(const T *smem, int src_xstride, T *ptr, int dst_xstride) { + // Linear thread ID + int ltid = GetLinearTID(); + + #pragma unroll + for (int i = 0; i < WRITES; ++i) { + *(ptr + (ltid + i * BLOCK_SIZE) * dst_xstride) = + *(smem + (ltid + i * BLOCK_SIZE) * src_xstride); + } + + if (REM_WRITES != 0 && ltid < REM_WRITES) { + *(ptr + (ltid + WRITES*BLOCK_SIZE)* dst_xstride) = + *(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride); + } + if (!ASYNC) __syncthreads(); + } + template + static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int dst_xstride, WCR wcr) + { // Linear thread ID int ltid = GetLinearTID(); - constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH; - constexpr int TOTAL = COPY_XLEN; - constexpr int WRITES = TOTAL / BLOCK_SIZE; - constexpr int REM_WRITES = TOTAL % BLOCK_SIZE; #pragma unroll for (int i = 0; i < WRITES; ++i) { wcr_custom::template reduce( - wcr, ptr + (ltid + i * BLOCK_SIZE) * DST_XSTRIDE, + wcr, ptr + (ltid + i * BLOCK_SIZE) * dst_xstride, *(smem + (ltid + i * BLOCK_SIZE) * src_xstride)); } if (REM_WRITES != 0) { if (ltid < REM_WRITES) wcr_custom::template reduce( - ptr + (ltid + WRITES * BLOCK_SIZE)* DST_XSTRIDE, + ptr + (ltid + WRITES * BLOCK_SIZE)* dst_xstride, *(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride)); } + + if (!ASYNC) + __syncthreads(); } template - static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int DST_XSTRIDE) + static DACE_DFI void Accum(const T *smem, int src_xstride, T *ptr, int dst_xstride) { - if (!ASYNC) - __syncthreads(); - // Linear thread ID int ltid = GetLinearTID(); - constexpr int BLOCK_SIZE = BLOCK_WIDTH * BLOCK_HEIGHT * BLOCK_DEPTH; - constexpr int TOTAL = COPY_XLEN; - constexpr int WRITES = TOTAL / BLOCK_SIZE; - constexpr int REM_WRITES = TOTAL % BLOCK_SIZE; #pragma unroll for (int i = 0; i < WRITES; ++i) { wcr_fixed::template reduce_atomic( - ptr + (ltid + i * BLOCK_SIZE) * DST_XSTRIDE, + ptr + (ltid + i * BLOCK_SIZE) * dst_xstride, *(smem + (ltid + i * BLOCK_SIZE) * src_xstride)); } if (REM_WRITES != 0) { if (ltid < REM_WRITES) wcr_fixed::template reduce_atomic( - ptr + (ltid + WRITES*BLOCK_SIZE)* DST_XSTRIDE, + ptr + (ltid + WRITES*BLOCK_SIZE)* dst_xstride, *(smem + (ltid + WRITES * BLOCK_SIZE) * src_xstride)); } + + if (!ASYNC) + __syncthreads(); } }; diff --git a/tests/codegen/cuda_memcopy_test.py b/tests/codegen/cuda_memcopy_test.py new file mode 100644 index 0000000000..a10f57eecd --- /dev/null +++ b/tests/codegen/cuda_memcopy_test.py @@ -0,0 +1,84 @@ +""" Tests code generation for array copy on GPU target. """ +import dace +from dace.transformation.auto import auto_optimize + +import pytest +import re + +# this test requires cupy module +cp = pytest.importorskip("cupy") + +# initialize random number generator +rng = cp.random.default_rng(42) + + +@pytest.mark.gpu +def test_gpu_shared_to_global_1D(): + M = 32 + N = dace.symbol('N') + + @dace.program + def transpose_shared_to_global(A: dace.float64[M, N], B: dace.float64[N, M]): + for i in dace.map[0:N]: + local_gather = dace.define_local([M], A.dtype, storage=dace.StorageType.GPU_Shared) + for j in dace.map[0:M]: + local_gather[j] = A[j, i] + B[i, :] = local_gather + + + sdfg = transpose_shared_to_global.to_sdfg() + auto_optimize.apply_gpu_storage(sdfg) + + size_M = M + size_N = 128 + + A = rng.random((size_M, size_N,)) + B = rng.random((size_N, size_M,)) + + ref = A.transpose() + + sdfg(A, B, N=size_N) + cp.allclose(ref, B) + + code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) + m = re.search('dace::SharedToGlobal1D<.+>::Copy', code) + assert m is not None + + +@pytest.mark.gpu +def test_gpu_shared_to_global_1D_accumulate(): + M = 32 + N = dace.symbol('N') + + @dace.program + def transpose_and_add_shared_to_global(A: dace.float64[M, N], B: dace.float64[N, M]): + for i in dace.map[0:N]: + local_gather = dace.define_local([M], A.dtype, storage=dace.StorageType.GPU_Shared) + for j in dace.map[0:M]: + local_gather[j] = A[j, i] + local_gather[:] >> B(M, lambda x, y: x + y)[i, :] + + + sdfg = transpose_and_add_shared_to_global.to_sdfg() + auto_optimize.apply_gpu_storage(sdfg) + + size_M = M + size_N = 128 + + A = rng.random((size_M, size_N,)) + B = rng.random((size_N, size_M,)) + + ref = A.transpose() + B + + sdfg(A, B, N=size_N) + cp.allclose(ref, B) + + code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) + m = re.search('dace::SharedToGlobal1D<.+>::template Accum', code) + assert m is not None + + +if __name__ == '__main__': + test_gpu_shared_to_global_1D() + test_gpu_shared_to_global_1D_accumulate() +