Increase block size used in FillGpu() to 1024

The maximum x- or y-dimension of a block is 1024 with compute capability above 2.x. The compute capability requirements of cuTENSOR and cuQuantum are much higher than 2.x. Furthermore, it's not easy to find a device that only supports the compute capability below or eqaul 1.3.
Cytnx-dev · Nov 10, 2024 · 77936f6 · 77936f6
1 parent bf1aa8f
commit 77936f6
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/src/backend/utils_internal_gpu/cuFill_gpu.cu b/src/backend/utils_internal_gpu/cuFill_gpu.cu
@@ -31,8 +31,8 @@ namespace cytnx {
       using CudaDType = typename ToCudaDType<DType>::type;
 
       CudaDType* typed_first = reinterpret_cast<CudaDType*>(first);
-      cytnx_uint64 block_count = (count + 511) / 512;
-      FillGpuKernel<<<block_count, 512>>>(typed_first, static_cast<CudaDType>(value), count);
+      cytnx_uint64 block_count = (count + 1023) / 1024;
+      FillGpuKernel<<<block_count, 1024>>>(typed_first, static_cast<CudaDType>(value), count);
     }
 
     template void FillGpu<cytnx_complex128>(void*, const cytnx_complex128&, cytnx_uint64);