From 19a3c3917ea343cbe0b24a5228b568478a7509e0 Mon Sep 17 00:00:00 2001
From: Ivana <ivana.gyro@gmail.com>
Date: Sun, 10 Nov 2024 16:10:47 +0800
Subject: [PATCH 1/3] Templatise `Fill()` functions used by `Storage`

This is the preparation for refactoring the `Storage` classes.
---
 src/backend/BoolStorage.cpp                   |  36 ++---
 src/backend/ComplexDoubleStorage.cpp          |  44 +++---
 src/backend/ComplexFloatStorage.cpp           |  44 +++---
 src/backend/DoubleStorage.cpp                 |  36 ++---
 src/backend/FloatStorage.cpp                  |  36 ++---
 src/backend/Int16Storage.cpp                  |  36 ++---
 src/backend/Int32Storage.cpp                  |  36 ++---
 src/backend/Int64Storage.cpp                  |  36 ++---
 src/backend/Uint16Storage.cpp                 |  36 ++---
 src/backend/Uint32Storage.cpp                 |  36 ++---
 src/backend/Uint64Storage.cpp                 |  36 ++---
 src/backend/utils_internal_cpu/CMakeLists.txt |   1 -
 src/backend/utils_internal_cpu/Fill_cpu.cpp   | 145 ------------------
 src/backend/utils_internal_cpu/Fill_cpu.hpp   |  52 ++++---
 src/backend/utils_internal_gpu/cuFill_gpu.cu  | 138 +++++------------
 src/backend/utils_internal_gpu/cuFill_gpu.hpp |  38 +++--
 16 files changed, 291 insertions(+), 495 deletions(-)
 delete mode 100644 src/backend/utils_internal_cpu/Fill_cpu.cpp

diff --git a/src/backend/BoolStorage.cpp b/src/backend/BoolStorage.cpp
index 6285101ae..b39a2c827 100644
--- a/src/backend/BoolStorage.cpp
+++ b/src/backend/BoolStorage.cpp
@@ -411,10 +411,10 @@ namespace cytnx {
   void BoolStorage::fill(const cytnx_double &val) {
     cytnx_bool tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -424,10 +424,10 @@ namespace cytnx {
   void BoolStorage::fill(const cytnx_float &val) {
     cytnx_bool tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -437,10 +437,10 @@ namespace cytnx {
   void BoolStorage::fill(const cytnx_int64 &val) {
     cytnx_bool tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -450,10 +450,10 @@ namespace cytnx {
   void BoolStorage::fill(const cytnx_uint64 &val) {
     cytnx_bool tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -463,10 +463,10 @@ namespace cytnx {
   void BoolStorage::fill(const cytnx_int32 &val) {
     cytnx_bool tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -476,10 +476,10 @@ namespace cytnx {
   void BoolStorage::fill(const cytnx_uint32 &val) {
     cytnx_bool tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -489,10 +489,10 @@ namespace cytnx {
   void BoolStorage::fill(const cytnx_int16 &val) {
     cytnx_bool tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -502,10 +502,10 @@ namespace cytnx {
   void BoolStorage::fill(const cytnx_uint16 &val) {
     cytnx_bool tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_b(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -514,10 +514,10 @@ namespace cytnx {
   }
   void BoolStorage::fill(const cytnx_bool &val) {
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_b(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillCpu(this->Mem, val, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_b(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
diff --git a/src/backend/ComplexDoubleStorage.cpp b/src/backend/ComplexDoubleStorage.cpp
index e1f4d1b3d..9fe023b5b 100644
--- a/src/backend/ComplexDoubleStorage.cpp
+++ b/src/backend/ComplexDoubleStorage.cpp
@@ -395,11 +395,11 @@ namespace cytnx {
 
   void ComplexDoubleStorage::fill(const cytnx_complex128 &val) {
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cd(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillCpu(this->Mem, val, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cd(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -409,11 +409,11 @@ namespace cytnx {
   void ComplexDoubleStorage::fill(const cytnx_complex64 &val) {
     cytnx_complex128 tmp(val.real(), val.imag());
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -423,11 +423,11 @@ namespace cytnx {
   void ComplexDoubleStorage::fill(const cytnx_double &val) {
     cytnx_complex128 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -437,11 +437,11 @@ namespace cytnx {
   void ComplexDoubleStorage::fill(const cytnx_float &val) {
     cytnx_complex128 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -451,11 +451,11 @@ namespace cytnx {
   void ComplexDoubleStorage::fill(const cytnx_int64 &val) {
     cytnx_complex128 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -465,11 +465,11 @@ namespace cytnx {
   void ComplexDoubleStorage::fill(const cytnx_uint64 &val) {
     cytnx_complex128 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -479,11 +479,11 @@ namespace cytnx {
   void ComplexDoubleStorage::fill(const cytnx_int32 &val) {
     cytnx_complex128 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -493,11 +493,11 @@ namespace cytnx {
   void ComplexDoubleStorage::fill(const cytnx_uint32 &val) {
     cytnx_complex128 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -507,11 +507,11 @@ namespace cytnx {
   void ComplexDoubleStorage::fill(const cytnx_int16 &val) {
     cytnx_complex128 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -521,11 +521,11 @@ namespace cytnx {
   void ComplexDoubleStorage::fill(const cytnx_uint16 &val) {
     cytnx_complex128 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -535,11 +535,11 @@ namespace cytnx {
   void ComplexDoubleStorage::fill(const cytnx_bool &val) {
     cytnx_complex128 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cd(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
diff --git a/src/backend/ComplexFloatStorage.cpp b/src/backend/ComplexFloatStorage.cpp
index 219eb304f..06755ffdf 100644
--- a/src/backend/ComplexFloatStorage.cpp
+++ b/src/backend/ComplexFloatStorage.cpp
@@ -397,11 +397,11 @@ namespace cytnx {
   void ComplexFloatStorage::fill(const cytnx_complex128 &val) {
     cytnx_complex64 tmp(val.real(), val.imag());
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -410,11 +410,11 @@ namespace cytnx {
   }
   void ComplexFloatStorage::fill(const cytnx_complex64 &val) {
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cf(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillCpu(this->Mem, val, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cf(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -424,11 +424,11 @@ namespace cytnx {
   void ComplexFloatStorage::fill(const cytnx_double &val) {
     cytnx_complex64 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -438,11 +438,11 @@ namespace cytnx {
   void ComplexFloatStorage::fill(const cytnx_float &val) {
     cytnx_complex64 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -452,11 +452,11 @@ namespace cytnx {
   void ComplexFloatStorage::fill(const cytnx_int64 &val) {
     cytnx_complex64 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -466,11 +466,11 @@ namespace cytnx {
   void ComplexFloatStorage::fill(const cytnx_uint64 &val) {
     cytnx_complex64 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -480,11 +480,11 @@ namespace cytnx {
   void ComplexFloatStorage::fill(const cytnx_int32 &val) {
     cytnx_complex64 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -494,11 +494,11 @@ namespace cytnx {
   void ComplexFloatStorage::fill(const cytnx_uint32 &val) {
     cytnx_complex64 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -508,11 +508,11 @@ namespace cytnx {
   void ComplexFloatStorage::fill(const cytnx_int16 &val) {
     cytnx_complex64 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -522,11 +522,11 @@ namespace cytnx {
   void ComplexFloatStorage::fill(const cytnx_uint16 &val) {
     cytnx_complex64 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -536,11 +536,11 @@ namespace cytnx {
   void ComplexFloatStorage::fill(const cytnx_bool &val) {
     cytnx_complex64 tmp(val, 0);
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_cf(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
diff --git a/src/backend/DoubleStorage.cpp b/src/backend/DoubleStorage.cpp
index 69000037f..994a05bc1 100644
--- a/src/backend/DoubleStorage.cpp
+++ b/src/backend/DoubleStorage.cpp
@@ -405,11 +405,11 @@ namespace cytnx {
   }
   void DoubleStorage::fill(const cytnx_double &val) {
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_d(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillCpu(this->Mem, val, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_d(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -419,11 +419,11 @@ namespace cytnx {
   void DoubleStorage::fill(const cytnx_float &val) {
     cytnx_double tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -433,11 +433,11 @@ namespace cytnx {
   void DoubleStorage::fill(const cytnx_int64 &val) {
     cytnx_double tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -447,11 +447,11 @@ namespace cytnx {
   void DoubleStorage::fill(const cytnx_uint64 &val) {
     cytnx_double tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -461,11 +461,11 @@ namespace cytnx {
   void DoubleStorage::fill(const cytnx_int32 &val) {
     cytnx_double tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -475,11 +475,11 @@ namespace cytnx {
   void DoubleStorage::fill(const cytnx_uint32 &val) {
     cytnx_double tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -489,11 +489,11 @@ namespace cytnx {
   void DoubleStorage::fill(const cytnx_int16 &val) {
     cytnx_double tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -503,11 +503,11 @@ namespace cytnx {
   void DoubleStorage::fill(const cytnx_uint16 &val) {
     cytnx_double tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -517,11 +517,11 @@ namespace cytnx {
   void DoubleStorage::fill(const cytnx_bool &val) {
     cytnx_double tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_d(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
diff --git a/src/backend/FloatStorage.cpp b/src/backend/FloatStorage.cpp
index ef7239fba..1b35b87f7 100644
--- a/src/backend/FloatStorage.cpp
+++ b/src/backend/FloatStorage.cpp
@@ -397,10 +397,10 @@ namespace cytnx {
   void FloatStorage::fill(const cytnx_double &val) {
     cytnx_float tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -409,10 +409,10 @@ namespace cytnx {
   }
   void FloatStorage::fill(const cytnx_float &val) {
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_f(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillCpu(this->Mem, val, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_f(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -422,10 +422,10 @@ namespace cytnx {
   void FloatStorage::fill(const cytnx_int64 &val) {
     cytnx_float tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -435,10 +435,10 @@ namespace cytnx {
   void FloatStorage::fill(const cytnx_uint64 &val) {
     cytnx_float tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -448,10 +448,10 @@ namespace cytnx {
   void FloatStorage::fill(const cytnx_int32 &val) {
     cytnx_float tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -461,10 +461,10 @@ namespace cytnx {
   void FloatStorage::fill(const cytnx_uint32 &val) {
     cytnx_float tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -474,10 +474,10 @@ namespace cytnx {
   void FloatStorage::fill(const cytnx_int16 &val) {
     cytnx_float tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -487,10 +487,10 @@ namespace cytnx {
   void FloatStorage::fill(const cytnx_uint16 &val) {
     cytnx_float tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -500,10 +500,10 @@ namespace cytnx {
   void FloatStorage::fill(const cytnx_bool &val) {
     cytnx_float tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
-      utils_internal::cuFill_gpu_f(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
diff --git a/src/backend/Int16Storage.cpp b/src/backend/Int16Storage.cpp
index e8e909a94..d21a3c286 100644
--- a/src/backend/Int16Storage.cpp
+++ b/src/backend/Int16Storage.cpp
@@ -394,11 +394,11 @@ namespace cytnx {
   void Int16Storage::fill(const cytnx_double &val) {
     cytnx_int16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -408,11 +408,11 @@ namespace cytnx {
   void Int16Storage::fill(const cytnx_float &val) {
     cytnx_int16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -422,11 +422,11 @@ namespace cytnx {
   void Int16Storage::fill(const cytnx_int64 &val) {
     cytnx_int16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -436,11 +436,11 @@ namespace cytnx {
   void Int16Storage::fill(const cytnx_uint64 &val) {
     cytnx_int16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -450,11 +450,11 @@ namespace cytnx {
   void Int16Storage::fill(const cytnx_int32 &val) {
     cytnx_int16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -464,11 +464,11 @@ namespace cytnx {
   void Int16Storage::fill(const cytnx_uint32 &val) {
     cytnx_int16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -478,11 +478,11 @@ namespace cytnx {
   void Int16Storage::fill(const cytnx_uint16 &val) {
     cytnx_int16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -491,11 +491,11 @@ namespace cytnx {
   }
   void Int16Storage::fill(const cytnx_int16 &val) {
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i16(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillCpu(this->Mem, val, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i16(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -505,11 +505,11 @@ namespace cytnx {
   void Int16Storage::fill(const cytnx_bool &val) {
     cytnx_int16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
diff --git a/src/backend/Int32Storage.cpp b/src/backend/Int32Storage.cpp
index 30088d61d..2535b78ff 100644
--- a/src/backend/Int32Storage.cpp
+++ b/src/backend/Int32Storage.cpp
@@ -397,11 +397,11 @@ namespace cytnx {
   void Int32Storage::fill(const cytnx_double &val) {
     cytnx_int32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -411,11 +411,11 @@ namespace cytnx {
   void Int32Storage::fill(const cytnx_float &val) {
     cytnx_int32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -425,11 +425,11 @@ namespace cytnx {
   void Int32Storage::fill(const cytnx_int64 &val) {
     cytnx_int32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -439,11 +439,11 @@ namespace cytnx {
   void Int32Storage::fill(const cytnx_uint64 &val) {
     cytnx_int32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -452,11 +452,11 @@ namespace cytnx {
   }
   void Int32Storage::fill(const cytnx_int32 &val) {
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i32(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillCpu(this->Mem, val, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i32(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -466,11 +466,11 @@ namespace cytnx {
   void Int32Storage::fill(const cytnx_uint32 &val) {
     cytnx_int32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -480,11 +480,11 @@ namespace cytnx {
   void Int32Storage::fill(const cytnx_int16 &val) {
     cytnx_int32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -494,11 +494,11 @@ namespace cytnx {
   void Int32Storage::fill(const cytnx_uint16 &val) {
     cytnx_int32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -508,11 +508,11 @@ namespace cytnx {
   void Int32Storage::fill(const cytnx_bool &val) {
     cytnx_int32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
diff --git a/src/backend/Int64Storage.cpp b/src/backend/Int64Storage.cpp
index adbcba8a7..01b3860ff 100644
--- a/src/backend/Int64Storage.cpp
+++ b/src/backend/Int64Storage.cpp
@@ -397,11 +397,11 @@ namespace cytnx {
   void Int64Storage::fill(const cytnx_double &val) {
     cytnx_int64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -411,11 +411,11 @@ namespace cytnx {
   void Int64Storage::fill(const cytnx_float &val) {
     cytnx_int64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i64(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -424,11 +424,11 @@ namespace cytnx {
   }
   void Int64Storage::fill(const cytnx_int64 &val) {
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i64(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillCpu(this->Mem, val, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i64(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -438,11 +438,11 @@ namespace cytnx {
   void Int64Storage::fill(const cytnx_uint64 &val) {
     cytnx_int64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -452,11 +452,11 @@ namespace cytnx {
   void Int64Storage::fill(const cytnx_int32 &val) {
     cytnx_int64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -466,11 +466,11 @@ namespace cytnx {
   void Int64Storage::fill(const cytnx_uint32 &val) {
     cytnx_int64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -480,11 +480,11 @@ namespace cytnx {
   void Int64Storage::fill(const cytnx_uint16 &val) {
     cytnx_int64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -494,11 +494,11 @@ namespace cytnx {
   void Int64Storage::fill(const cytnx_int16 &val) {
     cytnx_int64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -508,11 +508,11 @@ namespace cytnx {
   void Int64Storage::fill(const cytnx_bool &val) {
     cytnx_int64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_i64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
diff --git a/src/backend/Uint16Storage.cpp b/src/backend/Uint16Storage.cpp
index e96ad2c93..1352674d2 100644
--- a/src/backend/Uint16Storage.cpp
+++ b/src/backend/Uint16Storage.cpp
@@ -395,11 +395,11 @@ namespace cytnx {
   void Uint16Storage::fill(const cytnx_double &val) {
     cytnx_uint16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -409,11 +409,11 @@ namespace cytnx {
   void Uint16Storage::fill(const cytnx_float &val) {
     cytnx_uint16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -423,11 +423,11 @@ namespace cytnx {
   void Uint16Storage::fill(const cytnx_int64 &val) {
     cytnx_uint16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -437,11 +437,11 @@ namespace cytnx {
   void Uint16Storage::fill(const cytnx_uint64 &val) {
     cytnx_uint16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -451,11 +451,11 @@ namespace cytnx {
   void Uint16Storage::fill(const cytnx_int32 &val) {
     cytnx_uint16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -465,11 +465,11 @@ namespace cytnx {
   void Uint16Storage::fill(const cytnx_uint32 &val) {
     cytnx_uint16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -478,11 +478,11 @@ namespace cytnx {
   }
   void Uint16Storage::fill(const cytnx_uint16 &val) {
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u16(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillCpu(this->Mem, val, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u16(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -492,11 +492,11 @@ namespace cytnx {
   void Uint16Storage::fill(const cytnx_int16 &val) {
     cytnx_uint16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -506,11 +506,11 @@ namespace cytnx {
   void Uint16Storage::fill(const cytnx_bool &val) {
     cytnx_uint16 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u16(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
diff --git a/src/backend/Uint32Storage.cpp b/src/backend/Uint32Storage.cpp
index f7157c12d..1c9103a56 100644
--- a/src/backend/Uint32Storage.cpp
+++ b/src/backend/Uint32Storage.cpp
@@ -401,11 +401,11 @@ namespace cytnx {
   void Uint32Storage::fill(const cytnx_double &val) {
     cytnx_uint32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -415,11 +415,11 @@ namespace cytnx {
   void Uint32Storage::fill(const cytnx_float &val) {
     cytnx_uint32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -429,11 +429,11 @@ namespace cytnx {
   void Uint32Storage::fill(const cytnx_int64 &val) {
     cytnx_uint32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -443,11 +443,11 @@ namespace cytnx {
   void Uint32Storage::fill(const cytnx_uint64 &val) {
     cytnx_uint32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -457,11 +457,11 @@ namespace cytnx {
   void Uint32Storage::fill(const cytnx_int32 &val) {
     cytnx_uint32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -470,11 +470,11 @@ namespace cytnx {
   }
   void Uint32Storage::fill(const cytnx_uint32 &val) {
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u32(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillCpu(this->Mem, val, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u32(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -484,11 +484,11 @@ namespace cytnx {
   void Uint32Storage::fill(const cytnx_uint16 &val) {
     cytnx_uint32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -498,11 +498,11 @@ namespace cytnx {
   void Uint32Storage::fill(const cytnx_int16 &val) {
     cytnx_uint32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -512,11 +512,11 @@ namespace cytnx {
   void Uint32Storage::fill(const cytnx_bool &val) {
     cytnx_uint32 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u32(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
diff --git a/src/backend/Uint64Storage.cpp b/src/backend/Uint64Storage.cpp
index 231a8cf95..d0c3fb43c 100644
--- a/src/backend/Uint64Storage.cpp
+++ b/src/backend/Uint64Storage.cpp
@@ -396,11 +396,11 @@ namespace cytnx {
   void Uint64Storage::fill(const cytnx_double &val) {
     cytnx_uint64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -410,11 +410,11 @@ namespace cytnx {
   void Uint64Storage::fill(const cytnx_float &val) {
     cytnx_uint64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -424,11 +424,11 @@ namespace cytnx {
   void Uint64Storage::fill(const cytnx_int64 &val) {
     cytnx_uint64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -437,11 +437,11 @@ namespace cytnx {
   }
   void Uint64Storage::fill(const cytnx_uint64 &val) {
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u64(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillCpu(this->Mem, val, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u64(this->Mem, (void *)(&val), this->len);
+      utils_internal::FillGpu(this->Mem, val, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -451,11 +451,11 @@ namespace cytnx {
   void Uint64Storage::fill(const cytnx_int32 &val) {
     cytnx_uint64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -465,11 +465,11 @@ namespace cytnx {
   void Uint64Storage::fill(const cytnx_uint32 &val) {
     cytnx_uint64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -479,11 +479,11 @@ namespace cytnx {
   void Uint64Storage::fill(const cytnx_uint16 &val) {
     cytnx_uint64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -493,11 +493,11 @@ namespace cytnx {
   void Uint64Storage::fill(const cytnx_int16 &val) {
     cytnx_uint64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
@@ -507,11 +507,11 @@ namespace cytnx {
   void Uint64Storage::fill(const cytnx_bool &val) {
     cytnx_uint64 tmp = val;
     if (this->device == Device.cpu) {
-      utils_internal::Fill_cpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillCpu(this->Mem, tmp, this->len);
     } else {
 #ifdef UNI_GPU
       checkCudaErrors(cudaSetDevice(this->device));
-      utils_internal::cuFill_gpu_u64(this->Mem, (void *)(&tmp), this->len);
+      utils_internal::FillGpu(this->Mem, tmp, this->len);
 #else
       cytnx_error_msg(true, "[ERROR][fill] fatal internal, %s",
                       "storage is on gpu without CUDA support\n");
diff --git a/src/backend/utils_internal_cpu/CMakeLists.txt b/src/backend/utils_internal_cpu/CMakeLists.txt
index 993be4911..752d9bccb 100644
--- a/src/backend/utils_internal_cpu/CMakeLists.txt
+++ b/src/backend/utils_internal_cpu/CMakeLists.txt
@@ -14,7 +14,6 @@ target_sources_local(cytnx
     Complexmem_cpu.hpp
     Alloc_cpu.cpp
     Cast_cpu.cpp
-    Fill_cpu.cpp
     GetElems_cpu.cpp
     GetElems_contiguous_cpu.cpp
     Movemem_cpu.cpp
diff --git a/src/backend/utils_internal_cpu/Fill_cpu.cpp b/src/backend/utils_internal_cpu/Fill_cpu.cpp
deleted file mode 100644
index 6fec2f46e..000000000
--- a/src/backend/utils_internal_cpu/Fill_cpu.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-#include "Fill_cpu.hpp"
-#include "backend/Storage.hpp"
-#ifdef UNI_OMP
-  #include <omp.h>
-#endif
-
-using namespace std;
-
-namespace cytnx {
-  namespace utils_internal {
-
-    void Fill_cpu_cd(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_complex128* ptr = (cytnx_complex128*)in;
-      cytnx_complex128 _val = *((cytnx_complex128*)val);
-
-#ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
-#endif
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        ptr[i] = _val;
-      }
-    }
-
-    void Fill_cpu_cf(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_complex64* ptr = (cytnx_complex64*)in;
-      cytnx_complex64 _val = *((cytnx_complex64*)val);
-
-#ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
-#endif
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        ptr[i] = _val;
-      }
-    }
-
-    void Fill_cpu_d(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_double* ptr = (cytnx_double*)in;
-      cytnx_double _val = *((cytnx_double*)val);
-
-#ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
-#endif
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        ptr[i] = _val;
-      }
-    }
-
-    void Fill_cpu_f(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_float* ptr = (cytnx_float*)in;
-      cytnx_float _val = *((cytnx_float*)val);
-
-#ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
-#endif
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        ptr[i] = _val;
-      }
-    }
-
-    void Fill_cpu_i64(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_int64* ptr = (cytnx_int64*)in;
-      cytnx_int64 _val = *((cytnx_int64*)val);
-
-#ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
-#endif
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        ptr[i] = _val;
-      }
-    }
-
-    void Fill_cpu_u64(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_uint64* ptr = (cytnx_uint64*)in;
-      cytnx_uint64 _val = *((cytnx_uint64*)val);
-
-#ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
-#endif
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        ptr[i] = _val;
-      }
-    }
-
-    void Fill_cpu_i32(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_int32* ptr = (cytnx_int32*)in;
-      cytnx_int32 _val = *((cytnx_int32*)val);
-
-#ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
-#endif
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        ptr[i] = _val;
-      }
-    }
-
-    void Fill_cpu_u32(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_uint32* ptr = (cytnx_uint32*)in;
-      cytnx_uint32 _val = *((cytnx_uint32*)val);
-
-#ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
-#endif
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        ptr[i] = _val;
-      }
-    }
-
-    void Fill_cpu_i16(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_int16* ptr = (cytnx_int16*)in;
-      cytnx_int16 _val = *((cytnx_int16*)val);
-
-#ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
-#endif
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        ptr[i] = _val;
-      }
-    }
-
-    void Fill_cpu_u16(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_uint16* ptr = (cytnx_uint16*)in;
-      cytnx_uint16 _val = *((cytnx_uint16*)val);
-
-#ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
-#endif
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        ptr[i] = _val;
-      }
-    }
-
-    void Fill_cpu_b(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_bool* ptr = (cytnx_bool*)in;
-      cytnx_bool _val = *((cytnx_bool*)val);
-
-#ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
-#endif
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        ptr[i] = _val;
-      }
-    }
-
-  }  // namespace utils_internal
-}  // namespace cytnx
diff --git a/src/backend/utils_internal_cpu/Fill_cpu.hpp b/src/backend/utils_internal_cpu/Fill_cpu.hpp
index 5063289b5..5d25b72d9 100644
--- a/src/backend/utils_internal_cpu/Fill_cpu.hpp
+++ b/src/backend/utils_internal_cpu/Fill_cpu.hpp
@@ -1,29 +1,39 @@
-#ifndef _H_Fill_cpu_
-#define _H_Fill_cpu_
+#ifndef SRC_BACKEND_UTILS_INTERNAL_CPU_FILL_CPU_H_
+#define SRC_BACKEND_UTILS_INTERNAL_CPU_FILL_CPU_H_
 
-#include <cstdio>
-#include <cstdlib>
-#include <stdint.h>
-#include <climits>
 #include "Type.hpp"
-#include "backend/Storage.hpp"
-#include "cytnx_error.hpp"
+
+#ifdef UNI_OMP
+  #include <omp.h>
+#endif
+
 namespace cytnx {
   namespace utils_internal {
 
-    void Fill_cpu_cd(void *in, void *val, const cytnx_uint64 &Nelem);
-    void Fill_cpu_cf(void *in, void *val, const cytnx_uint64 &Nelem);
-    void Fill_cpu_d(void *in, void *val, const cytnx_uint64 &Nelem);
-    void Fill_cpu_f(void *in, void *val, const cytnx_uint64 &Nelem);
-    void Fill_cpu_i64(void *in, void *val, const cytnx_uint64 &Nelem);
-    void Fill_cpu_u64(void *in, void *val, const cytnx_uint64 &Nelem);
-    void Fill_cpu_i32(void *in, void *val, const cytnx_uint64 &Nelem);
-    void Fill_cpu_u32(void *in, void *val, const cytnx_uint64 &Nelem);
-    void Fill_cpu_u16(void *in, void *val, const cytnx_uint64 &Nelem);
-    void Fill_cpu_i16(void *in, void *val, const cytnx_uint64 &Nelem);
-    void Fill_cpu_b(void *in, void *val, const cytnx_uint64 &Nelem);
+    /**
+     * @brief Assign the given value to the first `count` elements in the range beginning at
+     * `first`.
+     *
+     * This function act the same as `std::fill_n`. The execution will be parallelized when OMP is
+     * enabled.
+     *
+     * @tparam DType the data type of the elements in the range
+     *
+     * @param first the beginning of the range
+     * @param value the value to be assigned
+     * @param count the number of elements to modify
+     */
+    template <typename DType>
+    void FillCpu(void *first, const DType &value, cytnx_uint64 count) {
+      DType *typed_first = reinterpret_cast<DType *>(first);
+#ifdef UNI_OMP
+  #pragma omp parallel for schedule(dynamic)
+#endif
+      for (cytnx_uint64 i = 0; i < count; i++) {
+        typed_first[i] = value;
+      }
+    }
   }  // namespace utils_internal
-
 }  // namespace cytnx
 
-#endif
+#endif  // SRC_BACKEND_UTILS_INTERNAL_CPU_FILL_CPU_H_
diff --git a/src/backend/utils_internal_gpu/cuFill_gpu.cu b/src/backend/utils_internal_gpu/cuFill_gpu.cu
index 680c97c4b..4faa445fb 100644
--- a/src/backend/utils_internal_gpu/cuFill_gpu.cu
+++ b/src/backend/utils_internal_gpu/cuFill_gpu.cu
@@ -1,117 +1,51 @@
-#include "cuFill_gpu.hpp"
-#include "backend/Storage.hpp"
-#ifdef UNI_OMP
-  #include <omp.h>
-#endif
+#include "backend/utils_internal_gpu/cuFill_gpu.hpp"
 
-using namespace std;
-namespace cytnx {
-  namespace utils_internal {
-
-    template <class T3>
-    __global__ void cuFill_kernel(T3* des, T3 val, cytnx_uint64 Nelem) {
-      if (blockIdx.x * blockDim.x + threadIdx.x < Nelem) {
-        des[blockIdx.x * blockDim.x + threadIdx.x] = val;
-      }
-    }
-
-    //========================================================================
-    void cuFill_gpu_cd(void* in, void* val, const cytnx_uint64& Nelem) {
-      cuDoubleComplex* ptr = (cuDoubleComplex*)in;
-      cuDoubleComplex _val = *((cuDoubleComplex*)val);
-
-      cytnx_uint64 NBlocks = Nelem / 512;
-      if (Nelem % 512) NBlocks += 1;
-      cuFill_kernel<<<NBlocks, 512>>>(ptr, _val, Nelem);
-    }
-
-    void cuFill_gpu_cf(void* in, void* val, const cytnx_uint64& Nelem) {
-      cuFloatComplex* ptr = (cuFloatComplex*)in;
-      cuFloatComplex _val = *((cuFloatComplex*)val);
-
-      cytnx_uint64 NBlocks = Nelem / 512;
-      if (Nelem % 512) NBlocks += 1;
-      cuFill_kernel<<<NBlocks, 512>>>(ptr, _val, Nelem);
-    }
-
-    void cuFill_gpu_d(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_double* ptr = (cytnx_double*)in;
-      cytnx_double _val = *((cytnx_double*)val);
-
-      cytnx_uint64 NBlocks = Nelem / 512;
-      if (Nelem % 512) NBlocks += 1;
-      cuFill_kernel<<<NBlocks, 512>>>(ptr, _val, Nelem);
-    }
+#include <complex>
 
-    void cuFill_gpu_f(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_float* ptr = (cytnx_float*)in;
-      cytnx_float _val = *((cytnx_float*)val);
+#include "cuda/std/complex"
 
-      cytnx_uint64 NBlocks = Nelem / 512;
-      if (Nelem % 512) NBlocks += 1;
-      cuFill_kernel<<<NBlocks, 512>>>(ptr, _val, Nelem);
-    }
-
-    void cuFill_gpu_i64(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_int64* ptr = (cytnx_int64*)in;
-      cytnx_int64 _val = *((cytnx_int64*)val);
-
-      cytnx_uint64 NBlocks = Nelem / 512;
-      if (Nelem % 512) NBlocks += 1;
-      cuFill_kernel<<<NBlocks, 512>>>(ptr, _val, Nelem);
-    }
-
-    void cuFill_gpu_u64(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_uint64* ptr = (cytnx_uint64*)in;
-      cytnx_uint64 _val = *((cytnx_uint64*)val);
-
-      cytnx_uint64 NBlocks = Nelem / 512;
-      if (Nelem % 512) NBlocks += 1;
-      cuFill_kernel<<<NBlocks, 512>>>(ptr, _val, Nelem);
-    }
+#include "Type.hpp"
 
-    void cuFill_gpu_i32(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_int32* ptr = (cytnx_int32*)in;
-      cytnx_int32 _val = *((cytnx_int32*)val);
+namespace cytnx {
+  namespace utils_internal {
 
-      cytnx_uint64 NBlocks = Nelem / 512;
-      if (Nelem % 512) NBlocks += 1;
-      cuFill_kernel<<<NBlocks, 512>>>(ptr, _val, Nelem);
+    template <class CudaDType>
+    __global__ void FillGpuKernel(CudaDType* first, CudaDType value, cytnx_uint64 count) {
+      if (blockIdx.x * blockDim.x + threadIdx.x < count) {
+        first[blockIdx.x * blockDim.x + threadIdx.x] = value;
+      }
     }
 
-    void cuFill_gpu_u32(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_uint32* ptr = (cytnx_uint32*)in;
-      cytnx_uint32 _val = *((cytnx_uint32*)val);
+    template <typename DType>
+    struct ToCudaDType {
+      typedef DType type;
+    };
 
-      cytnx_uint64 NBlocks = Nelem / 512;
-      if (Nelem % 512) NBlocks += 1;
-      cuFill_kernel<<<NBlocks, 512>>>(ptr, _val, Nelem);
-    }
+    template <typename DType>
+    struct ToCudaDType<std::complex<DType>> {
+      typedef cuda::std::complex<DType> type;
+    };
 
-    void cuFill_gpu_i16(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_int16* ptr = (cytnx_int16*)in;
-      cytnx_int16 _val = *((cytnx_int16*)val);
+    template <typename DType>
+    void FillGpu(void* first, const DType& value, cytnx_uint64 count) {
+      using CudaDType = typename ToCudaDType<DType>::type;
 
-      cytnx_uint64 NBlocks = Nelem / 512;
-      if (Nelem % 512) NBlocks += 1;
-      cuFill_kernel<<<NBlocks, 512>>>(ptr, _val, Nelem);
+      CudaDType* typed_first = reinterpret_cast<CudaDType*>(first);
+      cytnx_uint64 block_count = (count + 511) / 512;
+      FillGpuKernel<<<block_count, 512>>>(typed_first, static_cast<CudaDType>(value), count);
     }
 
-    void cuFill_gpu_u16(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_uint16* ptr = (cytnx_uint16*)in;
-      cytnx_uint16 _val = *((cytnx_uint16*)val);
+    template void FillGpu<cytnx_complex128>(void*, const cytnx_complex128&, cytnx_uint64);
+    template void FillGpu<cytnx_complex64>(void*, const cytnx_complex64&, cytnx_uint64);
+    template void FillGpu<cytnx_double>(void*, const cytnx_double&, cytnx_uint64);
+    template void FillGpu<cytnx_float>(void*, const cytnx_float&, cytnx_uint64);
+    template void FillGpu<cytnx_uint64>(void*, const cytnx_uint64&, cytnx_uint64);
+    template void FillGpu<cytnx_int64>(void*, const cytnx_int64&, cytnx_uint64);
+    template void FillGpu<cytnx_uint32>(void*, const cytnx_uint32&, cytnx_uint64);
+    template void FillGpu<cytnx_int32>(void*, const cytnx_int32&, cytnx_uint64);
+    template void FillGpu<cytnx_uint16>(void*, const cytnx_uint16&, cytnx_uint64);
+    template void FillGpu<cytnx_int16>(void*, const cytnx_int16&, cytnx_uint64);
+    template void FillGpu<cytnx_bool>(void*, const cytnx_bool&, cytnx_uint64);
 
-      cytnx_uint64 NBlocks = Nelem / 512;
-      if (Nelem % 512) NBlocks += 1;
-      cuFill_kernel<<<NBlocks, 512>>>(ptr, _val, Nelem);
-    }
-    void cuFill_gpu_b(void* in, void* val, const cytnx_uint64& Nelem) {
-      cytnx_bool* ptr = (cytnx_bool*)in;
-      cytnx_bool _val = *((cytnx_bool*)val);
-
-      cytnx_uint64 NBlocks = Nelem / 512;
-      if (Nelem % 512) NBlocks += 1;
-      cuFill_kernel<<<NBlocks, 512>>>(ptr, _val, Nelem);
-    }
   }  // namespace utils_internal
 }  // namespace cytnx
diff --git a/src/backend/utils_internal_gpu/cuFill_gpu.hpp b/src/backend/utils_internal_gpu/cuFill_gpu.hpp
index 4d30bdcd6..9e25e4d04 100644
--- a/src/backend/utils_internal_gpu/cuFill_gpu.hpp
+++ b/src/backend/utils_internal_gpu/cuFill_gpu.hpp
@@ -1,28 +1,26 @@
-#ifndef _H_cuFill_gpu_
-#define _H_cuFill_gpu_
+#ifndef SRC_BACKEND_UTILS_INTERNAL_GPU_CUFILL_GPU_H_
+#define SRC_BACKEND_UTILS_INTERNAL_GPU_CUFILL_GPU_H_
 
-#include <cstdio>
-#include <cstdlib>
-#include <stdint.h>
-#include <climits>
 #include "Type.hpp"
-#include "backend/Storage.hpp"
-#include "cytnx_error.hpp"
 
 namespace cytnx {
   namespace utils_internal {
-    void cuFill_gpu_cd(void* in, void* val, const cytnx_uint64&);
-    void cuFill_gpu_cf(void* in, void* val, const cytnx_uint64&);
-    void cuFill_gpu_d(void* in, void* val, const cytnx_uint64&);
-    void cuFill_gpu_f(void* in, void* val, const cytnx_uint64&);
-    void cuFill_gpu_i64(void* in, void* val, const cytnx_uint64&);
-    void cuFill_gpu_u64(void* in, void* val, const cytnx_uint64&);
-    void cuFill_gpu_i32(void* in, void* val, const cytnx_uint64&);
-    void cuFill_gpu_u32(void* in, void* val, const cytnx_uint64&);
-    void cuFill_gpu_u16(void* in, void* val, const cytnx_uint64&);
-    void cuFill_gpu_i16(void* in, void* val, const cytnx_uint64&);
-    void cuFill_gpu_b(void* in, void* val, const cytnx_uint64&);
+
+    /**
+     * @brief Assign the given value to the first `count` elements in the range beginning at
+     * `first`.
+     *
+     * This function act the same as `std::fill_n` and is implemented in CUDA.
+     *
+     * @tparam DType the data type of the elements in the range
+     *
+     * @param first the beginning of the range
+     * @param value the value to be assigned
+     * @param count the number of elements to modify
+     */
+    template <typename DType>
+    void FillGpu(void* first, const DType& value, cytnx_uint64 count);
   }  // namespace utils_internal
 }  // namespace cytnx
 
-#endif
+#endif  // SRC_BACKEND_UTILS_INTERNAL_GPU_CUFILL_GPU_H_

From 086034b6065fc84f5b0a52dbc083687dcc8e6bad Mon Sep 17 00:00:00 2001
From: Ivana <ivana.gyro@gmail.com>
Date: Sun, 10 Nov 2024 16:15:14 +0800
Subject: [PATCH 2/3] Increase block size used in FillGpu() to 1024

The maximum x- or y-dimension of a block is 1024 with compute
capability above 2.x. The compute capability requirements of cuTENSOR
and cuQuantum are much higher than 2.x. Furthermore, it's not easy to
find a device that only supports the compute capability below or eqaul
1.3.
---
 src/backend/utils_internal_gpu/cuFill_gpu.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils_internal_gpu/cuFill_gpu.cu b/src/backend/utils_internal_gpu/cuFill_gpu.cu
index 4faa445fb..0a1b080cc 100644
--- a/src/backend/utils_internal_gpu/cuFill_gpu.cu
+++ b/src/backend/utils_internal_gpu/cuFill_gpu.cu
@@ -31,8 +31,8 @@ namespace cytnx {
       using CudaDType = typename ToCudaDType<DType>::type;
 
       CudaDType* typed_first = reinterpret_cast<CudaDType*>(first);
-      cytnx_uint64 block_count = (count + 511) / 512;
-      FillGpuKernel<<<block_count, 512>>>(typed_first, static_cast<CudaDType>(value), count);
+      cytnx_uint64 block_count = (count + 1023) / 1024;
+      FillGpuKernel<<<block_count, 1024>>>(typed_first, static_cast<CudaDType>(value), count);
     }
 
     template void FillGpu<cytnx_complex128>(void*, const cytnx_complex128&, cytnx_uint64);

From eed01c21e031ff2dd7001b443edc4dc6e7152742 Mon Sep 17 00:00:00 2001
From: Ivana <ivana.gyro@gmail.com>
Date: Tue, 19 Nov 2024 14:16:31 +0800
Subject: [PATCH 3/3] Change the scheduling type to static for FillCpu()

Dynamic scheduling is 1000 times slower than the static scheduling in
this case. Below is the result and the code for benchmarking.

```
Total time for FillCpu: 0.0185553 seconds
Total time for FillCpuDynamic: 21.2537 seconds
```

```cpp
/** compile command: g++ -std=c++17 -fopenmp -O3 -o fill.o fill.cpp && ./fill.o */

using namespace std;

template <typename DType>
void FillCpu(void *first, const DType &value, size_t count) {
  DType *typed_first = reinterpret_cast<DType *>(first);
  for (int i = 0; i < count; ++i) {
    typed_first[i] = value;
  }
}

template <typename DType>
void FillCpuDynamic(void *first, const DType &value, size_t count) {
  DType *typed_first = reinterpret_cast<DType *>(first);
  for (int i = 0; i < count; ++i) {
    typed_first[i] = value;
  }
}

int main() {
  int count = 100000;
  int num_iterations = 10000;
  int *ptr = reinterpret_cast<int *>(malloc(sizeof(int) * count));
  int value = 10;

  {
    auto start = chrono::high_resolution_clock::now();
    for (int iter = 0; iter < num_iterations; ++iter) {
      FillCpu(reinterpret_cast<void *>(ptr), value, count);
    }
    auto end = chrono::high_resolution_clock::now();
    const std::chrono::duration<double> total_time = end - start;
    cout << "Total time for FillCpu: " << total_time.count() << " seconds" << endl;
  }

  {
    auto start = chrono::high_resolution_clock::now();
    for (int iter = 0; iter < num_iterations; ++iter) {
      FillCpuDynamic(reinterpret_cast<void *>(ptr), value, count);
    }
    auto end = chrono::high_resolution_clock::now();
    const std::chrono::duration<double> total_time = end - start;
    cout << "Total time for FillCpuDynamic: " << total_time.count() << " seconds" << endl;
  }
}
```
---
 src/backend/utils_internal_cpu/Fill_cpu.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils_internal_cpu/Fill_cpu.hpp b/src/backend/utils_internal_cpu/Fill_cpu.hpp
index 5d25b72d9..aeb75a651 100644
--- a/src/backend/utils_internal_cpu/Fill_cpu.hpp
+++ b/src/backend/utils_internal_cpu/Fill_cpu.hpp
@@ -27,7 +27,7 @@ namespace cytnx {
     void FillCpu(void *first, const DType &value, cytnx_uint64 count) {
       DType *typed_first = reinterpret_cast<DType *>(first);
 #ifdef UNI_OMP
-  #pragma omp parallel for schedule(dynamic)
+  #pragma omp parallel for schedule(static)
 #endif
       for (cytnx_uint64 i = 0; i < count; i++) {
         typed_first[i] = value;